Example #1
0
def seg(content):
        # Set your own model path
    MODELDIR="/home/liuqi/ltp/pyltp/ltp_data/"
    segmentor = Segmentor()
    segmentor.load(MODELDIR+"cws.model")
    tWords = segmentor.segment(content)
    return tWords
Example #2
0
def split_words(sentence = "中国进出口银行与中国银行加强合作",type_list=0):
    """分词,若type_list=True,则返回以列表返回分词后的结果。"""
    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))
    words = segmentor.segment(sentence)
    if type_list:
        return [i for i in words]
    return words
class pnn_count():
	def __init__(self):
		self.mydict = {}
		self.segmentor = Segmentor()
		self.segmentor.load('cws.model')
		self.hash_dict()
		self.ltp_process()
	def ltp_process(self):
		sentence_num = 0
		right_num = 0;
		f = open('pnn_annotated.txt','r')
		for line in f:
			sentence_num += 1
			#print line
			line_array = line.split('\t')
			line = line_array[1]
			count = 0
			words = self.segmentor.segment(line)
			for i in words:
				if self.mydict.has_key(i):
					count = count + self.mydict[i]
			if count > 0:		
				answer = "positive"
				if line_array[0] == '1':
					right_num += 1
			elif count == 0:
				answer = "neuter"
				if line_array[0] == '0':
					right_num += 1
			else:
				answer = "negative"
				if line_array[0] == '-1':
					right_num += 1
			#print "My guess is %s" %answer
			#print "THe right answer is %s" %line_array[0]

			#print "result  %d" % count
		f.close()
		print "total sentence is %d, right answer is %d" %(sentence_num,right_num)
	def hash_dict(self):
		f = open('negative.txt','r')
		for line in f:
			line = line.strip('\n')
			line = line.strip('\r')
			self.mydict[line] = -1
		f.close()
		f = open('positive.txt','r')
		for line in f:
			line = line.strip('\n')
			line = line.strip('\r')
			self.mydict[line] = 1
		f.close()
Example #4
0
def process(index):

	ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
	sys.path.append(os.path.join(ROOTDIR, "lib"))

	# Set your own model path
	MODELDIR=os.path.join(ROOTDIR, "ltp_data")

	segmentor = Segmentor()
	segmentor.load(os.path.join(MODELDIR, "cws.model"))

	finname = "o_"+str(index)+".txt"
	foutname = "p_"+str(index)+".txt"
	print finname
	count = 0
	fin = codecs.open(finname, encoding='utf-8')
	with codecs.open(foutname, 'w', encoding="utf-8") as fout:
		while 1:
			line = fin.readline()
			if not line:
			    break
			tmp = line.split(" ^ {")[1] # Get JSON
			tmp = "{"+tmp
			data = json.loads(tmp)
			content = data['content']
			# error_correction(content)
			content = content.strip()
			segmentation = ""
			for line in content.split("\n"):
				line = line.encode("utf-8")
				words = segmentor.segment(line)
				segmentation += "/".join(words)
				segmentation += "/"

			# Return type of the function is str, not unicode. Thus need to change into unicode.
			segmentation = unicode(segmentation, "utf-8")
			pinyin = add_pinyin(segmentation)
			obj = {}
			obj['flavor'] = data['flavor']
			obj['environment'] = data['environment']
			obj['service'] = data['service']
			obj['content'] = data['content']
			obj['segmentation'] = segmentation
			obj['pinyin'] = pinyin
			tmpstr = json.dumps(obj,ensure_ascii=False)
			fout.write(tmpstr)
			fout.write('\n')
			count += 1
			print count
		segmentor.release()
Example #5
0
def segmentation(filename, output_filename):

    print "segmenting '%s' to '%s'" % (filename, output_filename)

    f = open(filename, "r")
    lines = f.readlines()
    f.close()

    MODELDIR = "./ltp_data/"

    # segment
    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))

    # postag
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    
    # Named Entity Recognize
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    
    # Parse and get SVO
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    
    f = open(output_filename, "w")
    fner = open(output_filename.split(".")[0]+"_ner.txt", "w")

    for _line in lines:
        line = _line[:-1]
        if line[-1] in "\n\r":
            line = line[:-1]
        
        words = segmentor.segment(line)
        postags = postagger.postag(words)
#        netags = recognizer.recognize(words, postags)
#        arcs = parser.parse(words, postags)

        for i in range(len(words)):
            f.write( "%s/%s\t" % (words[i], postags[i]))
#            if netags[i]!='O':
#                fner.write("%s/%s\t" % (words[i], netags[i]))
        f.write("\n")
#        fner.write("\n")

    f.close()
Example #6
0
    def __init__(self):
        self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        segmentor = Segmentor()
        segmentor.load(self.cws_model_path)
        self.words = segmentor.segment(data)
        # print("|".join(words))
        segmentor.release()


        postagger = Postagger() # 初始化实例
        postagger.load(self.pos_model_path)  # 加载模型
        self.postags = postagger.postag(self.words)  # 词性标注
        # print('\t'.join(postags))
        postagger.release()  # 释放模型


        recognizer = NamedEntityRecognizer() # 初始化实例
        recognizer.load(self.ner_model_path)  # 加载模型
        self.netags = recognizer.recognize(self.words, self.postags)  # 命名实体识别
        # print('\t'.join(netags))
        recognizer.release()  # 释放模型
Example #7
0
def pyltp_words():
    from pyltp import Segmentor, Postagger
    segmentor = Segmentor()
    segmentor.load("/home/fredgan/github/pyltp/ltp_data/cws.model")
    # postagger = Postagger()    
    # postagger.load("~/github/pyltp/ltp_data/cpos.model")
    for line in open(sys.argv[1], 'r'):
        try:
            style,sentence = line.strip().split('\t')
        except:
            continue
        style_dic.setdefault(style, {})
        words = segmentor.segment(sentence)
        # postags = postagger.postag(words)
        for w in words:
            if w in style_dic[style]:
                style_dic[style][w] += 1
            else:
                style_dic[style][w] = 1

    for k,v in style_dic.iteritems():
        v_list = sorted(v.iteritems(), key = lambda d:d[1], reverse = True)
        print k+ "\t" + " ".join(map(lambda i:i[0] + ":" +str(i[1]), v_list[0:min(50,len(v_list))]))
class EventInfoExtract():
    def __init__(self,modulePath,outfile):
        self.MODELDIR = modulePath
        self.adict = {
        '·' :'',
        '的':'',
        '了':'',
        '“':'',
        '”':'',
        '一次':''
        }
        self.segmentor=None
        self.postagger=None
        self.parser=None
        self.recognizer=None
        self.out_file=outfile
        
        
    def multiple_replace(self,text):  
         rx = re.compile('|'.join(map(re.escape, self.adict)))  
         def one_xlat(match):  
               return self.adict[match.group(0)]  
         return rx.sub(one_xlat, text) 


    def InitModule(self):
        #print "正在加载LTP模型... ..."
        self.segmentor = Segmentor()
        #print os.path.join(self.MODELDIR, "cws.model")
        self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))   #分词模型,单文件
        
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) #词性标注模型,单文件
        
        self.parser = Parser()
        self.parser.load(os.path.join(self.MODELDIR, "parser.model"))  #依存句法分析模型,单文件
        
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) #命名实体识别模型,单文件
        #print self.recognizer

    def release_module(self):
        '''
        release the model
        '''
        self.segmentor.release()
        self.segmentor=None
        self.postagger.release()
        self.postagger=None
        self.parser.release()
        self.parser=None
        self.recognizer.release()
        self.recognizer=None
        

    def Txtextraction_start(self,txt,out_file):
        """
        事实三元组的控制程序
        Args:
            txt:带抽取的内容
        """
        txt = txt.strip()
        out_file = open(self.out_file, 'a')
        #try:
        #print "Execute here====-===="
        self.fact_triple_extract(txt,out_file)
        out_file.flush()
        out_file.close()
    
    def addresssTime_extract(self,inputtxt):
        #这个地方先做实体抽取,提取出人物、组织和相关的时间,首先分词,得到分词结果
        #words = self.segmentor.segment(inputtxt)
        sentences = inputtxt.split('。')
        #print sentences
        DataAndTime=[]
        for sentence in sentences:
            if len(sentence)<=1:
                continue
            #sentence = u"北京是中国首都"
            words = self.segmentor.segment(sentence)
            #print '\t'.join(words)
            postags = self.postagger.postag(words)
            netags = self.recognizer.recognize(words, postags)
            #print '\t'.join(postags)
            arcs = self.parser.parse(words, postags)
            #print "sentence;===========132123123123123"
            Dt={'date':'','address':''}
            if (("发生" in sentence or "遭" in sentence) and ("爆炸" in sentence or "事件" in sentence or "袭击" in sentence )) or (("恐怖" in sentence) or ("袭击" in sentence)):
                Flag=False
                #print '\t'.join(words)
                #print '\t'.join(postags)
                #print '\t'.join(postags)
                Addressbackups=[]
                Address =''
                for i in range(len(postags)-1):
                    if Flag==True:
                        if postags[i]=='ns'or postags[i]=='nd' or postags[i]=='n': # ns 地理名 nd方向名词 n一般名词
                            head = arcs[i].head
                            Address=Address+words[i]
                            if postags[head-1]=="n":
                                Address+=words[head-1]
                                head = arcs[head-1].head
                            if(words[head-1]=="在" or words[head-1]=="发生" or  words[head-1]=="袭击"  or words[head-1]=="遭" or words[head-1]=="遭遇" or words[head-1]=="将"):
                                Dt['address']=Address
                                break
                        else:
                            print "地址,",Address
                            Addressbackups.append(Address)
                            Address=''
                            Flag=False
                        continue
                    if postags[i]=='ns' and Flag == False:
                        #这个地方只会第一次进来。
                        head = arcs[i].head
                        Address = Address+words[i]
                        if (words[head-1]=="在" or words[head-1]=="发生" or  words[head-1]=="遭" or words[head-1]=="遭遇"  or words[head-1]=="将"):
                            Dt['address']=Address
                            break
                        #if postags[i+1]!='ns' or postags[i+1]!='nd' or postags[i+1]!='n':
                        #    print "wewewerwer====,",Address
                        #    Addressbackups.append(Address)
                        Flag = True 
                #print Addressbackups[0]
            if ("月" in sentence or '日' in sentence) and ("发生" in sentence or "袭击" in sentence):
                Flag = False
                Date=''
                Datebackup=[]
                for i in range(len(postags)-1):
                    if Flag==True:
                        if postags[i]=='nt':
                            #print words[i]
                            head = arcs[i].head
                            Date=Date+words[i]
                            if words[head-1]=="发生" or words[head-1]=="袭击":
                                Dt['date']=Date
                                break
                        else:
                            Datebackup.append(Date)
                            Date=''
                            Flag=False
                        continue
                    
                    if postags[i]=='nt' and Flag == False:
                        Date = Date+words[i]
                        #获取一下head
                        head = arcs[i].head
                        if words[head-1]=="发生" or words[head-1]=="袭击":
                            Dt['date']=Date
                            break
                        if postags[i+1]!='nt':
                            Datebackup.append(Date)
                        #index=i
                        Flag = True 
                if Dt['date']=='' and len(Datebackup):
                    Dt['date']=Datebackup[-1]
            if Dt['date']!='' or Dt['address']!='':
                DataAndTime.append(Dt)
                
        if len(DataAndTime)>1:
            for i in DataAndTime:
                if i['date']=="当天":
                    DataAndTime.remove(i)
        if len(DataAndTime)==0:
            Dt['date']=''
            Dt['address']=''
            DataAndTime.append(Dt)
        
        return DataAndTime
            
            

    def extraction_start(self, input_txt,out_file_name):
        """
        事实三元组抽取的总控程序
        Args:
            in_file_name: 输入文件的名称
            #out_file_name: 输出文件的名称
            begin_line: 读文件的起始行
            end_line: 读文件的结束行
        """
        #in_file = open(in_file_name, 'r')
        out_file = open(out_file_name, 'a')
        
        line_index = 1
        sentence_number = 0
        text_line = input_txt
        while text_line:
            if line_index < begin_line:
                text_line = in_file.readline()
                line_index += 1
                continue
            if end_line != 0 and line_index > end_line:
                break
            sentence = text_line.strip()
            if sentence == "" or len(sentence) > 1000:
                text_line = in_file.readline()
                line_index += 1
                continue
            try:
              sentence_one = sentence.split(" ")#"。"
              for num in range(len(sentence_one)-1):
                  self.fact_triple_extract(sentence, out_file)
                  out_file.flush()
            except:
                pass
            sentence_number += 1
            if sentence_number % 50 == 0:
                print "%d done" % (sentence_number)
            text_line = in_file.readline()
            line_index += 1
        in_file.close()
        out_file.close()

    def attribute_define0(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-2][0]=='n'):
                            continue
                        else:
                            print "事件属性:","".join(words[index-i-1:index+1])
                            break

    def attribute_define1(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-2][0]=='n'):
                            continue
                        else:
                            if(i != 0):
                                print "事件属性:","".join(words[index-i-1:index+1])
                            break

    def num_define(self,text):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        for index in range(len(words)):
            if(postags[index]=='m'):  
                return words[index]
                        
    def attribute_define2(self,text,keywords):
        words = self.segmentor.segment(text)
        #postags = postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(words[index-i-1]!=('发生' or '是')):#|(words[index-i-1]!='遭遇'):
                            continue
                        else:
                            if(i != 0):
                                attribute = "".join(words[index-i:index+1])
                                #attribute = multiple_replace(attribute)
                                print '==========='
                                if attribute in '恐怖袭击事件':
                                    return
                                return attribute
                            else:
                                return


    def organization_define(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-1][0]=='n')&(index-i-1 != 0):
                            continue
                        else:
                            if(words[index-1]=='组织')&(postags[index-2][0]!='n'):      
                                continue
                            if(i != 0):
                                print "组织:","".join(words[index-i:index])
                                return "".join(words[index-i:index])
    def organization_define1(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-1][0]=='n')&(index-i-1 != 0):
                            continue
                        else:
                            if(words[index-1]=='组织')&(postags[index-2][0]!='n'):      
                                continue
                            if(i != 0):
                                #print "组织:","".join(words[index-i:index])
                                return "".join(words[index-i:index])

    def fact_attribute_from_text(self,text):
        """
        """
        text = text.replace(',','。')
        sentence_one = text.split("。")
        
        fact_attribute = []
        for num in range(len(sentence_one)-1):
            if('袭击' in sentence_one[num]):
                #attribute_define0(sentence_one[num],'事件')
                    #print sentence_one[num]
                sentence_temp = self.multiple_replace(sentence_one[num])
                if('发生' in sentence_temp)|('遭遇' in sentence_temp):
                    #print '---------------',sentence_temp
                    temp_atrribut1 = self.attribute_define2(sentence_temp,'事件')
                    #print temp_atrribut1
                    if((temp_atrribut1)==None):
                        temp_atrribut2 = self.attribute_define2(sentence_temp,'袭击')
                        #print temp_atrribut2
                        if temp_atrribut2==None:
                            return
                        fact_attribute.append(str(temp_atrribut2))
                    else:
                        fact_attribute.append(str(temp_atrribut1))
        #print '------------------'
        if(len(fact_attribute)==0):
            #print '事件属性:unkown!'
            return 'None'
        else:
            #print '事件属性1:', len(fact_attribute),''.join(fact_attribute)
            #print '事件属性:',max(fact_attribute, key=len)
            return max(fact_attribute, key=len)

    def organization_from_text(self,text):
        """
        事实三元组抽取的总控程序
        Args:
            in_file_name: 输入文件的名称
            #out_file_name: 输出文件的名称
            begin_line: 读文件的起始行
            end_line: 读文件的结束行
        """
        sentence_one = text.split("。")
        #print '---------------------------',sentence_one[0]   
        ogniz = []
        for num in range(len(sentence_one)-1):
            if('负责' in sentence_one[num]):
                if('宣称' in sentence_one[num]):
                    #print sentence_one[num]
                    sentence_temp = sentence_one[num].replace('“','')
                    sentence_temp = sentence_temp.replace('”','')
                    temp_org = self.organization_define(sentence_temp,'宣称')
                    if(temp_org != None):
                        ogniz.append(temp_org)
            if(len(ogniz)==0):
                if('宣称' in sentence_one[num]):
                    #print sentence_one[num]
                    sentence_temp = sentence_one[num].replace('“','')
                    sentence_temp = sentence_temp.replace('”','')
                    temp_org = self.organization_define1(sentence_temp,'宣称')
                    if(temp_org != None):
                        ogniz.append(temp_org)
        if(len(ogniz)==0):
            #print '组织:unkown!'
            return 'unknown'
        else:
            #print '组织:',max(ogniz, key=len)
            #print ogniz
            return max(ogniz, key=len)

    def death_num_from_text(self,text):
        """
        事实三元组抽取的总控程序
        Args:
            in_file_name: 输入文件的名称
            #out_file_name: 输出文件的名称
            begin_line: 读文件的起始行
            end_line: 读文件的结束行
        """
        text = text.replace(',','。')
        text = text.replace('、','。')
        sentence_one = text.split("。")
        death_num = None
        hurt_num = None
        total_num = None
        #print '---------------------------',sentence_one[0]   
 
        for num in range(len(sentence_one)-1):
            if('死亡' in sentence_one[num])|('丧生' in sentence_one[num]):
                #print sentence_one[num]
                if(death_num == None):
                    death_num = self.num_define(sentence_one[num])
                    #print '死亡人数:',death_num
            if('受伤' in sentence_one[num]):
                #print sentence_one[num]        
                if(hurt_num == None):
                    hurt_num = self.num_define(sentence_one[num])
                    #print '受伤人数:',hurt_num
            if('伤亡' in sentence_one[num]):
                #print sentence_one[num]
                if(total_num == None):
                    total_num = self.num_define(sentence_one[num])
            #print type(death_num),type(hurt_num),type(total_num)
        return death_num,hurt_num,total_num
        


    def fact_triple_extract(self,sentence, out_file):
        #print sentence
        """
        对于给定的句子进行事实三元组抽取
        Args:
            sentence: 要处理的语句
        """
        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        netags = self.recognizer.recognize(words, postags)
        arcs = self.parser.parse(words, postags)
        child_dict_list = self.build_parse_child_dict(words, postags, arcs)
        
        Entity_Address=[]
        Entity_Name = []
        
        for index in range(len(postags)):
            e1 = ''
            if netags[index][0] == 'S' or netags[index][0] == 'B':
                if 'Ns' in netags[index]:
                    ni = index
                    if netags[ni][0] == 'B':
                        while netags[ni][0] != 'E':
                            ni += 1
                        e1 = ''.join(words[index:ni+1])
                    else:
                        e1 = words[ni]
                    Entity_Address.append(e1)
                if "Nh" in netags[index]:
                    ni = index
                    if netags[ni][0]=='B':
                        while netags[ni][0]!='E':
                            ni+=1
                        e1= ''.join(words[index:ni+1])
                    else:
                        e1=words[ni]
                        Entity_Name .append(e1)
        Entity_Address = list(set(Entity_Address))
        Entity_Name = list(set(Entity_Name))
        for i in Entity_Name:
            print i
        AddressTp =[]
        LocateAddress = []
        for index in range(len(postags)):
            # 抽取以谓词为中心的事实三元组
            if postags[index] == 'v':
                child_dict = child_dict_list[index]
                # 主谓宾
                Flag = False
                if child_dict.has_key('SBV') and child_dict.has_key('VOB'):
                    e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                    r = words[index]
                    e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                    out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2))
                    for address in Entity_Address:
                        if address in e1 and ( ("袭击" in e1 or "袭击" in e2) or ("事件" in e2 or "事件" in e1)):
                            for name in Entity_Name:
                                if name in e1:
                                    Flag == False
                                    break
                            else:
                                Flag = True
                            if Flag == True:
                                for i in Entity_Address:
                                    if i in e1 or i in e2:
                                        AddressTp.append(i)    
                    out_file.flush()
    
                # 定语后置,动宾关系
                if arcs[index].relation == 'ATT':
                    if child_dict.has_key('VOB'):
                        e1 = self.complete_e(words, postags, child_dict_list, arcs[index].head - 1)
                        r = words[index]
                        e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                        temp_string = r+e2
                        if temp_string == e1[:len(temp_string)]:
                            e1 = e1[len(temp_string):]
                        if temp_string not in e1:
                            #print "定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2)
                            out_file.write("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2))
                            out_file.flush()
                # 含有介宾关系的主谓动补关系
                if child_dict.has_key('SBV') and child_dict.has_key('CMP'):
                    #e1 = words[child_dict['SBV'][0]]
                    e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                    cmp_index = child_dict['CMP'][0]
                    r = words[index] + words[cmp_index]
                    if child_dict_list[cmp_index].has_key('POB'):
                        e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
                        #print "介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2)
                        out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2))
                        out_file.flush()
    
            # 尝试抽取命名实体有关的三元组
            if netags[index][0] == 'S' or netags[index][0] == 'B':
                ni = index
                if netags[ni][0] == 'B':
                    while netags[ni][0] != 'E':
                        ni += 1
                    e1 = ''.join(words[index:ni+1])
                else:
                    e1 = words[ni]
                if arcs[ni].relation == 'ATT' and postags[arcs[ni].head-1] == 'n' and netags[arcs[ni].head-1] == 'O':
                    r = self.complete_e(words, postags, child_dict_list, arcs[ni].head-1)
                    if e1 in r:
                        r = r[(r.index(e1)+len(e1)):]
                    if arcs[arcs[ni].head-1].relation == 'ATT' and netags[arcs[arcs[ni].head-1].head-1] != 'O':
                        e2 = self.complete_e(words, postags, child_dict_list, arcs[arcs[ni].head-1].head-1)
                        mi = arcs[arcs[ni].head-1].head-1
                        li = mi
                        if netags[mi][0] == 'B':
                            while netags[mi][0] != 'E':
                                mi += 1
                            e = ''.join(words[li+1:mi+1])
                            e2 += e
                        if r in e2:
                            e2 = e2[(e2.index(r)+len(r)):]
                        if r+e2 in sentence:
                            #print "人名//地名//机构\t(%s, %s, %s)\n" % (e1, r, e2)
                            out_file.write("人名//地名//机构\t(%s, %s, %s)\n" % (e1, r, e2))
                            out_file.flush()
                
        AddressTp = list(set(AddressTp))
        LocateAddress=AddressTp
        Tp = LocateAddress
        for i in LocateAddress:
            for k in AddressTp:
                if i!=k and (i in k):
                    Tp.remove(i)
            address = ''
            for i in Tp:
                address+=i
            print "地点:",address                    
                

    def build_parse_child_dict(self,words, postags, arcs):
        """
        为句子中的每个词语维护一个保存句法依存儿子节点的字典
        Args:
            words: 分词列表
            postags: 词性列表
            arcs: 句法依存列表
        """
        child_dict_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:
                    if child_dict.has_key(arcs[arc_index].relation):
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            #if child_dict.has_key('SBV'):
            #    print words[index],child_dict['SBV']
            child_dict_list.append(child_dict)
        return child_dict_list
    
    def complete_e(self,words, postags, child_dict_list, word_index):
        """
        完善识别的部分实体
        """
        child_dict = child_dict_list[word_index]
        prefix = ''
        if child_dict.has_key('ATT'):
            for i in range(len(child_dict['ATT'])):
                prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
        
        postfix = ''
        if postags[word_index] == 'v':
            if child_dict.has_key('VOB'):
                postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
            if child_dict.has_key('SBV'):
                prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
    
        return prefix + words[word_index] + postfix

    def attribute_define0(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-2][0]=='n'):
                            continue
                        else:
                            print "事件属性:","".join(words[index-i-1:index+1])
                            break
    
    def attribute_define1(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-2][0]=='n'):
                            continue
                        else:
                            if(i != 0):
                                print "事件属性:","".join(words[index-i-1:index+1])
                            break
    
    def attribute_define2(self,text,keywords):
        #print text
        words = self.segmentor.segment(text)
        print words
        #print self.segmentor
        #print '\t'.join(words)
        #postags = postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                #print words[index]
                if(words[index]==keywords):  
                    for i in range(index):
                        if(words[index-i-1]!=('发生' or '是')):#|(words[index-i-1]!='遭遇'):
                            continue
                        else:
                            if(i != 0):
                                attribute = "".join(words[index-i:index+1])
                                if attribute in '恐怖袭击事件':
                                    return
                                return attribute
                            else:
                                return
    
    
    def organization_define(self,text,keywords):
        words = self.segmentor.segment(text)
        postags = self.postagger.postag(words)#词性标注
        if keywords in text:
            for index in range(len(words)):
                if(words[index]==keywords):               
                    for i in range(index):
                        if(postags[index-i-1][0]=='n'):
                            continue
                        else:
                            if(words[index-1]=='组织')&(postags[index-2][0]!='n'):      
                                continue
                            if(i != 0):
                                print "组织:","".join(words[index-i:index])
                                return "".join(words[index-i:index])
    
    
    
    def fact_attribute(self,in_file_name, out_file_name, begin_line, end_line):
        """
        事实三元组抽取的总控程序
        Args:
            in_file_name: 输入文件的名称
            #out_file_name: 输出文件的名称
            begin_line: 读文件的起始行
            end_line: 读文件的结束行
        """
        in_file = open(in_file_name, 'r')
        out_file = open(out_file_name, 'a')
        
        line_index = 1
        sentence_number = 0
        text_line = in_file.readline()
        while text_line:
            #小于起始段的直接跳过
            if line_index < begin_line:
                text_line = in_file.readline()
                line_index += 1
                continue
            if end_line != 0 and line_index > end_line:
                break
            sentence = text_line.strip()
            #长段(大于1000)直接跳过
            if sentence == "" or len(sentence) > 1000:
                text_line = in_file.readline()
                line_index += 1
                continue
            sentence_one = sentence.split(" ")#"。"
            
            for num in range(len(sentence_one)-1):
                attribute_define0(sentence_one[num],'事件')
                attribute_define2(sentence_one[num],'袭击')
            sentence_number += 1
            if sentence_number % 50 == 0:
                print "%d done" % (sentence_number)
            text_line = in_file.readline()
            line_index += 1
        in_file.close()
        out_file.close()
        '''
    
    def fact_attribute_from_text(text):
        """
        事实三元组抽取的总控程序
        Args:
            in_file_name: 输入文件的名称
            #out_file_name: 输出文件的名称
            begin_line: 读文件的起始行
            end_line: 读文件的结束行
        """
        text = text.replace(',','。')
        sentence_one = text.split("。")
        
        fact_attribute = []
        for num in range(len(sentence_one)-1):
            if('袭击' in sentence_one[num]):
                #attribute_define0(sentence_one[num],'事件')
                    #print sentence_one[num]
                sentence_temp = multiple_replace(sentence_one[num])
                if('发生' in sentence_temp)|('遭遇' in sentence_temp):
                    print '---------------',sentence_temp
                    temp_atrribut1 = self.attribute_define2(sentence_temp,'事件')
                    fact_attribute.append(str(temp_atrribut1))
                    if((temp_atrribut1)==None):
                        temp_atrribut2 = self.attribute_define2(sentence_temp,'袭击')
                        fact_attribute.append(str(temp_atrribut2))
        print '------------------'
        if(len(fact_attribute)==0):
            print '事件属性:unkown!'
            return 'unknown'
        else:
            print '事件属性1:', len(fact_attribute),fact_attribute
            print '事件属性:',max(fact_attribute, key=len)
            return max(fact_attribute, key=len)
            '''
    '''
Example #9
0
class NERTagger(object):

    def __init__(self, model_dir_path, com_blacklist):
        # 初始化相关模型文件路径
        self.model_dir_path = model_dir_path
        self.cws_model_path = os.path.join(self.model_dir_path, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(self.model_dir_path, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(self.model_dir_path, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

        # 初始化分词模型
        self.segmentor = Segmentor()
        self.segmentor.load(self.cws_model_path)

        # 初始化词性标注模型
        self.postagger = Postagger()
        self.postagger.load(self.pos_model_path)

        # 初始化NER模型
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(self.ner_model_path)

        # 初始化公司名黑名单
        self.com_blacklist = set()
        with open(com_blacklist,'r',encoding='UTF-8') as f_com_blacklist:
            for line in f_com_blacklist:
                if len(line.strip()) > 0:
                    self.com_blacklist.add(line.strip())


    def ner(self, text, entity_dict):
        words = self.segmentor.segment(text)  # 分词
        post_tags = self.postagger.postag(words)
        ner_tags = self.recognizer.recognize(words, post_tags)  # 命名实体识别
        # print('\t'.join(words))
        # print('\t'.join(post_tags))
        # print('\t'.join(ner_tags))
        # print('-' * 80)
        entity_list = []
        entity = ""
        for word, post_tag, ner_tag in zip(words, post_tags, ner_tags):
            tag = ner_tag[0]
            entity_type = ner_tag[2:]
            if tag == 'S' :
                entity_list.append((word, entity_type))
            elif tag in 'BIE':
                entity += word
                if tag == 'E':
                    #判断公司名黑名单
                    if entity in self.com_blacklist:
                        entity_list.append((entity, "n"))
                    else:
                        entity_list.append((entity, entity_type))
                    entity = ""
            elif tag == 'O':
                if post_tag == 'nt':
                    entity += word
                else:
                    if entity != "":
                        entity_list.append((entity, 'nt'))
                        entity = ""
                    # 排除错误数字识别,例如“大宗”
                    if post_tag == 'm' and not re.match("[0-9]+.*",word):
                        post_tag = 'n'
                    # 识别数字中的百分数
                    if post_tag == 'm' and re.match("[0-9.]+%",word):
                        post_tag = 'mp'
                    entity_list.append((word, post_tag))
        entity_list = self._ner_tag_by_dict(entity_dict, entity_list)
        return NERTaggedText(text, entity_list)

    def _ner_tag_by_dict(self, entity_dict, entity_list):
#        for item in entity_dict.items():
#            print("\t".join(item))
        i = 0
        while i < len(entity_list) - 1:
            has_entity = False
            for entity_len in range(4,1,-1):
                segment = "".join([ x[0] for x in entity_list[i:i+entity_len]])
                # segment_uni = segment.decode('utf-8')
                segment_uni = segment
                if segment_uni in entity_dict:
                    has_entity = True
                    entity_list[i] = (segment,entity_dict[segment_uni])
                    del entity_list[i+1:i+entity_len]
                    i = i + entity_len
                    break
            if not has_entity:
                i += 1
        return entity_list


    def __del__(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
Example #10
0
class DSFN:
    """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析
    Attributes:
        default_user_dict_dir:str,用户自定义词典目录
        default_model_dir:str,ltp模型文件目录
    """

    entity_verb_new = entity_verb_new()
    all_entity = entity_verb_new.readAllEntity(
        "../../entity_verb//entity_verb_result\\all_entity.json")
    default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录

    def __init__(self, model_dir=default_model_dir, all_entity=all_entity):
        self.default_model_dir = model_dir
        # 加载ltp模型
        #
        default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  # LTP模型文件目录
        self.segmentor = Segmentor()
        user_dict = "..\\source\\user.txt"
        segmentor_flag = self.segmentor.load_with_lexicon(
            os.path.join(default_model_dir, 'cws.model'), user_dict)
        # segmentor_flag = self.segmentor.load(os.path.join(default_model_dir, 'cws.model'))
        # 词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(
            os.path.join(self.default_model_dir, 'pos.model'))
        # 命名实体识别模型
        self.recognizer = NamedEntityRecognizer()
        ner_flag = self.recognizer.load(
            os.path.join(self.default_model_dir, 'ner.model'))
        # 依存句法分析模型
        self.parser = Parser()
        parser_flag = self.parser.load(
            os.path.join(self.default_model_dir, 'parser.model'))

        if segmentor_flag or postag_flag or ner_flag or parser_flag:  # 可能有错误
            print('load model failed')

    def segment(self, sentence, entity_postag=dict()):
        words = self.segmentor.segment(sentence)
        lemmas = []
        for lemma in words:
            lemmas.append(lemma)
        return lemmas

    def getPostag(self):
        return self.postagger

    def postag(self, lemmas):
        """
        Parameters
        ----------
        lemmas : List,分词后的结果
        entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生
        Returns
        -------
        words:WordUnit List,包括分词与词性标注的结果
        """
        words = []
        # 词性标注
        postags = self.postagger.postag(lemmas)
        for i in range(len(lemmas)):
            # 存储分词与词性标记后的词单元WordUnit,编号从1开始
            word = WordUnit(i + 1, lemmas[i], postags[i])
            words.append(word)
        # self.postagger.release() #释放
        return words

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word:str,单词
        Returns:
            pos_tag:str,该单词的词性标注
        """
        pos_tag = self.postagger.postag([word])
        return pos_tag[0]

    def netag(self, words):
        """
        命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并
        Parameters
            words : WordUnit list,包括分词与词性标注结果
        Returns
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果
        """
        lemmas = []  # 存储分词后的结果
        postags = []  # 存储词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 命名实体识别
        netags = self.recognizer.recognize(lemmas, postags)
        print(netags)
        for netag in netags:
            print(netag)
        words_netag = EntityCombine().combine(words, netags)
        return words_netag

    def parse(self, words):
        """
        对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选)
        Args:
            words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果
        Returns
            *:sentenceUnit 句子单元
        """
        lemmas = []  # 分词结果
        postags = []  # 词性标注结果
        for word in words:
            lemmas.append(word.lemma)
            postags.append(word.postag)
        # 依存句法分析
        arcs = self.parser.parse(lemmas, postags)
        for i in range(len(arcs)):
            words[i].head = arcs[i].head
            words[i].dependency = arcs[i].relation
        return SentenceUnit(words)

    def close(self):
        """
        关闭与释放
        """
        # pynlpir.close()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    def dsfn1_2_3_4COO(self, sentence, item1, item2):
        allTripes = []
        """
        判断两个实体是否属于DSFN1的情况,并输出三元组
        """
        if (item1.dependency == "ATT"):
            AttWord = item1.head_word
            AttWordDict = dict()
            AttWordStr = ""
            while AttWord.ID < item2.ID:
                AttWordDict[AttWord.ID] = AttWord.lemma
                # AttWordStr += AttWord.lemma
                if (AttWord.dependency == "ATT"):
                    AttWord = AttWord.head_word
                else:
                    break

            if (AttWord.ID == item2.ID):
                flag = True
                while flag:
                    len1 = len(AttWordDict)
                    AttList = AttWordDict.keys()
                    for id in range(item1.ID + 1, item2.ID):
                        item = sentence.get_word_by_id(id)
                        if item.head_word != None and item.head_word.ID in AttList and (
                                item.dependency == "ATT"):
                            AttWordDict[item.ID] = item.lemma
                    if len1 == len(AttWordDict):
                        flag = False
                    else:
                        flag = True
                AttWordDict = sorted(AttWordDict.items(),
                                     key=lambda item: item[0])
                AttWordStr = ""
                for i in AttWordDict:
                    AttWordStr += i[1]
                print("三元组:(" + item1.lemma + "," + AttWordStr + "," +
                      item2.lemma + ")")
                allTripes.append([item1.lemma, AttWordStr, item2.lemma])
        """
        判断两个实体是否属于DSFN1的情况,并输出三元组
        """
        """
        考虑DSFN2的情况
        """
        if item1.dependency == "SBV" and item1.head_word.postag == "v":
            pred1 = item1.head_word
            predDict = dict()
            predDict[pred1.ID] = pred1.lemma

            if item2.dependency == "VOB" and item2.head_word.postag == "v":
                pred2 = item2.head_word
                predDict[pred2.ID] = pred2.lemma
                if (len(predDict) == 1):
                    PredWordStr = ""
                    for i in predDict:
                        PredWordStr += predDict[i]
                    print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr +
                          "," + item2.lemma + ")")
                    allTripes.append([item1.lemma, PredWordStr, item2.lemma])
                    """
                    新加,为了考虑“习近平视察和访问上海”的情况
                    """
                if len(predDict) == 2:
                    num = self.get_entity_num_between(pred1, pred2, sentence)
                    flagSBV = True
                    flagVOB = True
                    for word in sentence.words:
                        if word.dependency == "SBV" and word.head_word.ID == pred2.ID:
                            flagSBV = False
                        if word.dependency == "VOB" and word.head_word.ID == pred1.ID:
                            flagVOB = False
                    print("pred1:" + pred1.lemma + ",pred2:" + pred2.lemma +
                          ",num:" + str(num))
                    if num == 0:
                        if flagVOB == True:
                            print("DSFN2三元组:(" + item1.lemma + "," +
                                  pred1.lemma + "," + item2.lemma + ")")
                            allTripes.append(
                                [item1.lemma, pred1.lemma, item2.lemma])
                        if flagSBV == True:
                            print("DSFN2三元组:(" + item1.lemma + "," +
                                  pred2.lemma + "," + item2.lemma + ")")
                            allTripes.append(
                                [item1.lemma, pred2.lemma, item2.lemma])
        """
        DSFN3.0
        """
        pred = None
        prep = None
        if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
        elif item1.dependency == "FOB" and item2.dependency == "POB":  # 考虑介词为“被”的情况,如 “小王被小明所陷害”
            pred = item1.head_word
            prep = item2.head_word
            c = item1
            item1 = item2
            item2 = c
        if pred != None and prep != None:
            if prep.dependency == "ADV":
                if prep.head_word.ID == pred.ID:
                    pred2 = None
                    object = None
                    objectForPred2 = None
                    for i in range(pred.ID + 1, len(sentence.words) + 1):
                        item = sentence.get_word_by_id(i)

                        if item.dependency == "VOB" and item.head_word.ID == pred.ID:
                            object = item
                            objectDict = dict()
                            objectDict[object.ID] = object
                            for word in sentence.words:
                                if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == object.ID:
                                    objectDict[word.ID] = word
                            objectDict = sorted(objectDict.items(),
                                                key=lambda item: item[0])
                            objectStr = ""
                            for objectItem in objectDict:
                                objectStr += objectItem[1].lemma
                            print("DSFN3三元组:(" + item1.lemma + "," +
                                  pred.lemma + "" + objectStr + "," +
                                  item2.lemma + ")")
                            allTripes.append([
                                item1.lemma, pred.lemma + "" + objectStr,
                                item2.lemma
                            ])
                    if object == None:
                        print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma +
                              "," + item2.lemma + ")")
                        allTripes.append(
                            [item1.lemma, pred.lemma, item2.lemma])
        """
        DSFN4
        """
        pred = None
        prep = None
        prep1 = None
        pred2 = None
        if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB":
            pred = item1.head_word
            prep = item2.head_word
            if prep.dependency == "CMP" and prep.head_word.postag == "v":
                pred2 = prep.head_word
                if pred2.ID == pred.ID:
                    print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" +
                          prep.lemma + "," + item2.lemma + ")")
                    allTripes.append([
                        item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma
                    ])
                else:
                    num = self.get_entity_num_between(pred1, pred2, sentence)
                    flagSBV = True
                    flagVOB = True
                    for word in sentence.words:
                        if word.dependency == "SBV" and word.head_word.ID == pred2.ID:
                            flagSBV = False
                        if word.dependency == "VOB" and word.head_word.ID == pred.ID:
                            flagVOB = False
                    # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num))
                    if num == 0:
                        flag = True
                        for word in sentence.words:
                            if word.dependency == "CMP" and word.head_word.ID == pred.ID:
                                prep1 = word
                        if prep1 != None:
                            if flagVOB == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")")
                                allTripes.append([
                                    item1.lemma, pred.lemma + "" + prep1.lemma,
                                    item2.lemma
                                ])
                            # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                            if flagSBV == True:
                                allTripes.append([
                                    item1.lemma, pred2.lemma + "" + prep.lemma,
                                    item2.lemma
                                ])
                        else:
                            if flagVOB == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")")
                                allTripes.append(
                                    [item1.lemma, pred.lemma, item2.lemma])
                            if flagSBV == True:
                                # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")")
                                allTripes.append([
                                    item1.lemma, pred2.lemma + "" + prep.lemma,
                                    item2.lemma
                                ])
        """
        DSFN5
        """
        # self.dsfn5and6(rawSentence,sentence,item1,item2)
        return allTripes

    def get_entity_num_between(self, verb1, verb2, sentence):
        """
        获得两个动词之间的实体数量
        Parameters
        ----------
        entity1 : WordUnit,动词1
        entity2 : WordUnit,动词2
        Returns:
            num:int,两动词间的实体数量
        """
        if verb1.ID > verb2.ID:
            c = verb1
            verb1 = verb2
            verb2 = c
        num = 0
        i = verb1.ID
        while i < verb2.ID - 1:
            if self.is_entity(sentence.words[i]):
                num += 1
            i += 1
        return num

    def is_entity(self, entry):
        """判断词单元是否是实体
        Args:
            entry:WordUnit,词单元
        Returns:
            *:bool,实体(True),非实体(False)
        """
        #候选实体词性列表
        entity_postags = ['nh', 'ni', 'ns', 'nz', 'j', 'n', 'v', 'i']
        print(entry.lemma + " : " + entry.postag)
        if entry.postag in entity_postags:
            return True
        else:
            return False

    def dsfnAttCOO(self, sentence, item1, item2):
        item1Att = item1
        item2Att = item2
        while item1Att.dependency == "ATT":
            item1Att = item1Att.head_word

        allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2)
        if allTripe == None or len(allTripe) == 0:
            while item2Att.dependency == "ATT":
                item2Att = item2Att.head_word
            allTripe = self.dsfn1_2_3_4COO(sentence, item1, item2Att)
        if allTripe == None or len(allTripe) == 0:
            allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2Att)
        for tripe in allTripe:
            if tripe[0] == item1Att.lemma:
                tripe[0] = item1.lemma
            if tripe[2] == item2Att.lemma:
                tripe[2] = item2.lemma
        return allTripe

    def dsfn5COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            allTripes1 = self.dsfn1_2_3_4COO(sentence, item1COO, item2)
            # print(allTripes1)
            for tripe in allTripes1:
                if tripe[0] == item1COO.lemma:
                    tripe[0] = item1.lemma
                elif tripe[2] == item1COO.lemma:
                    tripe[2] = item1.lemma
            return allTripes1
            # print("allTripes1"+str(allTripes1))

    def dsfn6COO(self, sentence, item1, item2):
        if item2.dependency == "COO":
            item2COO = item2.head_word
            allTripes2 = self.dsfn1_2_3_4COO(sentence, item1, item2COO)
            for tripe in allTripes2:
                if tripe[2] == item2COO.lemma:
                    tripe[2] = item2.lemma
                elif tripe[0] == item2COO.lemma:
                    tripe[0] = item2.lemma
            return allTripes2

    def dsfn5and6COO(self, sentence, item1, item2):
        if item1.dependency == "COO":
            item1COO = item1.head_word
            if item2.dependency == "COO":
                item2COO = item2.head_word
                allTripe = self.dsfn1_2_3_4COO(sentence, item1COO, item2COO)
                for tripe in allTripe:
                    if tripe[0] == item1COO.lemma and tripe[
                            2] == item2COO.lemma:
                        tripe[0] = item1.lemma
                        tripe[2] = item2.lemma
                    if tripe[2] == item1COO.lemma and tripe[
                            0] == item2COO.lemma:
                        tripe[2] = item1.lemma
                        tripe[0] = item2.lemma
                return allTripe

    def dsfnStartCOO3(self, rawSentence, entity1, entity2):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        lemmas = dsfn.segment(rawSentence)
        words = dsfn.postag(lemmas)
        words_netag = dsfn.netag(words)
        sentence = dsfn.parse(words_netag)
        print(sentence.to_string())
        for item in sentence.words:
            if (item.lemma == entity1):
                item1 = item
            if (item.lemma == entity2):
                item2 = item
        if item1.ID > item2.ID:
            c = item1
            item1 = item2
            item2 = c
        itemCopy1 = item1
        itemCopy2 = item2
        allTripes = self.dsfnStartCOO2(sentence, item1, item2)
        if allTripes != None and len(allTripes) == 0:
            if item1.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni'
                                ] and item1.dependency == "ATT":
                item1 = item1.head_word
                while item1.dependency == "ATT":
                    item1 = item1.head_word
                if 'n' in item1.postag and item1.postag not in [
                        'nh', 'ns', 'nz', 'ni'
                ]:
                    pass
                else:
                    item1 = itemCopy1

            if item2.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni'
                                ] and item2.dependency == "ATT":
                item2 = item2.head_word
                while item2.dependency == "ATT":
                    item2 = item2.head_word
                if ('n' in item2.postag
                        or 'q' in item2.postag) and item2.postag not in [
                            'nh', 'ns', 'nz', 'ni'
                        ]:
                    pass
                else:
                    item2 = itemCopy2
            allTripes = self.dsfnStartCOO2(sentence, item1, item2)
            print("注意")
            print(allTripes)
            if len(allTripes) != 0:
                for tripe in allTripes:
                    if tripe[0] == item1.lemma:
                        tripe[0] = itemCopy1.lemma
                    elif tripe[2] == item1.lemma:
                        tripe[2] = itemCopy1.lemma

                    if tripe[0] == item2.lemma:
                        tripe[0] = itemCopy2.lemma
                    elif tripe[2] == item2.lemma:
                        tripe[2] = itemCopy2.lemma
                    print("12345")
                    resultList.append(tripe)
                print("最终结果")
                print(np.array(set([tuple(t) for t in resultList])))
        else:
            print("最终结果")
            print(np.array(set([tuple(t) for t in allTripes])))

    def dsfnStartCOO2(self, sentence, item1, item2):
        nounRelatedWithPosition = ['主席', '总理', '教授', '校长']
        resultList = []
        itemCopy1 = item1
        itemCopy2 = item2
        """
        来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV]
        """
        print(item1.lemma)
        print(item2.lemma)
        allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2)
        if len(allTripes) == 0:
            print("11111111")
            allTripes = self.dsfn5COO(sentence, item1, item2)
            if allTripes == None or len(allTripes) == 0:
                print("2222222")
                allTripes = self.dsfn6COO(sentence, item1, item2)
                if allTripes == None or len(allTripes) == 0:
                    print("3333333")
                    allTripes = self.dsfn5and6COO(sentence, item1, item2)
                    # if allTripes == None or len(allTripes) == 0:
                    #     print("44444444444")
                    #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
        # print("第一次"+str(allTripes))
        if allTripes != None and len(allTripes) != 0:
            for tripe in allTripes:
                resultList.append(tripe)
        print("第二次")
        pred1 = None
        subForCoo = None
        for item in sentence.words:
            if item.postag == "v" and item.dependency == "COO":
                pred1 = item.head_word

                for word in sentence.words:
                    if word.dependency == "SBV" and word.head_word.ID == pred1.ID:
                        for phrase in sentence.words:
                            if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID:
                                subForCoo = phrase
                        if subForCoo == None or (
                                subForCoo != None and subForCoo.ID
                                == word.ID):  # 处理动词COO的情况,必须要保证此并列动词没有额外主语。
                            # 考虑到:习近平主席视察厦门,李克强总理访问香港
                            word.head_word = item

                            print(sentence.to_string())
                            allTripes = self.dsfn1_2_3_4COO(
                                sentence, item1, item2)
                            if len(allTripes) == 0:
                                # print("11111111")
                                allTripes = self.dsfn5COO(
                                    sentence, item1, item2)
                                if allTripes == None or len(allTripes) == 0:
                                    # print("2222222")
                                    allTripes = self.dsfn6COO(
                                        sentence, item1, item2)
                                    if allTripes == None or len(
                                            allTripes) == 0:
                                        print("3333333")
                                        allTripes = self.dsfn5and6COO(
                                            sentence, item1, item2)
                                        # if allTripes == None or len(allTripes) == 0:
                                        #     allTripes = self.dsfnAttCOO(sentence,item1,item2)
                            # print("第二次"+str(allTripes))
                            if allTripes != None and len(allTripes) != 0:
                                for tripe in allTripes:
                                    # if tripe[0] == item1.lemma:
                                    #     tripe[0] = itemCopy1.lemma
                                    # elif tripe[2] == item1.lemma:
                                    #     tripe[2] = itemCopy1.lemma
                                    #
                                    # if tripe[0] == item2.lemma:
                                    #     tripe[0] = itemCopy2.lemma
                                    # elif tripe[2] == item2.lemma:
                                    #     tripe[2] = itemCopy2.lemma
                                    resultList.append(tripe)
        print(np.array(set([tuple(t) for t in resultList])))
        return resultList
Example #11
0
from pyltp import Segmentor
import os
from tqdm import tqdm

INPUT_PATH = '/home/brooksj/PycharmProjects/NLP12345/input'
LTPMODEL_PATH = os.path.join(INPUT_PATH, 'ltp_data_v3.4.0')
cws_model_path = os.path.join(LTPMODEL_PATH, 'cws.model')

seg = Segmentor()
seg.load_with_lexicon(cws_model_path, os.path.join(INPUT_PATH,
                                                   'lexicon_ex.txt'))

with open('./wiki.zh.txt.jian', 'r') as rf, open('./wiki.zh.segs.txt',
                                                 'w') as wf:
    wiki = tqdm(iter(rf.readlines()), desc=u'已分词0篇文章')
    i = 0
    for line in wiki:
        for sent in line.split('\s+'):
            words = list(seg.segment(sent))
            wf.write(' '.join(words) + ' ')
        wf.write('\n')
        i += 1
        if i % 100 == 0:
            wiki.set_description(u'已分词%d篇文章' % i)
Example #12
0
import os
from pyltp import Segmentor
LTP_DATA_DIR='D:\python\ltp_data_v3.4.0'
cws_model_path=os.path.join(LTP_DATA_DIR,'cws.model')
segmentor=Segmentor()
segmentor.load(cws_model_path)
words=segmentor.segment('2019年,我国船舶工业以供给侧结构性改革为主线,不断推动行业向高质量发展转变。')
print(type(words))
print('\t'.join(words))
segmentor.release()


pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

from pyltp import Postagger
postagger = Postagger() # 初始化实例
postagger.load(pos_model_path)  # 加载模型

words = ['元芳', '你', '怎么', '看']  # 分词结果
postags = postagger.postag(words)  # 词性标注

print('\t'.join(postags))
postagger.release()  # 释放模型
Example #13
0
class ZHProcessor:
    """
        This class is for processing xml for non-English languages (currently Chinese).

            dataset_unproc => dataset_whole.
    """
    def __init__(self):
        self.dp_dataset_unproc = path_parser.dataset_unproc
        self.dp_dataset_whole = path_parser.dataset_whole

        self.fp_top_unproc = path_parser.dataset_top_unproc
        self.fp_des_unproc = path_parser.dataset_des_unproc
        self.fp_top = path_parser.dataset_top
        self.fp_des = path_parser.dataset_des

        self.segmentor = Segmentor()
        self.segmentor.load(path_parser.cws)
        self.SIDE_PATTERN = '(?<=#s-{0}\n)[\s\S]*?(?=\n#e-{0})'
        # logger.info('CWS model fp: {0}'.format(path_parser.cws))

    @deprecated
    def stanford_stuff(self):
        # from stanfordcorenlp import StanfordCoreNLP
        # from nltk.parse.corenlp import CoreNLPTokenizer
        # import corenlp
        # self.nlp = StanfordCoreNLP('http://localhost',
        #                            port=9000,
        #                            timeout=30000)
        # self.nlp = StanfordCoreNLP(os.environ["CORENLP_HOME"], lang='zh', memory='4g')

        # self.props = {
        #     'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation',
        #     'pipelineLanguage': 'zh',
        #     'outputFormat': 'json'
        # }
        # seg = StanfordSegmenter()
        # seg.default_config('zh')
        # sent = u'这是斯坦福中文分词器测试'
        # print(seg.segment(sent))
        # with self.nlp as nlp:
        #     for sent in sents:
        #         print(sent)
        #         print(nlp.word_tokenize(sent))

        # with corenlp.CoreNLPClient(annotators="tokenize ssplit".split()) as client:
        #     ann = client.annotate(text)
        #
        # sentence = ann.sentence[0]
        # assert corenlp.to_text(sentence) == text
        # print(sentence.text)
        # token = sentence.token[0]
        # print(token.lemma)
        pass

    @deprecated
    def sent2words(self, segmentor, sent):
        words = segmentor.segment(sent)
        logger.info('|'.join(words))

    def para2sents(self, paragraph):
        for sent in re.findall('[^!?。\.\!\?]+[!?。\.\!\?]?',
                               paragraph,
                               flags=re.U):
            yield sent

    def proc_content(self,
                     content,
                     is_headline,
                     use_sent_seg=True,
                     convert2simple=False):
        if use_sent_seg:
            content = self.para2sents(content)

        proc_lines = list()
        for sent in content:
            proc_lines.append('#s-sent')
            if convert2simple:
                sent = Converter('zh-hans').convert(sent)
            words = self.segmentor.segment(sent)
            # logger.info('words: {0}'.format(words))
            # logger.info('|'.join(words))
            proc_lines.append('\n'.join(words))
            proc_lines.append('#e-sent')

        if is_headline:
            proc_lines.insert(0, '#s-headline')
            proc_lines.append('#e-headline')
        else:
            proc_lines.insert(0, '#s-para')
            proc_lines.append('#e-para')

        return proc_lines

    def release_seg(self):
        self.segmentor.release()

    def get_xml_elements(self, xml_fp):
        def _get_para_info():
            para_matches = list(re.finditer(re.compile(para_pattern), text))

            if not para_matches:
                logger.error('No para in {0}'.format(xml_fp))
                raise AssertionError

            # logger.info('para_matches {0}'.format(para_matches))

            paras = list()
            para_spans = list()
            for para_m in para_matches:
                # if para_m.group() != '\n':
                paras.append(para_m.group())
                para_spans.append(para_m.span())

            # logger.info('paras: {0}'.format(paras))
            # logger.info('para_spans: {0}'.format(para_spans))

            para_info = list(zip(paras, para_spans))
            # logger.info('para_info {0}'.format(para_info))

            return para_info

        def _get_ind_headline_info():
            ind_headline_info = list()

            headline_matches = list(
                re.finditer(re.compile(headline_pattern), text))
            if headline_matches:
                headlines = list()
                headline_spans = list()
                for headline_m in headline_matches:
                    # if headline_m.group() != '\n':
                    headlines.append(headline_m.group())
                    headline_spans.append(headline_m.span())

                headline_info = list(zip(headlines, headline_spans))
                # logger.info('headline_info {0}'.format(headline_info))

                for h_info in headline_info:
                    h_start, h_end = h_info[1]
                    in_para = False
                    for p_info in para_info:
                        p_start, p_end = p_info[1]
                        if p_start <= h_start and h_end <= p_end:
                            in_para = True
                            # logger.info('headline in para ...')
                    if not in_para:
                        ind_headline_info.append(h_info)

                # logger.info('ind_headline_info {0}'.format(ind_headline_info))

            return ind_headline_info

        def _sort_paras_and_headlines():
            sorted_items = deepcopy(list(para_info))
            if ind_headline_info:
                for ind_h_info in ind_headline_info:
                    ind_h_start = ind_h_info[1][0]
                    p_span_starts = [p_info[1][0] for p_info in para_info]
                    # logger.info('p_span_starts: {0}'.format(p_span_starts))
                    insert_idx = None
                    for idx, p_span_start in enumerate(p_span_starts):
                        if ind_h_start < p_span_start:
                            insert_idx = idx
                            break
                    item_dict = {'content': ind_h_info[0], 'is_headline': True}

                    sorted_items.insert(insert_idx, item_dict)

            # deal with all paras left
            for idx, item in enumerate(sorted_items):
                if type(item) != dict:
                    item_dict = {'content': item[0], 'is_headline': False}
                    sorted_items[idx] = item_dict
            return sorted_items

        def _handle_nested_paras():
            for idx, item in enumerate(sorted_items):
                if item['is_headline']:
                    continue

                headline_matches = list(
                    re.finditer(re.compile(headline_pattern), item['content']))
                # headline_match = re.search(re.compile(headline_pattern), item['content'])

                if not headline_matches:
                    continue

                new_items = list()
                for headline_m in headline_matches:
                    inner_headline_item = {
                        'content': headline_m.group(),
                        'is_headline': True,
                    }

                    new_items.insert(0, inner_headline_item)

                rest_pattern = '(?<=\</h>\n)[\s\S]*'
                rest_match = re.search(re.compile(rest_pattern),
                                       item['content'])

                if rest_match:
                    # logger.error('No rest in para: {0} of {1}'.format(item['content'], xml_fp))
                    # raise AssertionError
                    rest_para_item = {
                        'content': rest_match.group(),
                        'is_headline': False,
                    }

                    new_items.insert(0, rest_para_item)

                del sorted_items[idx]

                for new_item in new_items:
                    sorted_items.insert(idx, new_item)

        root_pattern = '(?<=\<{0}>\n)[\s\S]*?(?=\n</{0}>)'
        para_pattern = root_pattern.format('p')
        headline_pattern = root_pattern.format('h')

        with io.open(xml_fp, encoding='utf-8', errors='ignore') as f:
            text = f.read()

        # logger.info('text: {0}'.format(text))

        para_info = _get_para_info()
        ind_headline_info = _get_ind_headline_info()
        sorted_items = _sort_paras_and_headlines()
        _handle_nested_paras()

        return sorted_items

    def dump_files(self, fp, text):
        with io.open(fp, mode='a', encoding='utf-8') as f:
            f.write(text)

    def proc_xml(self, xml_fp, out_fp):
        sorted_elements = self.get_xml_elements(xml_fp)
        proc_lines = list()
        # logger.info('sorted_elements: {0}'.format(sorted_elements))
        for element in sorted_elements:
            element_lines = self.proc_content(**element)
            # logger.info('element_lines: {0}'.format(element_lines))
            proc_lines.extend(element_lines)

        proc_lines.insert(0, '#s-doc')
        proc_lines.append('#e-doc')

        out_text = '\n'.join(proc_lines)

        self.dump_files(fp=out_fp, text=out_text)

    def proc_all_docs(self):
        xml_root = self.dp_dataset_unproc
        fns = [fn for fn in listdir(xml_root) if isfile(join(xml_root, fn))]

        for fn in tqdm(fns):
            xml_fp = join(xml_root, fn)
            out_fp = join(self.dp_dataset_whole, fn)
            self.proc_xml(xml_fp=xml_fp, out_fp=out_fp)

    def proc_side_top(self):
        proc_lines = list()
        with io.open(self.fp_top_unproc, encoding='utf-8') as f:
            for dom in doms_final:
                pattern = re.compile(self.SIDE_PATTERN.format(dom))
                topics = re.findall(pattern, f.read())[0].split('\n')
                logger.info('topics: {0}'.format(topics))
                top_proc_lines = self.proc_content(topics,
                                                   is_headline=False,
                                                   use_sent_seg=False)

                top_proc_lines.insert(0, '#s-{0}'.format(dom))
                top_proc_lines.append('#e-{0}'.format(dom))

                proc_lines.extend(top_proc_lines)
                f.seek(0, 0)

        with io.open(self.fp_top, mode='a', encoding='utf-8') as f:
            f.write('\n'.join(proc_lines))

    def proc_side_des(self):
        proc_lines = list()
        with io.open(self.fp_des_unproc, encoding='utf-8') as f:
            for dom in doms_final:
                pattern = re.compile(self.SIDE_PATTERN.format(dom))
                des_sents = re.findall(pattern, f.read())[0].split('\n')
                des_proc_lines = self.proc_content(des_sents,
                                                   is_headline=False,
                                                   use_sent_seg=False)

                des_proc_lines.insert(0, '#s-{0}'.format(dom))
                des_proc_lines.append('#e-{0}'.format(dom))

                proc_lines.extend(des_proc_lines)
                f.seek(0, 0)

        with io.open(self.fp_des, mode='a', encoding='utf-8') as f:
            f.write('\n'.join(proc_lines))
Example #14
0
def main(argv):
    parser = argparse.ArgumentParser(description='...')
    parser.add_argument('-d','--domain',default='AISpeech',action='store',help='which domain: AISpeech or SpeechLab')
    parser.add_argument('-w','--weight',default=-1,action='store',metavar='number',type=float,help='weight number')
    parser.add_argument('--test',action='store_true')
    args = parser.parse_args()
    lex_file = open(os.path.join(PATH_TO_DATA[args.domain],'rules.txt'), 'r')
    weight = args.weight
    if not args.test:
        out_lex_file = open(os.path.join(PATH_TO_DATA[args.domain],'rules.release.txt'), 'w')
    else:
        out_lex_file = open(os.path.join(PATH_TO_DATA[args.domain],'rules.test.release.txt'), 'w')
    cws_model_path = PATH_TO_SPLIT_WORDS  # 分词模型路径,模型名称为`cws.model`
    dict_path = os.path.join(PATH_TO_DATA[args.domain], 'dict.txt') # 领域相关的词典,用于帮助分词
    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(cws_model_path,dict_path) # 加载模型

    if concept_fst_dict!={}:
        concept_fst_dict.clear()
    if constraints_names!={}:
        constraints_names.clear()

    macro_patterns = {}
    all_patterns = []
    for line in lex_file:
        line=line.strip()
        if line == '' or line.startswith('%'):
            continue
        if '=>' not in line:
            #规则宏
            pat_name, pat = line.strip(';').split('=')
            macro_patterns['${'+pat_name+'}'] = extract_simple_rules(pat.strip(), macro_patterns)
        else:
            #正常规则
            pattern, node_info = line.split('=>')
            chunk_list = extract_simple_rules(pattern.strip(), macro_patterns)
            all_patterns.append((chunk_list, node_info))

    isyms = ["<eps>"]
    label_voc = {}
    osyms = ["<eps>", "<unk>"]
    word_voc = {} #["<unk>"] #<unk> should be defined manually
    for chunk_list,_ in all_patterns:
        for word in chunk_list:
            if word[0] not in ['(', ')', '|']:
                word = word.strip('?')
                word_voc[word] = 1

    osyms = osyms + list(word_voc)
    osyms_table = fst.SymbolTable()
    for idx,val in enumerate(osyms):
        osyms_table[val] = idx
    
    isyms_table = fst.SymbolTable()
    for idx,val in enumerate(isyms):
        isyms_table[val] = idx

    for pattern_idx, (pattern_chunk_list, node_info) in enumerate(all_patterns):
        # unique_rules = set()
        replace_mapping_dict = {}
        concept_fst = fst.StdTransducer(isyms=isyms_table, osyms=osyms_table)
        segment_stack = [{'start_of_this_segment':0, 'end_of_this_segment':0}]
        segment_stack[0]['value'] = '<eps>'
        cursor_head, cursor_tail = 0, 1
        argument_count = 0
        # print('Processing rule',pattern_chunk_list,'=>',node_info)
        for word in pattern_chunk_list:
            if word == '(':
                argument_count += 1
                segment_stack.append({'start_of_this_segment':cursor_tail, 'end_of_this_segment':0, 'value':segment_stack[-1]['value']})
                segment_stack[-1]['head_arc'] = [cursor_head, cursor_tail]
                cursor_tail += 1
                cursor_head = cursor_tail - 1
            elif word[0] == ')':
                if segment_stack[-1]['end_of_this_segment'] == 0:
                    segment_stack[-1]['end_of_this_segment'] = cursor_head
                else:
                    concept_fst.add_arc(cursor_head, segment_stack[-1]['end_of_this_segment'], '<eps>', '<eps>')
                    cursor_head = segment_stack[-1]['end_of_this_segment']
                if word == ')?':
                    concept_fst.add_arc(segment_stack[-1]['head_arc'][0], segment_stack[-1]['head_arc'][1], '<eps>', '<eps>')
                    concept_fst.add_arc(segment_stack[-1]['start_of_this_segment'], segment_stack[-1]['end_of_this_segment'], '<eps>', '<eps>')
                else:
                    concept_fst.add_arc(segment_stack[-1]['head_arc'][0], segment_stack[-1]['head_arc'][1], '<eps>', '<eps>')
                segment_stack.pop()
            elif word == '|':
                if segment_stack[-1]['end_of_this_segment'] == 0:
                    segment_stack[-1]['end_of_this_segment'] = cursor_head
                else:
                    concept_fst.add_arc(cursor_head, segment_stack[-1]['end_of_this_segment'], '<eps>', '<eps>')
                cursor_head = segment_stack[-1]['start_of_this_segment']
            else:
                if word[-1] == '?':
                    concept_fst.add_arc(cursor_head, cursor_tail, '<eps>', '<eps>')
                    word = word[:-1]
                else:
                    pass
                next_state = add_arc(concept_fst, cursor_head, cursor_tail, word, segment_stack[-1]['value'])
                cursor_head = cursor_tail
                cursor_tail = next_state
        if segment_stack[-1]['end_of_this_segment'] == 0:
            segment_stack[-1]['end_of_this_segment'] = cursor_head
        else:
            concept_fst.add_arc(cursor_head, segment_stack[-1]['end_of_this_segment'], '<eps>', '<eps>')
        final_state_idx = segment_stack[-1]['end_of_this_segment']
        concept_fst[final_state_idx].final = True
        
        concept_fst = concept_fst.inverse()
        concept_fst = concept_fst.determinize()
        concept_fst.minimize()
        concept_fst = concept_fst.inverse()
        
        t = concept_fst
        paths=list(t.paths())
        random.shuffle(paths)
        if not args.test:
            if extract_proper_num(len(paths))>len(paths):
                paths=paths*(extract_proper_num(len(paths))//len(paths))+paths[:extract_proper_num(len(paths))%len(paths)]
            else:
                paths=paths[:extract_proper_num(len(paths))]
        else:
            paths=paths[:2] if len(paths)>=2 else paths
        for output in paths:
            raw_path = []
            for arc in output:
                raw_path.append((t.osyms.find(arc.olabel), t.isyms.find(arc.ilabel)))
            path = raw_path
            input_seq = []
            output_seq = []
            for word, label in path:
                if word not in ['<eps>', u"ε"]:
                    input_seq.append(word)
                if label not in ['<eps>', u"ε"]:
                    if label == '_' and word not in ['<eps>', u"ε"]:
                        output_seq.append(word)
                    elif label != '_':
                        output_seq.append(label)
            
            pattern = input_seq
            sentence = [item if item[0] != '$' else ',' for item in pattern]
            tags = [item for item in pattern if item[0] == '$']
            sentence = ''.join(sentence)
            words = segmentor.segment(sentence)

            new_words = []
            tag_idx = 0
            for word in words:
                word = word
                if word == ',':
                    word = tags[tag_idx]
                    tag_idx += 1
                new_words.append(word)

            new_rule_simple = ' '.join(new_words)+' => '+node_info
            out_lex_file.write(new_rule_simple+'\n')
Example #15
0
                              'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
cws_model_path = os.path.join(LTP_DATA_DIR,
                              'cws.model')  # 分词模型路径,模型名称为`cws.model`
pos_model_path = os.path.join(LTP_DATA_DIR,
                              'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)  # 加载模型

labeller = SementicRoleLabeller()  # 初始化实例
labeller.load(srl_model_path)  # 加载模型

postagger = Postagger()  # 初始化实例
postagger.load(pos_model_path)  # 加载模型

words = segmentor.segment(
    '威尔士柯基犬是一种小型犬,它们的胆子很大,也相当机警,能高度警惕地守护家园,是最受欢迎的小型护卫犬之一。')  # 分词
print('\t'.join(words))
postags = postagger.postag(words)  # 词性标注
print('\t'.join(postags))
parser = Parser()  # 初始化实例
parser.load(par_model_path)  # 加载模型
# words = ['元芳', '你', '怎么', '看']
# postags = ['nh', 'r', 'r', 'v']
arcs = parser.parse(words, postags)  # 句法分析
print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

# words = ['元芳', '你', '怎么', '看']
# postags = ['nh', 'r', 'r', 'v']
# arcs 使用依存句法分析的结果
roles = labeller.label(words, postags, arcs)  # 语义角色标注
# 打印结果
Example #16
0
# 这里分词十分垃圾,分完之后对于目标词还是要进行重组

news = open('./cleanNews.txt', 'r', encoding='utf-8').readlines()

segednews = open('./segedNews.txt', 'w', encoding='utf-8')

# 辅助分词很关键,千万别把 沙曼维亚济 分成 沙曼 维亚济
import os
LTP_DIR = 'D:\Python-dev\ltp_data_v3.4.0'
cws_model_path = os.path.join(LTP_DIR, 'cws.model')  # 分词模型路径

from pyltp import Segmentor
segmentor = Segmentor()  # 初始化
segmentor.load_with_lexicon(cws_model_path, './NE.txt')  # 辅助分词
for idx, sent in enumerate(news):
    segSent = list(segmentor.segment(sent))  # ['国家主席', '江泽民', '访问', '了', '美国']
    news[idx] = segSent  # ['国家主席江泽民访问了美国']->[['国家主席', '江泽民', '访问', '了', '美国']]

for n in news:
    for word in n:
        segednews.write(word + ' ')
    segednews.write('\n')
segednews.close()

# 这个垃圾分词,用了外部词典还是把不该分的分了!!!!
import re
news = open('./segedNews.txt', 'r', encoding='utf-8').readlines()
# 为 企业 改革 发展 建功立业 本报 北京 讯 中华 全国 总工会 今 发出 致 全国 各族 职工 慰问信 更加 紧密 地 团结 在 以 江泽民 同志 为 核心 的 党中央 周围
# 中华全国总工会被分词了,玩个屁
# 我知道了,用正则? none or once ' '就可以抢救回来!!!!
relations = open('./relation_pos_neg.txt', 'r', encoding='utf-8').readlines()
from utils import inout
import index

if __name__ == '__main__':

    segmentor = Segmentor()
    segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt'))
    postagger = Postagger()
    postagger.load(inout.getLTPPath(index.POS))

    infoList = inout.readListFromTxt('./dn_test.txt')

    for sentence in infoList:

        # segmentor.load(inout.getLTPPath(index.CWS))
        words = segmentor.segment(sentence)
        postags = postagger.postag(words)
        # result = zip(words,postags)
        # inout.printEscapeStr(result)


    segmentor.release()
    postagger.release()

    # recognizer = NamedEntityRecognizer()
    # recognizer.load(inout.getLTPPath(index.NER))
    # netags = recognizer.recognize(words, postags)
    # recognizer.release()

    # result = zip(words,postags)
    # inout.printEscapeStr(result)
with open('../data/cont.txt', 'rb') as f:
    encoding = chardet.detect(f.readline())
print(encoding)

with open('../data/cont.txt','r',encoding='utf8') as f:
    content = f.read()
print(content)

for line in content.split('\n'):
    print(line)
    print('----')


seg = Segmentor()
seg.load(model_path)
words = seg.segment(content)
seg.release()


pos = Postagger()
pos.load(pos_path)
postag = pos.postag(words)
pos.release()
union = list(zip(list(words),list(postag)))
union_list = [x+' :'+y for x,y in union]


ner_path = os.path.abspath('./coach/ltp_data_v3.4.0/ner.model')
recognizer = NamedEntityRecognizer()
recognizer.load(ner_path)
# print(list(words))
Example #19
0
class LtpParser:
    def __init__(self):
        # ltp 模型路径
        LTP_DATA_DIR = './ltp_data'

        # 分词模型
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(
            os.path.join(LTP_DATA_DIR, 'cws.model'), 'ltp_data/lexicon.txt')
        # self.segmentor.load(os.path.join(LTP_DATA_DIR,'cws.model'))

        # 词性标注模型
        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model'))

        # 依存句法分析
        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DATA_DIR, 'parser.model'))

        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))

        # 语义角色标注
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DATA_DIR, 'pisrl_win.model'))

    def format_label_role(self, words, postags):
        """
        语义角色标注
        :param self:
        :param words:
        :param postags:
        :return:
        """
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}

        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    def build_parse_child_dict(self, words, postags, arcs):
        """
        句法分析---为句子的每个词语维护一个保存语法依存儿子节点的字典
        :param words:
        :param postags:
        :param arcs:
        :return:
        """
        child_dict_list = []
        format_parse_list = []

        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:  # arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]
        heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]

        for i in range(len(words)):
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)
        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_label_role(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
Example #20
0
#coding:utf-8
import sys, os
import json
import re
import pynlpir
pynlpir.open()
from pyltp import Segmentor, Postagger, Parser,NamedEntityRecognizer, SementicRoleLabeller
ROOTDIR =os.path.join(os.path.dirname(__file__),os.pardir)
sys.path.append(os.path.join(ROOTDIR, "lib"))
#设置模型文件的路径
MODELDIR=os.path.join(ROOTDIR, "ltp_data")

path = os.path.abspath(os.path.dirname(sys.argv[0]))
path_in = path+'/car_review_split.txt'
content_in  = open(path_in,'r')
path_out = path+'/test_word_list2.txt'
content_out = open(path_out,'w')
segmentor = Segmentor()
segmentor.load_with_lexicon(os.path.join(MODELDIR,"cws.model"),"/data0/dm/dict/dict.txt")
for line in content_in.readlines()[5000:10000]:
    print line
    line = re.sub("[\.\!\/_,$%^*(+\"\' ]+|[+——!,。?、~@#¥%……&*()]+".decode('utf-8'),"".decode('utf-8'),line.decode('utf-8'))
    line = line.encode('utf-8').strip()
    words = segmentor.segment(line)
    for j in words:
        content_out.write(j+' ')
content_out.close()
Example #21
0
class Extractor():
    def __init__(self):
        self.__clause_list = []
        self.__subclause_dict = {}
        self.__triple_list = []
        self.__segmentor = Segmentor()
        self.__postagger = Postagger()
        self.__recognizer = NamedEntityRecognizer()
        self.__parser = Parser()
        self.__labeller = SementicRoleLabeller()
        self.__words_full_list = []
        self.__netags_full_list = []

    @property
    def clause_list(self):
        return self.__clause_list

    @property
    def triple_list(self):
        return self.__triple_list

    def split(self, words, postags):
        start = 0
        for j, w in enumerate(words):
            if w == ',' or w == ',' or w == '。':
                clause = Clause(start, j - 1)
                self.__clause_list.append(clause)
                start = j + 1

        for clause in self.__clause_list:
            clause.split(postags)
            for subclause in clause.sub_clause_list:
                self.add_inverted_idx(subclause)

    def add_inverted_idx(self, subclause):
        for i in range(subclause.start_idx, subclause.end_idx):
            self.__subclause_dict[i] = subclause

    def load(self):
        PATH = ''
        self.__segmentor.load(PATH + 'cws.model')
        self.__postagger.load(PATH + 'pos.model')
        self.__recognizer.load(PATH + 'ner.model')
        self.__parser.load(PATH + 'parser.model')
        self.__labeller.load(PATH + 'pisrl.model')

    def release(self):
        self.__segmentor.release()
        self.__postagger.release()
        self.__recognizer.release()
        self.__parser.release()
        self.__labeller.release()

    def clear(self):
        self.__triple_list = []
        self.__words_full_list = []
        self.__netags_full_list = []

    def resolve_conference(self, entity):
        try:
            e_str = entity.get_content_as_str()
        except Exception:
            return '?'
        ref = e_str
        if e_str == '他' or e_str == '她':
            for i in range(entity.loc, -1, -1):
                if self.__netags_full_list[i].lower().endswith('nh'):
                    ref = self.__words_full_list[i]
                    break
        return ref

    def resolve_all_conference(self):
        for t in self.triple_list:
            e_str = self.resolve_conference(t.entity_1)
            try:
                t.entity_1.content = e_str.split()
            except Exception:
                pass

    def chunk_str(self, data):
        sents = SentenceSplitter.split(data)
        offset = 0
        for sent in sents:
            try:
                words = self.__segmentor.segment(sent)
                postags = self.__postagger.postag(words)
                netags = self.__recognizer.recognize(words, postags)
                arcs = self.__parser.parse(words, postags)
                roles = self.__labeller.label(words, postags, netags, arcs)
                self.chunk_sent(list(words), list(postags), list(arcs), offset)
                offset += len(list(words))
                self.__words_full_list.extend(list(words))
                self.__netags_full_list.extend(list(netags))
            except Exception as e:
                print(str(e))
                pass

    def chunk_sent(self, words, postags, arcs, offset):
        root = [i for i, x in enumerate(arcs) if x.relation == 'HED']
        if len(root) > 1:
            raise Exception('More than 1 HEAD arc is detected!')
        root = root[0]
        relations = [
            i for i, x in enumerate(arcs)
            if x.head == root and x.relation == 'COO'
        ]
        relations.insert(0, root)

        prev_e1 = None
        e1 = None
        for rel in relations:

            left_arc = [
                i for i, x in enumerate(arcs)
                if x.head == rel and x.relation == 'SBV'
            ]

            if len(left_arc) > 1:
                pass
                # raise Exception('More than 1 left arc is detected!')
            elif len(left_arc) == 0:
                e1 = prev_e1
            elif len(left_arc) == 1:
                left_arc = left_arc[0]
                leftmost = find_farthest_att(arcs, left_arc)
                e1 = Entity(1,
                            [words[i] for i in range(leftmost, left_arc + 1)],
                            offset + leftmost)

            prev_e1 = e1

            right_arc = [
                i for i, x in enumerate(arcs)
                if x.head == rel and x.relation == 'VOB'
            ]

            e2_list = []
            if not right_arc:
                e2 = Entity(2, None)
                e2_list.append(e2)
            else:
                right_ext = find_farthest_vob(arcs, right_arc[0])

                items = [
                    i for i, x in enumerate(arcs)
                    if x.head == right_ext and x.relation == 'COO'
                ]
                items = right_arc + items

                count = 0
                for item in items:
                    leftmost = find_farthest_att(arcs, item)

                    e2 = None

                    if count == 0:
                        e2 = Entity(
                            2,
                            [words[i] for i in range(leftmost, right_ext + 1)],
                            offset + leftmost)
                    else:
                        p1 = range(leftmost, right_arc[0])
                        p2 = range(item, find_farthest_vob(arcs, item) + 1)
                        e2 = Entity(
                            2, [words[i] for i in itertools.chain(p1, p2)])

                    e2_list.append(e2)
                    r = Relation(words[rel])
                    t = Triple(e1, e2, r)
                    self.__triple_list.append(t)
                    count += 1
class LtpParser():
    def __init__(self):
        LTP_DIR = "./ltp_data"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

    '''长句切分'''

    def seg_long_sents(self, content):
        return [
            sentence for sentence in re.split(
                r'[??!!。\n\r]',
                content.replace(' ', '').replace('\u3000', '').replace(
                    '——', '')) if sentence
        ]

    '''ltp基本操作'''

    def basic_parser(self, words):
        postags = list(self.postagger.postag(words))
        netags = self.recognizer.recognize(words, postags)
        return postags, netags

    '''基于实体识别结果,整理输出实体列表'''

    def format_entity(self, words, netags):
        name_entity_list = []
        place_entity_list = []
        organization_entity_list = []
        ntag_E_Nh = ""
        ntag_E_Ni = ""
        ntag_E_Ns = ""
        index = 0
        for item in zip(words, netags):
            word = item[0]
            ntag = item[1]
            if ntag[0] != "O":
                if ntag[0] == "S":
                    if ntag[-2:] == "Nh":
                        name_entity_list.append(word)
                    elif ntag[-2:] == "Ni":
                        organization_entity_list.append(word)
                    else:
                        place_entity_list.append(word)
                elif ntag[0] == "B":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word
                    else:
                        ntag_E_Ns = ntag_E_Ns + word
                elif ntag[0] == "I":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word
                    else:
                        ntag_E_Ns = ntag_E_Ns + word
                else:
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word
                        name_entity_list.append(ntag_E_Nh)
                        ntag_E_Nh = ""
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word
                        organization_entity_list.append(ntag_E_Ni)
                        ntag_E_Ni = ""
                    else:
                        ntag_E_Ns = ntag_E_Ns + word
                        place_entity_list.append(ntag_E_Ns)
                        ntag_E_Ns = ""
            index += 1
        return place_entity_list

    '''获取地点'''

    def collect_locations(self, content):
        locations = []
        sents = self.seg_long_sents(content)
        for i in sents:
            words = list(self.segmentor.segment(i))
            postags, netags = self.basic_parser(words)
            locations += self.format_entity(words, netags)

        return locations
Example #23
0
class LTP:
    def __init__(
        self,
        ltp_data_path=None,
        seg_lexicon=None,
        pos_lexicon=None,
    ):
        if not ltp_data_path:
            raise ValueError('请指定ltp用到的模型所在路径!!!')

        self.ltp_data_path = ltp_data_path  # ltp模型目录的路径
        self._cws_model_path = os.path.join(
            self.ltp_data_path, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self._pos_model_path = os.path.join(
            self.ltp_data_path, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self._ner_model_path = os.path.join(
            self.ltp_data_path, 'parser.model')  # 命名实体识别模型路径,模型名称为`pos.model`

        self._segmentor = Segmentor()  # 初始化实例
        if seg_lexicon:
            self._segmentor.load_with_lexicon(
                self._cws_model_path, seg_lexicon)  # 加载模型,第二个参数是您的外部词典文件路径
        else:
            self._segmentor.load(self._cws_model_path)

        self._postagger = Postagger()  # 初始化实例
        if pos_lexicon:
            self._postagger.load_with_lexicon(
                self._pos_model_path, pos_lexicon)  # 加载模型,第二个参数是您的外部词典文件路径
        else:
            self._postagger.load(self._pos_model_path)

        self._recognizer = NamedEntityRecognizer()  # 初始化实例
        self._recognizer.load(self._ner_model_path)  # 加载模型

    def cut(self, text):
        return self._segmentor.segment(text)

    def pos(self, text):
        words = self.cut(text)
        postags = self._postagger.postag(words)

        return zip(words, postags)

    def ner(self, text):
        """
        命名实体识别,提供三种命名识别,PER人名、LOC地名、ORG机构名
        :param text:
        :return:
        """
        # Nh代表人名, Ni代表机构名,Ns代表地点名字
        ner_dict = {'Nh': [], 'Ni': [], 'Ns': []}
        words = self.cut(text)
        postags = self._postagger.postag(words)
        nertags = self._recognizer.recognize(words, postags)

        ner_tmp = []
        for i, tag in enumerate(nertags):
            if tag == 'O':
                continue
            if tag.startswith('S'):
                tag = tag.split('-')[-1]
                ner_dict[tag].append(words[i])
            elif tag.startswith('B') or tag.startswith('I'):
                ner_tmp.append(words[i])
                continue
            elif tag.startswith('E'):
                ner_tmp.append(words[i])
                tag = tag.split('-')[-1]
                ner_dict[tag].append(''.join(ner_tmp))
                ner_tmp = []
        if ner_tmp:
            tag = list(nertags)[-1]
            tag = tag = tag.split('-')[-1]
            ner_dict[tag].append(''.join(ner_tmp))

        ner_map = dict()
        ner_map['PER'] = ner_dict['Nh']
        ner_map['ORG'] = ner_dict['Ni']
        ner_map['LOC'] = ner_dict['Ns']

        return ner_map

    def release(self):
        self._segmentor.release()
        self._recognizer.release()
        self._postagger.release()
Example #24
0
# -*- coding: utf-8 -*-
from pyltp import SentenceSplitter
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import NamedEntityRecognizer

ldir = 'AgriKG\\ltp\\cws.model'  #分词模型
dicdir = 'word'  #外部字典
text = "贵州财经大学要举办大数据比赛吗?那让欧几里得去问问看吧!其实是在贵阳花溪区吧。"

#中文分词
segmentor = Segmentor()  #初始化实例
segmentor.load_with_lexicon(ldir, 'word')  #加载模型
words = segmentor.segment(text)  #分词
print(' '.join(words))  #分词拼接
words = list(words)  #转换list
print(u"分词:", words)
segmentor.release()
Example #25
0
    f.close()

    #统计有多少篇新闻
    newscnt = 0
    for i in range(0, 100):
        if len(new[99-i]) != 0:
            newscnt = 100-i
            break
    '''

    # 读入所有的标题,计算其向量置于title中
    title = []
    f = open(unicode('../Sentence/sentence/' + news + '/title.txt', 'utf8'),
             'r')
    for line in f:
        words = segmentor.segment(line.strip())
        word_vec_list = []
        for word in words:
            if word not in stoplist and word in model:
                word_vec_list.append(model[word])
        title.append(mean_vec(word_vec_list))
    f.close()

    print '标题数:', len(title)

    # 读入所有标签
    f = open(unicode('../Sentence/label/' + news + '/label.txt', 'utf8'), 'r')
    labels = [line.strip().replace('+', '') for line in f]
    f.close()

    for label in labels:
Example #26
0
class LTPFunction:
    def __init__(self):
        self.segmentor = Segmentor()
        self.segmentor.load("model/cws.model")
        # self.segmentor.load_with_lexicon("model/cws.model", 'dict/segdict.txt') # 加载模型,第二个参数是您的外部词典文件路径
        self.postagger = Postagger()  # 初始化实例
        self.postagger.load('model/pos.model')  # 加载模型
        self.parser = Parser()  # 初始化实例
        self.parser.load('model/parser.model')  # 加载模型
        self.recognizer = NamedEntityRecognizer()  # 初始化实例
        self.recognizer.load('model/ner.model')

    def __new__(cls, *args, **kwargs):
        if not hasattr(cls, 'instance'):
            cls.instance = super(LTPFunction, cls).__new__(cls)
        return cls.instance

    def ltp_seg(self, sentence):
        words = self.segmentor.segment(sentence)
        return [i for i in words]

    # 词性标注, 输入为分词后的列表, 输出为词性标注列表
    def ltp_pos(self, word_list):
        # print(type(word_list))
        words_postags = self.postagger.postag(word_list)  # 词性标注
        # postagger.release()
        return [i for i in words_postags]

    # 实体抽取, 输入为分词列表、词性标注列表, 输出为人名集合、地名集合、机构名集合
    def ltp_ner(self, word_list, words_postags):
        netags = self.recognizer.recognize(word_list, words_postags)
        # print(" ".join(netags))
        entity = ''
        tag = ''
        person_set = set()
        location_set = set()
        organization_set = set()
        for i in range(len(netags)):
            ner = netags[i].split('-')
            if ner[0] == 'O':
                if entity != '':
                    if tag == 'Nh':
                        person_set.add(entity)
                    if tag == 'Ns':
                        location_set.add(entity)
                    if tag == 'Ni':
                        organization_set.add(entity)
                entity = ''
                tag = ''
            elif ner[0] == 'S':
                if ner[1] == 'Nh':
                    person_set.add(word_list[i])
                if ner[1] == 'Ns':
                    location_set.add(word_list[i])
                if ner[1] == 'Ni':
                    organization_set.add(word_list[i])
            elif ner[0] == 'B':
                entity = entity + word_list[i]
                tag = ner[1]
            elif ner[0] == 'I':
                entity = entity + word_list[i]
                tag = ner[1]
            else:
                entity = entity + word_list[i]
                tag = ner[1]
                if tag == 'Nh':
                    person_set.add(entity)
                if tag == 'Ns':
                    location_set.add(entity)
                if tag == 'Ni':
                    organization_set.add(entity)
                entity = ''
                tag = ''
        return person_set, location_set, organization_set

    # 句法分析, 输入为分词列表、词性标注列表, 输出为关系(Relation)列表、父节点(Head)列表
    def ltp_parser(self, word_list, pos_list):
        relation_list = []
        head_list = []
        arcs = self.parser.parse(word_list, pos_list)  # 句法分析
        for arc in arcs:
            relation_list.append(arc.relation)
            head_list.append(arc.head)

        return relation_list, head_list

    # 外部
    def ner_extract(self, title, content):

        # 最终得到的去重后的结果
        person_set = set()
        location_set = set()
        organization_set = set()

        # 根据title获取对应的信息
        title_words = self.ltp_seg(title)
        title_pos = self.ltp_pos(title_words)
        p_set, l_set, o_set = self.ltp_ner(title_words, title_pos)

        # 将获取的集合放入结果中
        person_set = person_set | p_set
        location_set = location_set | l_set
        organization_set = organization_set | o_set
        '''
        # 根据content获取对应的信息(分句后再处理)
        for sentence in re.split(r'[??!!。;;::\n\r]', content):
            if sentence:
                # print(sentence)
                sen_words = self.ltp_seg(sentence)
                sen_pos = self.ltp_pos(sen_words)
                p_set, l_set, o_set = self.ltp_ner(sen_words, sen_pos)
                
                # 将获取的集合放入结果中
                person_set = person_set | p_set
                location_set = location_set | l_set
                organization_set = organization_set | o_set
        '''
        return list(person_set), list(location_set), list(organization_set)
Example #27
0
class LTP(object):
    def __init__(self):
        cws_model_path = os.path.join('../data/ltp_data_v3.4.0',
                                      'cws.model')  # 分词模型路径,模型名称为`cws.model`
        pos_model_path = os.path.join('../data/ltp_data_v3.4.0',
                                      'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        ner_model_path = os.path.join(
            '../data/ltp_data_v3.4.0',
            'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

        self.segmentor = Segmentor()  # 初始化实例
        self.segmentor.load(cws_model_path)  # 加载模型
        self.postagger = Postagger()  # 初始化实例
        self.postagger.load(pos_model_path)  # 加载模型
        self.recognizer = NamedEntityRecognizer()  # 初始化实例
        self.recognizer.load(ner_model_path)  # 加载模型

    # 分词
    def segment(self, text):
        words = list(self.segmentor.segment(text))
        return words

    # 词性标注
    def postag(self, words):
        postags = list(self.postagger.postag(words))
        return postags

    # 获取文本中的时间
    def get_time(self, text):

        # 开始分词及词性标注
        words = self.segment(text)
        #print(words)
        postags = self.postag(words)
        #print(postags)

        time_lst = []

        i = 0
        for tag, word in zip(postags, words):
            if tag == 'nt':
                j = i
                while postags[j] == 'nt' or words[j] in ['至', '到']:
                    j += 1
                time_lst.append(''.join(words[i:j]))
            i += 1

        # 去重子字符串的情形
        remove_lst = []
        for i in time_lst:
            for j in time_lst:
                if i != j and i in j:
                    remove_lst.append(i)

        text_time_lst = []
        for item in time_lst:
            if item not in remove_lst:
                text_time_lst.append(item)

        # print(text_time_lst)
        return text_time_lst

    #提取人名地名组织名
    def get_name(self, text):
        persons, places, orgs = set(), set(), set()

        words = self.segment(text)
        #print("words333333333333")
        postags = self.postag(words)
        #print(postags)
        netags = list(self.recognizer.recognize(words, postags))  # 命名实体识别
        #print(netags)
        # print(netags)
        i = 0
        for tag, word in zip(netags, words):
            j = i
            # 人名
            if 'Nh' in tag:
                if str(tag).startswith('S'):
                    persons.add(word)
                elif str(tag).startswith('B'):
                    union_person = word
                    while netags[j] != 'E-Nh':
                        j += 1
                        if j < len(words):
                            union_person += words[j]
                    persons.add(union_person)
            # 地名
            if 'Ns' in tag:
                if str(tag).startswith('S'):
                    places.add(word)
                elif str(tag).startswith('B'):
                    union_place = word
                    while netags[j] != 'E-Ns':
                        j += 1
                        if j < len(words):
                            union_place += words[j]
                    places.add(union_place)
            # 机构名
            if 'Ni' in tag:
                if str(tag).startswith('S'):
                    orgs.add(word)
                elif str(tag).startswith('B'):
                    union_org = word
                    while netags[j] != 'E-Ni':
                        j += 1
                        if j < len(words):
                            union_org += words[j]
                    orgs.add(union_org)

            i += 1

        # print('人名:', ','.join(persons))
        # print('地名:', ','.join(places))
        # print('组织机构:', ','.join(orgs))
        return persons, places, orgs

    # 释放模型
    def free_ltp(self):
        self.segmentor.release()
        self.postagger.release()
Example #28
0
class LtpParser():
    def __init__(self):
        LTP_DIR = os.path.join(pwd_path, "../ltp_data_v3.4.0")
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

    '''ltp基本操作'''

    def basic_parser(self, words):
        postags = list(self.postagger.postag(words))
        netags = self.recognizer.recognize(words, postags)
        return postags, netags

    '''ltp获取词性'''

    def get_postag(self, words):
        return list(self.postagger.postag(words))

    '''基于实体识别结果,整理输出实体列表'''

    def format_entity(self, words, netags, postags):
        name_entity_dist = {}
        name_entity_list = []
        place_entity_list = []
        organization_entity_list = []
        ntag_E_Nh = ""
        ntag_E_Ni = ""
        ntag_E_Ns = ""
        index = 0
        for item in zip(words, netags):
            word = item[0]
            ntag = item[1]
            if ntag[0] != "O":
                if ntag[0] == "S":
                    if ntag[-2:] == "Nh":
                        name_entity_list.append(word + '_%s ' % index)
                    elif ntag[-2:] == "Ni":
                        organization_entity_list.append(word + '_%s ' % index)
                    else:
                        place_entity_list.append(word + '_%s ' % index)
                elif ntag[0] == "B":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                elif ntag[0] == "I":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                else:
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                        name_entity_list.append(ntag_E_Nh)
                        ntag_E_Nh = ""
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                        organization_entity_list.append(ntag_E_Ni)
                        ntag_E_Ni = ""
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                        place_entity_list.append(ntag_E_Ns)
                        ntag_E_Ns = ""
            index += 1
        name_entity_dist['nhs'] = self.modify_entity(name_entity_list, words,
                                                     postags, 'nh')
        name_entity_dist['nis'] = self.modify_entity(organization_entity_list,
                                                     words, postags, 'ni')
        name_entity_dist['nss'] = self.modify_entity(place_entity_list, words,
                                                     postags, 'ns')
        return name_entity_dist

    '''entity修正,为rebuild_wordspostags做准备'''

    def modify_entity(self, entity_list, words, postags, tag):
        entity_modify = []
        if entity_list:
            for entity in entity_list:
                entity_dict = {}
                subs = entity.split(' ')[:-1]
                start_index = subs[0].split('_')[1]
                end_index = subs[-1].split('_')[1]
                entity_dict['stat_index'] = start_index
                entity_dict['end_index'] = end_index
                if start_index == entity_dict['end_index']:
                    consist = [
                        words[int(start_index)] + '/' +
                        postags[int(start_index)]
                    ]
                else:
                    consist = [
                        words[index] + '/' + postags[index]
                        for index in range(int(start_index),
                                           int(end_index) + 1)
                    ]
                entity_dict['consist'] = consist
                entity_dict['name'] = ''.join(
                    tmp.split('_')[0] for tmp in subs) + '/' + tag
                entity_modify.append(entity_dict)
        return entity_modify

    '''基于命名实体识别,修正words,postags'''

    def rebuild_wordspostags(self, name_entity_dist, words, postags):
        pre = ' '.join(
            [item[0] + '/' + item[1] for item in zip(words, postags)])
        post = pre
        for et, infos in name_entity_dist.items():
            if infos:
                for info in infos:
                    post = post.replace(' '.join(info['consist']),
                                        info['name'])
        post = [
            word for word in post.split(' ')
            if len(word.split('/')) == 2 and word.split('/')[0]
        ]
        words = [tmp.split('/')[0] for tmp in post]
        postags = [tmp.split('/')[1] for tmp in post]

        return words, postags

    '''依存关系格式化'''

    def syntax_parser(self, words, postags):
        arcs = self.parser.parse(words, postags)
        words = ['Root'] + words
        postags = ['w'] + postags
        tuples = list()
        for index in range(len(words) - 1):
            arc_index = arcs[index].head
            arc_relation = arcs[index].relation
            tuples.append([
                index + 1, words[index + 1], postags[index + 1],
                words[arc_index], postags[arc_index], arc_index, arc_relation
            ])

        return tuples

    '''为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, tuples):
        child_dict_list = list()
        for index, word in enumerate(words):
            child_dict = dict()
            for arc in tuples:
                if arc[3] == word:
                    if arc[-1] in child_dict:
                        child_dict[arc[-1]].append(arc)
                    else:
                        child_dict[arc[-1]] = []
                        child_dict[arc[-1]].append(arc)
            child_dict_list.append([word, postags[index], index, child_dict])

        return child_dict_list

    '''parser主函数'''

    def parser_main(self, words, postags):
        tuples = self.syntax_parser(words, postags)
        child_dict_list = self.build_parse_child_dict(words, postags, tuples)
        return tuples, child_dict_list

    '''基础语言分析'''

    def basic_process(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags, netags = self.basic_parser(words)
        name_entity_dist = self.format_entity(words, netags, postags)
        words, postags = self.rebuild_wordspostags(name_entity_dist, words,
                                                   postags)
        return words, postags
Example #29
0
    def ws_data(self):
        f = open("pnn_annotated.txt", 'r')
        total_line = 0
        orgin_attr = [0, 0, 0]
        judge_attr = [0, 0, 0]
        right = [0, 0, 0]
        segmentor = Segmentor()
        segmentor.load("cws.model")
        for line in f:
            total_line += 1
            # print 'line has been read'
            value_num = [0, 0]
            result = line.split('\t')
            ws_lst = segmentor.segment(result[1])
            # print 'this line is %s' % (line)

            for i in ws_lst:
                classify = ''
                try:
                    value = self.setiment_words[i]
                except:
                    pass
                else:
                    if value == 1:
                        print 'positive word:%s' % i
                        value_num[0] += 1
                    elif value == -1:
                        print 'negative word:%s' % i
                        value_num[1] += 1

            if value_num[0] == 0 and value_num[1] == 0:
                classify = 'neutral'
                judge_attr[0] += 1
            elif value_num[0] == value_num[1] != 0:
                classify = 'neutral'
                judge_attr[0] += 1
            elif value_num[0] > value_num[1]:
                classify = 'positive'
                judge_attr[1] += 1
            else:
                classify = 'negative'
                judge_attr[2] += 1

            print value_num
            print 'classfiy result:%s' % classify

            # the count of original'emotion
            if result[0] == '0':
                orgin_attr[0] += 1
            elif result[0] == '1':
                orgin_attr[1] += 1
            else:
                orgin_attr[2] += 1

            if (int(result[0]) == 0 and value_num[0] == 0 and value_num[1] == 0):
                # print 'neutral'
                right[0] += 1
            elif (int(result[0]) == 0 and value_num[0] == value_num[1] != 0):
                # print 'neutral'
                right[0] += 1
            elif (int(result[0]) > 0 and value_num[0] >= value_num[1] and value_num[0] != 0):
                # print 'positive'
                right[1] += 1
            elif (int(result[0]) < 0 and value_num[0] < value_num[1] and value_num[1] != 0):
                # print 'negative'
                right[2] += 1

            # print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line))
        print 'orgin\'s neutral, positive, negative'
        print orgin_attr

        print 'judge_attr neutral, positive, negative'
        print judge_attr

        print 'neutral, positive, negative'
        print right
        print (right[0] + right[1] + right[2])

        print 'total_line %f\n' % total_line
        print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line))
        segmentor.release()
Example #30
0
class LanguageProcessor(object):
    def __init__(self, configure):

        self.system_logger = logging.getLogger("system_log")

        self._sentence_splitter = SentenceSplitter

        self._segmentor = Segmentor()
        self._segmentor.load_with_lexicon(
            configure.nlp_data_root + "/cws.model",
            configure.nlp_data_root + "/cws.tsv")

        self._segmentor_without_dictionary = Segmentor()
        self._segmentor_without_dictionary.load(configure.nlp_data_root +
                                                "/cws.model")

        self._postagger = Postagger()
        self._postagger.load(configure.nlp_data_root + "/pos.model")

        self._ner_recognizer = NamedEntityRecognizer()
        self._ner_recognizer.load(configure.nlp_data_root + "/ner.model")

        self._dependency_parser = Parser()
        self._dependency_parser.load(configure.nlp_data_root + "/parser.model")

        self._srl = SementicRoleLabeller()
        self._srl.load(configure.nlp_data_root + "/pisrl.model")

        self._stopwords_file = configure.nlp_data_root + "/stopwords.txt"
        self._stopwords_set = set([
            tk.strip() for tk in codecs.open(self._stopwords_file, 'r',
                                             'utf-8').read().splitlines()
            if tk.strip() != ""
        ])

        self.entity_type_mapping_file = configure.entity_type_mapping_file
        self.entity_type_mapping = defaultdict()
        for line in codecs.open(self.entity_type_mapping_file, 'r',
                                'utf-8').read().splitlines():
            elems = line.split("\t")
            if len(elems) != 2:
                log_str = "Format error in file [%s] !!!\n" % self.entity_type_mapping_file
                self.system_logger.error(log_str)
                sys.stderr.write(log_str)
                continue
            self.entity_type_mapping[int(
                elems[0])] = "<" + str(elems[0]) + "_" + elems[1].strip() + ">"
        self.all_entity_replacements = list(self.entity_type_mapping.values())

        self.entity_type_exclusion_file = configure.entity_type_exclusion_file
        self.entity_type_exclusion_mapping = defaultdict()
        for line in codecs.open(self.entity_type_exclusion_file, 'r',
                                'utf-8').read().splitlines():
            elems = line.split("\t")
            if len(elems) != 2:
                log_str = "Format error in file [%s] !!!\n" % self.entity_type_exclusion_file
                self.system_logger.error(log_str)
                sys.stderr.write(log_str)
                continue
            self.entity_type_exclusion_mapping[int(
                elems[0])] = "<" + str(elems[0]) + "_" + elems[1].strip() + ">"
        self.entity_type_exclusion_set = set(
            self.entity_type_exclusion_mapping.keys())

        trie_tree, lexicon = generate_trie_tree(configure.nlp_data_root +
                                                "trust_list.tsv")
        self._lexicon = lexicon
        self._trie_tree = trie_tree

        self.entity_linker = EntityLinker()

        self.dialog_act_classifier = DialogActClassifier(
            configure.dialog_act_classifier_configure)

        self.emotion_classifier = EmotionClassifier(
            configure.emotion_classifier_configure)

        self.yes_no_classifier = YesNoClassifier(
            configure.attitude_classifier_configure)
        self.like_dislike_classifier = LikeDislikeClassifier(
            configure.attitude_classifier_configure)

        self.question_classifier = QuestionClassifier(
            configure.question_classifier_configure)
        self.question_response = ""

        self.noun_phrase_generator = noun_phrase_generator

        self.segmentor_plus = segmentor_plus

        self.turn_on = configure.turn_on

    def segment_chinese_sentence_without_dictionary(self, sentence):

        return list(self._segmentor_without_dictionary.segment(sentence))

    def generate_query(self, raw_sentence):

        # LTP cannot handle whitespace, it will remove whitespace automatically.
        # Therefore, we have to replace whitespace with some 'safe' tokens
        # e.g., comma
        original_raw_sentence = raw_sentence
        spaces = {}
        raw_sentence = list(raw_sentence)
        for s in re.finditer(' ', original_raw_sentence):
            spaces[s.start()] = s
            assert raw_sentence[s.start()] == ' '
            raw_sentence[s.start()] = ','
        raw_sentence = ''.join(raw_sentence)

        splitted_sentences = list(SentenceSplitter.split(raw_sentence))
        structured_sentences = []
        sent_pos = 0
        sent_index = 0
        for one_sentence in splitted_sentences:
            sent_start = raw_sentence.index(one_sentence, sent_pos)
            sent_end = sent_start + len(one_sentence)
            sent_pos = sent_end

            tokens = list(self._segmentor.segment(one_sentence))
            tokens = list(
                self._resegment(tokens,
                                lexicon=self._lexicon,
                                trie_tree=self._trie_tree))

            postags = [None] * len(tokens)
            if "POS" in self.turn_on:
                postags = list(self._postagger.postag(tokens))

            ners = [None] * len(tokens)
            if "POS" in self.turn_on and "NER" in self.turn_on:
                ners = list(self._ner_recognizer.recognize(tokens, postags))

            arcs = [None] * len(tokens)
            if "POS" in self.turn_on and "DEP" in self.turn_on:
                arcs = self._dependency_parser.parse(tokens, postags)

            roles = [None] * len(tokens)
            if "POS" in self.turn_on and "DEP" in self.turn_on and "SRL" in self.turn_on:
                roles = list(self._srl.label(tokens, postags, arcs))

            arcs = list(arcs)

            token_list = []
            word_pos = 0
            sentence_length = 0
            for index, tk in enumerate(tokens):
                word_start = one_sentence.index(tk[0], word_pos)
                word_end = word_start + len(tk)
                word_pos = word_end

                # Recover token
                if tk == ',' and word_start + sent_start in spaces:
                    tk = ' '

                token = Token(
                    index, tk, tk, word_start, word_end, postags[index],
                    ners[index],
                    arcs[index].head if arcs[index] is not None else None,
                    arcs[index].relation if arcs[index] is not None else None,
                    word_start + sent_start, word_end + sent_start, [],
                    self._detect_stop_words(tk))

                token_list.append(token)
                if token.pos == "wp" or len(tk.strip()) == 0:
                    continue

                sentence_length += len(tk)

            if len(token_list) == 0:
                continue

            if roles != [None] * len(tokens):
                for role in roles:
                    token = token_list[role.index]
                    for arg in role.arguments:
                        token.semantic_roles.append(
                            (arg.name, arg.range.start, arg.range.end))

            np_chunks = self._generate_np_chunks(token_list)

            # Recover sentence
            one_sentence = list(one_sentence)
            for s in spaces:
                if s >= sent_start and s < sent_end:
                    n = s - sent_start
                    assert one_sentence[n] == ','
                    one_sentence[n] = ' '
            one_sentence = ''.join(one_sentence)

            sentence = Sentence(one_sentence, sent_start, sent_index,
                                sentence_length, token_list, np_chunks)
            sent_index += 1
            structured_sentences.append(sentence)

        return_query = Query(raw_sentence, splitted_sentences,
                             structured_sentences)

        if "POS" in self.turn_on and "ATT" in self.turn_on:
            self._detect_attitude(return_query)

        if "POS" in self.turn_on and "QTY" in self.turn_on:
            self._detect_question_type(return_query)

        if "ACT" in self.turn_on:
            self._dialog_act_detector(return_query)

        if "EMO" in self.turn_on:
            self._detect_emotion_type(return_query)

        if "ENL" in self.turn_on:
            for sentence_index, sentence in enumerate(
                    return_query.sentence_list):
                self._link_entity(sentence_index, return_query.sentence_list)
                return_query.full_entity_ids.extend(sentence.full_entity_ids)
                return_query.topic_entity_ids.extend(sentence.topic_entity_ids)
                return_query.normalized_text += sentence.normalized_text
                for entity_type, entity_list in sentence.type2entity.items():
                    return_query.type2entity[entity_type].extend(entity_list)

            if len(return_query.sentence_list) > 1:
                return_query.full_entity_ids = list(
                    set(return_query.full_entity_ids))
                return_query.topic_entity_ids = list(
                    set(return_query.topic_entity_ids))

                for entity_type in return_query.type2entity:
                    return_query.type2entity[entity_type] = list(
                        set(return_query.type2entity[entity_type]))

            if return_query.normalized_text in self.all_entity_replacements:
                return_query.single_entity = True

        return return_query

    def _detect_stop_words(self, word):
        return word.strip() in self._stopwords_set

    def _generate_np_chunks(self, token_list):
        return list(noun_phrase_generator(token_list))

    def _dialog_act_detector(self, query):
        self.dialog_act_classifier.classify(query)
        query.map_dialog_act_to_sentence_index()

    def _detect_question_type(self, query):
        self.question_classifier.classify(query)

    def _detect_emotion_type(self, query):
        self.emotion_classifier.classify(query)

    def _detect_attitude(self, query):
        self.yes_no_classifier.classify(query)
        self.like_dislike_classifier.classify(query)

    def _link_entity(self, sentence_index, sentence_list):
        entity_mention = self.entity_linker.linking(sentence_index,
                                                    sentence_list)
        sentence_list[sentence_index].update_entity(
            entity_mention, self.entity_type_mapping,
            self.entity_type_exclusion_set)
        sentence_list[sentence_index].normalize(self.entity_type_mapping)

    def _ploarity_detector(self, query):
        return []

    def _resegment(self, tokens, lexicon=None, trie_tree=None):
        return self.segmentor_plus(tokens,
                                   lexicon=lexicon,
                                   trie_tree=trie_tree)
Example #31
0
                              'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`

## 词性标注
from pyltp import Postagger

from pyltp import Segmentor

par_model_path = os.path.join(LTP_DATA_DIR,
                              'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
from pyltp import Segmentor
segmentor = Segmentor()  # 初始化实例
segmentor.load_with_lexicon(cws_model_path, 'dict1.txt')  # 加载模型
from pyltp import Parser
parser = Parser()  # 初始化实例
parser.load(par_model_path)  # 加载模型
words = list(segmentor.segment(text))  # 分词

## 词性标注
from pyltp import Postagger
postagger = Postagger()  # 初始化实例
postagger.load(pos_model_path)  # 加载模型
print(words, '分词结果')
postags = postagger.postag(words)  # 词性标注
tags = list(postags)
print(tags, "词性标注")

## 依存句法分析
from pyltp import Parser
parser = Parser()  # 初始化实例
parser.load(par_model_path)  # 加载模型
Example #32
0
class LtpParser:
    def __init__(self):
        LTP_DIR = "/Users/benkangchen/pyltp/model"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

    '''语义角色标注'''

    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:  #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
Example #33
0
trDemo = data.TextRank.TextRank()

# 分析过程
i = 0
for rawline in f.readlines():
    # 按行分析
    rawline_json=json.loads(rawline)
    # 获取标题行
    titleline=rawline_json['title']
    # 获取实体
    entity=set()
    eec=rawline_json["coreEntityEmotions"]
    for key in eec:
        entity.add(key["entity"])
    # 获取标题分词
    titleWords=segmentor.segment(titleline)
    # 创建标题集合(不重集合)
    titleWordsSet = (set)([])
    # 标题行输出
    titleCut = "TitleCut="
    for w in titleWords:
        # 读入长度大于1的词
        for sep in kickout:
            if sep in w:
                continue
        if len((str)(w))>1 :
            titleWordsSet.add(w)
            titleCut+=w+" "
    # 获取内容行
    SC=rawline_json["content"].strip()
    content=rawline_json["title"].strip()+' '+rawline_json["content"].strip()
Example #34
0
def cut_words_ltp(sentence):
    segmentor = Segmentor()
    segmentor.load(cws_model_path)
    words = segmentor.segment(sentence)
    segmentor.release()
    return list(words)
Example #35
0
class LtpParser:
    def __init__(self):
        LTP_DIR = "ltp_data_v3.4.0"
        self.segmentor = Segmentor(
            model_path=os.path.join(LTP_DIR, "cws.model"))
        # self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger(
            model_path=os.path.join(LTP_DIR, "pos.model"))
        # self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser(os.path.join(LTP_DIR, "parser.model"))
        # self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer(
            os.path.join(LTP_DIR, "ner.model"))
        # self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller(
            os.path.join(LTP_DIR, 'pisrl_win.model'))
        # self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))

    '''语义角色标注'''

    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for index, role in roles:
            roles_dict[index] = {
                name: [name, arg[0], arg[1]]
                for name, arg in role
            }
        # for role in roles:
        #     roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments}
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for idx, (head, relation) in enumerate(arcs):
                if head == index + 1:  #arcs的索引从1开始,head 表示依存弧的父节点的索引。root节点的索引是0,从第一个词开始索引依次为1,2,3,。。。relation表示依存弧的关系。
                    if relation in child_dict:
                        child_dict[relation].append(idx)
                    else:
                        child_dict[relation] = []
                        child_dict[relation].append(idx)
            # for arc_index in range(len(arcs)):
            #     if arcs[arc_index].head == index+1:   #arcs的索引从1开始 arc. head 表示依存弧的父结点的索引。 ROOT 节点的索引是 0 ,第一个词开始的索引依次为1,2,3,···arc. relation 表示依存弧的关系。
            #         if arcs[arc_index].relation in child_dict:
            #             child_dict[arcs[arc_index].relation].append(arc_index)#添加
            #         else:
            #             child_dict[arcs[arc_index].relation] = []#新建
            #             child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)  # 每个词对应的依存关系父节点和其关系
        rely_id = [head for head, relation in arcs]  # 提取依存父节点id
        relation = [relation for head, relation in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
Example #36
0
    def bayes(self):
        segmentor = Segmentor()
        segmentor.load("cws.model")

        f = open('data/a_4.txt', 'r')
        # f = open('pnn_annotated.txt', 'r')
        # neutral, positive, negative
        class_freq = [0,0,0]
        # neutral, positive, negative
        word_total_count_freq = [0, 0, 0]
        each_word_count = [{}, {}, {}]

        accu = [0, 0]

        print 'train_set'
        for line in f:
            result = line.split('\t')
            ws_lst = segmentor.segment(result[1])
            # print line
            # neutral
            if result[0] == '0':
                class_freq[0] += 1
                for word in ws_lst:
                    word_total_count_freq[0] += 1
                    if each_word_count[0].get(word) is not None:
                        # print 'Not none'
                        each_word_count[0][word] += 1
                    else:
                        # print 'None'
                        each_word_count[0][word] = 1
            # positive
            elif result[0] == '1':
                class_freq[1] += 1
                for word in ws_lst:
                    word_total_count_freq[1] += 1
                    if each_word_count[1].get(word) is not None:
                        # print 'Not none'
                        each_word_count[1][word] += 1
                    else:
                        # print 'None'
                        each_word_count[1][word] = 1

            # negative
            elif result[0] == '-1':
                class_freq[2] += 1
                for word in ws_lst:
                    word_total_count_freq[2] += 1
                    if each_word_count[2].get(word) is not None:
                        # print 'Not none'
                        each_word_count[2][word] += 1
                    else:
                        # print 'None'
                        each_word_count[2][word] = 1

        # print class_freq
        # print word_total_count_freq
        # print each_word_count

        print 'total'
        total_class_count = class_freq[0] + class_freq[1] + class_freq[2]
        total_word_count = word_total_count_freq[0] + word_total_count_freq[1] + word_total_count_freq[2]
        print total_class_count
        # print total_word_count

        f.close()
        f1 = open('a_1.txt', 'r')

        #   中性   积极, , 消极
        # neutral, positive, negative
        orgin = [0, 0, 0]   # 本来有多少积极消极
        judge = [0, 0, 0]   # 判断出来了多少积极消极
        judge_right = [0, 0, 0]

        print 'test_set_now'
        for line in f1:
            result = line.split('\t')
            # print result[1]
            ws_lst = segmentor.segment(result[1])
            # print test_line[test_count]
            max = 0
            tmp_result = 0
            for test_iter in range(3):
                processed_wst = []
                prob_this_class = 1
                for test_word in ws_lst:
                    if test_word not in processed_wst:
                        prob_this_class *= (each_word_count[test_iter].get(test_word, 0) + 1.0) / float(word_total_count_freq[test_iter] + total_word_count)
                        processed_wst.append(test_word)
                prob_this_class *= (float(class_freq[test_iter]) / float(total_class_count))

                if prob_this_class > max:
                    max = prob_this_class
                    tmp_result = test_iter

            if tmp_result == 0:
                test_result = '0'
                judge[0] += 1
            elif tmp_result == 1:
                test_result = '1'
                judge[1] += 1
            elif tmp_result == 2:
                test_result = '-1'
                judge[2] += 1

            if result[0] == test_result:
                accu[0] += 1
            else:
                accu[1] += 1

            if result[0] == '0':
                orgin[0] += 1
            elif result[0] == '1':
                orgin[1] += 1
            elif result[0] == '-1':
                orgin[2] += 1

            if result[0] == '0' == test_result:
                judge_right[0] += 1
            elif result[0] == '1' == test_result:
                judge_right[1] += 1
            elif result[0] == '-1' == test_result:
                judge_right[2] += 1

            # print 'result is %s'%test_result
            # print 'count are %d, %d'%(accu[0], accu[1])
            # print 'accuracy so far: %f'%(float(accu[0]) / float(accu[0] + accu[1]))


        f1.close()
        print 'orgin'
        print orgin

        print 'judge'
        print judge

        print 'judge_right'
        print judge_right

        print 'total'
        print accu
        print 'accuracy this time is %f'%((float(accu[0]) / float(accu[0] + accu[1])))
Example #37
0
    def word_vec_case_set(cls,
                          word_model_file,
                          with_name=False,
                          merge_by='mosaic'):
        """
        获取词向量特征集,认为词条最多10个词
        如果以mosaic方式,每个词条被表示为50*10=500维
        如果以sum方式,每个词条被表示为50维
        :param word_model_file: 词向量模型文件
        :param with_name: 正样例是否包含人名
        :param merge_by: 词条中词项量的结合方式,mosaic或sum
        :return: 一个字典{pos_case:{正例},neg:{负例}}
        """
        segmentor = Segmentor()
        segmentor.load("../word2vec_process/model/cws.model")
        word_vec_model = word2vec.Word2Vec.load('../word2vec_process/model/' +
                                                word_model_file)
        case_dict = cls.load_case_set(with_name)
        word_vec_case_dict = {}

        if merge_by == 'mosaic':
            # 以词向量拼接的方式构建词条表示,500维
            pos_case_list = case_dict['pos_case']
            pos_case_vec_dict = {}
            for pos_case in pos_case_list:
                case_words = segmentor.segment(pos_case)
                case_vec = []
                is_useful = 0
                for word in case_words:
                    try:
                        # 拼接
                        case_vec.extend(word_vec_model[unicode(word)].tolist())
                        is_useful = 1
                    except Exception, e:
                        with open("./data/not_in_vocabulary.txt",
                                  'a') as out_file:
                            # 记录缺失词汇
                            out_file.write(word + '\n')
                # 多退少补
                if len(case_vec) > 500:
                    case_vec = case_vec[0:500]
                else:
                    while (len(case_vec) < 500):
                        case_vec.append(0)
                if is_useful:
                    pos_case_vec_dict[pos_case] = case_vec
            # 负样本
            neg_case_list = case_dict['neg']
            neg_case_vec_dict = {}
            for neg_case in neg_case_list:
                case_words = segmentor.segment(neg_case)
                case_vec = []
                is_useful = 0
                for word in case_words:
                    try:
                        # 拼接
                        case_vec.extend(word_vec_model[unicode(word)].tolist())
                        is_useful = 1
                    except Exception, e:
                        with open("./data/not_in_vocabulary.txt",
                                  'a') as out_file:
                            # 记录缺失词汇
                            out_file.write(word + '\n')
                # 多退少补
                if len(case_vec) > 500:
                    case_vec = case_vec[0:500]
                else:
                    while (len(case_vec) < 500):
                        case_vec.append(0)
                if is_useful:
                    neg_case_vec_dict[neg_case] = case_vec
Example #38
0
class PreProcessor(object) :
    def __init__(self , cws_model_path=CWS_MODEL_PATH , stop_words_dir=STOP_WORDS_DIR) :
        self.raw_data = None
        self.processed_data = None
        self.words_dict = None
        self.STOP_WORDS = self._load_stop_words(stop_words_dir) 
        self.segmentor = Segmentor()
        self.segmentor.load(cws_model_path)

    def _load_stop_words(self , dir_name) :
        stop_words = set()
        cur_abs_dir_path = os.path.split(os.path.abspath(__file__))[0]
        dir_path = os.path.join(cur_abs_dir_path , dir_name)
        for file_name in os.listdir(dir_path) :
            file_path = os.path.join(dir_path , file_name) 
            with open(file_path) as f :
                for line in f :
                    word = line.strip()
                    stop_words.add(word)
        for symbol in SENT_SPLIT_SYMBOLS :
            stop_words.add(symbol)
        return stop_words

    def load_raw_data(self , path) :
        with open(path) as f :
            self.raw_data = json.load(f)
    
    def _split_sentence(self , content) :
        '''
        split content to sentence
        '''
        sents = []
        paras = content.split("\n")
        for paragraph in paras :
            split_rst = re.split(ur"[%s]+" %(SENT_SPLIT_SYMBOLS) , paragraph) # has space 
            sents.extend(split_rst)
        return sents
    
    def _segment(self , unicode_line) :
        '''
        return : list of words
        '''
        utf8_line = unicode_line.strip().encode("utf8")
        words = list(self.segmentor.segment(utf8_line))
        return words
    
    def _make_doc_data(self , url , title_seged , sents_seged) :
        return { 'url' : url ,
                 'title' : title_seged ,
                 'content' : sents_seged
                 }

    def _add_word2words_dict(self , words) :
        for word in words :
            if word not in self.STOP_WORDS :
                word = word.lower() 
                self.words_dict.add(word)

    def do_preprocessing(self) :
        logging.info("do preprocessing ...")
        self.processed_data = dict()
        self.words_dict = set()
        for page_id , page_data in self.raw_data.items() :
            url = page_data['url']
            title = page_data["title"]
            content = page_data["content"]
            sents = self._split_sentence(content)
            # segment
            title_words = self._segment(title)
            content_words = []
            for sent in sents :
                content_words.extend(self._segment(sent))
                content_words.append(" ") # another space to avoid that they become one line when merging at output snippet 
            self.processed_data[page_id] = self._make_doc_data(url , title_words , content_words)
            self._add_word2words_dict(title_words + content_words)
        logging.info('done.')
    
    def save_doc_data(self , to_path) :
        logging.info("saving doc data to ` %s `" %(to_path) )
        with open(to_path , 'w') as of:
            json.dump(self.processed_data , of )
        logging.info("done.")

    def save_words_dict(self , to_path) :
        logging.info("saving words dict to ` %s `" %(to_path))
        words_list = list(self.words_dict)
        words_dict = {word : word_id for word_id , word in enumerate(words_list) }
        with open(to_path , 'w') as of :
            json.dump(words_dict , of , ensure_ascii=False) # json not support `set`
        logging.info("done.")
Example #39
0
from pyltp import Segmentor
from pyltp import Postagger

seg = Segmentor()
seg.load("../ltp_model/cws.model")
words = seg.segment("你是那人间的四月天。")
print("| ".join(words))

split_word = ' '.join(words)
pos = Postagger()
pos.load("../ltp_model/pos.model")
postags = pos.postag(split_word)

for word, postag in zip(split_word, postags):
    print(word + '/' + postag, end=' ')
Example #40
0
class Ltp(object):
    """docstring for Ltp"""
    def __init__(self):
        print 'Ltp:: __init__'
        self.segmentor = Segmentor()
        self.segmentor.load('model/cws.model')
        self.lexicon = None

    # end

    def load_lexicon(self, path):
        """ 加载专有名词词典
		@params <String> path 词典列表
		"""
        print 'Ltp:: load_lexicon'
        self.lexicon = Lexicon(path)

    # end

    def cut_to_word(self, data, tab_index=0):
        """ 把文本数据切分成词,返回词列表
		@params <list> file模块读取的数据
		@params <int> 按制表符分割后,需要分词的文本的索引
		@return <list> 词列表
		"""
        print 'Ltp:: cut_to_word'
        content = list()
        for line in data:
            line = line.strip()
            if line == "":
                continue
            text = line.split('\t')
            if len(text) < tab_index + 1:
                continue
            text = text[tab_index]

            proper_noun_list = list()
            if self.lexicon:
                text = self.lexicon.filter(text)

            words_list = self.segmentor.segment(text)
            for word in words_list:
                content.append(word)

        print 'Ltp:: cut_to_word done'
        return content

    # end

    def sentence_cut_to_words(self, sentence, tab_index=0):
        """
		把一句话分词,先不考虑专有名词
		"""
        print 'Ltp:: sentence_cut_to_words'
        sentence = sentence.strip()
        if sentence == "":
            return False
        text = sentence.split('\t')
        if len(text) < tab_index + 1:
            return False
        text = text[tab_index]

        words_line = ""
        words_list = self.segmentor.segment(text)
        for word in words_list:
            words_line += word + " "
        return words_line

    # end

    def article_cut_to_words(self, article, tab_index=0):
        """ 把一个文本文件分词
		@params article 是file读取的文本数据
		"""
        print 'Ltp:: article_cut_to_words'
        content = ""
        for line in article:
            sentence = self.sentence_cut_to_words(line, tab_index)
            if sentence is False:
                continue
            content += sentence + "\n"
        return content

    # end

    def get_word_freq(self, words_list):
        """ 统计词频
		@params <list> words_list 词列表
		@return <dict> word_freq_dict 词频字典
		"""
        print 'Ltp:: get_word_freq'
        word_freq_dict = dict()
        for word in words_list:
            if not word_freq_dict.has_key(word):
                word_freq_dict[word] = 0
            word_freq_dict[word] += 1
        print 'Ltp:: get_word_freq done'
        return word_freq_dict
Example #41
0
 def segment(self, sent):
     segmentor = Segmentor()
     segmentor.load(os.path.join(MODELDIR, "cws.model"))
     words = segmentor.segment(sent)
     seg_sent = " ".join(words)
     return seg_sent
Example #42
0
ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path

# Set your own model path
MODELDIR=os.path.join(ROOTDIR, "ltp_data")

from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!'

sentence = SentenceSplitter.split(paragraph)[0]

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
words = segmentor.segment(sentence)
print "\t".join(words)

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
class pnn_count():
	def __init__(self):
		self.mydict = {}
		self.lines = []
		self.lines_num = 3000
		self.c = [0,0,0] #PNN
		self.w_c = [{},{},{}]
		self.segmentor = Segmentor()
		self.segmentor.load('cws.model')
		self.read_file()
		self.train()
		self.test()
	def read_file(self):
		f = open('pnn_annotated.txt','r')
		self.lines = f.readlines()
		f.close()
	def train(self):
		for i in range(0,self.lines_num/5*4):
			line = self.lines[i]
			line.strip('\n')
			line_array = line.split('\t')
			line = line_array[1]
			words = self.segmentor.segment(line)
			if line_array[0] == '1':
				pos = 0
			elif line_array[0] =='0':
				pos = 1
			else:
				pos = 2
			for i in words:                          #calculate frequency
				if self.w_c[pos].has_key(i):
					self.w_c[pos][i] += 1
				else:
					for a in range(0,3):
						self.w_c[a][i] = 0
					self.w_c[pos][i] += 1
			self.c[pos] += 1

	def test(self):
		count = 0
		v = len(self.mydict.keys())
		for a in range(self.lines_num / 5 * 4, len(self.lines)-1):
			wholeline = self.lines[a]
			print wholeline
			result = [0.0,0.0,0.0]
			line_array = wholeline.split('\t')
			line = line_array[1]
			words = self.segmentor.segment(line)
			for i in range(0,3):
				pci = 1.0 * self.c[i] / (self.lines_num/5 *4)
				pwci = 1.0
				sum_i = 0
				for q in self.w_c[i].keys():
					sum_i += self.w_c[i][q]
				for k in words:
					if self.w_c[i].has_key(k):
						pwci = pwci * (self.w_c[i][k] + 1) / (sum_i + v)
				result[i] = pci * pwci
			maxi = 0
			for i in range(0,3):
				if result[i]>result[maxi]:
					maxi = i
			if maxi ==0:
				if line_array[0] == '1':
					count += 1
				print "my guess is positive"
			elif maxi==1:
				if line_array[0] == '0':
					count += 1
				print "my guess is neuter"
			else:
				if line_array[0] == '-1':
					count += 1
				print "my guess is negative"
		print  count * 1.0 /(self.lines_num/5)