Ejemplo n.º 1
0
def seg(content):
        # Set your own model path
    MODELDIR="/home/liuqi/ltp/pyltp/ltp_data/"
    segmentor = Segmentor()
    segmentor.load(MODELDIR+"cws.model")
    tWords = segmentor.segment(content)
    return tWords
Ejemplo n.º 2
0
def split_words(sentence = "中国进出口银行与中国银行加强合作",type_list=0):
    """分词,若type_list=True,则返回以列表返回分词后的结果。"""
    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))
    words = segmentor.segment(sentence)
    if type_list:
        return [i for i in words]
    return words
class pnn_count():
	def __init__(self):
		self.mydict = {}
		self.segmentor = Segmentor()
		self.segmentor.load('cws.model')
		self.hash_dict()
		self.ltp_process()
	def ltp_process(self):
		sentence_num = 0
		right_num = 0;
		f = open('pnn_annotated.txt','r')
		for line in f:
			sentence_num += 1
			#print line
			line_array = line.split('\t')
			line = line_array[1]
			count = 0
			words = self.segmentor.segment(line)
			for i in words:
				if self.mydict.has_key(i):
					count = count + self.mydict[i]
			if count > 0:		
				answer = "positive"
				if line_array[0] == '1':
					right_num += 1
			elif count == 0:
				answer = "neuter"
				if line_array[0] == '0':
					right_num += 1
			else:
				answer = "negative"
				if line_array[0] == '-1':
					right_num += 1
			#print "My guess is %s" %answer
			#print "THe right answer is %s" %line_array[0]

			#print "result  %d" % count
		f.close()
		print "total sentence is %d, right answer is %d" %(sentence_num,right_num)
	def hash_dict(self):
		f = open('negative.txt','r')
		for line in f:
			line = line.strip('\n')
			line = line.strip('\r')
			self.mydict[line] = -1
		f.close()
		f = open('positive.txt','r')
		for line in f:
			line = line.strip('\n')
			line = line.strip('\r')
			self.mydict[line] = 1
		f.close()
Ejemplo n.º 4
0
def process(index):

	ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
	sys.path.append(os.path.join(ROOTDIR, "lib"))

	# Set your own model path
	MODELDIR=os.path.join(ROOTDIR, "ltp_data")

	segmentor = Segmentor()
	segmentor.load(os.path.join(MODELDIR, "cws.model"))

	finname = "o_"+str(index)+".txt"
	foutname = "p_"+str(index)+".txt"
	print finname
	count = 0
	fin = codecs.open(finname, encoding='utf-8')
	with codecs.open(foutname, 'w', encoding="utf-8") as fout:
		while 1:
			line = fin.readline()
			if not line:
			    break
			tmp = line.split(" ^ {")[1] # Get JSON
			tmp = "{"+tmp
			data = json.loads(tmp)
			content = data['content']
			# error_correction(content)
			content = content.strip()
			segmentation = ""
			for line in content.split("\n"):
				line = line.encode("utf-8")
				words = segmentor.segment(line)
				segmentation += "/".join(words)
				segmentation += "/"

			# Return type of the function is str, not unicode. Thus need to change into unicode.
			segmentation = unicode(segmentation, "utf-8")
			pinyin = add_pinyin(segmentation)
			obj = {}
			obj['flavor'] = data['flavor']
			obj['environment'] = data['environment']
			obj['service'] = data['service']
			obj['content'] = data['content']
			obj['segmentation'] = segmentation
			obj['pinyin'] = pinyin
			tmpstr = json.dumps(obj,ensure_ascii=False)
			fout.write(tmpstr)
			fout.write('\n')
			count += 1
			print count
		segmentor.release()
Ejemplo n.º 5
0
def segmentation(filename, output_filename):

    print "segmenting '%s' to '%s'" % (filename, output_filename)

    f = open(filename, "r")
    lines = f.readlines()
    f.close()

    MODELDIR = "./ltp_data/"

    # segment
    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))

    # postag
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    
    # Named Entity Recognize
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    
    # Parse and get SVO
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    
    f = open(output_filename, "w")
    fner = open(output_filename.split(".")[0]+"_ner.txt", "w")

    for _line in lines:
        line = _line[:-1]
        if line[-1] in "\n\r":
            line = line[:-1]
        
        words = segmentor.segment(line)
        postags = postagger.postag(words)
#        netags = recognizer.recognize(words, postags)
#        arcs = parser.parse(words, postags)

        for i in range(len(words)):
            f.write( "%s/%s\t" % (words[i], postags[i]))
#            if netags[i]!='O':
#                fner.write("%s/%s\t" % (words[i], netags[i]))
        f.write("\n")
#        fner.write("\n")

    f.close()
Ejemplo n.º 6
0
    def __init__(self):
        self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        segmentor = Segmentor()
        segmentor.load(self.cws_model_path)
        self.words = segmentor.segment(data)
        # print("|".join(words))
        segmentor.release()


        postagger = Postagger() # 初始化实例
        postagger.load(self.pos_model_path)  # 加载模型
        self.postags = postagger.postag(self.words)  # 词性标注
        # print('\t'.join(postags))
        postagger.release()  # 释放模型


        recognizer = NamedEntityRecognizer() # 初始化实例
        recognizer.load(self.ner_model_path)  # 加载模型
        self.netags = recognizer.recognize(self.words, self.postags)  # 命名实体识别
        # print('\t'.join(netags))
        recognizer.release()  # 释放模型
Ejemplo n.º 7
0
def pyltp_words():
    from pyltp import Segmentor, Postagger
    segmentor = Segmentor()
    segmentor.load("/home/fredgan/github/pyltp/ltp_data/cws.model")
    # postagger = Postagger()    
    # postagger.load("~/github/pyltp/ltp_data/cpos.model")
    for line in open(sys.argv[1], 'r'):
        try:
            style,sentence = line.strip().split('\t')
        except:
            continue
        style_dic.setdefault(style, {})
        words = segmentor.segment(sentence)
        # postags = postagger.postag(words)
        for w in words:
            if w in style_dic[style]:
                style_dic[style][w] += 1
            else:
                style_dic[style][w] = 1

    for k,v in style_dic.iteritems():
        v_list = sorted(v.iteritems(), key = lambda d:d[1], reverse = True)
        print k+ "\t" + " ".join(map(lambda i:i[0] + ":" +str(i[1]), v_list[0:min(50,len(v_list))]))
Ejemplo n.º 8
0
Archivo: psg_proc.py Proyecto: bsnsk/QA
def main():

    f = open("psgs.txt", "r")
    lines = [line.rstrip() for line in f.readlines()]
    f.close()

    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))

    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))

    f = open("../questions/q_facts_segged_clf.txt", "r")
    types = f.readlines()
    f.close()

    f = open("../questions/provided/q_facts.txt", "r")
    questions = [line.rstrip() for line in f.readlines()]
    f.close()

    f = open("psgs_segged.txt", "w")
    fans = open("zhidao_answer.txt", "w")
    i = 0
    qid = 0
    flag = 0

    while i < len(lines):
        line = lines[i]
        if (i % 50000 == 0):
            print "\r#\t%d" % i,
            sys.stdout.flush()
        if line.startswith("<question"):
            qid = int(line.split(" ")[1].split("=")[1].split(">")[0])
            flag = 0
            f.write(line + "\n")
        elif line.startswith("</doc") or line.startswith("</question"):
            f.write(line + "\n")
        elif line.startswith("<doc"):
            f.write(line + "\n" + lines[i+1] + "\n")
            i += 2
        else:
            L = len(line)
            s = 0
            for s in range(L):
                if line[s:].startswith("最佳答案:") \
                        or line[s:].startswith("[专业]答案")\
                        or line[s:].startswith("、"+questions[qid-1]):
                    break
            if line[s:].startswith("最佳答案"):
                s += 14
            elif line[s:].startswith("[专业]答案"):
                s += 15
            elif line[s:].startswith("、"+questions[qid-1]):
                s += len(questions[qid-1])+1
            if s < L and flag == 0:
                t = s + 1
                while t < L and line[t:].startswith("更多") == False\
                        and not (t+2<L and line[t]==" " and line[t+1] in "0123456789" and line[t+2] in "0123456789")\
                        and not line[t:].startswith("~")\
                        and not line[t:].startswith("?")\
                        and not line[t:].startswith("!")\
                        and not line[t:].startswith("。"):
                    t += 1
                if s < t and t-s < 200 and t-s > 1:
                    ans = line[s:t].rstrip(".。 ??,,")
                    if types[qid-1].rstrip() == "Q_number":
                        ans = first_con_number(ans)
                    fans.write("%d\t%s\n" % (qid, ans))
                    flag = 1
#            words = segmentor.segment(line)
#            postags = postagger.postag(words)
#            for j in range(len(words)):
#                f.write("%s/%s\t" % (words[j], postags[j]))
#            f.write("\n")
        i += 1
    f.close()
    fans.close()
Ejemplo n.º 9
0
    # list = []
    # with open(file, 'r') as f:
    #     lines = f.readlines()
    #     for line in lines:
    #         str = line.split('\t')[1]
    #         try:
    #             list.index(str)
    #         except:
    #             list.append(str)
    #             print(str)
    # print(list)
    #[ '食品餐饮','旅游住宿', '金融服务', '医疗服务', '物流快递']

    from pyltp import Segmentor, Postagger
    seg = Segmentor()
    seg.load('cws.model')
    poser = Postagger()
    poser.load('pos.model')
    real_dir_path = os.path.split(os.path.realpath(__file__))[0]  #文件所在路径
    stop_words_file = os.path.join(real_dir_path, 'stopwords.txt')
    #定义允许的词性
    allow_pos_ltp = ('a', 'i', 'j', 'n', 'nh', 'ni', 'nl', 'ns', 'nt', 'nz',
                     'v', 'ws')

    def cut_stopword_pos(s):
        words = seg.segment(''.join(s.split()))
        poses = poser.postag(words)
        stopwords = {}.fromkeys([
            line.rstrip()
            for line in open(stop_words_file, 'r', encoding='UTF-8')
        ])
Ejemplo n.º 10
0
# -*- coding:utf-8 -*-
# segment
from pyltp import Segmentor
segmentor = Segmentor()
segmentor.load('./ltp-model/cws.model')

def segment(text):
    return segmentor.segment(text)
Ejemplo n.º 11
0
class AutoExtraction:
    """
    新闻言论自动抽取
    """
    def __init__(self):
        """
        初始化模型
        """
        self.seg_sent = SentenceSplitter()  # 分句
        self.seg = Segmentor()  # 分词
        self.seg.load(cws_model_path)
        self.pos = Postagger()  # 词性标注
        self.pos.load(pos_model_path)
        self.ner = NamedEntityRecognizer()  # 命名实体识别
        self.ner.load(ner_model_path)
        self.par = Parser()  # 依存分析
        self.par.load(par_model_path)
        self.similar_word = load_similar_word()  # 读取相似词列表

    def _sentence_split(self, content):
        sentences = self.seg_sent.split(content)
        return [s for s in sentences if len(s) != 0]

    def _del_punctuation(self, sent):
        """
        1.分词
        2.移除标点符号
        3.再次分词
        """
        sent_seg = self._cut(sent)
        item_list = [
            item.strip() for item in sent_seg if item.strip() not in all_punc
        ]
        sent_seg = self._cut(''.join(item_list))
        return sent_seg

    def _cut(self, sent):
        return ' '.join(self.seg.segment(sent))

    def _pos(self, sent):
        words = sent.split(' ')
        pos_tags = self.pos.postag(words)
        return list(pos_tags)

    def _ner(self, sent, pos_tags):
        sentence_tag = self.ner.recognize(sent.split(' '), pos_tags)
        return list(sentence_tag)

    def _par(self, sent, sentence_tag):
        arcs = self.par.parse(sent, sentence_tag)
        return [(arc.head, arc.relation) for arc in arcs]

    @classmethod
    def _exist_ner(cls, sentence_tag):
        """
        判断句子的ner结果是否存在实体,并返回实体内容
        """
        # Ni Ns Nhr
        ner_dic = defaultdict(int)
        ner_set = [
            'S-Ni', 'S-Ns', 'S-Nh', 'B-Ni', 'B-Ns', 'B-Nh', 'I-Ni', 'I-Ns',
            'I-Nh', 'E-Ni', 'E-Ns', 'E-Nh'
        ]
        i = 0
        while i < len(sentence_tag):
            for j in range(i, len(sentence_tag)):
                if sentence_tag[j] not in ner_set: break
            if j == i:
                i += 1
            else:
                ner_dic[i] = j
                i = j
        return ner_dic

    @classmethod
    def _tf_idf(cls, text_list):
        """
        计算tf-idf
        """
        tf_idf = TfidfVectorizer()
        return tf_idf.fit_transform(text_list)

    @classmethod
    def _cosine_sim(cls, x1, x2):
        """
        文本相似性
        """
        return cosine_similarity(x1, x2)

    def _has_next_sentence(self, x1, x2, threshold):
        """
        判断是否有下一句话
        """
        sim = self._cosine_sim(x1, x2)[0][0]
        if sim > threshold:
            print(sim)
            return True
        return False

    def process(self, content):
        """
        content: 输入的新闻预料
        return: 输出人物和对应言论
        """
        # 1. 分句
        sents = self._sentence_split(content)
        # 2. 分词、去标点
        sents_ = [self._del_punctuation(s) for s in sents]
        # 3. 词性标注
        postags = [self._pos(s) for s in sents_]
        # 4. 命名实体识别
        netags = [self._ner(s, p) for s, p in zip(sents_, postags)]
        # 5. 依存句法分析
        arcs_list = [
            self._par(w.split(' '), n) for w, n in zip(sents_, netags)
        ]
        # 6. tf-idf
        tf_idf_vec = self._tf_idf(sents_)

        extract_result = []
        for index, netag in enumerate(netags):
            ner_dic = self._exist_ner(netag)
            # print(ner_dic)
            if not ner_dic:  # 判断是否存在实体
                continue

            words = sents_[index].split(' ')
            # print(words)
            subject_verb = defaultdict(int)
            # (i, arc[0]-1)
            for i, arc in enumerate(arcs_list[index]):
                if arc[1] == 'SBV':  # [(subject_index, verb_index),...]
                    if (arc[0] - 1) not in subject_verb.keys():
                        subject_verb[arc[0] - 1] = i
                    else:
                        if i > subject_verb[arc[0] - 1]:
                            subject_verb[arc[0] - 1] = i
                            # print('words:{}\n ner:{}\n arcs:{}\n s_b:{}\n'.format(words,netags[index], arcs,subject_verb))

            for v, s in subject_verb.items():  # 根据句法分析获得的 实体索引 s 和 动词索引 v

                if words[v] in self.similar_word:  # 判断动词是否为相似词
                    # print('s:{},v:{}'.format(s,v))

                    # 如果SBV的 subject 不在实体,则任选一个距离s和v最近的实体作为 subject
                    if s in ner_dic.keys():
                        subject = ''.join(words[s:ner_dic[s]])
                    else:
                        l = [(n[0], n[1], s - n[0])
                             for n in list(ner_dic.items())
                             if n[0] < v and n[0] < s]
                        if l:  # 如果前面不存在实体, 则选择非实体词
                            start, end, _ = min(l, key=lambda x: x[2])
                            subject = ''.join(words[start:end])
                        else:
                            subject = words[s]

                    said = words[v]

                    #  判断下一句话是否与当前是同一个语境:1)存在下一句话 2)两句话相似 3)下一句不存在实体
                    speech = sents[index].split(words[v])[1]
                    if index < len(netags) - 1 and self._has_next_sentence(
                            tf_idf_vec[index], tf_idf_vec[index + 1],
                            0.1) and not self._exist_ner(netags[index + 1]):
                        # print('similar:{},{}'.format(sents[index], sents[index + 1]))
                        speech += sents[index + 1]

                    extract_result.append((subject, said, speech))

        return extract_result

    def release(self):
        """
        :释放模型
        """
        self.seg.release()
        self.pos.release()
        self.ner.release()
        self.par.release()
Ejemplo n.º 12
0
    ret = []
    for text_word in text_words:
        if text_word not in stopwords:
            ret.append(text_word)
    return ret


def write_result(data: list):
    """
    将结果写入文件
    :param data: 结果
    :return: None
    """
    with open("data/preprocessed.json", "w", encoding="utf-8") as f:
        for line in data:
            f.write(json.dumps(line, ensure_ascii=False) + "\n")


if __name__ == "__main__":
    print("Loading stop words and data!")
    stop_words = get_stop_words()
    need_segs = get_need_seg_file()
    print("Initializing Segmentor!")
    segmentor = Segmentor()
    segmentor.load(cws_model_path)
    print("Segmenting!")
    results = seg(stop_words, need_segs, segmentor)
    segmentor.release()
    write_result(results)
    print("Finish!")
Ejemplo n.º 13
0
from pyltp import Segmentor, Postagger, Parser
import os
import pickle

LTP_DATA_DIR = "D:\myprojects\LTP\ltp_data_v3.4.0"
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')

segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)  #加载模型
postagger = Postagger()
postagger.load(pos_model_path)
parser = Parser()
parser.load(par_model_path)

##===========去除前后多余部分==========##
# content=[
#     [('现在', 'nt'), ('已经', 'd'), ('坏', 'a'), ('了', 'u'), (',', 'wp'), ('假', 'a'), ('的', 'u'), ('数据线', 'n')]
#      ...
# ]
content = []
sen_feature = []  #对每个句子,保存分词,词性,依存句法[, , []]

with open('./data/file.txt', 'rt', encoding='utf-8') as f1:
    for l in f1:
        line = l.split(',', 4)[4][2:-4]  #去除前后多余部分
        words = segmentor.segment(line)  #分词
        postags = postagger.postag(words)  #词性标注
        arcs = list(parser.parse(words, postags))  # 句法分析
        word_pos = list(zip(words, postags))
Ejemplo n.º 14
0
import sys, os

ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path

# Set your own model path
MODELDIR=os.path.join(ROOTDIR, "ltp_data")

from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!'

sentence = SentenceSplitter.split(paragraph)[0]

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
words = segmentor.segment(sentence)
print "\t".join(words)

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
Ejemplo n.º 15
0
class LtpParser:
    def __init__(self):
        print(111)
        LTP_DIR = "D:\\ltp_data\\ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))
        print(111)

    '''语义角色标注'''

    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:  #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1]
                 for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''

    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
Ejemplo n.º 16
0
    data_processed.close()
    return entities


def dump_result_to_csv(target):
    writer = csv.writer(target, delimiter=',')
    writer.writerow(['entity', 'times', 'length', 'source'])
    for k, v in sorted_entities:
        writer.writerow([k, v[0], len(k), v[1]])


if __name__ == '__main__':
    segmentor = Segmentor()  # Initialize model
    postagger = Postagger()
    recognizer = NamedEntityRecognizer()
    segmentor.load('cws.model')  # Load model
    postagger.load('pos.model')
    recognizer.load('ner.model')

    # Load raw data
    data_csv = pd.read_csv('../data.csv')

    # Fetch column 'title'
    datas = data_csv['title']

    # Run in different method according to arg.method
    if args.method == 'postagger':
        entities = PostagResult(datas, postagger, segmentor)
        target_file = open('target_postagger.csv', 'w', encoding='utf-8-sig')
    elif args.method == 'recognizer':
        entities = NameEntityResult(datas, postagger, segmentor, recognizer)
Ejemplo n.º 17
0
import random
import os, re
from tqdm import tqdm
from pyltp import Segmentor
LTP_DIR = "/home/zxsong/workspace/ltp_data_v3.4.0"
segmentor = Segmentor()
segmentor.load(os.path.join(LTP_DIR, "cws.model"))


def split_sentences(article):
    '''
    对文章分句
    :param article: str
    :return: list(str)
    '''
    article = article.strip()
    para = re.sub('([。!!??\?])([^”’])', r"\1\n\2", article)  # 单字符断句符
    para = re.sub('(\.{6})([^”’])', r"\1\n\2", para)  # 英文省略号
    para = re.sub('(\…{2})([^”’])', r"\1\n\2", para)  # 中文省略号
    para = re.sub('([。!!??\?][”’])([^,。!!??\?])', r'\1\n\2', para)
    para = para.rstrip()
    return para.split("\n")


def cut_sentence(sentence, cut_level):
    '''
    对句子分词,采用字级别的分词方式
    :param sentence: str
    :return: list(str)
    '''
    # TTnews含有特殊符号<Paragraph>,将该符号作为特殊token
Ejemplo n.º 18
0
# -*- coding: utf-8 -*-
from pyltp import Segmentor
import jieba
import sys

segmentor = Segmentor()
segmentor.load("model/cws.model")

reload(sys)
sys.setdefaultencoding('utf8')

# words = segmentor.segment("元芳你怎么看")
# print words
# print "|".join(words)
# segmentor.release()



def simple_word_segment():
    """
    对wiki语料简体中文内容按照ltp默认方式分词,
    也就是未添加词典也未使用个性化分词
    :return:
    """
    with open('corpus/wiki_hans.txt', 'r') as in_file:
        count = 0
        for line in in_file.readlines():
            count += 1
            if count % 1000 == 0:
                print count
            try:
Ejemplo n.º 19
0
class SentenceParse(ModelPath):
    def __init__(self):
        # 分词
        self.segmentor = Segmentor()
        self.segmentor.load(self.ltp_cws)

        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(self.ltp_pos)

        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(self.ltp_ner)

        # 依存句法分析
        self.parser = Parser()
        self.parser.load(self.ltp_parser)

        # 语义角色标注
        self.labeller = SementicRoleLabeller()
        self.labeller.load(self.ltp_pisrl)

    def __call__(self, sentence, *args, **kwargs):
        words = self.get_words(sentence)
        postags = self.get_postagger(words)
        ner = self.get_recognizer(words, postags)
        arcs = self.get_parse(words, postags)
        # roles = self.get_rolelabel(words, postags, arcs)
        parse_result = {
            "sentence": sentence,
            "words": words,
            "postags": postags,
            "ner": ner,
            "arcs": [(arc.head, arc.relation) for arc in arcs],
            # "roles": [
            #     [role.index, [[arg.name, (arg.range.start, arg.range.end)] for arg in role.arguments]] for role in roles
            # ],
        }
        # print("*"*80)
        # for k, v in parse_result.items():
        #     print(k, v)
        return parse_result

    # , [arg.name, [arg.range.start, arg.range.end]]
    # for arg in role.arguments]
    def get_sentences(self, news):
        """
        分句
        :param news: str 新闻文本
        :return:  list 句子列表
        """
        return list(SentenceSplitter.split(news))

    def get_words(self, sentence):
        """
        分词
        :param sentence: str 句子
        :return: list 分词列表
        """
        return list(self.segmentor.segment(sentence))

    def get_postagger(self, words):
        """
        词性标注
        :param words:
        :return: list 词性
        """
        return list(self.postagger.postag(words))

    def get_recognizer(self, words, postags):
        """
        命名实体识别
        :param words: list 词列表
        :param postags: list 词性列表
        :return: list 命名实体列表
        """
        return list(self.recognizer.recognize(words, postags))

    def get_parse(self, words, postags):
        """
        依存句法分析
        :param words: list 词列表
        :param postags: list 词性列表
        :return: 依存关系
        """
        arcs = self.parser.parse(words, postags)
        return arcs

    def get_rolelabel(self, words, postags, arcs):
        """
        语义角色标注
        :param words: list 词列表
        :param postags: list 词性列表
        :param arcs: 依存句法分析结果
        :return: 语义角色标注
        """
        roles = self.labeller.label(words, postags, arcs)
        return roles

    def release(self):
        """释放模型"""
        self.segmentor.release()
        self.recognizer.release()
        self.parser.release()
        self.postagger.release()
        self.labeller.release()
Ejemplo n.º 20
0
# -*- coding: utf-8 -*-
from pyltp import Segmentor

modelPath = './data/ltp_data_v3.4.0/cws.model'
segmentor = Segmentor()
segmentor.load(modelPath)

sent = '在包含問題的所有解的解空間樹中,按照深度優先搜尋的策略,從根節點出發深度優先搜尋解空間樹'
words = segmentor.segment(sent)
print(' | '.join(words))
# coding:utf-8

import gensim
from pyltp import Segmentor
from numpy import *  # 用于矩阵运算
import copy  # 用于深复制
import os

qin = 0  # 改成0是刘辉的路径,否则是秦文涛的路径
# 加载分词模型
segmentor = Segmentor()
if qin == 1:
    segmentor.load('D:/coding/Python2.7/ltp_data_v3.4.0/cws.model')
    model = gensim.models.Word2Vec.load('../Sentence/model_qin')
else:
    segmentor.load('/Users/liuhui/Desktop/实验室/LTP/ltp_data_v3.4.0/cws.model')
    model = gensim.models.Word2Vec.load('../Sentence/model')
vec_size = 100

stoplist = {}
f = open('../stopword.txt', 'r')
for line in f:
    word = line.strip()
    stoplist[word] = 1
f.close()


class Sent:
    def __init__(self, _newsid, _globalid, _paraid, _localid, _sentnum, _content, _vec):
        self.newsid = _newsid  # 该句所属新闻编号
        self.globalid = _globalid  # 该句在该篇新闻的第几句
Ejemplo n.º 22
0
Archivo: L2R.py Proyecto: Goerwa/QA
    def  get_vec(self,filename):
        def list2str(l):
            r_str = ' '
            for i in l:
                r_str += str(int(i.split(':')[0]) + 10)  +':'+ i.split(':')[1]+ ' '
            return r_str
        def adjust_list(l,words):
            l.insert(0,'"')
            n = len(l)
            cut_list = []
            cut_list.append(l[0])
            for i in range(1,n):
                a = ''
                if cut_list[-1]+l[i] in words:
                    cut_list[-1] = cut_list[-1]+l[i]
                    continue
                while l[i] not in words and len(l[i]) > 1:
                    a = l[i][-1] + a
                    l[i] = l[i][:-1]
                if l[i] in words:
                    cut_list.append(l[i])
                if a != '':
                    if a in words:
                        cut_list.append(a)
            # print(cut_list)
            return cut_list
        def get_tf_idf(data,syn_dict):
            all_words = []
            dict_freq = {}  # 词频
            dict_doc = {}  # 文档数量
            all_num = 0
            words = []
            stop_words = self.stop_word
            tf_idf = {}
            all_sent = []
            for k in data.keys():
                # print(data[k])
                line_list = (data[k].replace('[','').replace(']','').split('", '))
                line_list[-1] = line_list[-1][:-1]
                for i in line_list:
                    line_i = i.split('\t')[1] + '"'
                    all_sent.append(line_i)
                    cut_line = '\t'.join(segmentor.segment(line_i))
                    words_list = cut_line.split('\t')  # 分词
                    is_adddoc = []
                    for word in words_list:
                        if word not in stop_words:
                            if word not in dict_freq.keys():
                                dict_freq[word] = 1
                            else:
                                dict_freq[word] += 1
                            if word not in is_adddoc: # 词出现的问题树
                                if word not in dict_doc.keys():
                                    dict_doc[word] = 1
                                else:
                                    dict_doc[word] += 1
                                is_adddoc.append(word)
            for k in dict_freq.keys():
                idf = math.log(self.N / dict_doc[k])
                tf_idf[k] = 1 + math.log(dict_freq[k])
                tf_idf[k] *= idf
            with open('AC/tf-idf.txt','w') as fr:
                for k in tf_idf.keys():
                    fr.write(k)
                    fr.write('\t')
                    fr.write(str(tf_idf[k]))
                    fr.write('\n')

        def get_feature_vec(q_list,a_list):
            feature  = []
            q_den = 1
            for word in q_list:
                q_den += tf_idf[word]**2
            for sa_list in a_list:
                vec_f = 0
                a_den = 1
                for wa in sa_list:
                    a_den += tf_idf[wa]**2
                    if wa in set(q_list):
                        vec_f += tf_idf[wa]
                den = (q_den * a_den)**0.5
                vec_f /= den
                feature.append(round(vec_f*1000,2))
            return feature

        def get_feature_bm25(q_list,a_list,all_words):
            all_wordsl = list(all_words)
            # print(all_wordsl[174017])
            feature = []
            s = BM25(a_list,all_wordsl)
            # s.simall(q_list)
            # print(s.simall(q_list))
            for i in s.simall(q_list):
                feature.append(i)
            # print(feature)
            return feature

        # def get_feature_sim(q_list,a_list):
        #     feature = []
        #     str_q = ''
        #     for q in q_list:
        #         str_q  = str_q + ' ' + q
        #     for as_list in a_list:
        #         str_sa = ''
        #         for a in as_list:
        #             str_sa = str_sa + ' ' + a
        #         # print(q_list,as_list,synonyms.compare(q_list,as_list))
        #         if len(str_sa) < 1 or len(str_q) < 1:
        #             feature.append(0.0)
        #         else:
        #             feature.append(round(synonyms.compare(str_q, str_sa,seg=False)*1000,3))
        #     return feature

        def get_feature_same(q,a_list):
            r = []
            for sa_list in a_list:
                n = 0
                for a in sa_list:
                    if a in q:
                        n += 1
                r.append(n)
            return r
        def get_DA(words_list):
            postags = postagger.postag(words_list)  # 词性标注
            pos_line = '\t'.join(postags)
            pos_list = pos_line.split('\t')
            # print(pos_list)
            # print(pos_list)
            if pos_list == ['']:
                return []
            netags = recognizer.recognize(words_list, pos_list)  # 命名实体识别
            ner_line = '\t'.join(netags)
            ner_list = ner_line.split('\t')

            arcs = parser.parse(words_list, pos_list)  # 句法分析
            arcs_line = "\t".join("%d %s" % (arc.head, arc.relation) for arc in arcs)
            arcs_list = arcs_line.split('\t')
            r = []
            rsyn = []
            for i in range(len(arcs_list)):
                # print(words_list[int(arcs_list[i][0])-1] + '_' + words_list[i],arcs_list[i][0])
                if pos_list[i][0]  in set({'n','v','a'}):
                    r.append(words_list[int(arcs_list[i][0]) - 1] + '_' + words_list[i])
            return r
        def get_feature_DA(q_list,a_list):
            feature = []
            feature_q = get_DA(q_list)
            feature_a = []
            for sa_list in a_list:
                feature_sa = get_DA(sa_list)
                # print(feature_q)
                # print(feature_sa)
                score = 0.0
                n = 0
                for sa in feature_sa:
                    for q in feature_q:
                        n += 1
                        # print(sa,q)
                        if sa == q:
                            score += 1
                        elif sa.split('_')[0] == q.split('_')[0]:
                            score += 0.5
                        elif sa.split('_')[1] == q.split('_')[1]:
                            score += 0.5
                        else:
                            n -= 1
                # print(score,n)
                if score > 0.4:
                    feature.append(score/n)
                else:
                    feature.append(0.0)
            # print(feature)
            return feature

        fd = open(filename, 'r')
        data = []
        data_dict = {}
        for line in fd:
            # print(line[:-1])
            # print(line[:-1].split('\t')[1])
            data.append(line[:-1])
        for i in range(0,len(data),2):
            data_dict[data[i]] = data[i+1]

        segmentor = Segmentor()
        segmentor.load('cws.model')
        postagger = Postagger()  # 初始化实例
        postagger.load('pos.model')  # 加载模型
        recognizer = NamedEntityRecognizer()  # 初始化实例
        recognizer.load('ner.model')  # 加载模型
        parser = Parser()
        parser.load('parser.model')
        tf_idf = {}
        all_word = []
        answer_vec = []
        fti = open('AC/tf-idf.txt', 'r')
        for line in fti:
            k = line[:-1].split('\t')[0]
            v = line[:-1].split('\t')[1]
            tf_idf[k] = round(float(v), 2)
            all_word.append(k)
        all_word = set(all_word)
        j = 0
        for k in data_dict.keys():
            # print(k,data_dict[k])
            # print(j)
            pos_a = []
            cut_line = '\t'.join(segmentor.segment(k.split('\t')[1][1:-1]))
            words_list = cut_line.split('\t')  # 分词
            words_list = adjust_list(words_list, all_word)
            q_list = []
            a_list = []
            for word in words_list:
                if word not in self.stop_word:
                    q_list.append(word)
            line_list = (data_dict[k].replace('[', '').replace(']', '').split('", '))
            line_list[-1] = line_list[-1][:-1]
            for i in line_list:
                sa_list = []
                line_i = i.split('\t')[1] + '"'
                i_n = i.split('\t')[0]
                pos_a.append(i_n)
                cut_line = '\t'.join(segmentor.segment(line_i[1:-1]))
                words_list = cut_line.split('\t')  # 分词
                words_list = adjust_list(words_list, all_word)
                for word in words_list:
                    if word not in self.stop_word:
                        sa_list.append(word)
                a_list.append(sa_list)
            # print(q_list)
            # print(a_list)
            feature_same = get_feature_same(k.split('\t')[1][1:-1],a_list)
            feature_vec = get_feature_vec(q_list,a_list)
            feature_bm25 = get_feature_bm25(q_list,a_list,all_word)
            # feature_sim = get_feature_sim(q_list, a_list)
            feature_DA = get_feature_DA(q_list,a_list)

            for ni in range(len(pos_a)):
                answer_vec.append(pos_a[ni] + ' 1:' + str(feature_vec[ni]) + ' 2:' + str(feature_DA[ni]) + ' 3:' +
                                  str(feature_same[ni]) + list2str(feature_bm25[ni]))
            j += 1
            if j % 500 == 0:
                print(j)
            # if j == 5:
            #     break
        with open('AC/train.txt', 'w') as fw:
            for avec in answer_vec:
                fw.write(avec)
                fw.write('\n')
def build_feature(is_train=True):
    """
    从初始数据中抽取特征
    :param is_train: 训练模式标记
    :return: 将提取到的特征写入文件
    """
    print("Initializing Segmentor!")
    segmentor = Segmentor()
    segmentor.load(cws_model_path)
    # 读取train json文件
    if is_train:
        with open(TRAIN_DATA, 'r', encoding='utf-8') as f:
            questions = [json.loads(line.strip()) for line in f.readlines()]
    else:
        with open(SEARCH_RESULT, 'r', encoding='utf-8') as f:
            questions = [json.loads(line.strip()) for line in f.readlines()]
        questions.sort(key=lambda item_: item_['qid'])  # 按qid升序排序
    # 读入passage json文件
    passage = {}
    with open(SEG_PASSAGE_DATA, encoding='utf-8') as f:
        for line in f.readlines():
            read = json.loads(line.strip())
            passage[read['pid']] = read['document']
    # 读入raw passage json文件
    passage_raw = {}
    with open(RAW_PASSAGE_DATA, encoding='utf-8') as f:
        for line in f.readlines():
            read = json.loads(line.strip())
            passage_raw[read['pid']] = read['document']

    # 建立特征矩阵
    feature = []
    ret = []

    for k in range(len(questions)):
        question = questions[k]
        sents, corpus = [], []
        if is_train:
            cv = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
            cv.fit(passage[question['pid']])
            tv = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
            tv.fit(passage[question['pid']])
            for sent in passage[question['pid']]:
                corpus.append(sent.split())

        else:
            for pid in question['answer_pid']:
                sents += passage[pid]
                for sent in passage[pid]:
                    corpus.append(sent.split())
            if len(sents) == 0:  # 没有检索到文档
                print("no answer pid: {}".format(question['qid']))
                continue
            cv = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
            cv.fit(sents)
            tv = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
            tv.fit(sents)

        # 提取 BM25 特征
        bm25_model = bm25.BM25(corpus)
        q = list(segmentor.segment(question['question']))
        scores = bm25_model.get_scores(q)

        if is_train:
            for i in range(len(passage[question['pid']])):
                ans_sent = passage[question['pid']][i]
                feature_array = extract_feature(q, ans_sent, cv, tv)
                feature_array.append(scores[i])
                feature.append(' '.join([str(attr)
                                         for attr in feature_array]) + '\n')
                sen = {}
                if passage_raw[
                        question['pid']][i] in question['answer_sentence']:
                    sen['label'] = 1
                else:
                    sen['label'] = 0
                sen['qid'] = question['qid']
                sen['question'] = question['question']
                sen['answer'] = passage[question['pid']][i]
                ret.append(sen)
        else:
            for i in range(len(sents)):
                feature_array = extract_feature(q, sents[i], cv, tv)
                feature_array.append(scores[i])
                feature.append(' '.join([str(attr)
                                         for attr in feature_array]) + '\n')
                sen = {
                    'label': 0,
                    'qid': question['qid'],
                    'question': question['question'],
                    'answer': sents[i]
                }
                ret.append(sen)
    # 特征写入文件
    feature_path = RAW_FEATURE if is_train else TEST_FEATURE
    with open(feature_path, 'w', encoding='utf-8') as f:
        f.writelines(feature)
    # 句子写入文件
    sentence_path = RAW_SENTENCE if is_train else TEST_SENTENCE
    with open(sentence_path, 'w', encoding='utf-8') as f:
        for sample in ret:
            f.write(json.dumps(sample, ensure_ascii=False) + '\n')
    segmentor.release()
# -*- coding: utf-8 -*-
from pyltp import Segmentor
segmentor = Segmentor()
segmentor.load("/Users/xiamin/Downloads/ltp_data/cws.model")
words = segmentor.segment("元芳你怎么看")
print "|".join(words)
segmentor.release()
Ejemplo n.º 25
0
class tokenization():
    def __init__(self):
        self.LTP_DATA_DIR = "/home/mm/Downloads/ltp_data_v3.4.0/"
        self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')

        self.segmentor = Segmentor()  # 初始化实例
        self.segmentor.load(self.cws_model_path)  # 加载模型
        self.train_res = self.read_train_res()  # 读取tag文本,防止里面有空格去掉空格
        # self.all_co_names = self.FDDC_co_list()

    def read_train_res(self):
        with open(
                '/home/mm/Documents/aliyun-FDDC-2018-Financial-Challenge-/chongzu.train'
        ) as rf:
            train_res = rf.read()
            train_res = re.sub(r'\(', '(', train_res)
            train_res = re.sub(r'\)', ')', train_res)
        return train_res

    def tokenize_enti(self, path11):
        texx, entity_string = convert2txt(path11)
        # sentences = re.split(r'。', texx)
        # sentences.sort(key=len, reverse=True)
        entities = list(set(re.split(r'[\s~、,;/]', entity_string)))
        entities.sort(key=len)
        entities_arrows_list = list(
            set([
                x if '~' in x else '' for x in re.split(r'\s', entity_string)
            ]))
        entities_arrows_list.sort(key=len, reverse=True)
        entities_arrows_list = entities_arrows_list[:-1]
        # 找出结果数据行并且把最后的回车符号去掉
        patt_index = re.findall(r'\d{4,10}', path11)[0]
        res_rows = re.findall(r'(?<=\n){}[^\n]+(?=\n)'.format(patt_index),
                              self.train_res)

        # 以下是整理train——res
        # 遍历结果,发现有简称全称的,把匹配的另一半加进去。
        """主要目的是修正train——res文件,里面有简称或者全称,并不统一,为了让简称全称都出现,
            使用正则提取对应的简称或全称,如果有顿号,把那些字串也分开提取,作为标注的标的,当然是先
            把字符长度小的匹配出来,分词之后也是先把长度长的连起来。没问题的"""
        res_paired = {}  # 临时定义一个res的列表,存储修改后的train res
        for x in range(len(res_rows)):
            res_row = res_rows[x]
            for y in range(6):
                res_paired[str(x) + str(y)] = [re.split(r'\t', res_row)[y]]

        for arrow_str in entities_arrows_list:

            for index, result_row in enumerate(res_rows):

                for indi, res_value in enumerate(re.split(r'\t', result_row)):
                    if indi in [0, 1, 4, 5]:
                        continue
                    res_value_list = res_value.split('、')

                    for res_value_split in res_value_list:
                        if res_value_split in entities and res_value_split in arrow_str:
                            # 找出配对的简称或者全称,添加,如果是股权/估值法/金额直接添加并且continue
                            niki, fullna = re.split(r'~', arrow_str)
                            fullna_first = fullna.split(',')[0]
                            niki_split_list = re.split(r'[/、]', niki)
                            # 对应的全称满足三个条件,长度/逗号  以及含有简称的几个字
                            if res_value_split in niki_split_list \
                                    and len(fullna_first) < 18 \
                                    and re.search(re.sub(r'(?<=[^屄\s])', '\s?', res_value_split), fullna_first):
                                res_paired[str(index) +
                                           str(indi)].append(fullna_first)
                            """ 由全称查简称时候要避免 公司/本公司/上市公司/发起人/申请人/,
                                含有这几个字的要剔除  """
                            if res_value_split == fullna_first:
                                # 对应的简称满足几个条件: 包含在全程里面,不长于4个字,不等于
                                for niki_split in niki_split_list:
                                    if re.search(re.sub(r'(?<=[^屄\s])', '\s?', fullna_first), niki_split)\
                                            and not re.search(r'(^公司$|^本公司$|环境$|^上市公司$|人$|资产|标的|交易|对方|发行|对象|股东|对手|单位)',re.sub(r'\s', '', niki_split)):
                                        res_paired[str(index) +
                                                   str(indi)].append(
                                                       niki_split)

        # 遍历公告的每一句,把每一句送进模型。
        # words_n_words = ''
        # for i in sentences:
        words = self.segmentor.segment(texx)
        words = ' '.join(words)
        # 分词要使用更好的策略,更长一些,避免太短的句子,重复循环浪费流程
        # # 下面是把所有目标主体合并在一起, 把55%股权这样的先分出来,
        # for ent in entities:
        #     # 把words中所有是实体的中间去掉空格。使用双层sub
        #     # 正则还是要多注释啊
        #     """ re.sub(r'(?<=\w)(?=\w)'','\s?',ent) 是把实体里面的每个字符中间插入“\s?”
        #     表示匹配任何以此序列出现但中间可能有空格的情况,分词之后join成空格分割的。然后找出words
        #     中出现这个序列的地方,将其换成没空格的"""
        #     if len(ent) > 1:
        #         if not re.search(r'([\d.]+%的?(?:股权|股份|权益))', ent):  # 如果没有股权关键字,直接加上空格匹配pattern
        #             patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent)
        #         elif len(ent) > 7: # 如果有股权关键字,且长度比较长,就把前面主体提出来,单独分词
        #             patt_ent = re.sub(r'(?<=\w)(?=\w)',r'\s?', re.sub(r'的?[\d.]+%的?(股权|股份|权益)','', ent))
        #         else:
        #             patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent)
        #         # 下面一句把words中所有符合主体列表的项目,可能被分词分开的,重新合并起来,单独成行,在test时使用
        #         words = re.sub(r'{}'.format(patt_ent), '\s' + ent + '\s', words)

        # 然后把空格都换成回车,words竖起来了。
        # words = re.sub(r'\s', '\n', words)
        # words = re.sub(r'\n+', '\n', words)
        """把words中所有是结果键值的,后缀上tab键和结果索引号。否则后缀tab键和字母o
            目的是好的,就是让模型更容易找到目标,模型不需要判断开始和结束,
            但是这样的正则太难了, 我无法将所有合适的实体
            全部抽出来,而导致标注的缺失,那么还是把任务给模型了"""
        # for x in range(len(res_rows)):
        #     for y in range(6):
        #         index = str(x)+str(y)
        #         tags_list = res_paired[index]
        for index, tags_list in res_paired.items():
            # 表中的小表,可能有一个或多个成员,遍历一下,包括顿号分割的那些都可以标出来了,不影响合并好的实体字符串。
            for sub_res in sorted(tags_list, key=len, reverse=True):
                if not index.endswith('0') and len(sub_res) > 1:
                    patt_sub_res = re.sub(r'(?<=[^屄\s])', '\s?', sub_res)
                    if re.search(r'{}'.format(patt_sub_res), words):
                        spliter = re.findall(patt_sub_res, words)[0]
                        words_split_list = re.split(spliter, words)
                        spliter_tagged = re.sub(r'\s', '屄{}'.format(index[1]),
                                                spliter)
                        words = spliter_tagged.join(words_split_list)
                        # print(words)

                        # words=re.sub(patt_sub_res, sub_res)
                        # words= re.sub(r'{}(?=\n)'.format(sub_res), '\n{}\t{}\n'.format(sub_res, index), words)
            # train——result标注完了,现在标注o,就是把非数字结尾的行加上tab和o
        words = re.sub(r'\s', '\to\n', words)
        words = re.sub(r'(?<=屄\d)', '\n', words)
        words = re.sub(r'屄', '\t', words)
        # words_n_words += words

        # print(words)
        with open(
                '/home/mm/FDDC_datasets_dir/tokenized_datasets_for_anago/chongzu/'
                + res_paired['00'][0] + '.txt', 'w') as af:
            af.write(words)
            print(path11.split("/")[-1])
Ejemplo n.º 26
0
import sys
import os
import re
import numpy as np
from pyltp import Segmentor
segmentor = Segmentor()
segmentor.load("/home/zzj/ltp_data_v3.4.0/cws.model")
dir = './tokenized_dir/'
filelist = os.listdir(dir)
abstract_avr_len = 0
article_avr_len = 0
coun = 0


# def clear_str(line):
#   emoji_pattern = re.compile(
#     u"(\ud83d[\ude00-\ude4f])|"  # emoticons
#     u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
#     u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
#     u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
#     u"(\ud83c[\udde0-\uddff])|"
#     u"[\U00010000-\U0010ffff]"  # flags (iOS)
#     "+", flags=re.UNICODE)
#
#   return emoji_pattern.sub(r'', line)
# def tokenize_stories(stories_dir,file):
#     """将weibo文件夹下对应文件分词,存储在tokenized_stories_dir/下一个个文件"""
#
#     num = 0
#     stories = []
#     stories.append(file)
Ejemplo n.º 27
0
class PyltpAnalyzer(object):
    def __init__(self, fileDir=LTP_DATA_DIR):
        """

        :param filename:
        """
        print('77777&777777777777777')
        self.fileDir = fileDir
        # 初始化分词实例
        self.cws_model_path = os.path.join(
            self.fileDir, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.segmentor = Segmentor()
        self.segmentor.load(self.cws_model_path)  # 加载模型
        # 初始化标注实例
        self.pos_model_path = os.path.join(
            self.fileDir, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.postagger = Postagger()
        self.postagger.load(self.pos_model_path)  # 加载模型

        # 初始化命名实体识别实例
        self.ner_model_path = os.path.join(
            self.fileDir, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(self.ner_model_path)  # 加载模型

        #依存句法分析
        self.par_model_path = os.path.join(
            self.fileDir, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
        self.parser = Parser()  # 初始化实例
        self.parser.load(self.par_model_path)  # 加载模型

    def loadSegmentorUserdict(self, user_dict):
        """
        载入用户分词词典
        :param user_dict:
        :return:
        """
        self.segmentor.load_with_lexicon(self.cws_model_path, user_dict)

    def segmentSentence(self, sentence):
        return list(self.segmentor.segment(sentence))

    def segment(self, sentences):
        """

        :param sentences: 句子列表
        :return:句子分词结果
        """
        wordsList = []
        if sentences:
            for sentence in sentences:
                wordsList.append(list(self.segmentor.segment(sentence)))
        return wordsList

    def postag(self, wordsList):
        """

        :param wordsList: 句子分词列表
        :return: 句子分词词性标注结果
        """
        postagsList = []
        if wordsList:
            for words in wordsList:
                postagsList.append(list(self.postagger.postag(words)))
        return postagsList

    def recognize(self, wordsList, postagsList):
        """

        :param wordsList: 句子分词列表
        :param postagsList: 句子标注列表
        :return: 句子命名实体识别结果
        """
        netagsList = []
        if wordsList and postagsList:
            if len(wordsList) == len(postagsList):
                for words, postags in zip(wordsList, postagsList):
                    netagsList.append(
                        list(self.recognizer.recognize(words, postags)))
            else:
                print(
                    "wordsList = {} ,len(wordsList) = {}  and postagsList = {} ,len(postagsList)"
                    .format(wordsList, len(wordsList), postagsList,
                            len(postagsList)))
        else:
            print("wordsList = {}  and postagsList = {}".format(
                wordsList, postagsList))

        return netagsList

    def dependencyParse(self, wordsList, postagsList):
        """

        :param wordsList: 句子分词列表
        :param postagsList: 句子标注列表
        :return: 句子句法分析结果
        """
        arcsList = []
        if wordsList and postagsList:
            if len(wordsList) == len(postagsList):
                for words, postags in zip(wordsList, postagsList):
                    arcsList.append(list(self.parser.parse(
                        words, postags)))  #arc.head 父节点, arc.relation 依存关系
            else:
                print(
                    "wordsList = {} ,len(wordsList) = {}  and postagsList = {} ,len(postagsList)"
                    .format(wordsList, len(wordsList), postagsList,
                            len(postagsList)))
        else:
            print("wordsList = {}  and postagsList = {}".format(
                wordsList, postagsList))

        return arcsList

    def finalize(self):
        """
        释放所有没用到的模型
        :return:
        """
        self.segmentor.release()  # 释放分词模型
        self.postagger.release()  # 释放词性模型
        self.recognizer.release()  # 释放命名实体模型
        self.parser.release()  # 释放依存句法模型
            arguments_list.append(
                argument.getAttribute("content").encode("utf-8"))
            print("加入的元素为(" +
                  argument.getAttribute("content").encode("utf-8") + ")")
        relations_item = []
        relations_item.append(trigger_list)
        relations_item.append(arguments_list)
        relations_list.append(relations_item)
print("一共提取到(" + str(len(relations_list)) + ")组事件对")
arguments_list = []

MODELDIR = "/media/lyt312323529/c4175817-9d97-490b-95c6-636149e75a87/Graph_Generate/ltp_data"
print("正在加载LTP模型...")
segmentor = Segmentor()
p = os.path.join(MODELDIR, "cws.model")
segmentor.load(p)
postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
print("加载完毕")

events_list = []
for i in range(len(relations_list)):
    event = []
    trigger = []
    print("\n\n处理触发词结果如下")
    if (relations_list[i][0][0] != "is") and (relations_list[i][0][0] != "de"):
        words = segmentor.segment(relations_list[i][0][0])
        #wordStr = "\t".join(words)
        #print(wordStr)
Ejemplo n.º 29
0
def load_source(maindir, word_dict):
    n_processed = 0
    contents_dict = {}
    segmentor = Segmentor()
    segmentor.load(
        '/home/caory/github/table-detection/data/table-v5/ltp_data/cws.model')

    dirlist = os.listdir(maindir)
    for docid in dirlist:
        n_processed += 1
        print('Load Source: doc: %s, rate: %.2f%%' %
              (docid, 100.0 * n_processed / len(dirlist)))
        sys.stdout.flush()
        contents_dict[docid] = {}

        json_path = os.path.join(maindir, docid, 'pages_with_tables')
        if not os.path.exists(json_path):
            continue

        data = read_json(json_path)
        for pageid in data:
            contents_dict[docid][pageid] = {}
            size = data[pageid]['size']
            texts, curves, others, tables = [], [], [], []

            # 获取表格框
            pad, offset = 2, 5
            for box in data[pageid]['tables']:
                left = max(offset, int(math.floor(float(box[0])) - pad))
                right = min(int(math.ceil(float(box[2])) + pad),
                            size[0] - offset)
                top = max(offset,
                          int(math.floor(float(size[1] - box[3])) - pad))
                bottom = min(int(math.ceil(float(size[1] - box[1])) + pad),
                             size[1] - offset)
                if 0 <= left <= right < size[0] and 0 <= top <= bottom < size[
                        1]:
                    tables.append({'position': [left, right, top, bottom]})

            # 获取文本框
            for text in data[pageid]['texts']:
                # 获取每一个字符的位置
                chars = []
                for char in text['chars']:
                    left = int(math.floor(float(char['box'][0])))
                    right = int(math.floor(float(char['box'][2])))
                    top = int(math.floor(float(size[1] - char['box'][3])))
                    bottom = int(math.floor(float(size[1] - char['box'][1])))
                    if 0 <= left <= right < size[
                            0] and 0 <= top <= bottom < size[1]:
                        chars.append({
                            'position': [left, right, top, bottom],
                            'sentence': char['text'].strip()
                        })

                # 对于距离近的字符进行合并
                for char in chars:
                    merged = False
                    for i in range(len(texts)):
                        box = texts[i]
                        if char['position'][2] == texts[i]['position'][2] and \
                            char['position'][3] == texts[i]['position'][3] and \
                            text['type'] == texts[i]['type']:
                            if abs(char['position'][0] -
                                   texts[i]['position'][1]) <= 5:
                                texts[i]['position'][1] = char['position'][1]
                                merged = True
                                break
                            elif abs(char['position'][1] -
                                     texts[i]['position'][0]) <= 5:
                                texts[i]['position'][0] = char['position'][0]
                                merged = True
                                break
                    if not merged:
                        texts.append({
                            'position': char['position'],
                            'type': text['type'],
                            'sentence': text['text'].strip()
                        })

            # 对于页码进行特殊识别
            for i in range(len(texts)):
                top = texts[i]['position'][2]
                bottom = texts[i]['position'][3]
                if 1.0 * top / size[1] <= 0.85:
                    continue
                is_page = True

                for j in range(len(texts)):
                    if j == i:
                        continue
                    other_top = texts[j]['position'][2]
                    other_bottom = texts[j]['position'][3]
                    if other_bottom >= top:
                        is_page = False
                        break

                if is_page:
                    texts[i]['type'] = 5

            # 将下划线文本框改为表格框
            new_texts = []
            for text in texts:
                isline = True
                if 'sentence' in text and text['type'] == 2:
                    for s in text['sentence']:
                        if s != '_':
                            isline = False
                    if isline and len(text['sentence']) >= 3:
                        pos = [
                            text['position'][0], text['position'][1],
                            text['position'][3] - 1, text['position'][3]
                        ]
                        curves.append({'position': pos, 'type': 1})
                    else:
                        new_texts.append(text)
                else:
                    new_texts.append(text)
            texts = new_texts

            # 获取其他框(图片等)
            for other in data[pageid]['others']:
                left = int(math.floor(float(other['box'][0])))
                right = int(math.floor(float(other['box'][2])))
                top = int(math.floor(float(size[1] - other['box'][3])))
                bottom = int(math.floor(float(size[1] - other['box'][1])))
                if 0 <= left <= right < size[0] and 0 <= top <= bottom < size[
                        1]:
                    others.append({
                        'position': [left, right, top, bottom],
                        'type': other['type']
                    })

            # 获取每一个线条的位置
            curves = []
            curve_width = 2
            for curve in data[pageid]['curves']:
                left = int(math.floor(float(curve['box'][0])))
                right = int(math.floor(float(curve['box'][2])))
                top = int(math.floor(float(size[1] - curve['box'][3])))
                bottom = int(math.floor(float(size[1] - curve['box'][1])))
                if right - left <= curve_width and bottom - top > curve_width:
                    right = left
                    line = {
                        'position': [left, right, top, bottom],
                        'type': curve['type']
                    }
                elif right - left > curve_width and bottom - top <= curve_width:
                    bottom = top
                    line = {
                        'position': [left, right, top, bottom],
                        'type': curve['type']
                    }
                if line:
                    if 0 <= line['position'][0] <= line['position'][1] < size[0] and \
                        0 <= line['position'][2] <= line['position'][3] < size[1]:
                        curves.append(line)

            contents_dict[docid][pageid] = {
                'texts': texts,
                'size': size,
                'tables': tables,
                'others': others,
                'curves': curves
            }

    return contents_dict
Ejemplo n.º 30
0
import jieba
from jieba import posseg
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SentenceSplitter
from corpus import cws_model, pos_model, parser_model, ner_model, news_path, relation_extract_output_file

segmentor = Segmentor()
segmentor.load(cws_model)

postagger = Postagger()
postagger.load(pos_model)

parser = Parser()
parser.load(parser_model)

recognizer = NamedEntityRecognizer()
recognizer.load(ner_model)

sentencesplit = SentenceSplitter()


def extract_start(input_file_name, output_file_name, begin_line, end_line):
    in_file = open(input_file_name, 'r', encoding='utf8')
    out_file = open(output_file_name, 'w')

    # for line in in_file.readlines()[begin_line:end_line]:
    #     for sentence in sentencesplit.split(''.join(line.split()[:-1])):
    #         fact_extract(sentence, out_file)
    fact_extract('欧几里得是西元前三世纪的希腊数学家。', out_file)

    in_file.close()
    out_file.close()
Ejemplo n.º 31
0
from pyltp import Segmentor,Postagger
from Auth.common import *
CWS_MODEL_PATH='/home/hiro/ltp_data_v3.4.0/cws.model'
POS_MODEL_PATH='/home/hiro/ltp_data_v3.4.0/pos.model'


segmentor = Segmentor()
segmentor.load(CWS_MODEL_PATH)

postagger = Postagger()
postagger.load(POS_MODEL_PATH)

def get_real_words_hit(str1):

    words=segmentor.segment(str1)
    postags = postagger.postag(words)
    wordlist=[(words[i],postags[i]) for i in range(0,len(words))]

    punc = open("punctuation.txt", 'rb')
    pr = punc.read()
    pr = pr.decode('gbk')
    p = pr.split()
    lreal = []

    passlist = ['\\', "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", '/', '-', '$', '#']

    for word in wordlist:
        if word[0] in passlist:
            pass
        elif word[1] in ['n', 'v', 'a'] and word[0] not in p:
            lreal.append(word[0])
Ejemplo n.º 32
0
class pnn_count():
	def __init__(self):
		self.mydict = {}
		self.lines = []
		self.lines_num = 3000
		self.c = [0,0,0] #PNN
		self.w_c = [{},{},{}]
		self.segmentor = Segmentor()
		self.segmentor.load('cws.model')
		self.read_file()
		self.train()
		self.test()
	def read_file(self):
		f = open('pnn_annotated.txt','r')
		self.lines = f.readlines()
		f.close()
	def train(self):
		for i in range(0,self.lines_num/5*4):
			line = self.lines[i]
			line.strip('\n')
			line_array = line.split('\t')
			line = line_array[1]
			words = self.segmentor.segment(line)
			if line_array[0] == '1':
				pos = 0
			elif line_array[0] =='0':
				pos = 1
			else:
				pos = 2
			for i in words:                          #calculate frequency
				if self.w_c[pos].has_key(i):
					self.w_c[pos][i] += 1
				else:
					for a in range(0,3):
						self.w_c[a][i] = 0
					self.w_c[pos][i] += 1
			self.c[pos] += 1

	def test(self):
		count = 0
		v = len(self.mydict.keys())
		for a in range(self.lines_num / 5 * 4, len(self.lines)-1):
			wholeline = self.lines[a]
			print wholeline
			result = [0.0,0.0,0.0]
			line_array = wholeline.split('\t')
			line = line_array[1]
			words = self.segmentor.segment(line)
			for i in range(0,3):
				pci = 1.0 * self.c[i] / (self.lines_num/5 *4)
				pwci = 1.0
				sum_i = 0
				for q in self.w_c[i].keys():
					sum_i += self.w_c[i][q]
				for k in words:
					if self.w_c[i].has_key(k):
						pwci = pwci * (self.w_c[i][k] + 1) / (sum_i + v)
				result[i] = pci * pwci
			maxi = 0
			for i in range(0,3):
				if result[i]>result[maxi]:
					maxi = i
			if maxi ==0:
				if line_array[0] == '1':
					count += 1
				print "my guess is positive"
			elif maxi==1:
				if line_array[0] == '0':
					count += 1
				print "my guess is neuter"
			else:
				if line_array[0] == '-1':
					count += 1
				print "my guess is negative"
		print  count * 1.0 /(self.lines_num/5)
Ejemplo n.º 33
0
    def ws_data(self):
        f = open("pnn_annotated.txt", 'r')
        total_line = 0
        orgin_attr = [0, 0, 0]
        judge_attr = [0, 0, 0]
        right = [0, 0, 0]
        segmentor = Segmentor()
        segmentor.load("cws.model")
        for line in f:
            total_line += 1
            # print 'line has been read'
            value_num = [0, 0]
            result = line.split('\t')
            ws_lst = segmentor.segment(result[1])
            # print 'this line is %s' % (line)

            for i in ws_lst:
                classify = ''
                try:
                    value = self.setiment_words[i]
                except:
                    pass
                else:
                    if value == 1:
                        print 'positive word:%s' % i
                        value_num[0] += 1
                    elif value == -1:
                        print 'negative word:%s' % i
                        value_num[1] += 1

            if value_num[0] == 0 and value_num[1] == 0:
                classify = 'neutral'
                judge_attr[0] += 1
            elif value_num[0] == value_num[1] != 0:
                classify = 'neutral'
                judge_attr[0] += 1
            elif value_num[0] > value_num[1]:
                classify = 'positive'
                judge_attr[1] += 1
            else:
                classify = 'negative'
                judge_attr[2] += 1

            print value_num
            print 'classfiy result:%s' % classify

            # the count of original'emotion
            if result[0] == '0':
                orgin_attr[0] += 1
            elif result[0] == '1':
                orgin_attr[1] += 1
            else:
                orgin_attr[2] += 1

            if (int(result[0]) == 0 and value_num[0] == 0 and value_num[1] == 0):
                # print 'neutral'
                right[0] += 1
            elif (int(result[0]) == 0 and value_num[0] == value_num[1] != 0):
                # print 'neutral'
                right[0] += 1
            elif (int(result[0]) > 0 and value_num[0] >= value_num[1] and value_num[0] != 0):
                # print 'positive'
                right[1] += 1
            elif (int(result[0]) < 0 and value_num[0] < value_num[1] and value_num[1] != 0):
                # print 'negative'
                right[2] += 1

            # print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line))
        print 'orgin\'s neutral, positive, negative'
        print orgin_attr

        print 'judge_attr neutral, positive, negative'
        print judge_attr

        print 'neutral, positive, negative'
        print right
        print (right[0] + right[1] + right[2])

        print 'total_line %f\n' % total_line
        print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line))
        segmentor.release()
Ejemplo n.º 34
0
# coding:utf-8
import math
import copy
from read_baselabel import read_baselabel
from read_word2vec import read_word2vec
import sys, os
from operator import itemgetter
from pyltp import Segmentor
from collections import Counter


segmentor = Segmentor()
segmentor.load("/data0/shenyanjun/ltp_data/cws.model")
path = os.path.abspath(os.path.dirname(sys.argv[0]))

path_rule_for_stock = path + "/stock_to_theme.txt"
path_base_label = path + "/stock_list_result.txt"
path_word2vec = path + "/word2vec_item_only.txt"
base_label = read_baselabel(path_base_label)
base_label_dic, stock_names = base_label.transpose()

word2vec = read_word2vec(path_word2vec)
word2vec_dic = word2vec.read_w2v()


def makeDict(path1):
    # 将规则存入dict,key为股票,value为股票可出现的概念
    dict = {}
    fin = open(path1, "r")
    for line in fin:
        line1 = line.strip().split("\t")
Ejemplo n.º 35
0
def pyltp_cut(sentence):
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(sentence)  # 分词
    segmentor.release()  # 释放模型
    return words
Ejemplo n.º 36
0
        print p_name

if __name__ == "__main__" :
    argp = argparse.ArgumentParser(description="Online Classification System")
    argp.add_argument("-c" , "--classname" , choices=[classname.JUNK , classname.SENSITIVE] , required=True ,
                      help="Classification Type , junk-class or sensitive class ")
    argp.add_argument("-s" , "--sample_interval_mode" , choices=[sampleIntervalMode.LINE_MODE , sampleIntervalMode.CRLF_MODE] , 
                      default=sampleIntervalMode.LINE_MODE ,
                      help="The mode with describes what is the inverval symbol between samples , default is LINE_MODE")
    argp.add_argument("-i" , "--input" , type=str , default="stdin" , 
                      help="'sysin' for using standard input ; else file path is needed.")
    args = argp.parse_args()

    logging.info("loadding segmentor")
    segmentor = Segmentor()
    segmentor.load(CWS_MODEL_PATH)
    logging.info("done")

    # loading model
    if args.classname == classname.JUNK :
        model = TFIDFModel()
        model.load_model(JUNK_MODEL_PATH)
    else :
        model = BOOLModel()
        model.load_model(SENSITIVE_MODEL_PATH)
    
    #process the input file
    if args.input == "stdin" :
        ifo = sys.stdin
    else :
        ifo = open(args.input) # if error , just quit
Ejemplo n.º 37
0
class LtpParser():
    def __init__(self, ltp_model_dir):
        LTP_DIR = ltp_model_dir
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

    '''ltp基本操作'''

    def basic_parser(self, words):
        postags = list(self.postagger.postag(words))
        netags = self.recognizer.recognize(words, postags)
        return postags, netags

    '''ltp获取词性'''

    def get_postag(self, words):
        return list(self.postagger.postag(words))

    '''基于实体识别结果,整理输出实体列表'''

    def format_entity(self, words, netags, postags):
        name_entity_dist = {}
        name_entity_list = []
        place_entity_list = []
        organization_entity_list = []
        ntag_E_Nh = ""
        ntag_E_Ni = ""
        ntag_E_Ns = ""
        index = 0
        for item in zip(words, netags):
            word = item[0]
            ntag = item[1]
            if ntag[0] != "O":
                if ntag[0] == "S":
                    if ntag[-2:] == "Nh":
                        name_entity_list.append(word + '_%s ' % index)
                    elif ntag[-2:] == "Ni":
                        organization_entity_list.append(word + '_%s ' % index)
                    else:
                        place_entity_list.append(word + '_%s ' % index)
                elif ntag[0] == "B":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                elif ntag[0] == "I":
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                else:
                    if ntag[-2:] == "Nh":
                        ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index
                        name_entity_list.append(ntag_E_Nh)
                        ntag_E_Nh = ""
                    elif ntag[-2:] == "Ni":
                        ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index
                        organization_entity_list.append(ntag_E_Ni)
                        ntag_E_Ni = ""
                    else:
                        ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index
                        place_entity_list.append(ntag_E_Ns)
                        ntag_E_Ns = ""
            index += 1
        name_entity_dist['nhs'] = self.modify_entity(name_entity_list, words,
                                                     postags, 'nh')
        name_entity_dist['nis'] = self.modify_entity(organization_entity_list,
                                                     words, postags, 'ni')
        name_entity_dist['nss'] = self.modify_entity(place_entity_list, words,
                                                     postags, 'ns')
        return name_entity_dist

    '''entity修正,为rebuild_wordspostags做准备'''

    def modify_entity(self, entity_list, words, postags, tag):
        entity_modify = []
        if entity_list:
            for entity in entity_list:
                entity_dict = {}
                subs = entity.split(' ')[:-1]
                start_index = subs[0].split('_')[1]
                end_index = subs[-1].split('_')[1]
                entity_dict['stat_index'] = start_index
                entity_dict['end_index'] = end_index
                if start_index == entity_dict['end_index']:
                    consist = [
                        words[int(start_index)] + '/' +
                        postags[int(start_index)]
                    ]
                else:
                    consist = [
                        words[index] + '/' + postags[index]
                        for index in range(int(start_index),
                                           int(end_index) + 1)
                    ]
                entity_dict['consist'] = consist
                entity_dict['name'] = ''.join(
                    tmp.split('_')[0] for tmp in subs) + '/' + tag
                entity_modify.append(entity_dict)
        return entity_modify

    '''基于命名实体识别,修正words,postags'''

    def rebuild_wordspostags(self, name_entity_dist, words, postags):
        pre = ' '.join(
            [item[0] + '/' + item[1] for item in zip(words, postags)])
        post = pre
        for et, infos in name_entity_dist.items():
            if infos:
                for info in infos:
                    post = post.replace(' '.join(info['consist']),
                                        info['name'])
        post = [
            word for word in post.split(' ')
            if len(word.split('/')) == 2 and word.split('/')[0]
        ]
        words = [tmp.split('/')[0] for tmp in post]
        postags = [tmp.split('/')[1] for tmp in post]

        return words, postags

    '''依存关系格式化'''

    def syntax_parser(self, words, postags):
        arcs = self.parser.parse(words, postags)
        words = ['Root'] + words
        postags = ['w'] + postags
        tuples = list()
        for index in range(len(words) - 1):
            arc_index = arcs[index].head
            arc_relation = arcs[index].relation
            tuples.append([
                index + 1, words[index + 1], postags[index + 1],
                words[arc_index], postags[arc_index], arc_index, arc_relation
            ])

        return tuples

    '''为句子中的每个词语维护一个保存句法依存儿子节点的字典'''

    def build_parse_child_dict(self, words, postags, tuples):
        child_dict_list = list()
        for index, word in enumerate(words):
            child_dict = dict()
            for arc in tuples:
                if arc[3] == word:
                    if arc[-1] in child_dict:
                        child_dict[arc[-1]].append(arc)
                    else:
                        child_dict[arc[-1]] = []
                        child_dict[arc[-1]].append(arc)
            child_dict_list.append([word, postags[index], index, child_dict])

        return child_dict_list

    '''parser主函数'''

    def parser_main(self, words, postags):
        tuples = self.syntax_parser(words, postags)
        child_dict_list = self.build_parse_child_dict(words, postags, tuples)
        return tuples, child_dict_list

    '''基础语言分析'''

    def basic_process(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags, netags = self.basic_parser(words)
        name_entity_dist = self.format_entity(words, netags, postags)
        words, postags = self.rebuild_wordspostags(name_entity_dist, words,
                                                   postags)
        return words, postags
Ejemplo n.º 38
0
# -*- coding: utf-8 -*-
# 作者:MebiuW
# 微博:@MebiuW
# python 版本:2.7
# 时间 2016/9/10

from pyltp import Segmentor
from pyltp import Postagger

from pyltp import NamedEntityRecognizer

segmentor1 = Segmentor()  # 初始化实例
segmentor1.load('/mnt/hgfs/share/ai2.0/bin/correct/ltp_data/cws.model')  # 加载模型

postagger = Postagger()  # 初始化实例
postagger.load('/mnt/hgfs/share/ai2.0/bin/correct/ltp_data/pos.model')  # 加载模型

recognizer = NamedEntityRecognizer()  # 初始化实例
recognizer.load('/mnt/hgfs/share/ai2.0/bin/correct/ltp_data/ner.model')  # 加载模型


# 分词
def segmentor(sentence):

    words = segmentor1.segment(sentence)  # 分词
    # 默认可以这样输出
    print '\t'.join(words)
    # 可以转换成List 输出
    words_list = list(words)
    segmentor1.release()  # 释放模型
    return words_list
Ejemplo n.º 39
0
    def __init__(self):
        """
        init method required. set batch_size, and load some resources.
        """
        self.batch_size = 128

        FLAGS = tf.app.flags.FLAGS
        tf.app.flags.DEFINE_string("ckpt_dir",
                                   "./checkpoint_cgrus/checkpoint/",
                                   "checkpoint location for the model")
        tf.app.flags.DEFINE_string("vocab_word_path",
                                   "predictor/word_freq.txt",
                                   "path of word vocabulary.")
        tf.app.flags.DEFINE_string("accusation_label_path",
                                   "predictor/accu.txt",
                                   "path of accusation labels.")

        tf.app.flags.DEFINE_string("article_label_path", "predictor/law.txt",
                                   "path of law labels.")

        tf.app.flags.DEFINE_float("learning_rate", 0.001, "learning rate")
        tf.app.flags.DEFINE_integer(
            "decay_steps", 1000, "how many steps before decay learning rate.")
        tf.app.flags.DEFINE_float("decay_rate", 1.0,
                                  "Rate of decay for learning rate.")
        tf.app.flags.DEFINE_integer("sentence_len", 400, "max sentence length")
        tf.app.flags.DEFINE_integer("num_sentences", 16, "number of sentences")
        tf.app.flags.DEFINE_integer("embed_size", 64, "embedding size")  #64
        tf.app.flags.DEFINE_integer("hidden_size", 128, "hidden size")  #128
        tf.app.flags.DEFINE_integer(
            "num_filters", 128,
            "number of filter for a filter map used in CNN.")  #128

        tf.app.flags.DEFINE_integer("embed_size_dpcnn", 64, "embedding size")
        tf.app.flags.DEFINE_integer("hidden_size_dpcnn", 128, "hidden size")
        #tf.app.flags.DEFINE_integer("num_filters_big", 128, "number of filter for a filter map used in CNN.")
        tf.app.flags.DEFINE_string(
            "model_dpcnn", "dp_cnn",
            "name of model:han,c_gru,c_gru2,gru,text_cnn")
        tf.app.flags.DEFINE_string("ckpt_dir_dpcnn",
                                   "predictor/checkpoint_dpcnn_big32/",
                                   "checkpoint location for the model")

        tf.app.flags.DEFINE_boolean(
            "is_training", False,
            "is traning.true:tranining,false:testing/inference")
        tf.app.flags.DEFINE_string(
            "model", "c_gru", "name of model:han,c_gru,c_gru2,gru,text_cnn")
        #tf.app.flags.DEFINE_boolean("is_training_flag", False, "is traning.true:tranining,false:testing/inference")
        tf.app.flags.DEFINE_string('cws_model_path', 'predictor/cws.model',
                                   'cws.model path')
        tf.app.flags.DEFINE_string('pos_model_path', 'predictor/pos.model',
                                   'pos.model path')
        tf.app.flags.DEFINE_string('ner_model_path', 'predictor/ner.model',
                                   'ner.model path')
        tf.app.flags.DEFINE_string('gpu', '1', 'help to select gpu divice')
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu

        segm = Segmentor()
        segm.load(FLAGS.cws_model_path)  # ltp 模型
        post = Postagger()
        post.load(FLAGS.pos_model_path)
        recognizer = NamedEntityRecognizer()
        recognizer.load(FLAGS.ner_model_path)
        self.ltp_model = [segm, post, recognizer]

        filter_sizes = [2, 3, 4, 5
                        ]  #,6,7,8]#[2,3,4,5]#[6, 7, 8, 9, 10]  # [30,40,50] #8
        #filter_sizes_big= [2,3,4,5]#,6,7,8]#[2,3,4,5]#[6, 7, 8, 9, 10]  # [30,40,50] #8

        stride_length = 1

        #1.load label dict, restore model from checkpoint
        # 1.load label dict
        self.vocab_word2index = load_word_vocab(FLAGS.vocab_word_path)
        accusation_label2index = load_label_dict_accu(
            FLAGS.accusation_label_path)
        articles_label2index = load_label_dict_article(
            FLAGS.article_label_path)

        deathpenalty_label2index = {True: 1, False: 0}
        lifeimprisonment_label2index = {True: 1, False: 0}
        vocab_size = len(self.vocab_word2index)
        accusation_num_classes = len(accusation_label2index)
        article_num_classes = len(articles_label2index)
        deathpenalty_num_classes = len(deathpenalty_label2index)
        lifeimprisonment_num_classes = len(lifeimprisonment_label2index)

        # 2.restore checkpoint
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        graph = tf.Graph().as_default()
        with graph:
            self.model = HierarchicalAttention(accusation_num_classes,
                                               article_num_classes,
                                               deathpenalty_num_classes,
                                               lifeimprisonment_num_classes,
                                               FLAGS.learning_rate,
                                               self.batch_size,
                                               FLAGS.decay_steps,
                                               FLAGS.decay_rate,
                                               FLAGS.sentence_len,
                                               FLAGS.num_sentences,
                                               vocab_size,
                                               FLAGS.embed_size,
                                               FLAGS.hidden_size,
                                               num_filters=FLAGS.num_filters,
                                               model=FLAGS.model,
                                               filter_sizes=filter_sizes,
                                               stride_length=stride_length)
            saver_accu = tf.train.Saver()
            sess_accu = tf.Session(config=config)
            saver_accu.restore(sess_accu,
                               tf.train.latest_checkpoint(FLAGS.ckpt_dir))
            self.sess = sess_accu

        # graph_big = tf.Graph().as_default()
        # with graph_big:
        #     self.model_dpcnn = HierarchicalAttention(accusation_num_classes, article_num_classes, deathpenalty_num_classes,lifeimprisonment_num_classes,
        #                             FLAGS.learning_rate, self.batch_size,FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, FLAGS.num_sentences,vocab_size,
        #                             FLAGS.embed_size_dpcnn, FLAGS.hidden_size_dpcnn,num_filters = FLAGS.num_filters, model = FLAGS.model_dpcnn, filter_sizes = filter_sizes,
        #                             stride_length = stride_length)
        #     saver_big = tf.train.Saver()
        #     sess_big = tf.Session(config=config)
        #     saver_big.restore(sess_big, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        #     self.sess_big=sess_big

        self.FLAGS = FLAGS
Ejemplo n.º 40
0
    def bayes(self):
        segmentor = Segmentor()
        segmentor.load("cws.model")

        f = open('data/a_4.txt', 'r')
        # f = open('pnn_annotated.txt', 'r')
        # neutral, positive, negative
        class_freq = [0,0,0]
        # neutral, positive, negative
        word_total_count_freq = [0, 0, 0]
        each_word_count = [{}, {}, {}]

        accu = [0, 0]

        print 'train_set'
        for line in f:
            result = line.split('\t')
            ws_lst = segmentor.segment(result[1])
            # print line
            # neutral
            if result[0] == '0':
                class_freq[0] += 1
                for word in ws_lst:
                    word_total_count_freq[0] += 1
                    if each_word_count[0].get(word) is not None:
                        # print 'Not none'
                        each_word_count[0][word] += 1
                    else:
                        # print 'None'
                        each_word_count[0][word] = 1
            # positive
            elif result[0] == '1':
                class_freq[1] += 1
                for word in ws_lst:
                    word_total_count_freq[1] += 1
                    if each_word_count[1].get(word) is not None:
                        # print 'Not none'
                        each_word_count[1][word] += 1
                    else:
                        # print 'None'
                        each_word_count[1][word] = 1

            # negative
            elif result[0] == '-1':
                class_freq[2] += 1
                for word in ws_lst:
                    word_total_count_freq[2] += 1
                    if each_word_count[2].get(word) is not None:
                        # print 'Not none'
                        each_word_count[2][word] += 1
                    else:
                        # print 'None'
                        each_word_count[2][word] = 1

        # print class_freq
        # print word_total_count_freq
        # print each_word_count

        print 'total'
        total_class_count = class_freq[0] + class_freq[1] + class_freq[2]
        total_word_count = word_total_count_freq[0] + word_total_count_freq[1] + word_total_count_freq[2]
        print total_class_count
        # print total_word_count

        f.close()
        f1 = open('a_1.txt', 'r')

        #   中性   积极, , 消极
        # neutral, positive, negative
        orgin = [0, 0, 0]   # 本来有多少积极消极
        judge = [0, 0, 0]   # 判断出来了多少积极消极
        judge_right = [0, 0, 0]

        print 'test_set_now'
        for line in f1:
            result = line.split('\t')
            # print result[1]
            ws_lst = segmentor.segment(result[1])
            # print test_line[test_count]
            max = 0
            tmp_result = 0
            for test_iter in range(3):
                processed_wst = []
                prob_this_class = 1
                for test_word in ws_lst:
                    if test_word not in processed_wst:
                        prob_this_class *= (each_word_count[test_iter].get(test_word, 0) + 1.0) / float(word_total_count_freq[test_iter] + total_word_count)
                        processed_wst.append(test_word)
                prob_this_class *= (float(class_freq[test_iter]) / float(total_class_count))

                if prob_this_class > max:
                    max = prob_this_class
                    tmp_result = test_iter

            if tmp_result == 0:
                test_result = '0'
                judge[0] += 1
            elif tmp_result == 1:
                test_result = '1'
                judge[1] += 1
            elif tmp_result == 2:
                test_result = '-1'
                judge[2] += 1

            if result[0] == test_result:
                accu[0] += 1
            else:
                accu[1] += 1

            if result[0] == '0':
                orgin[0] += 1
            elif result[0] == '1':
                orgin[1] += 1
            elif result[0] == '-1':
                orgin[2] += 1

            if result[0] == '0' == test_result:
                judge_right[0] += 1
            elif result[0] == '1' == test_result:
                judge_right[1] += 1
            elif result[0] == '-1' == test_result:
                judge_right[2] += 1

            # print 'result is %s'%test_result
            # print 'count are %d, %d'%(accu[0], accu[1])
            # print 'accuracy so far: %f'%(float(accu[0]) / float(accu[0] + accu[1]))


        f1.close()
        print 'orgin'
        print orgin

        print 'judge'
        print judge

        print 'judge_right'
        print judge_right

        print 'total'
        print accu
        print 'accuracy this time is %f'%((float(accu[0]) / float(accu[0] + accu[1])))
#!/usr/bin/env python
# coding: utf-8
from pyltp import Segmentor


segmentor = Segmentor()
segmentor.load('/downloads/cws.model')


def segment(text):
    if isinstance(text, unicode):
        text = text.encode('utf-8')
    words = segmentor.segment(text)
    return map(lambda x: x.decode('utf-8'), words)

Ejemplo n.º 42
0
class PreProcessor(object) :
    def __init__(self , cws_model_path=CWS_MODEL_PATH , stop_words_dir=STOP_WORDS_DIR) :
        self.raw_data = None
        self.processed_data = None
        self.words_dict = None
        self.STOP_WORDS = self._load_stop_words(stop_words_dir) 
        self.segmentor = Segmentor()
        self.segmentor.load(cws_model_path)

    def _load_stop_words(self , dir_name) :
        stop_words = set()
        cur_abs_dir_path = os.path.split(os.path.abspath(__file__))[0]
        dir_path = os.path.join(cur_abs_dir_path , dir_name)
        for file_name in os.listdir(dir_path) :
            file_path = os.path.join(dir_path , file_name) 
            with open(file_path) as f :
                for line in f :
                    word = line.strip()
                    stop_words.add(word)
        for symbol in SENT_SPLIT_SYMBOLS :
            stop_words.add(symbol)
        return stop_words

    def load_raw_data(self , path) :
        with open(path) as f :
            self.raw_data = json.load(f)
    
    def _split_sentence(self , content) :
        '''
        split content to sentence
        '''
        sents = []
        paras = content.split("\n")
        for paragraph in paras :
            split_rst = re.split(ur"[%s]+" %(SENT_SPLIT_SYMBOLS) , paragraph) # has space 
            sents.extend(split_rst)
        return sents
    
    def _segment(self , unicode_line) :
        '''
        return : list of words
        '''
        utf8_line = unicode_line.strip().encode("utf8")
        words = list(self.segmentor.segment(utf8_line))
        return words
    
    def _make_doc_data(self , url , title_seged , sents_seged) :
        return { 'url' : url ,
                 'title' : title_seged ,
                 'content' : sents_seged
                 }

    def _add_word2words_dict(self , words) :
        for word in words :
            if word not in self.STOP_WORDS :
                word = word.lower() 
                self.words_dict.add(word)

    def do_preprocessing(self) :
        logging.info("do preprocessing ...")
        self.processed_data = dict()
        self.words_dict = set()
        for page_id , page_data in self.raw_data.items() :
            url = page_data['url']
            title = page_data["title"]
            content = page_data["content"]
            sents = self._split_sentence(content)
            # segment
            title_words = self._segment(title)
            content_words = []
            for sent in sents :
                content_words.extend(self._segment(sent))
                content_words.append(" ") # another space to avoid that they become one line when merging at output snippet 
            self.processed_data[page_id] = self._make_doc_data(url , title_words , content_words)
            self._add_word2words_dict(title_words + content_words)
        logging.info('done.')
    
    def save_doc_data(self , to_path) :
        logging.info("saving doc data to ` %s `" %(to_path) )
        with open(to_path , 'w') as of:
            json.dump(self.processed_data , of )
        logging.info("done.")

    def save_words_dict(self , to_path) :
        logging.info("saving words dict to ` %s `" %(to_path))
        words_list = list(self.words_dict)
        words_dict = {word : word_id for word_id , word in enumerate(words_list) }
        with open(to_path , 'w') as of :
            json.dump(words_dict , of , ensure_ascii=False) # json not support `set`
        logging.info("done.")
Ejemplo n.º 43
0
degree_dict = {}
negative_dict = {}
for i in range(len(f_degree)):
    word = f_degree[i].strip().split('\001')[0]
    if i < 13:
        degree_dict[word] = 2
    if i >= 13 and i < 29:
        degree_dict[word] = 1
    if i >= 29:
        degree_dict[word] = 3
for line in f_negative:
    word = line.strip().split('\001')[0]
    negative_dict[word] = -1

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))


def parse(words_tags):
    #words = segmentor.segment(sentence)
    #words="清洁 不是 很 彻底   感觉 不是 正品".split()
    #print words
    #postags = postagger.postag(words)
    words = words_tags[0]
    postags = words_tags[1]
    mp = {}
    for index, i in enumerate(postags, 1):
Ejemplo n.º 44
0
class Sentence_Parser:
    def __init__(self):
        LTP_DIR = 'F:\project support\ltp_data_v3.4.0'
        # 分词
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, 'cws.model'))

        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, 'pos.model'))

        # 依存句法分析
        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, 'parser.model'))

        # 命名实体识别(人名、地名、机构名等)
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, 'ner.model'))

        # 词义角色标注(施事、受事、时间、地点)
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))

    def format_labelrole(self, words, postags):
        """
        词义角色标注
        """
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {
                arg.name: [arg.name, arg.range.start, arg.range.end]
                for arg in role.arguments
            }
        # for item in roles_dict.items():
        #     print(item)
        return roles_dict

    def bulid_parser_child_dict(self, words, postags, arcs):
        """
        句法分析---为句子中的每个词语维护一个保存句法依存子节点的字典
        """
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index + 1:
                    if arcs[arc_index].relation not in child_dict:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]
        # print(rely_id)
        relation = [arc.relation for arc in arcs]
        # for i in range(len(relation)):
        #     print(words[i], '_', postags[i], '_', i, '_', relation[i])
        heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]
        # print(heads)
        for i in range(len(words)):
            a = [
                relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1,
                postags[rely_id[i] - 1]
            ]
            format_parse_list.append(a)
        return child_dict_list, format_parse_list

    def parser_main(self, sentence):
        """
        parser主函数
        """
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.bulid_parser_child_dict(
            words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list

    def select(self, words, postags):
        """
        筛选出名词和形容词
        """
        co_model = Word2Vec.load('coseg_text.model')
        n_list0 = []
        a_list = []
        for i in range(len(postags)):
            if postags[i] == 'n':
                if len(words[i]) >= 2:
                    n_list0.append(words[i])
            if postags[i] == 'a':
                # if len(words[i]) >= 2:
                a_list.append(words[i])
        n_list0 = list(set(n_list0))
        a_list = list(set(a_list))
        # print(n_list0)
        # print(a_list)
        si_p = []
        for n in n_list0:
            try:
                s = co_model.similarity(n, '手机')
                si_p.append(s)
            except Exception as e:
                si_p.append(0)
        index_list = list(
            map(si_p.index, heapq.nlargest(int(0.8 * len(si_p)),
                                           si_p)))  #取出和手机相关度最高的n
        n_list = []
        for index in index_list:
            n_list.append(n_list0[index])
        # print(n_list)
        return n_list, a_list

    def simlarity(self, n_list0, a_list):
        """
        计算相似度,进行正逆向匹配,筛选出名词和形容词的最佳搭配
        """
        n_list0 = n_list0
        a_list = a_list
        co_model = Word2Vec.load('coseg_text.model')
        si_p = []
        for n in n_list0:
            try:
                s = co_model.similarity(n, '手机')
                si_p.append(s)
            except Exception as e:
                si_p.append(0)
        index_list = list(
            map(si_p.index, heapq.nlargest(int(0.8 * len(si_p)),
                                           si_p)))  #取出和手机相关度最高的n
        n_list = []
        for index in index_list:
            n_list.append(n_list0[index])

        # 名词正向匹配
        comment1_df = pd.DataFrame(columns=['comment_tag', 'similarity'],
                                   index=[np.arange(100)])
        index = 0
        for i in range(len(n_list)):
            f_si = 0
            for j in range(len(a_list)):
                try:
                    si = co_model.similarity(n_list[i], a_list[j])
                    if si >= f_si:
                        f_si = si
                        comment_tag = n_list[i] + a_list[j]
                    else:
                        f_si = f_si
                except Exception as e:
                    print('语料库中缺少该词', e)
            comment1_df.loc[index, ] = [comment_tag, f_si]
            index += 1
        comment1_df = comment1_df.sort_values(by='similarity',
                                              ascending=False,
                                              ignore_index=True)
        comment1_df.dropna(subset=['comment_tag'], inplace=True)
        # comment1_df = comment1_df.iloc[0: int(0.2*len(comment_df)), ]

        # 形容词匹配逆向匹配
        comment2_df = pd.DataFrame(columns=['comment_tag', 'similarity'],
                                   index=[np.arange(100)])
        index = 0
        for i in range(len(a_list)):
            f_si = 0
            for j in range(len(n_list)):
                try:
                    si = co_model.similarity(n_list[j], a_list[i])
                    if si >= f_si:
                        f_si = si
                        comment_tag = n_list[j] + a_list[i]
                    else:
                        f_si = f_si
                except Exception as e:
                    print('语料库中缺少该词', e)
            comment2_df.loc[index, ] = [comment_tag, f_si]
            index += 1
            comment2_df = comment2_df.sort_values(by='similarity',
                                                  ascending=False,
                                                  ignore_index=True)
            comment1_df.dropna(subset=['comment_tag'], inplace=True)
        comment_df = pd.merge(comment1_df,
                              comment2_df,
                              on='comment_tag',
                              how='inner')
        comment_df.dropna(subset=['comment_tag'], inplace=True)
        return comment_df

    def cleandata(self, x):
        """
        对数据进行清洗,替换一些不规则的标点符号
        """
        pat = re.compile("[^\u4e00-\u9fa5^.^a-z^A-Z^0-9]")  # 只保留中英文,去掉符号
        x = x.replace(' ', ',')
        emoji.demojize(x)  # 去掉表情表情符号
        x = re.sub(pat, ',', x)
        return x
                        if view_word in sentence:
                            result = abstract(view_word, sentence)
                            if result != None:
                                news_list[num]['opinion'].append(result)
            # print(news_list[num])
    print('执行完成')

if __name__ == '__main__':
    with open(r'D:\Github_project\Project_one\算法模型\data\view_words.pk', 'rb') as f:
        view_words = pickle.load(f)


    # 加载模型
    # # 哈工大ltp分词
    segmentor = Segmentor()  # 初始化实例
    segmentor.load('D:\data\ltp_data_v3.4.0\cws.model')  # 加载模型
    # 得到词性
    postagger = Postagger()  # 初始化实例
    postagger.load('D:\data\ltp_data_v3.4.0\pos.model')  # 加载模型
    # 得到命名实体
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(r'D:\data\ltp_data_v3.4.0\ner.model')  # 加载模型
    # 依存句法分析
    parser = Parser()  # 初始化实例
    parser.load(r'D:\data\ltp_data_v3.4.0\parser.model')  # 加载模型
    # 得到语义角色标注
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(r'D:\data\ltp_data_v3.4.0\pisrl_win.model')  # 加载模型

    # with open(r'D:\Github_project\Project_one\算法模型\data\news_sports.pk', 'rb') as f:
    #     news_sports = pickle.load(f)
Ejemplo n.º 46
0
# -*- coding: utf-8 -*-

from pyltp import Segmentor
segmentor = Segmentor()
segmentor.load("/Users/lzy/Code/ltp_model/cws.model")


def word_seg(line,label="0"):
	words = segmentor.segment(line)
	s=" ".join(words)
	return s
Created on 2018年12月25日

@author: Zhukun Luo
Jiangxi university of finance and economics
'''
import os

import pandas as pd
import re

from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
from pyltp import SentenceSplitter

LTP_DIR = r'D:\LTP\MODEL\ltp_data'  # ltp模型目录的路径
segmentor = Segmentor()
segmentor.load(os.path.join(LTP_DIR, "cws.model"))  # 分词模型路径,模型名称为`cws.model`

postagger = Postagger()
postagger.load(os.path.join(LTP_DIR, "pos.model"))  # 词性标注模型路径,模型名称为`pos.model`

parser = Parser()
parser.load(os.path.join(LTP_DIR,
                         "parser.model"))  # 依存句法分析模型路径,模型名称为`parser.model

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(LTP_DIR,
                             "ner.model"))  # 命名实体识别模型路径,模型名称为`ner.model`

# labeller = SementicRoleLabeller()
# labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))# 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。
Ejemplo n.º 48
0
# kernel method for select term
import re
import random
import os
from Config import *
from pyltp import Segmentor

segmentor = Segmentor()
segmentor.load('./model/cws.model')

def answer(questionDict):
    ### return answer for select term question
    '''
    for each in questionDict:
        print each,questionDict[each]
    '''

    candidateTermList = generate_candidate_term(questionDict['options'])
    compareSentenceList = generate_compare_sentence(questionDict['body'],candidateTermList)
    scoreList = rnnlm_score(compareSentenceList)
    answer = find_best_option(questionDict['options'],candidateTermList,scoreList)
    #print 'answer',answer
    return answer

def generate_candidate_term(optionList):
    ### generatee candidata term
    ### return candidate term list. [[A1,A2],[B1,B2],[C1,C2]]
    optionList = option_list_regular(optionList)
    candidateTermList = []
    # insert empty list for sentence length
    for i in range(len(optionList[0])):
Ejemplo n.º 49
0
class LTP(object):
    def __init__(self):
        cws_model_path = os.path.join('../data/ltp_data_v3.4.0',
                                      'cws.model')  # 分词模型路径,模型名称为`cws.model`
        pos_model_path = os.path.join('../data/ltp_data_v3.4.0',
                                      'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        ner_model_path = os.path.join(
            '../data/ltp_data_v3.4.0',
            'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

        self.segmentor = Segmentor()  # 初始化实例
        self.segmentor.load(cws_model_path)  # 加载模型
        self.postagger = Postagger()  # 初始化实例
        self.postagger.load(pos_model_path)  # 加载模型
        self.recognizer = NamedEntityRecognizer()  # 初始化实例
        self.recognizer.load(ner_model_path)  # 加载模型

    # 分词
    def segment(self, text):
        words = list(self.segmentor.segment(text))
        return words

    # 词性标注
    def postag(self, words):
        postags = list(self.postagger.postag(words))
        return postags

    # 获取文本中的时间
    def get_time(self, text):

        # 开始分词及词性标注
        words = self.segment(text)
        #print(words)
        postags = self.postag(words)
        #print(postags)

        time_lst = []

        i = 0
        for tag, word in zip(postags, words):
            if tag == 'nt':
                j = i
                while postags[j] == 'nt' or words[j] in ['至', '到']:
                    j += 1
                time_lst.append(''.join(words[i:j]))
            i += 1

        # 去重子字符串的情形
        remove_lst = []
        for i in time_lst:
            for j in time_lst:
                if i != j and i in j:
                    remove_lst.append(i)

        text_time_lst = []
        for item in time_lst:
            if item not in remove_lst:
                text_time_lst.append(item)

        # print(text_time_lst)
        return text_time_lst

    #提取人名地名组织名
    def get_name(self, text):
        persons, places, orgs = set(), set(), set()

        words = self.segment(text)
        #print("words333333333333")
        postags = self.postag(words)
        #print(postags)
        netags = list(self.recognizer.recognize(words, postags))  # 命名实体识别
        #print(netags)
        # print(netags)
        i = 0
        for tag, word in zip(netags, words):
            j = i
            # 人名
            if 'Nh' in tag:
                if str(tag).startswith('S'):
                    persons.add(word)
                elif str(tag).startswith('B'):
                    union_person = word
                    while netags[j] != 'E-Nh':
                        j += 1
                        if j < len(words):
                            union_person += words[j]
                    persons.add(union_person)
            # 地名
            if 'Ns' in tag:
                if str(tag).startswith('S'):
                    places.add(word)
                elif str(tag).startswith('B'):
                    union_place = word
                    while netags[j] != 'E-Ns':
                        j += 1
                        if j < len(words):
                            union_place += words[j]
                    places.add(union_place)
            # 机构名
            if 'Ni' in tag:
                if str(tag).startswith('S'):
                    orgs.add(word)
                elif str(tag).startswith('B'):
                    union_org = word
                    while netags[j] != 'E-Ni':
                        j += 1
                        if j < len(words):
                            union_org += words[j]
                    orgs.add(union_org)

            i += 1

        # print('人名:', ','.join(persons))
        # print('地名:', ','.join(places))
        # print('组织机构:', ','.join(orgs))
        return persons, places, orgs

    # 释放模型
    def free_ltp(self):
        self.segmentor.release()
        self.postagger.release()