Example #1
0
class LtpTokenizer(Tokenizer):
    def __init__(self, **kwargs):
        """
        Args:
            annotators: set that can include pos and ner.
            model: ltp model to use (path).
        """
        self.segmentor = Segmentor()  # 初始化分词器实例
        self.recognizer = NamedEntityRecognizer()  # 初始化命名实体识别器实例
        self.postagger = Postagger()  # 初始化词性标注实例

        self.segmentor.load(cws_model_path)  # 加载分词模型

        self.annotators = copy.deepcopy(kwargs.get('annotators', set()))
        if {'pos'} & self.annotators:
            self.postagger.load(pos_model_path)
        if {'ner'} & self.annotators:
            self.postagger.load(pos_model_path)
            self.recognizer.load(ner_model_path)

    def tokenize(self, text):
        # We don't treat new lines as tokens.
        clean_text = text.replace('\n', ' ')

        words = list(self.segmentor.segment(clean_text))  # 分词
        postags = ['empty'] * len(words)
        netags = ['empty'] * len(words)

        if {'pos'} & self.annotators:  # 交集
            postags = list(self.postagger.postag(words))  # 词性标注
        if {'ner'} & self.annotators:
            postags = list(self.postagger.postag(words))
            netags = list(self.recognizer.recognize(words, postags))  # 命名实体识别

        data = []
        tmp_idx = 0
        for i in range(len(words)):
            # Get whitespace
            start_ws = tmp_idx
            end_ws = tmp_idx + len(words[i])
            tmp_idx += len(words[i])

            data.append((
                words[i],
                text[start_ws:end_ws],
                (start_ws, end_ws),
                postags[i],
                words[i],  # lemma
                netags[i],
            ))

        # Set special option for non-entity tag: '' vs 'O' in spaCy
        return Tokens(data, self.annotators, opts={'non_ent': ''})
def word_pos():
#ltp词性标注
    candidate=pd.read_csv(r'../data/candidate_sentiment.csv',header=None)
    can_word=candidate[0].tolist()
    # 新加一列存放词性
    candidate.insert(2,'ltp_pos',0)
    candidate.insert(3,'jieba_pos',0)
    candidate.columns=['word','freq','ltp_pos','jieba_pos']

    LTP_DATA_DIR = '../ltp_data_v3.4.0/ltp_data_v3.4.0'  # ltp模型目录的路径
    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    
    
    postagger = Postagger() # 初始化实例
    postagger.load(pos_model_path)  # 加载模型   
    postags = postagger.postag(can_word)  # 词性标注   
    postagger.release()  # 释放模型   
    postags=list(postags)
    candidate['ltp_pos']=postags
#jieba词性标注    
    
    jieba_pos=[]
    for index,row in candidate.iterrows():
        s=row['word']
        words=pseg.cut(s)
        pos=[]
        for w in words:
            pos.append(w.flag)
        pos=' '.join(pos)
        jieba_pos.append(pos)
    
    candidate['jieba_pos']=jieba_pos
#    添加表头
    candidate.to_csv(r'../data/candidate_sentiment.csv',index=None)
Example #3
0
def locationNER(text):
    #先分词
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(text)  # 分词
    #print ('\t'.join(words))
    segmentor.release()

    #再词性标注
    postagger = Postagger() # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    postagger.release()  # 释放模型

    #最后地理实体识别

    recognizer = NamedEntityRecognizer() # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    for i in range (0,len(netags)):
       if 'I-Ns'in netags[i] or 'I-Ni'in netags[i]:
           results.append(words[i-1]+words[i]+words[i+1])
       if 'S-Ns'in netags[i] or 'S-Ni'in netags[i]:
           results.append(words[i])
    return results
def cut_words():
    #分词+去除空行
    #词性标注集http://ltp.readthedocs.io/zh_CN/latest/appendix.html
    cont = open('resource_new.txt', 'r', encoding='utf-8')
    f = open('key/cut_resouce.txt', 'w', encoding='utf-8')
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load('cws.model')  # 加载模型,不加载字典
    segmentor.load_with_lexicon('module/cws.model',
                                'userdict.txt')  # 加载模型,加载用户字典
    postagger = Postagger()  # 初始化实例
    postagger.load('module/pos.model')  # 加载模型
    for sentence in cont:
        if sentence.strip() != '':
            words = segmentor.segment(sentence)  # 分词
            pos_tags = postagger.postag(words)  # 词性标注
            for word, tag in zip(words, pos_tags):
                if tag != 'wp':
                    f.write(word)
                else:
                    f.write('\n')
            f.write('\n')
        else:
            continue
    f.close()
    segmentor.release()
    postagger.release()
Example #5
0
def segmentsentence(sentence):
    segmentor = Segmentor()
    postagger = Postagger()
    parser = Parser()
    recognizer = NamedEntityRecognizer()

    segmentor.load("./ltpdata/ltp_data_v3.4.0/cws.model")
    postagger.load("./ltpdata/ltp_data_v3.4.0/pos.model")
    # parser.load("./ltpdata/ltp_data_v3.4.0/parser.model")
    recognizer.load("./ltpdata/ltp_data_v3.4.0/ner.model")
    #############
    word_list = segmentor.segment(sentence)
    postags_list = postagger.postag(word_list)
    nertags = recognizer.recognize(word_list, postags_list)
    ############
    for word, ntag in zip(word_list, nertags):
        if ntag == 'Nh':
            entity_list.append(word)
    print(" ".join(word_list))
    print(' '.join(nertags))
    ############
    segmentor.release()
    postagger.release()
    # parser.release()
    recognizer.release()
    return word_list
Example #6
0
def get_all_name(r_filename,w_file):
    # global nlp
    LTP_DATA_DIR = r'ltp_data_v3.4.0'  # LTP模型目录路径
    # 分词
    segmentor = Segmentor()  # 初始化
    segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model'))  # 加载模型
    # 词性标注
    postagger = Postagger()  # 初始化
    postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model'))  # 加载模型
    # 命名实体识别
    recognizer = NamedEntityRecognizer()  # 实例化
    recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))
    f_r=open(r_filename,"r",encoding="utf-8")
    f_w=open(w_file,"w",encoding="utf-8")
    count=0
    for line in f_r:
        count+=1
        lines=line.strip("\n").replace(r"\n","")
    #    print("----------"+lines)
        words = segmentor.segment(lines)
        postags = postagger.postag(words)
        netags = recognizer.recognize(words, postags)
        sen=get_some_idea(line,netags,words)
        print(sen)
        if sen:
            for key in sen:
                sens="\t".join(list(set([data[1] for data in sen[key]])))
                f_w.write(key +"\t"+sens +"\n")
    # nlp.close()
    f_r.close()
    f_w.close()
Example #7
0
 def get_postags(self, words):
     postagger = Postagger()  # 初始化实例
     postagger.load(self.pos_model_path)  # 加载模型
     postags = postagger.postag(words)  # 词性标注
     print('\t'.join(postags))
     postagger.release()  # 释放模型
     return list(postags)
def ltp_pos_data():
    """使用 LTP 进行词性标注"""
    LTP_DATA_DIR = 'D:\BaiduNetdiskDownload\ltp_data_v3.4.0'  # ltp模型目录的路径
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

    from pyltp import Postagger
    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    result = []
    file = [(const.qc_train_seg, const.qc_train_pos),
            (const.qc_test_seg, const.qc_test_pos)]
    for i in range(2):
        with open(file[i][0], 'r', encoding='utf-8') as f:
            for line in f.readlines():
                attr = line.strip().split('\t')
                words = attr[1].split(" ")
                words_pos = postagger.postag(words)
                res = ' '.join([
                    "{}/_{}".format(words[i], words_pos[i])
                    for i in range(len(words))
                ])
                result.append("{}\t{}\n".format(attr[0], res))
        with open(file[i][1], 'w', encoding='utf-8') as f:
            f.writelines(result)
        result.clear()
    postagger.release()  # 释放模型
Example #9
0
class Ltp(NerModel):
    def __init__(self):
        super(Ltp, self).__init__()
        self._model_path = "./model/ltp/"
        self._seg = Segmentor()
        self._pos = Postagger()
        self._recognizer = NamedEntityRecognizer()
        self._load_model()
        self._object_str = "[INFO] This is ltp object!"
        print("[INFO] All model is load!")

    def __repr__(self):
        return self._object_str

    def _load_model(self):
        self._seg.load(self._model_path + "cws.model")
        self._pos.load(self._model_path + "pos.model")
        self._recognizer.load(self._model_path + "ner.model")

    def get_entity(self, sentence):
        words = self._seg.segment(sentence)
        pos = self._pos.postag(words)
        ner = self._recognizer.recognize(words, pos)
        entity = [w for w, s in zip(words, ner) if s != 'O']
        if entity:
            return "".join(entity) if len(entity) > 1 else entity[0]
Example #10
0
class pyltp_model():
    def __init__(self, LTP_DATA_DIR='/Users/didi/Desktop/ltp_data_v3.4.0'):
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
        pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
        ner_model_path = os.path.join(
            LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        self.segmentor = Segmentor()  # 初始化实例
        self.postagger = Postagger()  # 初始化实例
        self.recognizer = NamedEntityRecognizer()  # 初始化实例

        self.segmentor.load(cws_model_path)  # 加载模型
        self.postagger.load(pos_model_path)  # 加载模型
        self.recognizer.load(ner_model_path)  # 加载模型

    def token(self, sentence):
        words = self.segmentor.segment(sentence)  # 分词
        words = list(words)
        postags = self.postagger.postag(words)  # 词性标注
        postags = list(postags)
        netags = self.recognizer.recognize(words, postags)  # 命名实体识别
        netags = list(netags)
        result = []
        for i, j in zip(words, netags):
            if j in ['S-Nh', 'S-Ni', 'S-Ns']:
                result.append(j)
                continue
            result.append(i)
        return result

    def close(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()  # 释放模型
def run():
    #分词+选词
    cont = open('key/pinglun_filter_all1.txt','r',encoding='utf-8')
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load('cws.model')  # 加载模型,不加载字典
    segmentor.load_with_lexicon('cws.model', 'userdict.txt') # 加载模型,加载用户字典
    postagger = Postagger() # 初始化实例
    postagger.load('pos.model')  # 加载模型
    nwordall = []
    for sentence in cont:
        nword = ['']
        words = segmentor.segment(sentence)  # 分词
        #默认可以这样输出
        # print (' '.join(words))
        postags = postagger.postag(words)  # 词性标注
        for word,tag in zip(words,postags):
            #############选择词性输出
            # print (word+'/'+tag)
            ############只选出副词
            # if tag == 'd':
            #######过滤单个字
            # if((tag == 'n'or tag == 'd' or tag == 'a') and len(word)>1):
            ############使用word2vec相似度计算找取跟名词相近的形容词
            # if((tag == 'a' or tag == 'n') and len(word)>1):
            if((tag == 'n') and len(word)>1):
                # print(word+tag)
                nword.append(word)
        nwordall.append(nword)
    #size为词向量维度数也即是特征值,windows窗口范围,min_count频数小于5的词忽略,workers是线程数,维度高会造成问题
    model = models.word2vec.Word2Vec(nwordall, size=10, window=5, min_count=100, workers=80)
    print('#############################################')
    sim = model.most_similar(positive=[u'餐饮'])
    for s in sim:
        print ("word:%s,similar:%s " %(s[0],s[1]))
Example #12
0
File: nlp.py Project: 89935/OpenRE
class NLP:
    default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\'  #LTP模型文件目录

    def __init__(self, model_dir=default_model_dir):
        self.default_model_dir = model_dir

        #词性标注模型
        self.postagger = Postagger()
        postag_flag = self.postagger.load(
            os.path.join(self.default_model_dir, 'pos.model'))

    def get_postag(self, word):
        """获得单个词的词性标注
        Args:
            word:str,单词
        Returns:
            pos_tag:str,该单词的词性标注
        """
        pos_tag = self.postagger.postag([
            word,
        ])
        return pos_tag[0]

    def close(self):
        """
        关闭与释放
        """
        self.postagger.release()
Example #13
0
    def _load_testset(self):
        """
        加载测试集
        :return:
        """
        par_model_path = os.path.join(self.ltp_dir, 'parser.model')
        pos_model_path = os.path.join(self.ltp_dir, 'pos.model')
        postagger = Postagger()
        postagger.load(pos_model_path)
        parser = Parser()
        parser.load(par_model_path)

        examples = []
        with open(os.path.join(self.data_dir, self.file_name)) as f:
            for l in tqdm(f):
                l = json.loads(l)
                # 分词 pos ner : 中文命名实体识别是字符级模型(bert),所以用 list将字符串转换为字符列表。至于输出,格式为 (entity, type, begin, end)。
                text_seg = jieba.lcut(l['text'], HMM=False)
                poses = ' '.join(postagger.postag(text_seg)).split()
                arcs = parser.parse(text_seg, poses)
                arcses = ' '.join("%d:%s" % (arc.head, arc.relation)
                                  for arc in arcs).split()
                examples.append(
                    self.align_bert_4_inference(l, text_seg, arcses))

        return examples
Example #14
0
def postags_opt(words):
    # Set pyltp postagger model path
    LTP_DATA_DIR = '../ltp_data_v3.4.0'
    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')

    # Init postagger
    postagger = Postagger()

    # Load model
    postagger.load(pos_model_path)

    # Get postags
    postags = postagger.postag(words)

    # Close postagger
    postagger.release()

    postags = list(postags)

    # Init result list
    saying_words = []

    # Filter with tag 'verb'
    for index, tag in enumerate(postags):
        if tag == 'v':
            saying_words.append(words[index])

    return saying_words
Example #15
0
    def ltp_word(self):
        """创建一个方法,用来进行句子的分词、词性分析等处理。"""
        # 分词
        segmentor = Segmentor()
        segmentor.load(os.path.join(MODELDIR, "cws.model"))
        words = segmentor.segment(self.content)
        #print("*************分词*****************")
        #print("\t".join(words))

        # 词性标注
        postagger = Postagger()
        postagger.load(os.path.join(MODELDIR, "pos.model"))
        postags = postagger.postag(words)
        #print("*************词性标注*************")
        #print(type(postags))
        #print("\t".join(postags))

        # 依存句法分析
        parser = Parser()
        parser.load(os.path.join(MODELDIR, "parser.model"))
        arcs = parser.parse(words, postags)
        #print("*************依存句法分析*************")
        #print(type(arcs))
        #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

        # 把依存句法分析结果的head和relation分离出来
        arcs_head = []
        arcs_relation = []
        for arc in arcs:
            arcs_head.append(arc.head)
            arcs_relation.append(arc.relation)

        # 命名实体识别
        recognizer = NamedEntityRecognizer()
        recognizer.load(os.path.join(MODELDIR, "ner.model"))
        netags = recognizer.recognize(words, postags)
        #print("*************命名实体识别*************")
        #print("\t".join(netags))
        """
        # 语义角色标注
        labeller = SementicRoleLabeller()
        labeller.load(os.path.join(MODELDIR, "pisrl.model"))
        roles = labeller.label(words, postags, arcs)
        print("*************语义角色标注*************")
        for role in roles:
            print(role.index, "".join(
                ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
        """

        segmentor.release()
        postagger.release()
        parser.release()
        recognizer.release()
        #labeller.release()

        # 调用list_conversion函数,把处理结果列表化
        words_result = list_conversion(words, postags, netags, arcs_head,
                                       arcs_relation)

        return words_result
Example #16
0
class LtpTree(DepTree):
    def __init__(self, dict_path=None):
        super(DepTree, self).__init__()
        print("正在加载LTP模型... ...")
        self.segmentor = Segmentor()
        if dict_path is None:
            self.segmentor.load(os.path.join(MODELDIR, "cws.model"))
        else:
            self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), dict_path)
        self.postagger = Postagger()
        self.postagger.load(os.path.join(MODELDIR, "pos.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(MODELDIR, "parser.model"))
        print("加载模型完毕。")

    def parse(self, sentence):
        self.words = self.segmentor.segment(sentence)
        self.postags = self.postagger.postag(self.words)
        self.arcs = self.parser.parse(self.words, self.postags)
        for i in range(len(self.words)):
            if self.arcs[i].head == 0:
                self.arcs[i].relation = "ROOT"

    def release_model(self):
        # 释放模型
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
def namedEntityRecognize(sentence):
    '''
        使用pyltp模块进行命名实体识别
        返回:1)命名实体和类别元组列表、2)实体类别列表
    '''
    namedEntityTagTupleList = []

    segmentor = Segmentor()
    # segmentor.load(inout.getLTPPath(index.CWS))
    segmentor.load_with_lexicon(inout.getLTPPath(index.CWS),
                                inout.getResourcePath('userDic.txt'))
    words = segmentor.segment(sentence)
    segmentor.release()
    postagger = Postagger()
    postagger.load(inout.getLTPPath(index.POS))
    postags = postagger.postag(words)
    postagger.release()
    recognizer = NamedEntityRecognizer()
    recognizer.load(inout.getLTPPath(index.NER))
    netags = recognizer.recognize(words, postags)
    recognizer.release()

    # 封装成元组形式
    for word, netag in zip(words, netags):
        namedEntityTagTupleList.append((word, netag))

    neTagList = '\t'.join(netags).split('\t')

    return namedEntityTagTupleList, neTagList
Example #18
0
def ner_data():
     # 分词模型
    segmentor = Segmentor()
    segmentor.load('cws.model')
    # 词性标注模型
    postagger = Postagger()
    postagger.load('pos.model')
    # 命名实体模型
    recognizer = NamedEntityRecognizer()
    NamedEntityRecognizer.load('ner.model')
    # 加载将要被分词的数据
    data_csv = pd.read_csv('../data.csv', encoding='utf-8-sig')
    datas = data_csv['title']

    util = Utils()
    data_processed = open('./data_processed_recognizer.csv', 'w', encoding='utf-8')
    for data in datas:
        words = segmentor.segment(data)
        postags = postagger.postag(words)
        word_split = ' '.join(words).split(' ')
        netags = recognizer.recognize(words, postags)
        netag_split = ' '.join(netags).split(' ')
        concat_word = util.concat(word_split, netag_split, tag='netags')
        data_processed.write(concat_word + '\n')
    data_processed.close()
Example #19
0
 def get_postag_list(self, word_list, model):
     # 得到词性标注
     postag = Postagger()
     postag.load(model)
     postag_list = list(postag.postag(word_list))
     postag.release()
     return postag_list
Example #20
0
class LtpLanguageAnalysis(object):
    def __init__(self, model_dir="/home/xxx/ltp-3.4.0/ltp_data/"):
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(model_dir, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(model_dir, "pos.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(model_dir, "parser.model"))

    def analyze(self, text):
        # 分词
        words = self.segmentor.segment(text)
        print '\t'.join(words)

        # 词性标注
        postags = self.postagger.postag(words)
        print '\t'.join(postags)

        # 句法分析
        arcs = self.parser.parse(words, postags)
        print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

    def release_model(self):
        # 释放模型
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
Example #21
0
def extract_views(all_sents):
    segmentor = Segmentor()
    segmentor.load(r'/home/student/project-01/ltp_data/cws.model')
    postagger = Postagger()
    postagger.load(r'/home/student/project-01/ltp_data/pos.model')
    parser = Parser()
    parser.load(r'/home/student/project-01/ltp_data/parser.model')
    views_in_sents = []
    for i, sents in enumerate(all_sents):
        views_tmp = []
        for sent in sents:
            sent = sent.replace('\\n', '\n').strip()
            if len(sent) == 0:
                continue
            # words = list(jieba.cut(sent))
            words = list(segmentor.segment(sent))
            contains = contain_candidates(words)
            if len(contains) == 0:
                continue
            tags = list(postagger.postag(words))
            arcs = list(parser.parse(words, tags))
            sbv, head = get_sbv_head(arcs, words, tags)
            if sbv[0] is None or head[0] is None or head[0] not in contains:
                continue
            subj = sbv[0]
            view = clean_view(words[head[1] + 1:])
            views_tmp.append((subj, view, i))
        if len(views_tmp) > 0:
            views_in_sents.append({'sents': sents, 'views': views_tmp})
    segmentor.release()
    postagger.release()
    parser.release()
    return views_in_sents
Example #22
0
class Parse_Util(object):
    def __init__(self, lexicon_path='./data/lexicon'):
        # 分词
        self.segmentor = Segmentor()
        # self.segmentor.load_with_lexicon(cws_model_path, lexicon_path)
        self.segmentor.load(cws_model_path)
        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(pos_model_path)
        # 依存句法分析
        self.parser = Parser()
        self.parser.load(par_model_path)
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(ner_model_path)
        # jieba 分词
        # jieba.load_userdict(lexicon_path)

    def __del__(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    # 解析句子
    def parse_sentence(self, sentence):
        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        netags = self.recognizer.recognize(words, postags)
        arcs = self.parser.parse(words, postags)
        # child_dict_list = ParseUtil.build_parse_child_dict(words, arcs)

        return words, postags, netags, arcs
Example #23
0
class LTP_word():
    """docstring for parser_word
    deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值
    release释放缓存"""
    def __init__(self, model_path):
        self.model_path = model_path
        self.segmentor = Segmentor()  # 分词初始化实例
        self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt'))
        self.postagger = Postagger() # 词性标注初始化实例
        self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型
        self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例
        self.recognizer.load(path.join(self.model_path, 'ner.model'))
        self.parser = Parser() # 依存句法初始化实例 s
        self.parser.load(path.join(self.model_path, 'parser.model'))  # 加载模型
        self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例
        self.labeller.load(path.join(self.model_path, 'srl'))
    def deal (self, text):  #把所有该要使用的东西都提取出来
        words =self.segmentor.segment(text)    # 分词 
        postags = self.postagger.postag(words)  # 词性标注
        netags = self.recognizer.recognize(words, postags)	#命名实体
        arcs = self.parser.parse(words, postags)  # 句法分析
        roles = self.labeller.label(words, postags, netags, arcs)  # 语义角色标注
        return words,postags,arcs,roles,netags
    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()
class pyltp_impl(Seg):
    def __init__(self, dictpath, mode='seg'):
        super().__init__(mode)

        from pyltp import Segmentor
        from pyltp import Postagger
        from pyltp import NamedEntityRecognizer
        self.ltp_seg = Segmentor()
        self.ltp_pos = Postagger()
        self.ltp_ner = NamedEntityRecognizer()

        self.ltp_seg.load(os.path.join(dictpath, 'cws.model'))

        if mode != 'seg':
            self.ltp_pos.load(os.path.join(dictpath, 'pos.model'))

        if mode == 'ner':
            self.ltp_ner.load(os.path.join(dictpath, 'ner.model'))

    def impl_func(self, sentence):
        seg_res = self.ltp_seg.segment(sentence)
        if self.mode == 'seg':
            return seg_res

        pos_res = self.ltp_pos.postag(seg_res)
        if self.mode == 'postag':
            return [(word, tag) for (word, tag) in zip(seg_res, pos_res)]

        ner_res = self.ltp_ner.recognize(seg_res, pos_res)
        return [(word, tag) for (word, tag) in zip(seg_res, ner_res)]
def get_postag_list(words_list):

    postag = Postagger()
    postag.load(pos_model_path)
    postag_list = list(postag.postag(words_list))
    postag.release()
    return postag_list
Example #26
0
def test_ltp(document):

    LTP_DATA_DIR = r"D:\anaconda\envs\TF+3.5\Lib\site-packages\pyltp-model"
    # ltp模型目录的路径
    par_model_path = os.path.join(
        LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
    cws_model_path = os.path.join(LTP_DATA_DIR,
                                  'cws.model')  # 分词模型路径,模型名称为`cws.model`
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(document)  # 分词
    print("\nA")
    print("分词结果:")
    print('\t'.join(words))
    segmentor.release()  # 释放模型

    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    print("\n")
    print("词性标注结果:")
    print('\t'.join(postags))
    postagger.release()  # 释放模型

    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型
    arcs = parser.parse(words, postags)  # 句法分析
    print("\n")
    print("句法分析结果:")
    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    parser.release()  # 释放模型
class LtpParser:
    def __init__(self):
        LTP_DIR = "./ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

    '''语义角色标注'''
    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments}
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''
    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index+1:   #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''
    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
Example #28
0
def main():
    segmentor = Segmentor()
    segmentor.load('./cws.model')
    postagger = Postagger()
    postagger.load('./pos.model')
    file_object = open(sys.argv[1], 'r')
    sid = []
    output_list = []
    try:
        all_lines = file_object.readlines()
        lc = 0
        tot = 0
        for line in all_lines:
            output = []
            lc += 1
            item = line.split('\t')
            sid.append(item[0])
            sentence = item[1][0:-1]
            #print sentence.decode('utf-8')
            if (len(sentence.replace(' ', '')) != len(sentence)):
                tot += 1
                print lc
            sentence = sentence.replace(' ', '')
            word = segmentor.segment(sentence.encode('utf-8'))
            pos = postagger.postag(word)
            tag = []
            word = list(word)
            pos = list(pos)
            for i in range(len(word)):
                word[i] = word[i].decode('utf-8')
                pos[i] = pos[i].decode('utf-8')
            word, pos = wordToChar(word, pos)
            for i in range(len(word)):
                tag.append('O')
            for i in range(len(word)):
                output.append(word[i] + ' ' + pos[i] + ' ' + tag[i] + '\n')
            output.append('\n')
            output_list.append(output)
        print tot
    finally:
        file_object.close()

    file_object = open(sys.argv[2], 'w')
    negative_num = 0
    for i in range(len(output_list)):
        ff = 0
        for j in range(len(output_list[i])):
            output_list[i][j].encode('utf-8')
            if (output_list[i][j] != '\n'
                    and output_list[i][j].split(' ')[2][0] != 'O'):
                ff = 1
            file_object.write(output_list[i][j])
        if (ff == 0): negative_num += 1
    print negative_num

    file_object = open('SID.txt', 'w')
    for i in range(len(sid)):
        file_object.write(sid[i] + '\n')
    file_object.close()
Example #29
0
class Opinion(object):
    def __init__(self, Dsent, industry_id):
        self.industry_id = industry_id
        self.Dsent = Dsent
        self.postagger = Postagger()  # 初始化实例
        self.postagger.load_with_lexicon(pos_model_path,
                                         '%s/conf/posttags.txt' % dir_path)
        self.sql = mysqls()
        self.opinionword = read_opinion(self.industry_id)
        self.n_v = []

    def cut_word(self, sents):
        # 分词
        words = [i.encode('utf-8', 'ignore')
                 for i in norm_cut(sents)]  # HMM=False
        return words

    def word_sex(self, ):
        # 获取词性
        postags = list(self.postagger.postag(self.words))  # 词性标注
        num = 0
        #副词或者名词后面一个词
        for tag in postags:
            if tag in ['d']:
                if num + 1 < len(postags):
                    if num != 0 and postags[num + 1] in ['n', 'v']:
                        if self.words[num+1] not in self.opinionword \
                            and len(self.words[num + 1].decode('utf-8','ignore')) > 1:
                            self.n_v.append(self.words[num + 1])
            #动词或者n词
            if tag in ['a', 'i', 'b']:
                if self.words[num] not in self.opinionword\
                        and len(self.words[num].decode('utf-8','ignore')) > 1:
                    self.n_v.append(self.words[num])
            num += 1
        return postags

    def prepare(self, ):
        for id, sentences in self.Dsent.items():
            split_sentence = re.split(
                ur'[,,()()、: …~?。!. !?]?',
                sentences.decode('utf-8', 'ignore').strip())
            for sent in split_sentence:
                self.words = self.cut_word(sent.encode('utf-8', 'ignore'))
                self.postags = self.word_sex()
                cword = Counter(self.n_v)

                lresult = heapq.nlargest(500,
                                         cword.items(),
                                         key=lambda x: x[1])
                # lword = []
                # for rg in lresult:
                #     w, n = rg
                #     lword.append(w)
                # self.sql.insert(self.industry_id, lword)
        self.postagger.release()  # 释放模型
        # self.parser.release()  # 释放模型
        # outfile.close()
        return lresult
def new_relation_find(words, sentence):
    """ 新关系发现

    :param words:
    :param sentence:
    :return:
    """
    # 存放三元组的字典
    tuple_dict = dict()
    index0 = -1
    index1 = -1
    bool = False
    for entity_word in entity_words:
        if sentence.find(entity_word) != -1:
            if tuple_dict:
                # 返回为true说明有重复部分
                if has_same(tuple_dict[index0], entity_word):
                    continue
                index1 = sentence.find(entity_word)
                tuple_dict[index1] = entity_word
                bool = True
                break
            else:
                index0 = sentence.find(entity_word)
                tuple_dict[index0] = entity_word
    if bool is False:
        return "", "", ""
    # 排序结果为list
    # tuple_dict = sorted(tuple_dict.items(), key=lambda d: d[0])
    words = "/".join(words).split("/")
    for key, value in tuple_dict.items():
        tuple_word = value
        words = init_words(tuple_word, words)
    # 对于已经重构的词进行词标注
    postagger = Postagger()  # 初始化实例
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    postagger.load_with_lexicon(pos_model_path, 'data/postagger.txt')  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    print('\t'.join(postags))
    postagger.release()  # 释放模型
    # 发现新关系
    relation_word = ""
    index_word = 0
    for index, postag in enumerate('\t'.join(postags).split('\t')):
        index_word += len(words[index])
        if index_word >= len(sentence):
            break
        if postag == 'v' and index_word - min(index0, index1) <= 2 and max(index0, index1) - index_word <= 2 \
                and not has_same(tuple_dict[index0], words[index]) and not has_same(tuple_dict[index1],
                                                                                    words[index]) \
                and words[index] not in wrong_relation:
            relation_word = words[index]
            break
    if relation_word == "":
        return "", "", ""
    return tuple_dict[min(index0,
                          index1)], tuple_dict[max(index0,
                                                   index1)], relation_word
Example #31
0
def segmentation(filename, output_filename):

    print "segmenting '%s' to '%s'" % (filename, output_filename)

    f = open(filename, "r")
    lines = f.readlines()
    f.close()

    MODELDIR = "./ltp_data/"

    # segment
    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))

    # postag
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    
    # Named Entity Recognize
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    
    # Parse and get SVO
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    
    f = open(output_filename, "w")
    fner = open(output_filename.split(".")[0]+"_ner.txt", "w")

    for _line in lines:
        line = _line[:-1]
        if line[-1] in "\n\r":
            line = line[:-1]
        
        words = segmentor.segment(line)
        postags = postagger.postag(words)
#        netags = recognizer.recognize(words, postags)
#        arcs = parser.parse(words, postags)

        for i in range(len(words)):
            f.write( "%s/%s\t" % (words[i], postags[i]))
#            if netags[i]!='O':
#                fner.write("%s/%s\t" % (words[i], netags[i]))
        f.write("\n")
#        fner.write("\n")

    f.close()
Example #32
0
def words_cixing(words=["中国","进出口","银行","与","中国银行","加强","合作"],type_list=0,pos=0):
    """词性标注,若type_list=True,则返回以列表返回标注词性后的结果。
    词性标记集:LTP中采用863词性标注集
    词性说明见:http://www.ltp-cloud.com/intro/
    若type_list为真,则返回['ns', 'v', 'n', 'c', 'ni', 'v', 'v']
    若pos为真,则返回['中国/ns', '进出口/v', '银行/n', '与/c', '中国银行/ni', '加强/v', '合作/v']
    默认返回是生成器列表
    """
    if type(words)==str:
        words=split_words(words)
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    postags = postagger.postag(words)
    # list-of-string parameter is support in 0.1.5
    # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
    if type_list :
        return [i for i in postags]
    if pos:
        return ['{}/{}'.format(k,v)for k,v in zip(words,[i for i in postags])]
    return postags
Example #33
0
    def __init__(self):
        self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        segmentor = Segmentor()
        segmentor.load(self.cws_model_path)
        self.words = segmentor.segment(data)
        # print("|".join(words))
        segmentor.release()


        postagger = Postagger() # 初始化实例
        postagger.load(self.pos_model_path)  # 加载模型
        self.postags = postagger.postag(self.words)  # 词性标注
        # print('\t'.join(postags))
        postagger.release()  # 释放模型


        recognizer = NamedEntityRecognizer() # 初始化实例
        recognizer.load(self.ner_model_path)  # 加载模型
        self.netags = recognizer.recognize(self.words, self.postags)  # 命名实体识别
        # print('\t'.join(netags))
        recognizer.release()  # 释放模型
Example #34
0
MODELDIR=os.path.join(ROOTDIR, "ltp_data")

from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!'

sentence = SentenceSplitter.split(paragraph)[0]

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
words = segmentor.segment(sentence)
print "\t".join(words)

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)