Ejemplo n.º 1
0
 def get_postags(self, words):
     postagger = Postagger()  # 初始化实例
     postagger.load(self.pos_model_path)  # 加载模型
     postags = postagger.postag(words)  # 词性标注
     print('\t'.join(postags))
     postagger.release()  # 释放模型
     return list(postags)
Ejemplo n.º 2
0
def get_all_name(r_filename,w_file):
    # global nlp
    LTP_DATA_DIR = r'ltp_data_v3.4.0'  # LTP模型目录路径
    # 分词
    segmentor = Segmentor()  # 初始化
    segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model'))  # 加载模型
    # 词性标注
    postagger = Postagger()  # 初始化
    postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model'))  # 加载模型
    # 命名实体识别
    recognizer = NamedEntityRecognizer()  # 实例化
    recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))
    f_r=open(r_filename,"r",encoding="utf-8")
    f_w=open(w_file,"w",encoding="utf-8")
    count=0
    for line in f_r:
        count+=1
        lines=line.strip("\n").replace(r"\n","")
    #    print("----------"+lines)
        words = segmentor.segment(lines)
        postags = postagger.postag(words)
        netags = recognizer.recognize(words, postags)
        sen=get_some_idea(line,netags,words)
        print(sen)
        if sen:
            for key in sen:
                sens="\t".join(list(set([data[1] for data in sen[key]])))
                f_w.write(key +"\t"+sens +"\n")
    # nlp.close()
    f_r.close()
    f_w.close()
Ejemplo n.º 3
0
def ltp_pos_data():
    """使用 LTP 进行词性标注"""
    LTP_DATA_DIR = 'D:\BaiduNetdiskDownload\ltp_data_v3.4.0'  # ltp模型目录的路径
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

    from pyltp import Postagger
    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    result = []
    file = [(const.qc_train_seg, const.qc_train_pos),
            (const.qc_test_seg, const.qc_test_pos)]
    for i in range(2):
        with open(file[i][0], 'r', encoding='utf-8') as f:
            for line in f.readlines():
                attr = line.strip().split('\t')
                words = attr[1].split(" ")
                words_pos = postagger.postag(words)
                res = ' '.join([
                    "{}/_{}".format(words[i], words_pos[i])
                    for i in range(len(words))
                ])
                result.append("{}\t{}\n".format(attr[0], res))
        with open(file[i][1], 'w', encoding='utf-8') as f:
            f.writelines(result)
        result.clear()
    postagger.release()  # 释放模型
Ejemplo n.º 4
0
class pyltp_model():
    def __init__(self, LTP_DATA_DIR='/Users/didi/Desktop/ltp_data_v3.4.0'):
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
        pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
        ner_model_path = os.path.join(
            LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        self.segmentor = Segmentor()  # 初始化实例
        self.postagger = Postagger()  # 初始化实例
        self.recognizer = NamedEntityRecognizer()  # 初始化实例

        self.segmentor.load(cws_model_path)  # 加载模型
        self.postagger.load(pos_model_path)  # 加载模型
        self.recognizer.load(ner_model_path)  # 加载模型

    def token(self, sentence):
        words = self.segmentor.segment(sentence)  # 分词
        words = list(words)
        postags = self.postagger.postag(words)  # 词性标注
        postags = list(postags)
        netags = self.recognizer.recognize(words, postags)  # 命名实体识别
        netags = list(netags)
        result = []
        for i, j in zip(words, netags):
            if j in ['S-Nh', 'S-Ni', 'S-Ns']:
                result.append(j)
                continue
            result.append(i)
        return result

    def close(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()  # 释放模型
def run():
    #分词+选词
    cont = open('key/pinglun_filter_all1.txt','r',encoding='utf-8')
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load('cws.model')  # 加载模型,不加载字典
    segmentor.load_with_lexicon('cws.model', 'userdict.txt') # 加载模型,加载用户字典
    postagger = Postagger() # 初始化实例
    postagger.load('pos.model')  # 加载模型
    nwordall = []
    for sentence in cont:
        nword = ['']
        words = segmentor.segment(sentence)  # 分词
        #默认可以这样输出
        # print (' '.join(words))
        postags = postagger.postag(words)  # 词性标注
        for word,tag in zip(words,postags):
            #############选择词性输出
            # print (word+'/'+tag)
            ############只选出副词
            # if tag == 'd':
            #######过滤单个字
            # if((tag == 'n'or tag == 'd' or tag == 'a') and len(word)>1):
            ############使用word2vec相似度计算找取跟名词相近的形容词
            # if((tag == 'a' or tag == 'n') and len(word)>1):
            if((tag == 'n') and len(word)>1):
                # print(word+tag)
                nword.append(word)
        nwordall.append(nword)
    #size为词向量维度数也即是特征值,windows窗口范围,min_count频数小于5的词忽略,workers是线程数,维度高会造成问题
    model = models.word2vec.Word2Vec(nwordall, size=10, window=5, min_count=100, workers=80)
    print('#############################################')
    sim = model.most_similar(positive=[u'餐饮'])
    for s in sim:
        print ("word:%s,similar:%s " %(s[0],s[1]))
def get_postag_list(words_list):

    postag = Postagger()
    postag.load(pos_model_path)
    postag_list = list(postag.postag(words_list))
    postag.release()
    return postag_list
Ejemplo n.º 7
0
class pyltp_impl(Seg):
    def __init__(self, dictpath, mode='seg'):
        super().__init__(mode)

        from pyltp import Segmentor
        from pyltp import Postagger
        from pyltp import NamedEntityRecognizer
        self.ltp_seg = Segmentor()
        self.ltp_pos = Postagger()
        self.ltp_ner = NamedEntityRecognizer()

        self.ltp_seg.load(os.path.join(dictpath, 'cws.model'))

        if mode != 'seg':
            self.ltp_pos.load(os.path.join(dictpath, 'pos.model'))

        if mode == 'ner':
            self.ltp_ner.load(os.path.join(dictpath, 'ner.model'))

    def impl_func(self, sentence):
        seg_res = self.ltp_seg.segment(sentence)
        if self.mode == 'seg':
            return seg_res

        pos_res = self.ltp_pos.postag(seg_res)
        if self.mode == 'postag':
            return [(word, tag) for (word, tag) in zip(seg_res, pos_res)]

        ner_res = self.ltp_ner.recognize(seg_res, pos_res)
        return [(word, tag) for (word, tag) in zip(seg_res, ner_res)]
Ejemplo n.º 8
0
def ner_data():
     # 分词模型
    segmentor = Segmentor()
    segmentor.load('cws.model')
    # 词性标注模型
    postagger = Postagger()
    postagger.load('pos.model')
    # 命名实体模型
    recognizer = NamedEntityRecognizer()
    NamedEntityRecognizer.load('ner.model')
    # 加载将要被分词的数据
    data_csv = pd.read_csv('../data.csv', encoding='utf-8-sig')
    datas = data_csv['title']

    util = Utils()
    data_processed = open('./data_processed_recognizer.csv', 'w', encoding='utf-8')
    for data in datas:
        words = segmentor.segment(data)
        postags = postagger.postag(words)
        word_split = ' '.join(words).split(' ')
        netags = recognizer.recognize(words, postags)
        netag_split = ' '.join(netags).split(' ')
        concat_word = util.concat(word_split, netag_split, tag='netags')
        data_processed.write(concat_word + '\n')
    data_processed.close()
Ejemplo n.º 9
0
def test_ltp(document):

    LTP_DATA_DIR = r"D:\anaconda\envs\TF+3.5\Lib\site-packages\pyltp-model"
    # ltp模型目录的路径
    par_model_path = os.path.join(
        LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径,模型名称为`parser.model`
    cws_model_path = os.path.join(LTP_DATA_DIR,
                                  'cws.model')  # 分词模型路径,模型名称为`cws.model`
    pos_model_path = os.path.join(LTP_DATA_DIR,
                                  'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(document)  # 分词
    print("\nA")
    print("分词结果:")
    print('\t'.join(words))
    segmentor.release()  # 释放模型

    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    print("\n")
    print("词性标注结果:")
    print('\t'.join(postags))
    postagger.release()  # 释放模型

    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型
    arcs = parser.parse(words, postags)  # 句法分析
    print("\n")
    print("句法分析结果:")
    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    parser.release()  # 释放模型
Ejemplo n.º 10
0
class Parse_Util(object):
    def __init__(self, lexicon_path='./data/lexicon'):
        # 分词
        self.segmentor = Segmentor()
        # self.segmentor.load_with_lexicon(cws_model_path, lexicon_path)
        self.segmentor.load(cws_model_path)
        # 词性标注
        self.postagger = Postagger()
        self.postagger.load(pos_model_path)
        # 依存句法分析
        self.parser = Parser()
        self.parser.load(par_model_path)
        # 命名实体识别
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(ner_model_path)
        # jieba 分词
        # jieba.load_userdict(lexicon_path)

    def __del__(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()

    # 解析句子
    def parse_sentence(self, sentence):
        words = self.segmentor.segment(sentence)
        postags = self.postagger.postag(words)
        netags = self.recognizer.recognize(words, postags)
        arcs = self.parser.parse(words, postags)
        # child_dict_list = ParseUtil.build_parse_child_dict(words, arcs)

        return words, postags, netags, arcs
Ejemplo n.º 11
0
class ModelLoader:
    __instance = None

    def __new__(cls):
        if cls.__instance is None:
            cls.__instance = super(ModelLoader, cls).__new__(cls)
            cls.__instance.__initialized = False
        return cls.__instance

    def __init__(self):
        if (self.__initialized): return
        self.__initialized = True
        LTP_DIR = "./ltp_data"
        #客製化分詞,並且後處理更改詞性
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(
            os.path.join(LTP_DIR, "cws.model"),
            os.path.join(LTP_DIR, 'customized.txt'))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

        self.sentenceSplitter = SentenceSplitter()
Ejemplo n.º 12
0
    def __init__(self, config):
        self.config = config
        random_seed = config['random_seed']
        random.seed(random_seed)
        torch.manual_seed(random_seed) # cpu
        torch.cuda.manual_seed(random_seed) #gpu
        np.random.seed(random_seed) #numpy

        if self.config['use_bert']:
            self.tokenizer = BertTokenizer.from_pretrained(self.config['bert_model_name'], cache_dir=config['bert_dir'])
        elif self.config['use_xlnet']:
            self.tokenizer = XLNetTokenizer.from_pretrained('hfl/chinese-xlnet-base', cache_dir=config['xlnet_dir'])
        elif self.config['use_transformer'] or self.config['use_rnn_basic_encoder']:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', cache_dir=config['bert_dir'])
        else:
            raise Exception('Not support other basic encoder')
        self.latest_epoch = 0

        if self.config['cut_word_task'] or self.config['pos_tag_task'] or self.config['parser_task']:
            cws_model_path = os.path.join(self.config['ltp_path'], 'cws.model')
            segmentor = Segmentor()
            segmentor.load(cws_model_path)
            self.segmentor = segmentor
        if self.config['pos_tag_task'] or self.config['parser_task']:
            pos_model_path = os.path.join(self.config['ltp_path'], 'pos.model')
            postagger = Postagger()
            postagger.load(pos_model_path)
            self.postagger = postagger
        if self.config['parser_task']:
            parser_model_path = os.path.join(self.config['ltp_path'], 'parser.model')
            parser = Parser()
            parser.load(parser_model_path)
            self.parser = parser
Ejemplo n.º 13
0
class LTP_word():
    """docstring for parser_word
    deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值
    release释放缓存"""
    def __init__(self, model_path):
        self.model_path = model_path
        self.segmentor = Segmentor()  # 分词初始化实例
        self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt'))
        self.postagger = Postagger() # 词性标注初始化实例
        self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型
        self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例
        self.recognizer.load(path.join(self.model_path, 'ner.model'))
        self.parser = Parser() # 依存句法初始化实例 s
        self.parser.load(path.join(self.model_path, 'parser.model'))  # 加载模型
        self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例
        self.labeller.load(path.join(self.model_path, 'srl'))
    def deal (self, text):  #把所有该要使用的东西都提取出来
        words =self.segmentor.segment(text)    # 分词 
        postags = self.postagger.postag(words)  # 词性标注
        netags = self.recognizer.recognize(words, postags)	#命名实体
        arcs = self.parser.parse(words, postags)  # 句法分析
        roles = self.labeller.label(words, postags, netags, arcs)  # 语义角色标注
        return words,postags,arcs,roles,netags
    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()
Ejemplo n.º 14
0
class Ltp(NerModel):
    def __init__(self):
        super(Ltp, self).__init__()
        self._model_path = "./model/ltp/"
        self._seg = Segmentor()
        self._pos = Postagger()
        self._recognizer = NamedEntityRecognizer()
        self._load_model()
        self._object_str = "[INFO] This is ltp object!"
        print("[INFO] All model is load!")

    def __repr__(self):
        return self._object_str

    def _load_model(self):
        self._seg.load(self._model_path + "cws.model")
        self._pos.load(self._model_path + "pos.model")
        self._recognizer.load(self._model_path + "ner.model")

    def get_entity(self, sentence):
        words = self._seg.segment(sentence)
        pos = self._pos.postag(words)
        ner = self._recognizer.recognize(words, pos)
        entity = [w for w, s in zip(words, ner) if s != 'O']
        if entity:
            return "".join(entity) if len(entity) > 1 else entity[0]
Ejemplo n.º 15
0
class LTP:
    def __init__(self):
        self.segmentor = Segmentor()  # 分词器
        self.segmentor.load_with_lexicon(
            Config.SEGMENTOR_PATH, Config.PERSONAL_SEGMENTOR_PATH)  # 加载模型
        self.postagger = Postagger()  # 词性分析器
        self.postagger.load(Config.POSTAGGER_PATH)  # 加载模型
        self.parser = Parser()  # 句法分析器
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(Config.NAMED_ENTITY_RECONGNTION_PATH)
        self.parser.load(Config.PARSER_PATH)  # 加载模型
        self.labeller = SementicRoleLabeller()  # 语义角色分析器
        self.labeller.load(Config.LABELLER_PATH)  # 加载模型
        self.negative_list = get_negative_list()
        self.no_list = get_no_list()
        self.limit_list = get_limit_list()
        self.special_list = get_special_list()
        self.key_sentences = []

    def __del__(self):
        """
        资源释放
        """
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.labeller.release()
Ejemplo n.º 16
0
def word_pos():
#ltp词性标注
    candidate=pd.read_csv(r'../data/candidate_sentiment.csv',header=None)
    can_word=candidate[0].tolist()
    # 新加一列存放词性
    candidate.insert(2,'ltp_pos',0)
    candidate.insert(3,'jieba_pos',0)
    candidate.columns=['word','freq','ltp_pos','jieba_pos']

    LTP_DATA_DIR = '../ltp_data_v3.4.0/ltp_data_v3.4.0'  # ltp模型目录的路径
    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
    
    
    postagger = Postagger() # 初始化实例
    postagger.load(pos_model_path)  # 加载模型   
    postags = postagger.postag(can_word)  # 词性标注   
    postagger.release()  # 释放模型   
    postags=list(postags)
    candidate['ltp_pos']=postags
#jieba词性标注    
    
    jieba_pos=[]
    for index,row in candidate.iterrows():
        s=row['word']
        words=pseg.cut(s)
        pos=[]
        for w in words:
            pos.append(w.flag)
        pos=' '.join(pos)
        jieba_pos.append(pos)
    
    candidate['jieba_pos']=jieba_pos
#    添加表头
    candidate.to_csv(r'../data/candidate_sentiment.csv',index=None)
Ejemplo n.º 17
0
def locationNER(text):
    #先分词
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(text)  # 分词
    #print ('\t'.join(words))
    segmentor.release()

    #再词性标注
    postagger = Postagger() # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    postagger.release()  # 释放模型

    #最后地理实体识别

    recognizer = NamedEntityRecognizer() # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    for i in range (0,len(netags)):
       if 'I-Ns'in netags[i] or 'I-Ni'in netags[i]:
           results.append(words[i-1]+words[i]+words[i+1])
       if 'S-Ns'in netags[i] or 'S-Ni'in netags[i]:
           results.append(words[i])
    return results
def cut_words():
    #分词+去除空行
    #词性标注集http://ltp.readthedocs.io/zh_CN/latest/appendix.html
    cont = open('resource_new.txt', 'r', encoding='utf-8')
    f = open('key/cut_resouce.txt', 'w', encoding='utf-8')
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load('cws.model')  # 加载模型,不加载字典
    segmentor.load_with_lexicon('module/cws.model',
                                'userdict.txt')  # 加载模型,加载用户字典
    postagger = Postagger()  # 初始化实例
    postagger.load('module/pos.model')  # 加载模型
    for sentence in cont:
        if sentence.strip() != '':
            words = segmentor.segment(sentence)  # 分词
            pos_tags = postagger.postag(words)  # 词性标注
            for word, tag in zip(words, pos_tags):
                if tag != 'wp':
                    f.write(word)
                else:
                    f.write('\n')
            f.write('\n')
        else:
            continue
    f.close()
    segmentor.release()
    postagger.release()
Ejemplo n.º 19
0
def segmentsentence(sentence):
    segmentor = Segmentor()
    postagger = Postagger()
    parser = Parser()
    recognizer = NamedEntityRecognizer()

    segmentor.load("./ltpdata/ltp_data_v3.4.0/cws.model")
    postagger.load("./ltpdata/ltp_data_v3.4.0/pos.model")
    # parser.load("./ltpdata/ltp_data_v3.4.0/parser.model")
    recognizer.load("./ltpdata/ltp_data_v3.4.0/ner.model")
    #############
    word_list = segmentor.segment(sentence)
    postags_list = postagger.postag(word_list)
    nertags = recognizer.recognize(word_list, postags_list)
    ############
    for word, ntag in zip(word_list, nertags):
        if ntag == 'Nh':
            entity_list.append(word)
    print(" ".join(word_list))
    print(' '.join(nertags))
    ############
    segmentor.release()
    postagger.release()
    # parser.release()
    recognizer.release()
    return word_list
Ejemplo n.º 20
0
def postags_opt(words):
    # Set pyltp postagger model path
    LTP_DATA_DIR = '../ltp_data_v3.4.0'
    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')

    # Init postagger
    postagger = Postagger()

    # Load model
    postagger.load(pos_model_path)

    # Get postags
    postags = postagger.postag(words)

    # Close postagger
    postagger.release()

    postags = list(postags)

    # Init result list
    saying_words = []

    # Filter with tag 'verb'
    for index, tag in enumerate(postags):
        if tag == 'v':
            saying_words.append(words[index])

    return saying_words
Ejemplo n.º 21
0
def extract_views(all_sents):
    segmentor = Segmentor()
    segmentor.load(r'/home/student/project-01/ltp_data/cws.model')
    postagger = Postagger()
    postagger.load(r'/home/student/project-01/ltp_data/pos.model')
    parser = Parser()
    parser.load(r'/home/student/project-01/ltp_data/parser.model')
    views_in_sents = []
    for i, sents in enumerate(all_sents):
        views_tmp = []
        for sent in sents:
            sent = sent.replace('\\n', '\n').strip()
            if len(sent) == 0:
                continue
            # words = list(jieba.cut(sent))
            words = list(segmentor.segment(sent))
            contains = contain_candidates(words)
            if len(contains) == 0:
                continue
            tags = list(postagger.postag(words))
            arcs = list(parser.parse(words, tags))
            sbv, head = get_sbv_head(arcs, words, tags)
            if sbv[0] is None or head[0] is None or head[0] not in contains:
                continue
            subj = sbv[0]
            view = clean_view(words[head[1] + 1:])
            views_tmp.append((subj, view, i))
        if len(views_tmp) > 0:
            views_in_sents.append({'sents': sents, 'views': views_tmp})
    segmentor.release()
    postagger.release()
    parser.release()
    return views_in_sents
Ejemplo n.º 22
0
    def _load_testset(self):
        """
        加载测试集
        :return:
        """
        par_model_path = os.path.join(self.ltp_dir, 'parser.model')
        pos_model_path = os.path.join(self.ltp_dir, 'pos.model')
        postagger = Postagger()
        postagger.load(pos_model_path)
        parser = Parser()
        parser.load(par_model_path)

        examples = []
        with open(os.path.join(self.data_dir, self.file_name)) as f:
            for l in tqdm(f):
                l = json.loads(l)
                # 分词 pos ner : 中文命名实体识别是字符级模型(bert),所以用 list将字符串转换为字符列表。至于输出,格式为 (entity, type, begin, end)。
                text_seg = jieba.lcut(l['text'], HMM=False)
                poses = ' '.join(postagger.postag(text_seg)).split()
                arcs = parser.parse(text_seg, poses)
                arcses = ' '.join("%d:%s" % (arc.head, arc.relation)
                                  for arc in arcs).split()
                examples.append(
                    self.align_bert_4_inference(l, text_seg, arcses))

        return examples
Ejemplo n.º 23
0
class LtpTree(DepTree):
    def __init__(self, dict_path=None):
        super(DepTree, self).__init__()
        print("正在加载LTP模型... ...")
        self.segmentor = Segmentor()
        if dict_path is None:
            self.segmentor.load(os.path.join(MODELDIR, "cws.model"))
        else:
            self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), dict_path)
        self.postagger = Postagger()
        self.postagger.load(os.path.join(MODELDIR, "pos.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(MODELDIR, "parser.model"))
        print("加载模型完毕。")

    def parse(self, sentence):
        self.words = self.segmentor.segment(sentence)
        self.postags = self.postagger.postag(self.words)
        self.arcs = self.parser.parse(self.words, self.postags)
        for i in range(len(self.words)):
            if self.arcs[i].head == 0:
                self.arcs[i].relation = "ROOT"

    def release_model(self):
        # 释放模型
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
def namedEntityRecognize(sentence):
    '''
        使用pyltp模块进行命名实体识别
        返回:1)命名实体和类别元组列表、2)实体类别列表
    '''
    namedEntityTagTupleList = []

    segmentor = Segmentor()
    # segmentor.load(inout.getLTPPath(index.CWS))
    segmentor.load_with_lexicon(inout.getLTPPath(index.CWS),
                                inout.getResourcePath('userDic.txt'))
    words = segmentor.segment(sentence)
    segmentor.release()
    postagger = Postagger()
    postagger.load(inout.getLTPPath(index.POS))
    postags = postagger.postag(words)
    postagger.release()
    recognizer = NamedEntityRecognizer()
    recognizer.load(inout.getLTPPath(index.NER))
    netags = recognizer.recognize(words, postags)
    recognizer.release()

    # 封装成元组形式
    for word, netag in zip(words, netags):
        namedEntityTagTupleList.append((word, netag))

    neTagList = '\t'.join(netags).split('\t')

    return namedEntityTagTupleList, neTagList
Ejemplo n.º 25
0
 def get_postag_list(self, word_list, model):
     # 得到词性标注
     postag = Postagger()
     postag.load(model)
     postag_list = list(postag.postag(word_list))
     postag.release()
     return postag_list
Ejemplo n.º 26
0
def init_pyltp(model_dir, dict_file=None):
    '''
    初始化Pyltp的几个模块
    :param  model_dir   模型的路径
    :param  dict_file   分词的外部词典
    :return segmentor, postagger, parser, ner
    '''
    segmentor = Segmentor()
    postagger = Postagger()
    parser = Parser()
    ner = NamedEntityRecognizer()

    cws_model = os.path.join(model_dir, 'cws.model')
    pos_model = os.path.join(model_dir, 'pos.model')
    parser_model = os.path.join(model_dir, 'parser.model')
    ner_model = os.path.join(model_dir, 'ner.model')

    if dict_file:
        segmentor.load_with_lexicon(cws_model, dict_file)
    else:
        segmentor.load(cws_model)
    postagger.load(pos_model)
    ner.load(ner_model)
    parser.load(parser_model)
    return segmentor, postagger, parser, ner
Ejemplo n.º 27
0
class LtpLanguageAnalysis(object):
    def __init__(self, model_dir="/home/xxx/ltp-3.4.0/ltp_data/"):
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(model_dir, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(model_dir, "pos.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(model_dir, "parser.model"))

    def analyze(self, text):
        # 分词
        words = self.segmentor.segment(text)
        print '\t'.join(words)

        # 词性标注
        postags = self.postagger.postag(words)
        print '\t'.join(postags)

        # 句法分析
        arcs = self.parser.parse(words, postags)
        print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

    def release_model(self):
        # 释放模型
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
Ejemplo n.º 28
0
    def ltp_word(self):
        """创建一个方法,用来进行句子的分词、词性分析等处理。"""
        # 分词
        segmentor = Segmentor()
        segmentor.load(os.path.join(MODELDIR, "cws.model"))
        words = segmentor.segment(self.content)
        #print("*************分词*****************")
        #print("\t".join(words))

        # 词性标注
        postagger = Postagger()
        postagger.load(os.path.join(MODELDIR, "pos.model"))
        postags = postagger.postag(words)
        #print("*************词性标注*************")
        #print(type(postags))
        #print("\t".join(postags))

        # 依存句法分析
        parser = Parser()
        parser.load(os.path.join(MODELDIR, "parser.model"))
        arcs = parser.parse(words, postags)
        #print("*************依存句法分析*************")
        #print(type(arcs))
        #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

        # 把依存句法分析结果的head和relation分离出来
        arcs_head = []
        arcs_relation = []
        for arc in arcs:
            arcs_head.append(arc.head)
            arcs_relation.append(arc.relation)

        # 命名实体识别
        recognizer = NamedEntityRecognizer()
        recognizer.load(os.path.join(MODELDIR, "ner.model"))
        netags = recognizer.recognize(words, postags)
        #print("*************命名实体识别*************")
        #print("\t".join(netags))
        """
        # 语义角色标注
        labeller = SementicRoleLabeller()
        labeller.load(os.path.join(MODELDIR, "pisrl.model"))
        roles = labeller.label(words, postags, arcs)
        print("*************语义角色标注*************")
        for role in roles:
            print(role.index, "".join(
                ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
        """

        segmentor.release()
        postagger.release()
        parser.release()
        recognizer.release()
        #labeller.release()

        # 调用list_conversion函数,把处理结果列表化
        words_result = list_conversion(words, postags, netags, arcs_head,
                                       arcs_relation)

        return words_result
class LtpParser:
    def __init__(self):
        LTP_DIR = "./ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

    '''语义角色标注'''
    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments}
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''
    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index+1:   #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''
    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
Ejemplo n.º 30
0
def main():
    segmentor = Segmentor()
    segmentor.load('./cws.model')
    postagger = Postagger()
    postagger.load('./pos.model')
    file_object = open(sys.argv[1], 'r')
    sid = []
    output_list = []
    try:
        all_lines = file_object.readlines()
        lc = 0
        tot = 0
        for line in all_lines:
            output = []
            lc += 1
            item = line.split('\t')
            sid.append(item[0])
            sentence = item[1][0:-1]
            #print sentence.decode('utf-8')
            if (len(sentence.replace(' ', '')) != len(sentence)):
                tot += 1
                print lc
            sentence = sentence.replace(' ', '')
            word = segmentor.segment(sentence.encode('utf-8'))
            pos = postagger.postag(word)
            tag = []
            word = list(word)
            pos = list(pos)
            for i in range(len(word)):
                word[i] = word[i].decode('utf-8')
                pos[i] = pos[i].decode('utf-8')
            word, pos = wordToChar(word, pos)
            for i in range(len(word)):
                tag.append('O')
            for i in range(len(word)):
                output.append(word[i] + ' ' + pos[i] + ' ' + tag[i] + '\n')
            output.append('\n')
            output_list.append(output)
        print tot
    finally:
        file_object.close()

    file_object = open(sys.argv[2], 'w')
    negative_num = 0
    for i in range(len(output_list)):
        ff = 0
        for j in range(len(output_list[i])):
            output_list[i][j].encode('utf-8')
            if (output_list[i][j] != '\n'
                    and output_list[i][j].split(' ')[2][0] != 'O'):
                ff = 1
            file_object.write(output_list[i][j])
        if (ff == 0): negative_num += 1
    print negative_num

    file_object = open('SID.txt', 'w')
    for i in range(len(sid)):
        file_object.write(sid[i] + '\n')
    file_object.close()
Ejemplo n.º 31
0
def segmentation(filename, output_filename):

    print "segmenting '%s' to '%s'" % (filename, output_filename)

    f = open(filename, "r")
    lines = f.readlines()
    f.close()

    MODELDIR = "./ltp_data/"

    # segment
    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))

    # postag
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    
    # Named Entity Recognize
    recognizer = NamedEntityRecognizer()
    recognizer.load(os.path.join(MODELDIR, "ner.model"))
    
    # Parse and get SVO
    parser = Parser()
    parser.load(os.path.join(MODELDIR, "parser.model"))
    
    f = open(output_filename, "w")
    fner = open(output_filename.split(".")[0]+"_ner.txt", "w")

    for _line in lines:
        line = _line[:-1]
        if line[-1] in "\n\r":
            line = line[:-1]
        
        words = segmentor.segment(line)
        postags = postagger.postag(words)
#        netags = recognizer.recognize(words, postags)
#        arcs = parser.parse(words, postags)

        for i in range(len(words)):
            f.write( "%s/%s\t" % (words[i], postags[i]))
#            if netags[i]!='O':
#                fner.write("%s/%s\t" % (words[i], netags[i]))
        f.write("\n")
#        fner.write("\n")

    f.close()
Ejemplo n.º 32
0
def words_cixing(words=["中国","进出口","银行","与","中国银行","加强","合作"],type_list=0,pos=0):
    """词性标注,若type_list=True,则返回以列表返回标注词性后的结果。
    词性标记集:LTP中采用863词性标注集
    词性说明见:http://www.ltp-cloud.com/intro/
    若type_list为真,则返回['ns', 'v', 'n', 'c', 'ni', 'v', 'v']
    若pos为真,则返回['中国/ns', '进出口/v', '银行/n', '与/c', '中国银行/ni', '加强/v', '合作/v']
    默认返回是生成器列表
    """
    if type(words)==str:
        words=split_words(words)
    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))
    postags = postagger.postag(words)
    # list-of-string parameter is support in 0.1.5
    # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
    if type_list :
        return [i for i in postags]
    if pos:
        return ['{}/{}'.format(k,v)for k,v in zip(words,[i for i in postags])]
    return postags
Ejemplo n.º 33
0
    def __init__(self):
        self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`
        segmentor = Segmentor()
        segmentor.load(self.cws_model_path)
        self.words = segmentor.segment(data)
        # print("|".join(words))
        segmentor.release()


        postagger = Postagger() # 初始化实例
        postagger.load(self.pos_model_path)  # 加载模型
        self.postags = postagger.postag(self.words)  # 词性标注
        # print('\t'.join(postags))
        postagger.release()  # 释放模型


        recognizer = NamedEntityRecognizer() # 初始化实例
        recognizer.load(self.ner_model_path)  # 加载模型
        self.netags = recognizer.recognize(self.words, self.postags)  # 命名实体识别
        # print('\t'.join(netags))
        recognizer.release()  # 释放模型
Ejemplo n.º 34
0
Archivo: psg_proc.py Proyecto: bsnsk/QA
def main():

    f = open("psgs.txt", "r")
    lines = [line.rstrip() for line in f.readlines()]
    f.close()

    segmentor = Segmentor()
    segmentor.load(os.path.join(MODELDIR, "cws.model"))

    postagger = Postagger()
    postagger.load(os.path.join(MODELDIR, "pos.model"))

    f = open("../questions/q_facts_segged_clf.txt", "r")
    types = f.readlines()
    f.close()

    f = open("../questions/provided/q_facts.txt", "r")
    questions = [line.rstrip() for line in f.readlines()]
    f.close()

    f = open("psgs_segged.txt", "w")
    fans = open("zhidao_answer.txt", "w")
    i = 0
    qid = 0
    flag = 0

    while i < len(lines):
        line = lines[i]
        if (i % 50000 == 0):
            print "\r#\t%d" % i,
            sys.stdout.flush()
        if line.startswith("<question"):
            qid = int(line.split(" ")[1].split("=")[1].split(">")[0])
            flag = 0
            f.write(line + "\n")
        elif line.startswith("</doc") or line.startswith("</question"):
            f.write(line + "\n")
        elif line.startswith("<doc"):
            f.write(line + "\n" + lines[i+1] + "\n")
            i += 2
        else:
            L = len(line)
            s = 0
            for s in range(L):
                if line[s:].startswith("最佳答案:") \
                        or line[s:].startswith("[专业]答案")\
                        or line[s:].startswith("、"+questions[qid-1]):
                    break
            if line[s:].startswith("最佳答案"):
                s += 14
            elif line[s:].startswith("[专业]答案"):
                s += 15
            elif line[s:].startswith("、"+questions[qid-1]):
                s += len(questions[qid-1])+1
            if s < L and flag == 0:
                t = s + 1
                while t < L and line[t:].startswith("更多") == False\
                        and not (t+2<L and line[t]==" " and line[t+1] in "0123456789" and line[t+2] in "0123456789")\
                        and not line[t:].startswith("~")\
                        and not line[t:].startswith("?")\
                        and not line[t:].startswith("!")\
                        and not line[t:].startswith("。"):
                    t += 1
                if s < t and t-s < 200 and t-s > 1:
                    ans = line[s:t].rstrip(".。 ??,,")
                    if types[qid-1].rstrip() == "Q_number":
                        ans = first_con_number(ans)
                    fans.write("%d\t%s\n" % (qid, ans))
                    flag = 1
#            words = segmentor.segment(line)
#            postags = postagger.postag(words)
#            for j in range(len(words)):
#                f.write("%s/%s\t" % (words[j], postags[j]))
#            f.write("\n")
        i += 1
    f.close()
    fans.close()
Ejemplo n.º 35
0
# Set your own model path
MODELDIR=os.path.join(ROOTDIR, "ltp_data")

from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!'

sentence = SentenceSplitter.split(paragraph)[0]

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
words = segmentor.segment(sentence)
print "\t".join(words)

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)