def __init__(self,
                 articles_filename='articles.csv',
                 record_filename='record.csv',
                 rule_reference_filename='rule_reference.txt',
                 LTP_DIR="ltp_data_v3.4.0/",
                 filter_dictionary=['有限公司']):

        self.articles_filename = articles_filename
        self.record_filename = record_filename
        self.rule_reference_filename = rule_reference_filename
        ###############################加载ltp相关模型#########################################
        self.LTP_DIR = LTP_DIR
        #分词模型
        self.segmentor = pyltp.Segmentor()
        self.segmentor.load(os.path.join(self.LTP_DIR, "cws.model"))
        #词性模型
        self.postagger = pyltp.Postagger()
        self.postagger.load(os.path.join(self.LTP_DIR, 'pos.model'))
        #命名实体模型
        self.recognizer = pyltp.NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.LTP_DIR, 'ner.model'))
        #依存句法分析
        self.parser = pyltp.Parser()
        self.parser.load(os.path.join(self.LTP_DIR, 'parser.model'))
        self.filter_dictionary = filter_dictionary

        self.left_postags_dict = {}
        self.left_word_dict = {}
        self.mid_postags_dict = {}
        self.mid_word_dict = {}
        self.right_postags_dict = {}
        self.right_word_dict = {}
        self.CMP_dict = {}
        self.SBV_dict = {}
        self.VOB_dict = {}
    def __init__(self, model_dir_path, blacklist_path):
        '''
        model_dir_path: pyltp 模型文件路径
        blacklist_path: 黑名单文件路径
        '''
        # 初始化相关模型文件路径
        self.model_dir_path = model_dir_path
        self.cws_model_path = os.path.join(
            self.model_dir_path, 'cws.model')  # 分词模型路径,模型名称为`cws.model`
        self.pos_model_path = os.path.join(
            self.model_dir_path, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`
        self.ner_model_path = os.path.join(
            self.model_dir_path, 'ner.model')  # 命名实体识别模型路径,模型名称为`pos.model`

        # 初始化分词模型
        self.segmentor = pyltp.Segmentor()
        self.segmentor.load(self.cws_model_path)

        # 初始化词性标注模型
        self.postagger = pyltp.Postagger()
        self.postagger.load(self.pos_model_path)

        # 初始化NER模型
        self.recognizer = pyltp.NamedEntityRecognizer()
        self.recognizer.load(self.ner_model_path)

        # 初始化公司名黑名单
        self.com_blacklist = set()
        with open(blacklist_path, 'r', encoding='utf-8') as f_com_blacklist:
            for line in f_com_blacklist:
                if len(line.strip()) > 0:
                    self.com_blacklist.add(line.strip())
 def build_files(self):
     """ 遍历原始文档,进行分词词性标注,去除停用词等,创建FileItem类集合 """
     files = []
     category_id = 0
     segmentor = pyltp.Segmentor()
     segmentor.load(
         r'C:\Users\51694\PycharmProjects\paper\ltp_model\cws.hyp')
     postagger = pyltp.Postagger()
     postagger.load(
         r'C:\Users\51694\PycharmProjects\paper\ltp_model\pos.hyp')
     parser = pyltp.Parser()
     parser.load(
         r'C:\Users\51694\PycharmProjects\paper\ltp_model\parser.hyp')
     for ids, path in enumerate(self.file_paths()):
         with open(path, 'r', encoding='utf-8') as f:
             try:
                 category = self.path2category(path)
                 if category not in self.category_ids:
                     self.category_ids[category] = category_id
                     category_id += 1
                 raw = self.process_line(f.read())
                 words = self.remove_stop_words(list(
                     segmentor.segment(raw)))
                 words = self.clean_specific(words)
                 pos = list(postagger.postag(words))
                 parse_result = list(parser.parse(words, pos))
                 files.append(
                     FileItem(ids, category, words, pos, parse_result))
             except UnicodeDecodeError:
                 logging.warning(path + ' UTF-8解码失败,请检查文本格式')
                 continue
     segmentor.release()
     postagger.release()
     parser.release()
     return files
def tag_text(segment_text_state, segment_text_queue, tag_result_queue):
    postagger = pyltp.Postagger()  #实例化分词模块
    postagger.load(model_path)  #加载分词库
    while (not segment_text_queue.empty()
           ) or segment_text_state.value == 'have':
        #get()方法从队头删除并返回一个项目
        #如果队列为空,get()方法就使进程阻塞timeout秒。
        #如果在timeout秒内,发现可用的项目,则继续执行。如果超时,则引发一个异常。
        try:
            segment_text_list = segment_text_queue.get(block=True, timeout=0.1)
            tag_text_list = list()
            #一个分词进程分的所有文本放进一个列表,一个列表项就是一篇文本
            for text in segment_text_list:
                words_list = text.split("|")
                postags_list = postagger.postag(words_list)
                tag_result_list = list()
                for word, postag in zip(words_list, postags_list):
                    tag_result_list.append(word + "/" + postag)
                tag_result_text = ' '.join(tag_result_list)
                tag_text_list.append(tag_result_text)
            #put()方法在队尾插入一个项目
            #如果队列已满,put()方法就使子进程阻塞,直到空出一个数据单元
            tag_result_queue.put(tag_text_list, block=True, timeout=None)
        except:
            pass
    return
Ejemplo n.º 5
0
def word_posttagger(sentence):
    pos_ = pyltp.Postagger()
    pos_.load(pos_model_path)
    result = pos_.postag(sentence)
    print(type(result))
    print('\t'.join(result))
    pos_.release()
    return result
Ejemplo n.º 6
0
 def __init__(self):
     model_path = '/home/lnn/Documents/postag/ltp_data_v3.4.0/'
     self.seg = pyltp.Segmentor()
     self.seg.load(model_path + 'cws.model')
     self.pos = pyltp.Postagger()
     self.pos.load(model_path + 'pos.model')
     self.parser=pyltp.Parser()
     self.parser.load(model_path+'parser.model')
Ejemplo n.º 7
0
def ltpSetup():
    LTP_DATA_DIR = './ltp_data_v3.4.0/'
    cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
    segmentor = pyltp.Segmentor()
    segmentor.load(cws_model_path)
    postagger = pyltp.Postagger()
    postagger.load(pos_model_path)
    return segmentor, postagger
Ejemplo n.º 8
0
def load_model():
    '''
    #载入分词模型和词性标注模型
    '''
    segmentor = pyltp.Segmentor()
    segmentor.load("./ltp_data/cws.model")
    postagger = pyltp.Postagger()
    postagger.load("./ltp_data/pos.model")
    return segmentor, postagger
Ejemplo n.º 9
0
def words_mark(array):
    # 词性标注模型路径,模型名称为`pos.model`
    pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
    postagger = pyltp.Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型
    postags = postagger.postag(array)  # 词性标注
    pos_str = ' '.join(postags)
    pos_array = pos_str.split(" ")
    postagger.release()  # 释放模型
    return pos_array
Ejemplo n.º 10
0
    def __init__(self,
                 config_lib="ltp",
                 config_dict=None,
                 config_stop=None,
                 config_dir=None,
                 seg_out_list=False):
        self.input_type = str
        self.config_dir = config_dir
        if config_dir is None:
            self.config_dir = 'E:/Data/' if 'windows' in platform.architecture(
            )[1].lower() else '/users/fanzfeng/Data/'

        self.stop_config = False
        if config_stop is not None and isinstance(
                config_stop, str) and os.path.exists(config_stop):
            self.stop_config = True
            with open(config_stop, "r", encoding="utf-8") as fp:
                self.stop_words = [
                    k.strip() for k in fp.readlines() if len(k.strip()) > 0
                ]
        elif isinstance(config_stop,
                        (list, tuple, set)) and len(config_stop) > 0:
            self.stop_config = True
            self.stop_words = config_stop

        self.all_cut = False
        self.seg_out_list = seg_out_list

        self.config_lib = config_lib
        if config_lib == "jieba":
            self.jieba_ner = "nr ns nt m".split()
            if config_dict is not None and isinstance(
                    config_dict, str) and os.path.exists(config_dict):
                jieba.load_userdict(config_dict)
            self.seg = jieba.cut
            self.pos_seg = pseg.cut
        elif config_lib == "ltp":
            import pyltp
            self.segmentor = pyltp.Segmentor()
            if config_dict is not None and isinstance(
                    config_dict, str) and os.path.exists(config_dict):
                self.segmentor.load_with_lexicon(
                    os.path.join(self.config_dir, "ltp_data_v3.4.0/cws.model"),
                    config_dict)
            else:
                self.segmentor.load(
                    os.path.join(self.config_dir, "ltp_data_v3.4.0/cws.model"))
            self.seg = self.segmentor.segment
            self.postagger = pyltp.Postagger()
            self.text_splitter = pyltp.SentenceSplitter.split
            self.postagger.load(
                os.path.join(self.config_dir, "ltp_data_v3.4.0/pos.model"))
            self.recognizer = pyltp.NamedEntityRecognizer()
            self.recognizer.load(self.config_dir + "ltp_data_v3.4.0/ner.model")
Ejemplo n.º 11
0
def sentence(
    articles: List[Dict],
    project: Path = os.getcwd(),
    ltp_dir=os.path.abspath(os.path.join(os.path.realpath(__file__),
                                         "../..")) + '/ltp_data'
) -> List[Dict]:
    logger = hlogger(project)
    start_time = datetime.datetime.now()
    logger.info('Starting to process sentences')

    # 加载ltp相关模型
    # 分词模型
    segmentor = pyltp.Segmentor()
    segmentor.load(os.path.join(ltp_dir, "cws.model"))

    # 词性模型
    postagger = pyltp.Postagger()
    postagger.load(os.path.join(ltp_dir, 'pos.model'))

    # 命名实体模型
    recognizer = pyltp.NamedEntityRecognizer()
    recognizer.load(os.path.join(ltp_dir, 'ner.model'))

    if_force = False
    # 如果存在词表,强制实体标注加载词表
    if os.path.exists(project + '/lexicon'):
        logger.info('Ner will use lexicon')
        if_force = force_segmentor()
        if_force.load(project + '/lexicon')

    logger.info('Processing sentences')

    results = []
    for article in articles:
        result = extract_information(article['id'], article['content'],
                                     segmentor, postagger, recognizer,
                                     if_force)
        results.extend(result)

    length = len(results)

    end_time = datetime.datetime.now()

    logger.info(
        'Sentences have been processed successfully,and there are %s sentences'
        % len(results))
    logger.info('FINISHED! using time : %s\n' % get_time(
        (end_time - start_time).seconds))

    return results
Ejemplo n.º 12
0
    def do_pos(self, intxt):
        words = self.do_seg(intxt)
        if self.postagger is None:
            self.postagger = pyltp.Postagger()
            if self.debug:
                load_start = default_timer()
            self.postagger.load(os.path.join(self.model_dir, 'pos.model'))
            if self.debug:
                load_use = default_timer() - load_start
                self.loger.debug("load pos.model use [ %f ] s" % load_use)

        postags = self.postagger.postag(words)

        return words, list(postags)
Ejemplo n.º 13
0
    def Postagger(self, words=None, sent=None):
        if self.__postagger is None:
            self.__postagger = pyltp.Postagger()
            if self.__seg_lexicon_path is None:
                self.__postagger.load(self.__pos_model_path)
            else:
                self.__postagger.load_with_lexicon(self.__pos_model_path, self.__seg_lexicon_path)

        postags = None
        if sent is not None:
            words = self.Segmentor(sent)
            postags = self.__postagger.postag(words)
        else:
            postags = self.__postagger.postag(words)
        return postags
Ejemplo n.º 14
0
    def __init__(self):
        self.path = 'ltp_data_v3.4.0/'  # 下载地址 https://ltp.ai/download.html 3.4.0
        self.segmentor = pp.Segmentor()
        self.segmentor.load(self.path + "cws.model")  # 加载分词模型

        self.postagger = pp.Postagger()
        self.postagger.load(self.path + "pos.model")  # 加载词性标注模型

        self.recognizer = pp.NamedEntityRecognizer()
        self.recognizer.load(self.path + "ner.model")  # 加载命名实体识别模型

        self.parser = pp.Parser()
        self.parser.load(self.path + "parser.model")  # 加载依存句法分析模型

        self.labeller = pp.SementicRoleLabeller()
        self.labeller.load(self.path + "pisrl.model")  # 加载语义角色标注模型
Ejemplo n.º 15
0
    def __init__(self, *args, **kwargs):
        self.__LTP_DATA_DIR = 'D:\\NLP\\ltp_data'
        self.__cws_model_path = os.path.join(self.__LTP_DATA_DIR, 'cws.model')
        self.__pos_model_path = os.path.join(self.__LTP_DATA_DIR, 'pos.model')
        self.__par_model_path = os.path.join(self.__LTP_DATA_DIR,
                                             'parser.model')

        self.segmentor = pyltp.Segmentor()
        self.segmentor.load_with_lexicon(self.__cws_model_path,
                                         './../data/word_dict.txt')
        self.postagger = pyltp.Postagger()
        self.postagger.load(self.__pos_model_path)
        self.parser = pyltp.Parser()
        self.parser.load(self.__par_model_path)

        self.tags_dict = {}
Ejemplo n.º 16
0
 def __init__(self, ltp_path, dependency=False):
     self.dependency = dependency
     cws_model_path = os.path.join(ltp_path, 'cws.model')
     pos_model_path = os.path.join(ltp_path, 'pos.model')
     ner_model_path = os.path.join(ltp_path, 'ner.model')
     dp_model_path = os.path.join(ltp_path, 'parser.model')
     self.seg = pyltp.Segmentor()
     self.pos = pyltp.Postagger()
     self.ner = pyltp.NamedEntityRecognizer()
     # self.srl = pyltp.SementicRoleLabeller()
     self.seg.load(cws_model_path)
     self.pos.load(pos_model_path)
     self.ner.load(ner_model_path)
     # self.srl.load(srl_model_path)
     if dependency:
         self.dp = pyltp.Parser()
         self.dp.load(dp_model_path)
Ejemplo n.º 17
0
    def __init__(self, seg_model_path = 'ltp_data_v3/ltp_data_v3.4.0/cws.model', seg_lexicon_path = 'lexicon/lexicon_test',
                 pos_model_path = 'ltp_data_v3/ltp_data_v3.4.0/pos.model', rec_model_path = 'ltp_data_v3/ltp_data_v3.4.0/ner.model',
                 par_model_path = 'ltp_data_v3/ltp_data_v3.4.0/parser.model'):
        self.seg_lexicon_path = seg_lexicon_path
        self.segmentor = pyltp.Segmentor()
        self.seg_model_path = seg_model_path
        self.segmentor.load_with_lexicon(self.seg_model_path,self.seg_lexicon_path)

        self.postagger = pyltp.Postagger()
        self.pos_model_path = pos_model_path
        self.postagger.load(self.pos_model_path)

        self.recognizer = pyltp.NamedEntityRecognizer()
        self.rec_model_path = rec_model_path
        self.recognizer.load(rec_model_path)

        self.parser = pyltp.Parser()
        self.par_model_path = par_model_path
        self.parser.load(self.par_model_path)
Ejemplo n.º 18
0
    def ltp_init(self):
        import pyltp
        LTP_DATA_DIR = '/nas/data/m1/panx2/lib/ltp/ltp_data_v3.4.0'
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
        pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
        ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
        par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')

        self.model_ltp_splitter = pyltp.SentenceSplitter()
        self.model_ltp_segmentor = pyltp.Segmentor()
        self.model_ltp_segmentor.load(cws_model_path)
        self.model_ltp_postagger = pyltp.Postagger()
        self.model_ltp_postagger.load(pos_model_path)
        self.model_ltp_recognizer = pyltp.NamedEntityRecognizer()
        self.model_ltp_recognizer.load(ner_model_path)
        self.model_ltp_dparser = pyltp.Parser()
        self.model_ltp_dparser.load(par_model_path)

        self.parse = self._parse
        self.sent_seger = self.ltp_sent_seger
        self.tokenizer = self.ltp_tokenizer
        self.processor = self.ltp_processor
Ejemplo n.º 19
0
    def _model_initialize(self):
        if self.__segmentor == None:
            self.__segmentor = pyltp.Segmentor()
            if self.__seg_lexicon_path == None:
                self.__segmentor.load(self.__seg_model_path)
            else:
                self.__segmentor.load_with_lexicon(self.__seg_model_path,
                                                   self.__seg_lexicon_path)

        if self.__postagger == None:
            self.__postagger = pyltp.Postagger()
            if self.__seg_lexicon_path == None:
                self.__postagger.load(self.__pos_model_path)
            else:
                self.__postagger.load_with_lexicon(self.__pos_model_path,
                                                   self.__seg_lexicon_path)

        if self.__recognizer == None:
            self.__recognizer = pyltp.NamedEntityRecognizer()
            self.__recognizer.load(self.__rec_model_path)

        if self.__parser == None:
            self.__parser = pyltp.Parser()
            self.__parser.load(self.__par_model_path)
Ejemplo n.º 20
0
def ltp_process(sentence):
    stop_words = get_stops_words()  # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。

    # 分词
    segmentor = pyltp.Segmentor()
    segmentor.load("./cws.model")
    words = segmentor.segment(sentence)
    print("\t".join(words))
    segmentor.release()

    # 词性
    postagger = pyltp.Postagger()
    postagger.load("./pos.model")
    postags = postagger.postag(words)
    # list-of-string parameter is support in 0.1.5
    # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
    print("\t".join(postags))
    postagger.release()

    # 依存句法分析
    parser = pyltp.Parser()
    parser.load("./parser.model")
    arcs = parser.parse(words, postags)
    parser.release()

    # 角色分析,暂时没用上
    # 拿到前面来是有用意的,在进行判断了当前的SBV的子节点与"说"有关后,需要抽取这个词,简而言之,是SBV,又对应A0,则这个词一定是主语。
    labeller = pyltp.SementicRoleLabeller()
    labeller.load("./pisrl.model")
    roles = labeller.label(words, postags, arcs)

    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

    SBV_set = list()
    Subject_label_set = list()
    Word_of_speech_content = list()

    Index_of_Subjet = 0

    for arc in arcs:
        #SBV_index = get_repeat(arc.head, "SBV")
        k = Index_of_Subjet
        if arc.relation == "SBV" and words[
                arc.head -
                1] not in stop_words:  # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0

            SBV_set.append(words[arc.head - 1])  # arc.head是从1开始计数,存储SBV指向的谓语动词
            Subject_label_set.append(
                words[Index_of_Subjet])  # 如果有SBV,那么这个词对应的位置肯定是主语
            Word_of_speech_content.append(
                words[arc.head:])  # 拿出来的相当于SBV主语词以后的部分。

            Index_of_Subjet += 1

        else:
            Index_of_Subjet += 1
            continue  # 如果为空列表,该句子没有分析的必要性
    '''
    recognizer = pyltp.NamedEntityRecognizer()
    recognizer.load("./ner.model")
    netags = recognizer.recognize(words, postags)
    print("\t".join(netags))
    
    labeller = pyltp.SementicRoleLabeller()
    labeller.load("./pisrl.model")
    roles = labeller.label(words, postags, arcs)

    for role in roles:
        print(role.index, "".join(
                ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
    '''

    return SBV_set, Subject_label_set, Word_of_speech_content  # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语。一定要注意,是不是都是[]
Ejemplo n.º 21
0
import pandas as pd
import literature
import pyltp
import pickle
import os
from trigger_dict import TriggerDict
from math import inf

MIN_SENTENCE_NUM = 140
STOP_WORD_PATH = './相关词表/停用词词表.txt'
LTP_SEGMENT_MODE = './LTP_model/cws.model'
LTP_POS_MODE = './LTP_model/pos.model'
LTP_PARSE_MODE = './LTP_model/parser.model'
SEGMENTOR = pyltp.Segmentor()
POSTARGGER = pyltp.Postagger()
PARSER = pyltp.Parser()
with open('./相关词表/线索词词表.txt', 'r', encoding='utf-8') as f:
    CLUE_WORDS = f.read().splitlines()


def load_model():
    """ 加载LTP包的分词、词性标注、句法分析模型 """
    SEGMENTOR.load(LTP_SEGMENT_MODE)
    POSTARGGER.load(LTP_POS_MODE)
    PARSER.load(LTP_PARSE_MODE)


def release_model():
    """ 释放LTP包的分词、词性标注、句法分析模型 """
    SEGMENTOR.release()
    POSTARGGER.release()
Ejemplo n.º 22
0
#encoding:utf-8
from deepdive import *
from transform import *
import pyltp
import numpy as np
import os
import sys
#加载ltp相关模型
LTP_DIR = "/root/transaction/udf/model/ltp_data_v3.4.0"

#分词模型
segmentor = pyltp.Segmentor()
segmentor.load(os.path.join(LTP_DIR, "cws.model"))

#词性模型
postagger = pyltp.Postagger()
postagger.load(os.path.join(LTP_DIR, 'pos.model'))

#命名实体模型
recognizer = pyltp.NamedEntityRecognizer()
recognizer.load(os.path.join(LTP_DIR, 'ner.model'))

#依存句法分析
parser = pyltp.Parser()
parser.load(os.path.join(LTP_DIR, 'parser.model'))


@tsv_extractor
@returns(lambda doc_id="text", sentence_index="int", sentence_text="text",
         tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags=
         "text[]", dep_types="text[]", dep_tokens="int[]", : [])
 def __init__(self):
     self.tagger = pyltp.Postagger()
     self.parser = pyltp.Parser()
     self.tagger.load(path_to_tagger)
     self.parser.load(path_to_parser)
Ejemplo n.º 24
0
def pos_words(words):
    postagger = pyltp.Postagger()
    postagger.load(ltp_path + 'pos.model')
    postags_lst = [pos for pos in postagger.postag(words)]
    postagger.release()
    return postags_lst
Ejemplo n.º 25
0
def make_instances(samples,
                   char_voc,
                   word_voc,
                   sentiment_words_path,
                   question2targets,
                   need_augment,
                   is_training,
                   use_extra_feature,
                   ner_dict_path,
                   pos_dict_path,
                   dtype=np.int32):
    # TODO: build sentiment words for own data
    positive_words, negative_words = load_sentiment_words(sentiment_words_path)

    if need_augment:
        samples = instance_augment(samples, question2targets)

    questions = unique_list([sample['question'] for sample in samples])
    question2skeleton = gather_skeleton_indicator(questions)

    if use_extra_feature:
        with open(ner_dict_path, 'rb') as infile:
            ner_dict = pickle.load(infile)
        with open(pos_dict_path, 'rb') as infile:
            pos_dict = pickle.load(infile)

        ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
        pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
        segmentor = pyltp.Segmentor()
        segmentor.load(cws_model_path)
        postagger = pyltp.Postagger()
        postagger.load(pos_model_path)
        recognizer = pyltp.NamedEntityRecognizer()
        recognizer.load(ner_model_path)

    assert set(positive_words) & set(negative_words) == set()

    sentiment_words = positive_words + negative_words
    sentiment_words.sort(key=lambda sen_word: len(sen_word), reverse=True)
    sentiment_words_dic = {}
    for sen_word in sentiment_words:
        if sen_word in positive_words:
            sentiment_words_dic[sen_word] = 1
        else:
            sentiment_words_dic[sen_word] = 0

    for sample in samples:

        assert isinstance(sample, dict)

        que_ww2v_index_sequence = []
        ans_ww2v_index_sequence = []
        ans = sample["answer"]
        que = sample["question"]

        for word in jieba.lcut(que):
            for _ in range(len(word)):
                que_ww2v_index_sequence.append(word_voc[word])
        for word in jieba.lcut(ans):
            for _ in range(len(word)):
                ans_ww2v_index_sequence.append(word_voc[word])
        que_ww2v_index_sequence = np.array(que_ww2v_index_sequence,
                                           dtype=dtype)
        ans_ww2v_index_sequence = np.array(ans_ww2v_index_sequence,
                                           dtype=dtype)

        def make_extra_index_sequence(str_):
            pos_end = []
            ner_end = []
            words = segmentor.segment(str_)
            postages = postagger.postag(words)
            netags = recognizer.recognize(words, postages)

            for word_index in range(len(list(words))):
                word = list(words)[word_index]
                for _ in range(len(word)):
                    pos_end.append(pos_dict[list(postages)[word_index]])
                    ner_temp = list(netags)[word_index]
                    if '-' in ner_temp:
                        ner_temp = ner_temp[ner_temp.index('-') + 1:]
                    ner_end.append(ner_dict[ner_temp])
            return np.array(ner_end, dtype=dtype), np.array(pos_end,
                                                            dtype=dtype)

        if use_extra_feature:
            que_ner_index_sequence, que_pos_index_sequence = make_extra_index_sequence(
                que)
            ans_ner_index_sequence, ans_pos_index_sequence = make_extra_index_sequence(
                ans)
        else:
            que_ner_index_sequence = np.array([0] * len(que), dtype=dtype)
            que_pos_index_sequence = np.array([0] * len(que), dtype=dtype)
            ans_ner_index_sequence = np.array([0] * len(ans), dtype=dtype)
            ans_pos_index_sequence = np.array([0] * len(ans), dtype=dtype)

        que_cw2v_index_sequence = [
            char_voc[char] for char in sample['question']
        ]
        que_cw2v_index_sequence = np.array(que_cw2v_index_sequence,
                                           dtype=dtype)
        ans_cw2v_index_sequence = [char_voc[char] for char in sample['answer']]
        ans_cw2v_index_sequence = np.array(ans_cw2v_index_sequence,
                                           dtype=dtype)
        que_skeleton_label = question2skeleton[que]

        assert len(que_cw2v_index_sequence) == len(que_ww2v_index_sequence)
        assert len(ans_cw2v_index_sequence) == len(ans_ww2v_index_sequence)

        if use_extra_feature:
            assert len(que_ner_index_sequence) == len(que_pos_index_sequence)
            assert len(ans_ner_index_sequence) == len(ans_pos_index_sequence)
            assert len(que_cw2v_index_sequence) == len(que_ner_index_sequence)
            assert len(ans_cw2v_index_sequence) == len(ans_ner_index_sequence)

        if len(que_cw2v_index_sequence) != len(que_skeleton_label):
            print(que)
            print(len(que_cw2v_index_sequence))
            print(len(que_skeleton_label))

        def make_sentiment_polarity_labels(str_):
            # 0:neutral, 1: positive 2: negative
            ans_temp = str_
            sentiment_polarity_labels = np.array([0] * len(str_), dtype=dtype)
            for sen_word in sentiment_words:
                if sen_word in ans_temp and sen_word in str_:
                    if sentiment_words_dic[sen_word] == 1:
                        sentiment_polarity_labels[str_.index(sen_word):str_.
                                                  index(sen_word) +
                                                  len(sen_word)] = np.ones(
                                                      len(sen_word))
                        ans_temp = ans_temp[:ans_temp.index(
                            sen_word)] + ans_temp[ans_temp.index(sen_word) +
                                                  len(sen_word):]
                    else:
                        sentiment_polarity_labels[str_.index(sen_word):str_.
                                                  index(sen_word) +
                                                  len(sen_word)] = np.array(
                                                      [2] * len(sen_word),
                                                      dtype=dtype)
                        ans_temp = ans_temp[:ans_temp.index(
                            sen_word)] + ans_temp[ans_temp.index(sen_word) +
                                                  len(sen_word):]
            return sentiment_polarity_labels

        def make_indicate_target_labels():
            # 0: not 1: is
            specify_target = str(sample["target"])
            indicate_target_labels = np.array([0] * len(que), dtype=dtype)
            if specify_target in que:
                indicate_target_labels[que.index(specify_target): que.index(specify_target) + len(specify_target)] = \
                    np.ones(len(specify_target), dtype=dtype)
            return indicate_target_labels

        ans_sentiment_polarity_labels = make_sentiment_polarity_labels(ans)
        que_sentiment_polarity_labels = make_sentiment_polarity_labels(que)
        indicate_target_labels = make_indicate_target_labels()

        sample.update({
            'que_ww2v_index_sequence':
            que_ww2v_index_sequence,
            'ans_ww2v_index_sequence':
            ans_ww2v_index_sequence,
            'que_cw2v_index_sequence':
            que_cw2v_index_sequence,
            'ans_cw2v_index_sequence':
            ans_cw2v_index_sequence,
            'ans_sentiment_polarity_labels':
            ans_sentiment_polarity_labels,
            'que_sentiment_polarity_labels':
            que_sentiment_polarity_labels,
            'que_indicate_target_labels':
            indicate_target_labels,
            'ans_indicate_target_labels':
            np.array([0] * len(ans), dtype=dtype),
            'que_skeleton_label':
            que_skeleton_label,
            'que_ner_index_sequence':
            que_ner_index_sequence,
            'que_pos_index_sequence':
            que_pos_index_sequence,
            'ans_ner_index_sequence':
            ans_ner_index_sequence,
            'ans_pos_index_sequence':
            ans_pos_index_sequence
        })
        # 'question_id': sen_voc[sample["question"]],
        # 'answer_id': sen_voc[sample["answer"]]

    return samples
Ejemplo n.º 26
0
def ltp_process(sentence, old_SI={}):

    stop_words = get_stops_words()  # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。

    # 分词
    segmentor = pyltp.Segmentor()
    segmentor.load("./model/cws.model")
    words = segmentor.segment(sentence)
    segmentor.release()
    # 词性
    postagger = pyltp.Postagger()
    postagger.load("./model/pos.model")
    postags = postagger.postag(words)
    # list-of-string parameter is support in 0.1.5
    postagger.release()
    parser = pyltp.Parser()
    parser.load("./model/parser.model")
    arcs = parser.parse(words, postags)
    parser.release()
    noun_tags = ['nh','nd','n','ni','nl','ns','nt','nz']
    # nh:person name, nd:direction noun, n:general noun, ni:organization name, nl:location noun
    # ns: geographical name, nt:temporal noun, nz:other proper noun
    SI_words = {}  # 词和索引
    for tag in noun_tags:
        # 找出noun_tags词性,对应词在句子的位置
        SI_index = np.argwhere(np.array(postags)==tag).reshape(-1).tolist()
        # SI_index 存储符合tag词性的词的位置
        for j in SI_index:
            # 找到该词,并记录该词位置 蒋丽芸
            SI_words[words[j]] = j

    #print(SI_words)

    SBV_set = list()
    Subject_label_set = list()
    Word_of_speech_content = list()

    Index_of_Subjet = 0
    SI_postags = {}
    si_SBV_postag = []
    si_VOB_postag = []
    for arc in arcs:
        # SBV_index = get_repeat(arc.head, "SBV")
        k = Index_of_Subjet
        if arc.relation == "SBV" and words[arc.head - 1] not in stop_words:  # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0
            #print(arc.head, words[arc.head -1])
            SBV_set.append(words[arc.head - 1])  # arc.head是从1开始计数,存储SBV指向的谓语动词
            # 加入主语的判断
            if words[Index_of_Subjet] in ['他','他们','你','你们','我','我们', '她','她们']:
                # 进行指代消解
                # 查看当前old_SI,如果old_SI中有相同角色,取积分最高值进行替换人称代词。需要做一次修正,名词词组如习近平+总书记应该是一个词,或者把习近平的权重设置为总书记一样
                if old_SI:
                    ag2entity = np.argmax(old_SI.params.keys())
                    
                    words[Index_of_Subjet] = list(old_SI.params.keys())[ag2entity]

                else:
                    pass

                Subject_label_set.append(words[Index_of_Subjet])

            else:
                Subject_label_set.append(words[Index_of_Subjet])  # 如果不是指示代词,那么这个词对应的位置肯定是主语
                #SI_postag[words[Index_of_Subjet].split(':')[1]] = Index_of_Subjet
                if postags[arc.head -1] == 'v':
                    si_SBV_postag.append((words[Index_of_Subjet], Index_of_Subjet))

            Word_of_speech_content.append(intro_speech(''.join(words[arc.head:])))  # 拿出所说内容。
            #print(intro_speech(''.join(words[arc.head:])))
            Index_of_Subjet += 1
            SI_postags[arc.relation] = si_SBV_postag



        elif arc.relation == 'VOB' and words[arc.head -1] not in stop_words:
            # 加入宾语的判断
            if words[Index_of_Subjet] in ['他','他们','你','你们','我','我们', '她','她们']:
                # 进行指代消解
                # 引入前一句的宾语位置和积分最高元素
                pass

            else:
                Subject_label_set.append(words[Index_of_Subjet])  # 如果不是指示代词,那么这个词对应的位置肯定是主语
                si_VOB_postag.append((words[Index_of_Subjet], Index_of_Subjet))

            Index_of_Subjet += 1
            SI_postags[arc.relation] = si_VOB_postag

        else:
            Index_of_Subjet += 1
            continue  # 如果为空列表,该句子没有分析的必要性

    Forcus_point = Si(SI_words, SI_postags,old_SI) # 关注焦点集
    # 需要更新self.params
    Forcus_point.score()
    return SBV_set, Subject_label_set, Word_of_speech_content, Forcus_point  # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语,结果有可能是空。
Ejemplo n.º 27
0
def load_model():
    segmentor = pyltp.Segmentor()
    segmentor.load("./ltp_data/cws.model")
    postagger = pyltp.Postagger()
    postagger.load("./ltp_data/pos.model")
    return segmentor, postagger
Ejemplo n.º 28
0
def postag(words):
    global postagger_
    if postagger_ is None:
        postagger_ = pyltp.Postagger()
        postagger_.load(ltp_models['pos'])
    return postagger_.postag(words)
Ejemplo n.º 29
0
def ltp_process(sentence, old_SI={}):

    stop_words = get_stops_words()  # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。

    # 分词
    segmentor = pyltp.Segmentor()
    segmentor.load("./model/cws.model")
    words = segmentor.segment(sentence)
    #print("\t".join(words))
    segmentor.release()

    # 词性
    postagger = pyltp.Postagger()
    postagger.load("./model/pos.model")
    postags = postagger.postag(words)
    # list-of-string parameter is support in 0.1.5
    #print("\t".join(postags))
    postagger.release()

    # 依存句法分析
    parser = pyltp.Parser()
    parser.load("./model/parser.model")
    arcs = parser.parse(words, postags)
    parser.release()

    # 拿到前面来是有用意的,在进行判断了当前的SBV的子节点与"说"有关后,需要抽取这个词,简而言之,是SBV,又对应A0,则这个词一定是主语。
    labeller = pyltp.SementicRoleLabeller()
    labeller.load("./model/pisrl.model")
    roles = labeller.label(words, postags, arcs)

    #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # 依存句法分析

    noun_tags = ['nh', 'nd', 'n', 'ni', 'nl', 'ns', 'nt', 'nz']
    SI_words = {}  # 词和索引
    for tag in noun_tags:
        SI_index = np.argwhere(np.array(postags) == tag).reshape(-1).tolist()
        for j in SI_index:
            SI_words[words[j]] = j

    #print(SI_words)

    SBV_set = list()
    Subject_label_set = list()
    Word_of_speech_content = list()

    Index_of_Subjet = 0
    SI_postags = {}
    si_SBV_postag = []
    si_VOB_postag = []
    for arc in arcs:
        # SBV_index = get_repeat(arc.head, "SBV")
        k = Index_of_Subjet
        if arc.relation == "SBV" and words[
                arc.head -
                1] not in stop_words:  # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0
            #print(arc.head, words[arc.head -1])
            SBV_set.append(words[arc.head - 1])  # arc.head是从1开始计数,存储SBV指向的谓语动词
            # 加入主语的判断
            if words[Index_of_Subjet] in [
                    '他', '他们', '你', '你们', '我', '我们', '她', '她们'
            ]:
                # 进行指代消解
                # 查看当前old_SI,如果old_SI中有相同角色,取积分最高值进行替换人称代词。需要做一次修正,名词词组如习近平+总书记应该是一个词,或者把习近平的权重设置为总书记一样
                if old_SI:
                    ag2entity = np.argmax(old_SI.params.keys())

                    words[Index_of_Subjet] = list(
                        old_SI.params.keys())[ag2entity]

                else:
                    pass

                Subject_label_set.append(words[Index_of_Subjet])

            else:
                Subject_label_set.append(
                    words[Index_of_Subjet])  # 如果不是指示代词,那么这个词对应的位置肯定是主语
                #SI_postag[words[Index_of_Subjet].split(':')[1]] = Index_of_Subjet
                if postags[arc.head - 1] == 'v':
                    si_SBV_postag.append(
                        (words[Index_of_Subjet], Index_of_Subjet))

            Word_of_speech_content.append(
                intro_speech(''.join(words[arc.head:])))  # 拿出所说内容。
            #print(intro_speech(''.join(words[arc.head:])))
            Index_of_Subjet += 1
            SI_postags[arc.relation] = si_SBV_postag

        elif arc.relation == 'VOB' and words[arc.head - 1] not in stop_words:
            # 加入宾语的判断
            if words[Index_of_Subjet] in [
                    '他', '他们', '你', '你们', '我', '我们', '她', '她们'
            ]:
                # 进行指代消解
                # 引入前一句的宾语位置和积分最高元素
                pass

            else:
                Subject_label_set.append(
                    words[Index_of_Subjet])  # 如果不是指示代词,那么这个词对应的位置肯定是主语

                si_VOB_postag.append((words[Index_of_Subjet], Index_of_Subjet))

            Index_of_Subjet += 1
            SI_postags[arc.relation] = si_VOB_postag

        else:
            Index_of_Subjet += 1
            continue  # 如果为空列表,该句子没有分析的必要性

    Forcus_point = Si(SI_words, SI_postags, old_SI)  # 关注焦点集
    # 需要更新self.params
    Forcus_point.score()

    recognizer = pyltp.NamedEntityRecognizer()
    recognizer.load("./model/ner.model")
    netags = recognizer.recognize(words, postags)
    #print("\t".join(netags))
    '''
    labeller = pyltp.SementicRoleLabeller()
    labeller.load("./pisrl.model")
    roles = labeller.label(words, postags, arcs)

    for role in roles:
        print(role.index, "".join(
                ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
    '''
    return SBV_set, Subject_label_set, Word_of_speech_content, Forcus_point  # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语,结果有可能是空。