Example #1
0
def ner(text):
    """
    """
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型
    words = segmentor.segment(text)  # 分词
    # print ('\t'.join(words))
    segmentor.release()  # 释放模型

    postagger = Postagger()  # 初始化实例
    postagger.load(pos_model_path)  # 加载模型

    # words = ['元芳', '你', '怎么', '看']  # 分词结果
    postags = postagger.postag(words)  # 词性标注
    print("##" * 30)
    # print ('\t'.join(postags))
    postagger.release()  # 释放模型

    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load(ner_model_path)  # 加载模型
    # words = ['元芳', '你', '怎么', '看']
    # postags = ['nh', 'r', 'r', 'v']
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    recognizer.release()  # 释放模型
    words_list = []
    for word, flag in zip(words, netags):
        # print(word,flag)
        if flag.startswith("B-"):
            one = []
            one.append(word)
        elif flag.startswith("I-"):
            one.append(word)
        elif flag.startswith("E-"):
            one.append(word)
            words_list.append("".join(one))
        elif flag.startswith("S-"):
            words_list.append(word)
    # print(words_list)
    # return words_list,words, postags,netags
    return words_list
Example #2
0
    def word_vec_case_set(cls, word_model_file, with_name=False):
        """
        获取词向量特征集,认为词条最多10个词
        每个词条被表示为50*10的二维列表
        :param word_model_file: 词向量模型文件
        :param with_name: 正样例是否包含人名
        :return: 一个字典{pos_case:{正例},neg:{负例}}
        """
        segmentor = Segmentor()
        segmentor.load("../word2vec_process/model/cws.model")
        word_vec_model = word2vec.Word2Vec.load('../word2vec_process/model/' +
                                                word_model_file)
        case_dict = cls.load_case_set(with_name)
        word_vec_case_dict = {}

        # 以词向量拼接的方式构建词条表示,500维
        pos_case_list = case_dict['pos_case']
        pos_case_vec_dict = {}
        for pos_case in pos_case_list:
            case_words = segmentor.segment(pos_case)
            case_vec = []
            is_useful = 0
            for word in case_words:
                try:
                    # 拼接
                    case_vec.append(word_vec_model[unicode(word)].tolist())
                    is_useful = 1
                except Exception, e:
                    with open("./data/not_in_vocabulary.txt", 'a') as out_file:
                        # 记录缺失词汇
                        out_file.write(word + '\n')
                    case_vec.append([0] * 50)
            # 多退少补
            if len(case_vec) > 10:
                case_vec = case_vec[0:10]
            else:
                while (len(case_vec) < 10):
                    case_vec.append([0] * 50)
            if is_useful:
                pos_case_vec_dict[pos_case] = case_vec
Example #3
0
    def __init__(self, cws_model_path, pos_model_path, ner_model_path,
                 parser_model_path, **args):
        Tokenizer_Base.__init__(self, **args)
        from pyltp import Segmentor, Postagger, NamedEntityRecognizer, Parser
        self.seg_ins = Segmentor()
        self.seg_ins.load(cws_model_path)
        self.pos_ins = Postagger()
        self.pos_ins.load(pos_model_path)
        if parser_model_path is not None and os.path.exists(parser_model_path):
            self.parser_ins = Parser()
            self.parser_ins.load(parser_model_path)
        else:
            self.parser_ins = None
        self.ner_ins = []

        for path in sorted(glob.glob(ner_model_path)):
            try:
                if os.path.getsize(path) > 1024:
                    self.ner_ins.append(NamedEntityRecognizer())
                    self.ner_ins[-1].load(path)
            except Exception as err:
                print(err)
Example #4
0
 def __init__(self, ltp_path, user_path):
     cws_model_path = os.path.join(ltp_path,
                                   'cws.model')  # 分词模型路径,模型名称为`cws.model`
     user_model_path = os.path.join(user_path, 'userdict.txt')  #用户自定义字典
     pos_model_path = os.path.join(
         ltp_path, 'pos.model')  # 词性标注模型路径,模型名称为`pos.model`]
     sym_dict_path = os.path.join(user_path, 'reladict.txt')
     self.segmentor = Segmentor()  # 初始化实例
     self.segmentor.load_with_lexicon(cws_model_path,
                                      user_model_path)  # 加载模型
     self.postagger = Postagger()  # 初始化实例
     self.postagger.load_with_lexicon(pos_model_path,
                                      user_model_path)  # 加载模型
     #加载同义词库
     self.list1 = []
     with open(sym_dict_path, mode='r', encoding='UTF-8') as f:
         for line in f.readlines():
             rela_array = line.strip("\n").split(",")
             tmplist = []
             for rela in rela_array:
                 tmplist.append(rela)
             self.list1.append(tmplist)
Example #5
0
    def __init__(self, data_dir):
        self.LTP_DATA_DIR = data_dir
        # 分词模型
        cws_model = os.path.join(self.LTP_DATA_DIR, 'cws.model')
        self.segmentor = Segmentor()
        self.segmentor.load(cws_model)
        # self.segmentor.load_with_lexicon(cws_model, DEFAULT_SYNONYMS_PATH)

        # 词性标注模型
        pos_model = os.path.join(self.LTP_DATA_DIR, 'pos.model')
        self.postagger = Postagger()
        self.postagger.load(pos_model)

        # 命名实体识别模型
        ner_model = os.path.join(self.LTP_DATA_DIR, 'ner.model')
        self.recongnizer = NamedEntityRecognizer()
        self.recongnizer.load(ner_model)

        # 依存句法分析模型
        parse_model = os.path.join(self.LTP_DATA_DIR, 'parser.model')
        self.parser = Parser()
        self.parser.load(parse_model)
def main():
    cws_model_path = os.path.join(LTP_DATA_DIR,
                                  "cws.model")  # 分词模型路径,模型名称为`cws.model`
    segmentor = Segmentor()  # 初始化实例
    segmentor.load(cws_model_path)  # 加载模型

    train_dict = json.loads(
        open(os.path.join(DATA_SET_DIR, train_file_path), "r").readline())
    test_dict = json.loads(
        open(os.path.join(DATA_SET_DIR, test_file_path), "r").readline())
    contents = []

    for value in train_dict.values():
        contents.append(" ".join(
            [word for word in segmentor.segment(value["content"])]))

    for value in test_dict.values():
        contents.append(" ".join(
            [word for word in segmentor.segment(value["content"])]))

    contents_file = open("contents.txt", "w", encoding='utf-8')
    contents_file.write("\n".join(contents))
Example #7
0
    def __init__(self):
        LTP_DIR = "/home/python/ltp/ltp_data_v3.4.0"

        # 分词模型,单文件
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        # 词性标注模型,单文件
        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        # 依存句法分析模型,单文件
        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        # 命名实体识别模型,单文件
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        # 语义角色标注模型,多文件
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
Example #8
0
    def __init__(self,
                 in_file_path,
                 out_file_path,
                 model_path,
                 clean_output=False):
        self.logger = logging.getLogger("TripleIE")

        self.in_file_path = in_file_path
        self.out_file_path = out_file_path
        self.model_path = model_path
        self.clean_output = clean_output  # 输出是否有提示

        self.out_handle = None

        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(self.model_path, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(self.model_path, "pos.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(self.model_path, "parser.model"))
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(self.model_path, "ner.model"))
Example #9
0
    def segmentor(self):
        segmentor = Segmentor()  # 初始化实例
        # ldir = '/Users/ceil/PycharmProjects/pyltp/ltp/ltp_data_model/cws.model'
        # dicdir = 'C:\\Users\\K\\ltp_data_v3.4.0\\word.txt'  # 自定义字典

        # segmentor.load('/Users/ceil/PycharmProjects/personal/pyltp/ltp/ltp_data_model/cws.model')  # 加载模型
        segmentor.load('/home/student/project-01/ltp_data/cws.model')  # 服务器路径

        # segmentor.load_with_lexicon(ldir) # , dicdir)
        words = segmentor.segment(self.sentence)  # 分词
        # 默认可以这样输出
        # print '\t'.join(words)
        # 可以转换成List 输出
        words_list = list(words)
        print('\n')
        print('分词的结果是:')
        for word in words_list:
            print(word, end=' ')

        segmentor.release()  # 释放模型
        self.words = words_list
        return self.words
Example #10
0
def parser(sent):
    from pyltp import Segmentor
    segmentor = Segmentor()  # 初始化实例
    segmentor.load('../../data/ltp_data/cws.model')  # 加载模型
    words = segmentor.segment(sent)  # 分词
    segmentor.release()
    from pyltp import Postagger
    postagger = Postagger()  # 初始化实例
    postagger.load('../../data/ltp_data/pos.model')  # 加载模型
    postags = postagger.postag(words)  # 词性标注
    postagger.release()
    from pyltp import NamedEntityRecognizer
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load('../../data/ltp_data/ner.model')  # 加载模型
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    recognizer.release()
    from pyltp import Parser
    parser = Parser()  # 初始化实例
    parser.load('../../data/ltp_data/parser.model')  # 加载模型
    arcs = parser.parse(words, postags)  # 句法分析
    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    parser.release()
def get_wordsList(content_after_DS, stopwords_flag=False):
    stopwords = []
    if (stopwords_flag):
        stopwords = load_stopwords()

    wordSegmentor_ltp = Segmentor()
    cws_model_path = config.LTP_DATA_DIR + r'\cws.model'
    wordSegmentor_ltp.load(cws_model_path)

    words_list = []
    for text in content_after_DS.split():
        words_segment_list = list(wordSegmentor_ltp.segment(text))
        words_list.extend(words_segment_list)
        if (stopwords_flag):
            words_list = [
                word for word in words_list if (word not in stopwords)
            ]

    wordSegmentor_ltp.release()
    words_list_str = ' '.join(words_list)

    return words_list_str
def run():
    #分词+去除空行
    #词性标注集http://ltp.readthedocs.io/zh_CN/latest/appendix.html
    cont = open('key/pinglun_resource.txt','r',encoding='utf-8')
    # cont = open('key/text.txt','r',encoding='utf-8')
    f = open('key/cut_resouce_new.txt','w',encoding='utf-8')
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load('cws.model')  # 加载模型,不加载字典
    segmentor.load_with_lexicon('cws.model', 'userdict.txt') # 加载模型,加载用户字典
    postagger = Postagger() # 初始化实例
    postagger.load('pos.model')  # 加载模型
    for sentence in cont:
        if sentence.strip() !='':
            words = segmentor.segment(sentence)  # 分词
            postags = postagger.postag(words)  # 词性标注
            for word,tag in zip(words,postags):
                if (tag !='wp'):
                    f.write(word+' ')
                else:f.write('\n')
            f.write('\n')
        else:continue
    f.close()
Example #13
0
    def ltp_module():
        LTP_DATA_DIR = 'ltp_data_v3.4.0/'
        cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
        pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
        ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')
        par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')
        srl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl.model')

        segmentor = Segmentor()
        postagger = Postagger()
        recognizer = NamedEntityRecognizer()
        parser = Parser()
        # labeller = SementicRoleLabeller()

        segmentor.load(cws_model_path)
        postagger.load(pos_model_path)
        recognizer.load(ner_model_path)
        parser.load(par_model_path)
        # labeller.load(srl_model_path)

        words = segmentor.segment('格力电器美的造')
        postags = postagger.postag(words)
        netags = recognizer.recognize(words, postags)
        arcs = parser.parse(words, postags)
        # roles = labeller.label(words, postags, arcs)
        words_list = list(words)
        postags_list = list(postags)
        segmentor.release()
        postagger.release()
        recognizer.release()
        parser.release()
        # labeller.release()

        for w in words_list:
            print(w)
        for p in postags_list:
            print(p)
        print('\t'.join(netags))
        print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
Example #14
0
    def init(self, base_dir, is_custom_seg_dict):
        segmentor_model = os.path.join(base_dir, 'cws.model')
        tagger_model = os.path.join(base_dir, 'pos.model')
        ner_model = os.path.join(base_dir, 'ner.model')
        parser_model = os.path.join(base_dir, 'parser.model')
        custom_seg_dict = os.path.join(dict_dir,
                                       'vertical_domain_baike_dict.txt')

        self.segmentor = Segmentor()
        if is_custom_seg_dict:
            self.segmentor.load_with_lexicon(segmentor_model, custom_seg_dict)
        else:
            self.segmentor.load(segmentor_model)

        self.tagger = Postagger()
        self.tagger.load(tagger_model)

        self.nertagger = NamedEntityRecognizer()
        self.nertagger.load(ner_model)

        self.parser = Parser()
        self.parser.load(parser_model)
Example #15
0
def get_name(line):
    LTP_DATA_DIR = r'ltp_data_v3.4.0'  # LTP模型目录路径

    # 分词
    segmentor = Segmentor()  # 初始化
    segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model'))  # 加载模型
    words = segmentor.segment(line)  # 分词

    # 词性标注
    postagger = Postagger()  # 初始化
    postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model'))  # 加载模型
    postags = postagger.postag(words)
    # postags = postagger.postag(['中国', '进出口', '银行', '与', '中国银行', '加强', '合作', '。'])
    res = []
    # 命名实体识别
    recognizer = NamedEntityRecognizer()  # 实例化
    recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))
    netags = recognizer.recognize(words, postags)
    for i, data in enumerate(list(netags)):
        if data[2:] == "Nh":
            res.append(words[i])
    return list(set(res))
Example #16
0
    def __init__(self):
        #self.tfidf = joblib.load('model/tfidf.model')
        self.law = load_model('CNN_base_best_law.h5',custom_objects={'f1':f1})
        self.accu = load_model('CNN_base_best_accusation.h5',custom_objects={'f1':f1})
        self.time = load_model('CNN_base_best_time.h5',custom_objects={'f1':f1})
        self.batch_size = 128
        self.max_sequence_length=175

        segmentor = Segmentor()  # 初始化实例,split words
        segmentor.load('/home/wshong/PycharmProjects/CAIL2018/text_pre_process/cws.model')  # 加载模型for cut text
        self.cut = segmentor.segment

        self.dict_path = '/home/wshong/PycharmProjects/CAIL2018/text_pre_process/word_dict.txt'
        self.word_dict=get_word_dict(self.dict_path)
        self.path = '/home/wshong/PycharmProjects/CAIL2018/text_pre_process/stopwords.txt'
        self.stopwords = []
        with open(self.path, 'r', encoding='utf-8')as fi:
            for line in fi.readlines():
                word = line.strip('\n')
                self.stopwords.append(word)
        self.pattern = re.compile(
            '([0-9]{4}年)([0-9][0-9]?月)?([0-9][0-9]?日)?(凌晨|上午|中午|下午|晚上|傍晚|晚|早上)?([0-9][0-9]?时)?([0-5]?[0-9]分)?(许|左右)?')
Example #17
0
 def __init__(
         self,
         data,
         stop_words_file='stop_words.txt',
         theta=0.5,
         # LTP_DATA_DIR=r'E:\ltp-models\\ltp_data_v3.4.0\\',  # ltp模型目录的路径
         LTP_DATA_DIR='E:\ltp_models\ltp_data_v3.4.0\ltp_data_v3.4.0',  # ltp模型目录的路径
         segmentor=Segmentor(),
         postagger=Postagger(),
 ):
     self.data = data
     self.stop_words_file = stop_words_file
     self.theta = theta
     self.LTP_DATA_DIR = LTP_DATA_DIR
     self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model')
     print(self.cws_model_path)
     self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model')
     self.segmentor = segmentor  # 初始化实例
     self.segmentor.load_with_lexicon(
         self.cws_model_path, self.LTP_DATA_DIR + 'dictionary.txt')  # 加载模型
     self.postagger = postagger  # 初始化实例
     self.postagger.load(self.pos_model_path)  # 加载模型
Example #18
0
    def __init__(self):
        # LTP_DIR = "./ltp_data"
        LTP_DIR = "/mnt/data/dev/model/ltp/ltp_data_v3.4.0/"

        # 分词模型
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, 'cws.model'),
                                         'libs/userdict.txt')
        #         self.segmentor = Segmentor()
        #         self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
Example #19
0
    def __init__(self):
        LTP_DIR = "E:\\study\\Projects\\data-mining\\ltp\\ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        #self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR, "word_dict")) #加载外部词典

        self.postagger = Postagger()
        self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"), os.path.join(LTP_DIR, "n_word_dict")) #加载外部词典

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model")) #依存句法分析

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))#实体识别

        self.labeller = SementicRoleLabeller()  # 语义角色标注
        self.labeller.load(os.path.join(LTP_DIR, "pisrl_win.model"))

        #加载停用词
        with open(LTP_DIR + '\\stopwords.txt', 'r', encoding='gbk') as fread:
            self.stopwords = set()
            for line in fread:
                self.stopwords.add(line.strip())
Example #20
0
def process(stop_words='stopwords.txt',
            craw_file='./output/craw_res.json',
            model_path='E:/pyltp/ltp_data_v3.4.0/cws.model') -> list:
    """
    将craw.py文件输出的结果进行分词和停用词处理.

    :param stop_words: 停用词文件.
    :param craw_file: craw.py文件输出的json结果文件.
    :param model_path: pyltp加载的模型文件路径.
    :return: list,返回处理结果列表
    """
    with open(stop_words,
              'r', encoding='utf-8') as f, open(craw_file,
                                                'r',
                                                encoding='utf-8') as f1:
        stop_words = set(f.read().split('\n'))  # 获取停用词

        from pyltp import Segmentor
        seg, res = Segmentor(), []  # 初始化分词实例
        seg.load(model_path)  # 加载模型
        for craw in [json.loads(line)
                     for line in f1]:  # 按行转换json格式到python数据结构格式
            title_lst = [
                word for word in seg.segment(craw['title'])
                if word not in stop_words
            ]
            para_lst = [
                word for word in seg.segment(craw['paragraphs'])
                if word not in stop_words
            ]
            res.append({
                'url': craw['url'],
                'segmented_title': title_lst,
                'segmented_paragraphs': para_lst,
                'file_name': craw['file_name']
            })
        seg.release()
        return res
Example #21
0
def split_words(path, outpath):
    segmentor = Segmentor()
    if 'Windows' in platform.platform():
        segmentor.load('E:\\Github\\table-detection\\data\\table-v5\\ltp_data\\cws.model')
    elif 'Linux' in platform.platform():
        segmentor.load('/home/caory/github/table-detection/data/table-v5/ltp_data/cws.model')
    lines, sentences = [], []

    with codecs.open(path, 'r', 'utf8') as fo:
        for line in fo:
            lines.append(line.strip())

    for idx, line in enumerate(lines):
        print '%.4f%%' % (100.0 * idx / len(lines))
        words = segmentor.segment(line.encode('utf8'))
        sentence = [w.decode('utf8') for w in words]
        sentences.append(sentence)

    print(len(sentences))

    with open(outpath, 'w') as fw:
    	for sentence in sentences:
    		fw.writelines((' '.join(sentence) + '\n').encode('utf8'))
Example #22
0
    def __init__(self):
        if (self.__initialized): return
        self.__initialized = True
        LTP_DIR = "./ltp_data"
        #客製化分詞,並且後處理更改詞性
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(
            os.path.join(LTP_DIR, "cws.model"),
            os.path.join(LTP_DIR, 'customized.txt'))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

        self.sentenceSplitter = SentenceSplitter()
Example #23
0
def postag_data():
    # 分词模型
    segmentor = Segmentor()
    segmentor.load('cws.model')
    # 词性标注模型
    postagger = Postagger()
    postagger.load('pos.model')
    
    # 加载将要被分词的数据
    data_csv = pd.read_csv('../data.csv', encoding='utf-8-sig')
    datas = data_csv['title']

    util = Utils()
    data_processed = open('../data_processed_postagger.csv', 'w', encoding='utf-8')
    for data in datas:
        words = segmentor.segment(data)  # 分词
        postags = postagger.postag(words) # 标注
        word_split = ' '.join(words).split(' ') 
        postags_split = ' '.join(postags).split(' ')
        # 连接词语
        concat_word = util.concat(word_split, postags_split, type='postags')
        data_processed.write(concat_word + '\n')
    data_processed.close()
Example #24
0
    def __init__(self, component_config: Dict[Text, Any] = None):
        super(LtpHelper, self).__init__(component_config)
        self.path = component_config['path']
        self.lexicon = component_config['lexicon']
        self.dimension = component_config['dimension']

        ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
        MODELDIR = os.path.join(ROOTDIR, self.path)
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"),
                                         self.lexicon)

        self.postagger = Postagger()
        self.postagger.load(os.path.join(MODELDIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(MODELDIR, 'parser.model'))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(MODELDIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(MODELDIR, "pisrl.model"))
Example #25
0
def read_data():
    # 读取未分词文件

    segmentor = Segmentor()  # 初始化实例
    segmentor.load_with_lexicon(cws_model_path, data_path.user_dict)

    with open(data_path.train, encoding='utf-8') as fin:
        read_results = [json.loads(line.strip()) for line in fin.readlines()]
    answer_sentence = []
    question = []
    ans = []
    for item in read_results:
        answer_sentence.append(list(segmentor.segment(''.join(item['answer_sentence']).strip())))
        # print(list(jieba.cut(''.join(item['answer_sentence']))))
        que = item['question']
        if que[-1] == '?':
            que = que[0:len(que) - 1]
        question.append(list(segmentor.segment(que.strip())))
        ans.append(item['answer'])

    segmentor.release()  # 释放模型
    assert len(answer_sentence) == len(question)
    return answer_sentence, question, ans
Example #26
0
    def __init__(self, config):
        self.config = config
        random_seed = config['random_seed']
        random.seed(random_seed)
        torch.manual_seed(random_seed)  # cpu
        torch.cuda.manual_seed(random_seed)  #gpu
        np.random.seed(random_seed)  #numpy

        if self.config['use_bert']:
            self.tokenizer = BertTokenizer.from_pretrained(
                self.config['bert_model_name'], cache_dir=config['bert_dir'])
        elif self.config['use_xlnet']:
            self.tokenizer = XLNetTokenizer.from_pretrained(
                'hfl/chinese-xlnet-base', cache_dir=config['xlnet_dir'])
        else:
            raise Exception('Not support other basic encoder')
        self.latest_epoch = 0

        if self.config['cut_word_task']:
            cws_model_path = os.path.join(self.config['ltp_path'], 'cws.model')
            segmentor = Segmentor()
            segmentor.load(cws_model_path)
            self.segmentor = segmentor
Example #27
0
 def __init__(self, MODELDIR, exword_path='lexion'):
     self.MODELDIR = MODELDIR
     # self.output = {}
     self.words = None
     self.postags = None
     self.netags = None
     self.arcs = None
     self.exword_path = exword_path  # e.x: '‪E:\LTP\ltp_data_v3.4.0\exwords.txt'
     # 分词
     self.segmentor = Segmentor()
     if not self.exword_path:
         # 是否加载额外词典
         self.segmentor.load(os.path.join(self.MODELDIR, "cws.model"))
     else:
         self.segmentor.load_with_lexicon(
             os.path.join(self.MODELDIR, "cws.model"), self.exword_path)
     # 模型引用
     # 词性标注
     self.postagger = Postagger()
     self.postagger.load(os.path.join(self.MODELDIR, "pos.model"))
     # 依存句法
     self.parser = Parser()
     self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
Example #28
0
def seg_with_ltp(in_file, out_file_path, manual_seg_file):
    # initialization model
    seg = Segmentor()  # 生成对象
    seg.load("./ltp_data_v3.4.0/cws.model")  # 加载分词语料库

    # save seg_result
    corpus = construct_corpus(in_file)
    f = open(out_file_path, "w", encoding='utf-8')
    for line in corpus:
        f.write("=".join(seg.segment(line)) + "\n")
        f.flush()

    # test qps 百度暂时不计算,因为加了延时
    corpus = construct_corpus(in_file, 500)
    start = time.time()
    for line in corpus:
        "=".join(seg.segment(line))
    end = time.time()
    qps = round(len(corpus) / (end - start), 2)

    # test accuracy
    p, r, f1, line_aver_length = evaluate(out_file_path, manual_seg_file)
    return qps, p, r, f1, line_aver_length
Example #29
0
	def __init__(self, theOptions):
		self.options = theOptions
		self.minus_verbs = self.loadMinusVerbs()
		###ltp
		# -*- coding: utf-8 -*-
		import os
		LTP_DATA_DIR = '/path/to/your/ltp_data'  # ltp模型目录的路径
		cws_model_path = './/knols//ltp_data//cws.model'  # 分词模型路径,模型名称为`cws.model`
		pos_model_path = './/knols//ltp_data//pos.model'  # 词性标注模型路径,模型名称为`pos.model`
		ner_model_path = './/knols//ltp_data//ner.model'  # 词性标注模型路径,模型名称为`ner.model`
		
		from pyltp import Segmentor
		self.segmentor = Segmentor()  # 初始化实例
		self.segmentor.load(cws_model_path)  # 加载模型
		#words = self.segmentor.segment('元芳你怎么看')  # 分词

		from pyltp import Postagger
		self.postagger = Postagger() # 初始化实例
		self.postagger.load(pos_model_path)  # 加载模型
		
		from pyltp import NamedEntityRecognizer
		self.recognizer = NamedEntityRecognizer() # 初始化实例
		self.recognizer.load(ner_model_path)  # 加载模型
Example #30
0
    def __init__(self, batch_size=400, num_epoch=100, threshold=0.105):
        self.raw_data = None
        self.train_data = []
        self.segmentor = Segmentor()
        self.segmentor.load(self.SEGMENT_PATH)
        self.stops = []
        # 加载停用词
        stop_word_file = codecs.open(STOP_WORD_FILE, encoding='utf-8')
        for line in stop_word_file.readlines():
            if line != '':
                self.stops.append(line.strip())
        self.max_sequence_length = 0
        self.y_data = []
        self.buid_features()
        self.get_max_sequence_length()
        self.num_word = 0
        self.train_embedding = None  # 训练文本词向量
        self.batch_size = batch_size
        self.num_epoch = num_epoch

        self.out_dim = 17  # 输出维度(类别)
        self.threshold = threshold  # 输出概率大于阈值方可预测一类
        self.num_fold = 10