def ner(text): """ """ segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(text) # 分词 # print ('\t'.join(words)) segmentor.release() # 释放模型 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 # words = ['元芳', '你', '怎么', '看'] # 分词结果 postags = postagger.postag(words) # 词性标注 print("##" * 30) # print ('\t'.join(postags)) postagger.release() # 释放模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 # words = ['元芳', '你', '怎么', '看'] # postags = ['nh', 'r', 'r', 'v'] netags = recognizer.recognize(words, postags) # 命名实体识别 recognizer.release() # 释放模型 words_list = [] for word, flag in zip(words, netags): # print(word,flag) if flag.startswith("B-"): one = [] one.append(word) elif flag.startswith("I-"): one.append(word) elif flag.startswith("E-"): one.append(word) words_list.append("".join(one)) elif flag.startswith("S-"): words_list.append(word) # print(words_list) # return words_list,words, postags,netags return words_list
def word_vec_case_set(cls, word_model_file, with_name=False): """ 获取词向量特征集,认为词条最多10个词 每个词条被表示为50*10的二维列表 :param word_model_file: 词向量模型文件 :param with_name: 正样例是否包含人名 :return: 一个字典{pos_case:{正例},neg:{负例}} """ segmentor = Segmentor() segmentor.load("../word2vec_process/model/cws.model") word_vec_model = word2vec.Word2Vec.load('../word2vec_process/model/' + word_model_file) case_dict = cls.load_case_set(with_name) word_vec_case_dict = {} # 以词向量拼接的方式构建词条表示,500维 pos_case_list = case_dict['pos_case'] pos_case_vec_dict = {} for pos_case in pos_case_list: case_words = segmentor.segment(pos_case) case_vec = [] is_useful = 0 for word in case_words: try: # 拼接 case_vec.append(word_vec_model[unicode(word)].tolist()) is_useful = 1 except Exception, e: with open("./data/not_in_vocabulary.txt", 'a') as out_file: # 记录缺失词汇 out_file.write(word + '\n') case_vec.append([0] * 50) # 多退少补 if len(case_vec) > 10: case_vec = case_vec[0:10] else: while (len(case_vec) < 10): case_vec.append([0] * 50) if is_useful: pos_case_vec_dict[pos_case] = case_vec
def __init__(self, cws_model_path, pos_model_path, ner_model_path, parser_model_path, **args): Tokenizer_Base.__init__(self, **args) from pyltp import Segmentor, Postagger, NamedEntityRecognizer, Parser self.seg_ins = Segmentor() self.seg_ins.load(cws_model_path) self.pos_ins = Postagger() self.pos_ins.load(pos_model_path) if parser_model_path is not None and os.path.exists(parser_model_path): self.parser_ins = Parser() self.parser_ins.load(parser_model_path) else: self.parser_ins = None self.ner_ins = [] for path in sorted(glob.glob(ner_model_path)): try: if os.path.getsize(path) > 1024: self.ner_ins.append(NamedEntityRecognizer()) self.ner_ins[-1].load(path) except Exception as err: print(err)
def __init__(self, ltp_path, user_path): cws_model_path = os.path.join(ltp_path, 'cws.model') # 分词模型路径,模型名称为`cws.model` user_model_path = os.path.join(user_path, 'userdict.txt') #用户自定义字典 pos_model_path = os.path.join( ltp_path, 'pos.model') # 词性标注模型路径,模型名称为`pos.model`] sym_dict_path = os.path.join(user_path, 'reladict.txt') self.segmentor = Segmentor() # 初始化实例 self.segmentor.load_with_lexicon(cws_model_path, user_model_path) # 加载模型 self.postagger = Postagger() # 初始化实例 self.postagger.load_with_lexicon(pos_model_path, user_model_path) # 加载模型 #加载同义词库 self.list1 = [] with open(sym_dict_path, mode='r', encoding='UTF-8') as f: for line in f.readlines(): rela_array = line.strip("\n").split(",") tmplist = [] for rela in rela_array: tmplist.append(rela) self.list1.append(tmplist)
def __init__(self, data_dir): self.LTP_DATA_DIR = data_dir # 分词模型 cws_model = os.path.join(self.LTP_DATA_DIR, 'cws.model') self.segmentor = Segmentor() self.segmentor.load(cws_model) # self.segmentor.load_with_lexicon(cws_model, DEFAULT_SYNONYMS_PATH) # 词性标注模型 pos_model = os.path.join(self.LTP_DATA_DIR, 'pos.model') self.postagger = Postagger() self.postagger.load(pos_model) # 命名实体识别模型 ner_model = os.path.join(self.LTP_DATA_DIR, 'ner.model') self.recongnizer = NamedEntityRecognizer() self.recongnizer.load(ner_model) # 依存句法分析模型 parse_model = os.path.join(self.LTP_DATA_DIR, 'parser.model') self.parser = Parser() self.parser.load(parse_model)
def main(): cws_model_path = os.path.join(LTP_DATA_DIR, "cws.model") # 分词模型路径,模型名称为`cws.model` segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 train_dict = json.loads( open(os.path.join(DATA_SET_DIR, train_file_path), "r").readline()) test_dict = json.loads( open(os.path.join(DATA_SET_DIR, test_file_path), "r").readline()) contents = [] for value in train_dict.values(): contents.append(" ".join( [word for word in segmentor.segment(value["content"])])) for value in test_dict.values(): contents.append(" ".join( [word for word in segmentor.segment(value["content"])])) contents_file = open("contents.txt", "w", encoding='utf-8') contents_file.write("\n".join(contents))
def __init__(self): LTP_DIR = "/home/python/ltp/ltp_data_v3.4.0" # 分词模型,单文件 self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) # 词性标注模型,单文件 self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) # 依存句法分析模型,单文件 self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) # 命名实体识别模型,单文件 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) # 语义角色标注模型,多文件 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
def __init__(self, in_file_path, out_file_path, model_path, clean_output=False): self.logger = logging.getLogger("TripleIE") self.in_file_path = in_file_path self.out_file_path = out_file_path self.model_path = model_path self.clean_output = clean_output # 输出是否有提示 self.out_handle = None self.segmentor = Segmentor() self.segmentor.load(os.path.join(self.model_path, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(self.model_path, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(self.model_path, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.model_path, "ner.model"))
def segmentor(self): segmentor = Segmentor() # 初始化实例 # ldir = '/Users/ceil/PycharmProjects/pyltp/ltp/ltp_data_model/cws.model' # dicdir = 'C:\\Users\\K\\ltp_data_v3.4.0\\word.txt' # 自定义字典 # segmentor.load('/Users/ceil/PycharmProjects/personal/pyltp/ltp/ltp_data_model/cws.model') # 加载模型 segmentor.load('/home/student/project-01/ltp_data/cws.model') # 服务器路径 # segmentor.load_with_lexicon(ldir) # , dicdir) words = segmentor.segment(self.sentence) # 分词 # 默认可以这样输出 # print '\t'.join(words) # 可以转换成List 输出 words_list = list(words) print('\n') print('分词的结果是:') for word in words_list: print(word, end=' ') segmentor.release() # 释放模型 self.words = words_list return self.words
def parser(sent): from pyltp import Segmentor segmentor = Segmentor() # 初始化实例 segmentor.load('../../data/ltp_data/cws.model') # 加载模型 words = segmentor.segment(sent) # 分词 segmentor.release() from pyltp import Postagger postagger = Postagger() # 初始化实例 postagger.load('../../data/ltp_data/pos.model') # 加载模型 postags = postagger.postag(words) # 词性标注 postagger.release() from pyltp import NamedEntityRecognizer recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('../../data/ltp_data/ner.model') # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 recognizer.release() from pyltp import Parser parser = Parser() # 初始化实例 parser.load('../../data/ltp_data/parser.model') # 加载模型 arcs = parser.parse(words, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release()
def get_wordsList(content_after_DS, stopwords_flag=False): stopwords = [] if (stopwords_flag): stopwords = load_stopwords() wordSegmentor_ltp = Segmentor() cws_model_path = config.LTP_DATA_DIR + r'\cws.model' wordSegmentor_ltp.load(cws_model_path) words_list = [] for text in content_after_DS.split(): words_segment_list = list(wordSegmentor_ltp.segment(text)) words_list.extend(words_segment_list) if (stopwords_flag): words_list = [ word for word in words_list if (word not in stopwords) ] wordSegmentor_ltp.release() words_list_str = ' '.join(words_list) return words_list_str
def run(): #分词+去除空行 #词性标注集http://ltp.readthedocs.io/zh_CN/latest/appendix.html cont = open('key/pinglun_resource.txt','r',encoding='utf-8') # cont = open('key/text.txt','r',encoding='utf-8') f = open('key/cut_resouce_new.txt','w',encoding='utf-8') segmentor = Segmentor() # 初始化实例 # segmentor.load('cws.model') # 加载模型,不加载字典 segmentor.load_with_lexicon('cws.model', 'userdict.txt') # 加载模型,加载用户字典 postagger = Postagger() # 初始化实例 postagger.load('pos.model') # 加载模型 for sentence in cont: if sentence.strip() !='': words = segmentor.segment(sentence) # 分词 postags = postagger.postag(words) # 词性标注 for word,tag in zip(words,postags): if (tag !='wp'): f.write(word+' ') else:f.write('\n') f.write('\n') else:continue f.close()
def ltp_module(): LTP_DATA_DIR = 'ltp_data_v3.4.0/' cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') srl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl.model') segmentor = Segmentor() postagger = Postagger() recognizer = NamedEntityRecognizer() parser = Parser() # labeller = SementicRoleLabeller() segmentor.load(cws_model_path) postagger.load(pos_model_path) recognizer.load(ner_model_path) parser.load(par_model_path) # labeller.load(srl_model_path) words = segmentor.segment('格力电器美的造') postags = postagger.postag(words) netags = recognizer.recognize(words, postags) arcs = parser.parse(words, postags) # roles = labeller.label(words, postags, arcs) words_list = list(words) postags_list = list(postags) segmentor.release() postagger.release() recognizer.release() parser.release() # labeller.release() for w in words_list: print(w) for p in postags_list: print(p) print('\t'.join(netags)) print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
def init(self, base_dir, is_custom_seg_dict): segmentor_model = os.path.join(base_dir, 'cws.model') tagger_model = os.path.join(base_dir, 'pos.model') ner_model = os.path.join(base_dir, 'ner.model') parser_model = os.path.join(base_dir, 'parser.model') custom_seg_dict = os.path.join(dict_dir, 'vertical_domain_baike_dict.txt') self.segmentor = Segmentor() if is_custom_seg_dict: self.segmentor.load_with_lexicon(segmentor_model, custom_seg_dict) else: self.segmentor.load(segmentor_model) self.tagger = Postagger() self.tagger.load(tagger_model) self.nertagger = NamedEntityRecognizer() self.nertagger.load(ner_model) self.parser = Parser() self.parser.load(parser_model)
def get_name(line): LTP_DATA_DIR = r'ltp_data_v3.4.0' # LTP模型目录路径 # 分词 segmentor = Segmentor() # 初始化 segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model')) # 加载模型 words = segmentor.segment(line) # 分词 # 词性标注 postagger = Postagger() # 初始化 postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model')) # 加载模型 postags = postagger.postag(words) # postags = postagger.postag(['中国', '进出口', '银行', '与', '中国银行', '加强', '合作', '。']) res = [] # 命名实体识别 recognizer = NamedEntityRecognizer() # 实例化 recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model')) netags = recognizer.recognize(words, postags) for i, data in enumerate(list(netags)): if data[2:] == "Nh": res.append(words[i]) return list(set(res))
def __init__(self): #self.tfidf = joblib.load('model/tfidf.model') self.law = load_model('CNN_base_best_law.h5',custom_objects={'f1':f1}) self.accu = load_model('CNN_base_best_accusation.h5',custom_objects={'f1':f1}) self.time = load_model('CNN_base_best_time.h5',custom_objects={'f1':f1}) self.batch_size = 128 self.max_sequence_length=175 segmentor = Segmentor() # 初始化实例,split words segmentor.load('/home/wshong/PycharmProjects/CAIL2018/text_pre_process/cws.model') # 加载模型for cut text self.cut = segmentor.segment self.dict_path = '/home/wshong/PycharmProjects/CAIL2018/text_pre_process/word_dict.txt' self.word_dict=get_word_dict(self.dict_path) self.path = '/home/wshong/PycharmProjects/CAIL2018/text_pre_process/stopwords.txt' self.stopwords = [] with open(self.path, 'r', encoding='utf-8')as fi: for line in fi.readlines(): word = line.strip('\n') self.stopwords.append(word) self.pattern = re.compile( '([0-9]{4}年)([0-9][0-9]?月)?([0-9][0-9]?日)?(凌晨|上午|中午|下午|晚上|傍晚|晚|早上)?([0-9][0-9]?时)?([0-5]?[0-9]分)?(许|左右)?')
def __init__( self, data, stop_words_file='stop_words.txt', theta=0.5, # LTP_DATA_DIR=r'E:\ltp-models\\ltp_data_v3.4.0\\', # ltp模型目录的路径 LTP_DATA_DIR='E:\ltp_models\ltp_data_v3.4.0\ltp_data_v3.4.0', # ltp模型目录的路径 segmentor=Segmentor(), postagger=Postagger(), ): self.data = data self.stop_words_file = stop_words_file self.theta = theta self.LTP_DATA_DIR = LTP_DATA_DIR self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') print(self.cws_model_path) self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') self.segmentor = segmentor # 初始化实例 self.segmentor.load_with_lexicon( self.cws_model_path, self.LTP_DATA_DIR + 'dictionary.txt') # 加载模型 self.postagger = postagger # 初始化实例 self.postagger.load(self.pos_model_path) # 加载模型
def __init__(self): # LTP_DIR = "./ltp_data" LTP_DIR = "/mnt/data/dev/model/ltp/ltp_data_v3.4.0/" # 分词模型 self.segmentor = Segmentor() self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, 'cws.model'), 'libs/userdict.txt') # self.segmentor = Segmentor() # self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
def __init__(self): LTP_DIR = "E:\\study\\Projects\\data-mining\\ltp\\ltp_data_v3.4.0" self.segmentor = Segmentor() #self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR, "word_dict")) #加载外部词典 self.postagger = Postagger() self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"), os.path.join(LTP_DIR, "n_word_dict")) #加载外部词典 self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) #依存句法分析 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))#实体识别 self.labeller = SementicRoleLabeller() # 语义角色标注 self.labeller.load(os.path.join(LTP_DIR, "pisrl_win.model")) #加载停用词 with open(LTP_DIR + '\\stopwords.txt', 'r', encoding='gbk') as fread: self.stopwords = set() for line in fread: self.stopwords.add(line.strip())
def process(stop_words='stopwords.txt', craw_file='./output/craw_res.json', model_path='E:/pyltp/ltp_data_v3.4.0/cws.model') -> list: """ 将craw.py文件输出的结果进行分词和停用词处理. :param stop_words: 停用词文件. :param craw_file: craw.py文件输出的json结果文件. :param model_path: pyltp加载的模型文件路径. :return: list,返回处理结果列表 """ with open(stop_words, 'r', encoding='utf-8') as f, open(craw_file, 'r', encoding='utf-8') as f1: stop_words = set(f.read().split('\n')) # 获取停用词 from pyltp import Segmentor seg, res = Segmentor(), [] # 初始化分词实例 seg.load(model_path) # 加载模型 for craw in [json.loads(line) for line in f1]: # 按行转换json格式到python数据结构格式 title_lst = [ word for word in seg.segment(craw['title']) if word not in stop_words ] para_lst = [ word for word in seg.segment(craw['paragraphs']) if word not in stop_words ] res.append({ 'url': craw['url'], 'segmented_title': title_lst, 'segmented_paragraphs': para_lst, 'file_name': craw['file_name'] }) seg.release() return res
def split_words(path, outpath): segmentor = Segmentor() if 'Windows' in platform.platform(): segmentor.load('E:\\Github\\table-detection\\data\\table-v5\\ltp_data\\cws.model') elif 'Linux' in platform.platform(): segmentor.load('/home/caory/github/table-detection/data/table-v5/ltp_data/cws.model') lines, sentences = [], [] with codecs.open(path, 'r', 'utf8') as fo: for line in fo: lines.append(line.strip()) for idx, line in enumerate(lines): print '%.4f%%' % (100.0 * idx / len(lines)) words = segmentor.segment(line.encode('utf8')) sentence = [w.decode('utf8') for w in words] sentences.append(sentence) print(len(sentences)) with open(outpath, 'w') as fw: for sentence in sentences: fw.writelines((' '.join(sentence) + '\n').encode('utf8'))
def __init__(self): if (self.__initialized): return self.__initialized = True LTP_DIR = "./ltp_data" #客製化分詞,並且後處理更改詞性 self.segmentor = Segmentor() self.segmentor.load_with_lexicon( os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR, 'customized.txt')) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) self.sentenceSplitter = SentenceSplitter()
def postag_data(): # 分词模型 segmentor = Segmentor() segmentor.load('cws.model') # 词性标注模型 postagger = Postagger() postagger.load('pos.model') # 加载将要被分词的数据 data_csv = pd.read_csv('../data.csv', encoding='utf-8-sig') datas = data_csv['title'] util = Utils() data_processed = open('../data_processed_postagger.csv', 'w', encoding='utf-8') for data in datas: words = segmentor.segment(data) # 分词 postags = postagger.postag(words) # 标注 word_split = ' '.join(words).split(' ') postags_split = ' '.join(postags).split(' ') # 连接词语 concat_word = util.concat(word_split, postags_split, type='postags') data_processed.write(concat_word + '\n') data_processed.close()
def __init__(self, component_config: Dict[Text, Any] = None): super(LtpHelper, self).__init__(component_config) self.path = component_config['path'] self.lexicon = component_config['lexicon'] self.dimension = component_config['dimension'] ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) MODELDIR = os.path.join(ROOTDIR, self.path) self.segmentor = Segmentor() self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), self.lexicon) self.postagger = Postagger() self.postagger.load(os.path.join(MODELDIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(MODELDIR, 'parser.model')) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(MODELDIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(MODELDIR, "pisrl.model"))
def read_data(): # 读取未分词文件 segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, data_path.user_dict) with open(data_path.train, encoding='utf-8') as fin: read_results = [json.loads(line.strip()) for line in fin.readlines()] answer_sentence = [] question = [] ans = [] for item in read_results: answer_sentence.append(list(segmentor.segment(''.join(item['answer_sentence']).strip()))) # print(list(jieba.cut(''.join(item['answer_sentence'])))) que = item['question'] if que[-1] == '?': que = que[0:len(que) - 1] question.append(list(segmentor.segment(que.strip()))) ans.append(item['answer']) segmentor.release() # 释放模型 assert len(answer_sentence) == len(question) return answer_sentence, question, ans
def __init__(self, config): self.config = config random_seed = config['random_seed'] random.seed(random_seed) torch.manual_seed(random_seed) # cpu torch.cuda.manual_seed(random_seed) #gpu np.random.seed(random_seed) #numpy if self.config['use_bert']: self.tokenizer = BertTokenizer.from_pretrained( self.config['bert_model_name'], cache_dir=config['bert_dir']) elif self.config['use_xlnet']: self.tokenizer = XLNetTokenizer.from_pretrained( 'hfl/chinese-xlnet-base', cache_dir=config['xlnet_dir']) else: raise Exception('Not support other basic encoder') self.latest_epoch = 0 if self.config['cut_word_task']: cws_model_path = os.path.join(self.config['ltp_path'], 'cws.model') segmentor = Segmentor() segmentor.load(cws_model_path) self.segmentor = segmentor
def __init__(self, MODELDIR, exword_path='lexion'): self.MODELDIR = MODELDIR # self.output = {} self.words = None self.postags = None self.netags = None self.arcs = None self.exword_path = exword_path # e.x: 'E:\LTP\ltp_data_v3.4.0\exwords.txt' # 分词 self.segmentor = Segmentor() if not self.exword_path: # 是否加载额外词典 self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) else: self.segmentor.load_with_lexicon( os.path.join(self.MODELDIR, "cws.model"), self.exword_path) # 模型引用 # 词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) # 依存句法 self.parser = Parser() self.parser.load(os.path.join(self.MODELDIR, "parser.model"))
def seg_with_ltp(in_file, out_file_path, manual_seg_file): # initialization model seg = Segmentor() # 生成对象 seg.load("./ltp_data_v3.4.0/cws.model") # 加载分词语料库 # save seg_result corpus = construct_corpus(in_file) f = open(out_file_path, "w", encoding='utf-8') for line in corpus: f.write("=".join(seg.segment(line)) + "\n") f.flush() # test qps 百度暂时不计算,因为加了延时 corpus = construct_corpus(in_file, 500) start = time.time() for line in corpus: "=".join(seg.segment(line)) end = time.time() qps = round(len(corpus) / (end - start), 2) # test accuracy p, r, f1, line_aver_length = evaluate(out_file_path, manual_seg_file) return qps, p, r, f1, line_aver_length
def __init__(self, theOptions): self.options = theOptions self.minus_verbs = self.loadMinusVerbs() ###ltp # -*- coding: utf-8 -*- import os LTP_DATA_DIR = '/path/to/your/ltp_data' # ltp模型目录的路径 cws_model_path = './/knols//ltp_data//cws.model' # 分词模型路径,模型名称为`cws.model` pos_model_path = './/knols//ltp_data//pos.model' # 词性标注模型路径,模型名称为`pos.model` ner_model_path = './/knols//ltp_data//ner.model' # 词性标注模型路径,模型名称为`ner.model` from pyltp import Segmentor self.segmentor = Segmentor() # 初始化实例 self.segmentor.load(cws_model_path) # 加载模型 #words = self.segmentor.segment('元芳你怎么看') # 分词 from pyltp import Postagger self.postagger = Postagger() # 初始化实例 self.postagger.load(pos_model_path) # 加载模型 from pyltp import NamedEntityRecognizer self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(ner_model_path) # 加载模型
def __init__(self, batch_size=400, num_epoch=100, threshold=0.105): self.raw_data = None self.train_data = [] self.segmentor = Segmentor() self.segmentor.load(self.SEGMENT_PATH) self.stops = [] # 加载停用词 stop_word_file = codecs.open(STOP_WORD_FILE, encoding='utf-8') for line in stop_word_file.readlines(): if line != '': self.stops.append(line.strip()) self.max_sequence_length = 0 self.y_data = [] self.buid_features() self.get_max_sequence_length() self.num_word = 0 self.train_embedding = None # 训练文本词向量 self.batch_size = batch_size self.num_epoch = num_epoch self.out_dim = 17 # 输出维度(类别) self.threshold = threshold # 输出概率大于阈值方可预测一类 self.num_fold = 10