def predict(self, sentence): """ 预测 # TODO 加入标签到序列的转换? """ self.load_model() u_sent = q_to_b(sentence) word_lists = [[u'<BOS>'] + [c for c in u_sent] + [u'<EOS>']] # <class 'list'>: ['<BOS>', '新', '华', '社', '北', '京', '十', '二', '月', '三', '十', '一', '日', '电', '(', '中', # '央', '人', '民', '广', '播', '电', '台', '记', '者', '刘', '振', '英', '、', '新', '华', '社', '记', '者', '张', # '宿', '堂', ')', '今', '天', '是', '一', '九', '九', '七', '年', '的', '最', '后', '一', '天', '。', '辞', '旧', # '迎', '新', '之', '际', ',', '国', '务', '院', '总', '理', '李', '鹏', '今', '天', '上', '午', '来', '到', '北', # '京', '石', '景', '山', '发', '电', '总', '厂', '考', '察', ',', '向', '广', '大', '企', '业', '职', '工', '表', # '示', '节', '日', '的', '祝', '贺', ',', '向', '将', '要', '在', '节', '日', '期', '间', '坚', '守', '工', '作', # '岗', '位', '的', '同', '志', '们', '表', '示', '慰', '问', '<EOS>'] word_grams = [ self.corpus.segment_by_window(word_list) for word_list in word_lists ] features = self.corpus.extract_feature(word_grams) y_predict = self.model.predict(features) entity = u'' for index in range(len(y_predict[0])): if y_predict[0][index] != u'O': if index > 0 and y_predict[0][index][-1] != y_predict[0][ index - 1][-1]: entity += u' ' entity += u_sent[index] elif entity[-1] != u' ': entity += u' ' return entity
def pre_process(self, fin): lines = self.load_corpus(fin) self.lines = [] for line in lines: words = [word for word in q_to_b(line.strip()).split(' ') if word] # 全角转半角 if len(words) <= 0: continue new_words = self.process_time(words) # 处理时间 new_words = self.process_person(new_words) # 处理人名 new_words = self.process_org(new_words) # 处理组织机构 self.lines.append(new_words)
def pre_process(cls): """ 语料预处理 """ train_corpus_path = cls._config.get('ner', 'train_corpus_path') lines = cls.read_corpus_from_file(train_corpus_path) new_lines = [] for line in lines: words = q_to_b(line.decode('utf-8').strip()).split(u' ') pro_words = cls.process_t(words) pro_words = cls.process_nr(pro_words) pro_words = cls.process_k(pro_words) new_lines.append(' '.join(pro_words[1:])) process_corpus_path = cls._config.get('ner', 'process_corpus_path') cls.write_corpus_to_file(data='\n'.join(new_lines).encode('utf-8'), file_path=process_corpus_path)
def pre_process(cls): """ 语料预处理 """ train_corpus_path = cls._config.get('ner', 'train_corpus_path') lines = cls.read_corpus_from_file(train_corpus_path) new_lines = [] for line in lines: words = q_to_b(line.strip()).split(' ') pro_words = cls.process_t(words) pro_words = cls.process_nr(pro_words) pro_words = cls.process_k(pro_words) new_lines.append(' '.join(pro_words[1:])) process_corpus_path = cls._config.get('ner', 'process_corpus_path') cls.write_corpus_to_file(data='\n'.join(new_lines), file_path=process_corpus_path)
def predict(self, sentence): """ 预测 """ self.load_model() u_sent = q_to_b(sentence) word_lists = [[u'<BOS>']+[c for c in u_sent]+[u'<EOS>']] word_grams = [self.corpus.segment_by_window(word_list) for word_list in word_lists] features = self.corpus.extract_feature(word_grams) y_predict = self.model.predict(features) entity = u'' for index in range(len(y_predict[0])): if y_predict[0][index] != u'O': if index > 0 and y_predict[0][index][-1] != y_predict[0][index-1][-1]: entity += u' ' entity += u_sent[index] elif entity[-1] != u' ': entity += u' ' return entity
def predict(self, sentence): """ 预测 """ self.load_model() u_sent = q_to_b(sentence) word_lists = [[u'<BOS>'] + [c for c in u_sent] + [u'<EOS>']] word_grams = [ self.corpus.segment_by_window(word_list) for word_list in word_lists ] features = self.corpus.extract_feature(word_grams) y_predict = self.model.predict(features) entity = u'' for index in range(len(y_predict[0])): if y_predict[0][index] != u'O': if index > 0 and y_predict[0][index][-1] != y_predict[0][ index - 1][-1]: entity += u' ' entity += u_sent[index] elif entity[-1] != u' ': entity += u' ' return entity