Ejemplo n.º 1
0
 def predict(self, sentence):
     """
     预测
     # TODO 加入标签到序列的转换?
     """
     self.load_model()
     u_sent = q_to_b(sentence)
     word_lists = [[u'<BOS>'] + [c for c in u_sent] + [u'<EOS>']]
     # <class 'list'>: ['<BOS>', '新', '华', '社', '北', '京', '十', '二', '月', '三', '十', '一', '日', '电', '(', '中',
     # '央', '人', '民', '广', '播', '电', '台', '记', '者', '刘', '振', '英', '、', '新', '华', '社', '记', '者', '张',
     # '宿', '堂', ')', '今', '天', '是', '一', '九', '九', '七', '年', '的', '最', '后', '一', '天', '。', '辞', '旧',
     # '迎', '新', '之', '际', ',', '国', '务', '院', '总', '理', '李', '鹏', '今', '天', '上', '午', '来', '到', '北',
     # '京', '石', '景', '山', '发', '电', '总', '厂', '考', '察', ',', '向', '广', '大', '企', '业', '职', '工', '表',
     # '示', '节', '日', '的', '祝', '贺', ',', '向', '将', '要', '在', '节', '日', '期', '间', '坚', '守', '工', '作',
     # '岗', '位', '的', '同', '志', '们', '表', '示', '慰', '问', '<EOS>']
     word_grams = [
         self.corpus.segment_by_window(word_list)
         for word_list in word_lists
     ]
     features = self.corpus.extract_feature(word_grams)
     y_predict = self.model.predict(features)
     entity = u''
     for index in range(len(y_predict[0])):
         if y_predict[0][index] != u'O':
             if index > 0 and y_predict[0][index][-1] != y_predict[0][
                     index - 1][-1]:
                 entity += u' '
             entity += u_sent[index]
         elif entity[-1] != u' ':
             entity += u' '
     return entity
Ejemplo n.º 2
0
 def pre_process(self, fin):
     lines = self.load_corpus(fin)
     self.lines = []
     for line in lines:
         words = [word for word in q_to_b(line.strip()).split(' ') if word]  # 全角转半角
         if len(words) <= 0:
             continue
         new_words = self.process_time(words)  # 处理时间
         new_words = self.process_person(new_words)  # 处理人名
         new_words = self.process_org(new_words)  # 处理组织机构
         self.lines.append(new_words)
Ejemplo n.º 3
0
 def pre_process(cls):
     """
     语料预处理 
     """
     train_corpus_path = cls._config.get('ner', 'train_corpus_path')
     lines = cls.read_corpus_from_file(train_corpus_path)
     new_lines = []
     for line in lines:
         words = q_to_b(line.decode('utf-8').strip()).split(u'  ')
         pro_words = cls.process_t(words)
         pro_words = cls.process_nr(pro_words)
         pro_words = cls.process_k(pro_words)
         new_lines.append('  '.join(pro_words[1:]))
     process_corpus_path = cls._config.get('ner', 'process_corpus_path')
     cls.write_corpus_to_file(data='\n'.join(new_lines).encode('utf-8'), file_path=process_corpus_path)
Ejemplo n.º 4
0
 def pre_process(cls):
     """
     语料预处理
     """
     train_corpus_path = cls._config.get('ner', 'train_corpus_path')
     lines = cls.read_corpus_from_file(train_corpus_path)
     new_lines = []
     for line in lines:
         words = q_to_b(line.strip()).split('  ')
         pro_words = cls.process_t(words)
         pro_words = cls.process_nr(pro_words)
         pro_words = cls.process_k(pro_words)
         new_lines.append('  '.join(pro_words[1:]))
     process_corpus_path = cls._config.get('ner', 'process_corpus_path')
     cls.write_corpus_to_file(data='\n'.join(new_lines),
                              file_path=process_corpus_path)
Ejemplo n.º 5
0
 def predict(self, sentence):
     """
     预测
     """
     self.load_model()
     u_sent = q_to_b(sentence)
     word_lists = [[u'<BOS>']+[c for c in u_sent]+[u'<EOS>']]
     word_grams = [self.corpus.segment_by_window(word_list) for word_list in word_lists]
     features = self.corpus.extract_feature(word_grams)
     y_predict = self.model.predict(features)
     entity = u''
     for index in range(len(y_predict[0])):
         if y_predict[0][index] != u'O':
             if index > 0 and y_predict[0][index][-1] != y_predict[0][index-1][-1]:
                 entity += u' '
             entity += u_sent[index]
         elif entity[-1] != u' ':
             entity += u' '
     return entity
Ejemplo n.º 6
0
 def predict(self, sentence):
     """
     预测
     """
     self.load_model()
     u_sent = q_to_b(sentence)
     word_lists = [[u'<BOS>'] + [c for c in u_sent] + [u'<EOS>']]
     word_grams = [
         self.corpus.segment_by_window(word_list)
         for word_list in word_lists
     ]
     features = self.corpus.extract_feature(word_grams)
     y_predict = self.model.predict(features)
     entity = u''
     for index in range(len(y_predict[0])):
         if y_predict[0][index] != u'O':
             if index > 0 and y_predict[0][index][-1] != y_predict[0][
                     index - 1][-1]:
                 entity += u' '
             entity += u_sent[index]
         elif entity[-1] != u' ':
             entity += u' '
     return entity