def read_instance(self, seg_lists, labels, mode='train'): texts, ids = [], [] for seg_list, label in zip(seg_lists, labels): char_list, char_id_list, word_id_list, = [], [], [] label_id = self.label_alphabet.get_index(label, mode) for word in seg_list: word_id = self.word_alphabet.get_index(normalize_word(word)) word_id_list.append(word_id) chars, char_ids = [], [] if self.specific_word(word): chars.append(word) char_ids.append( self.char_alphabet.get_index(normalize_word(word))) else: for char in word: chars.append(char) char_ids.append( self.char_alphabet.get_index(normalize_word(char))) char_list.append(chars) char_id_list.append(char_ids) # for char in char_list: # char_id = self.char_alphabet.get_index(normalize_word(char)) # char_id_list.append(char_id) texts.append([seg_list, char_list, label]) ids.append([word_id_list, char_id_list, label_id]) return texts, ids
def inference(self, text): """ :param text: :return: """ texts, ids = [], [] seg_list = self.data.segment([text])[0] seg_list = self.synonyms_replace(seg_list) # 同义词替换 # print('text: %s, seg_list: %s' % (text, seg_list)) if len(seg_list) == 0: return 1, None, None char_list, char_id_list, word_id_list, = [], [], [] for word in seg_list: word_id = self.data.word_alphabet.get_index(normalize_word(word)) word_id_list.append(word_id) chars, char_ids = [], [] if self.data.specific_word(word): chars.append(word) char_ids.append( self.data.char_alphabet.get_index(normalize_word(word))) else: for char in word: chars.append(char) char_ids.append( self.data.char_alphabet.get_index( normalize_word(char))) char_list.append(chars) char_id_list.append(char_ids) texts.append([seg_list, char_list]) ids.append([word_id_list, char_id_list]) batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, mask = \ predict_batchfy_classification_with_label(ids, self.model.configs['gpu'], if_train=False) pred_represent = self.model(batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) pred_represent = pred_represent.data.numpy() # ori_pred_represent = pred_represent # faiss.normalize_L2(pred_represent) # numpy改写faiss.normalize_L2 pred_represent = pred_represent / np.linalg.norm(pred_represent, ord=2) pred_represent = pred_represent.tolist()[0] faiss_start = datetime.datetime.now() D, I = self.search(self.stub, pred_represent) logger.info('Faiss search costs: %s' % (datetime.datetime.now() - faiss_start).total_seconds()) if D > 0 and I > 0: max_id = I[0][0] max_score = D max_similar_text = self.train_texts[max_id] pred_text = ''.join(max_similar_text[0]) pred_label = max_similar_text[-1] if pred_label == 'None': pred_label = None return max_score, pred_text, pred_label else: # 如果faiss调用失败,返回默认得分和标签 return 0, None, None
def inference(self, text, text_list, label_list): texts, ids = self.data.read_scene_text_list(text_list, label_list) self.data.scene_texts, self.data.scene_ids = texts, ids self.scene_texts, scene_represents, scene_label_ids = get_represents( self.data, self.model, 'scene', self.model.configs) # 处理当前传入的用户input_text texts, ids = [], [] seg_list = self.data.segment([text])[0] if len(seg_list) == 0: return None, None, None # print('seg_list: %s' % seg_list) char_list, char_id_list, word_id_list, = [], [], [] for word in seg_list: word_id = self.data.word_alphabet.get_index(normalize_word(word)) word_id_list.append(word_id) chars, char_ids = [], [] if self.data.specific_word(word): chars.append(word) char_ids.append( self.data.char_alphabet.get_index(normalize_word(word))) else: for char in word: chars.append(char) char_ids.append( self.data.char_alphabet.get_index( normalize_word(char))) char_list.append(chars) char_id_list.append(char_ids) texts.append([seg_list, char_list]) ids.append([word_id_list, char_id_list]) batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, mask = \ predict_batchfy_classification_with_label(ids, self.model.configs['gpu'], if_train=False) pred_represent = self.model(batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) max_score, max_similar_text = self.cal_similarity( pred_represent, scene_represents) pred_text = ''.join(max_similar_text[0]) pred_label = max_similar_text[-1] if pred_label == 'None': pred_label = None # 置信度、最接近的text,最接近的label return max_score, pred_text, pred_label
def read_scene_text_list(self, text_list): chars, ids = [], [] for sentence in text_list: sen_text = [char for char in sentence] sen_id = [ self.char_alphabet.get_index(normalize_word(char)) for char in sentence ] chars.append(sen_text) ids.append(sen_id) return chars, ids
def build_alphabet(self): char_lists = [] for seg_list, label in zip(self.seg_lists, self.labels): char_list = [] for word in seg_list: # word = normalize_word(word) self.word_alphabet.add(normalize_word(word)) if self.specific_word(word): self.char_alphabet.add(word) char_list.append(word) else: for char in word: char_list.append(char) char = normalize_word(char) self.char_alphabet.add(char) char_lists.append(char_list) self.label_alphabet.add(label) self.char_alphabet_size = self.char_alphabet.size() self.word_alphabet_size = self.word_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() self.fix_alphabet() return char_lists
def inference_for_scene_with_glove(self, text, text_list, label_list): # 预处理scene_texts scene_chars, scene_ids = self.data.read_scene_text_list(text_list) # 计算weight # s = datetime.datetime.now() sen_weights = self.cal_char_weight(scene_chars, scene_ids) # print('cal_char_weight costs: %s' % (datetime.datetime.now() - s).total_seconds()) # 计算对应weight下的句子表征 scene_represents = self.cal_scene_represents(scene_ids, sen_weights) # 处理当前input_text: chars, ids = [], [] for char in text: chars.append(char) ids.append(self.data.char_alphabet.get_index(normalize_word(char))) if len(chars) == 0: return 1, None, None input_weights = self.cal_char_weight([chars], [ids]) pred_represent = self.cal_scene_represents([ids], input_weights) max_score, pred_text, pred_label = self.cal_similarity( pred_represent, scene_represents, text_list, label_list) if pred_label == 'None': pred_label = None # 置信度、最接近的text,最接近的label return max_score, pred_text, pred_label