def add_alphabet(self, input_file): with open(input_file, 'r') as rf: for i, line in enumerate(rf): if len(line) > 2: pairs = line.strip().split() word = pairs[0] word = normalize_word(word) # 由数字组成的word,直接置为'0' feat = pairs[1].replace('[Lexi]', '') label = pairs[-1] self.words.append(word) self.feats.append(feat) self.labels.append(label) for char in word: self.chars.append(char)
def build_alphabet(self, alphabet_path): for line in self.corpus: line = ast.literal_eval(line) char, char_label, seg_list, intent = line['char'], line[ 'char_label'], line['word'], line['intent'] for word in seg_list: # lexicon lexi_feat = [] for lexi_type, lb in self.trees.lexi_trees.items(): lexi_feat.append(lb.search(word)) for n in range(len(lexi_feat)): if lexi_feat[n] is None or lexi_feat[n] == '_STEM_': lexi_feat[n] = 0 else: lexi_feat[n] = 1 lexi_feat = ''.join([str(i) for i in lexi_feat]) # 抽象成一个字符 self.char_alphabet.add(lexi_feat) # char for c in char: self.char_alphabet.add(normalize_word(c)) # intent self.intent_alphabet.add(intent) # label for label in char_label: self.label_alphabet.add(label) # alphabet_size self.char_alphabet_size = self.char_alphabet.size() self.intent_alphabet_size = self.intent_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() # close alphabet self.fix_alphabet() # write alphabet: if not os.path.exists(alphabet_path): with open(alphabet_path, 'wb') as wbf: pickle.dump(self.char_alphabet.instance2index, wbf) pickle.dump(self.intent_alphabet.instance2index, wbf) pickle.dump(self.label_alphabet.instance2index, wbf) pickle.dump(self.label_alphabet.instances, wbf) pickle.dump(self.char_alphabet_size, wbf) pickle.dump(self.intent_alphabet_size, wbf) pickle.dump(self.label_alphabet_size, wbf)
def read_instance(self, input_file): chars, words, feats, labels = [], [], [], [] char_ids, word_ids, feat_ids, label_ids = [], [], [], [] name = input_file.split('/')[-1].split('_')[0] with open(input_file, 'r') as rf: for line in rf: if len(line) > 3: pairs = line.strip().split() word = pairs[0] word = normalize_word(word) # 如果word由数字构成,转为'0' feat = pairs[1].split(']', 1)[-1] label = pairs[-1] words.append(word) feats.append(feat) labels.append(label) word_id = self.word_alphabet[word] feat_id = self.feat_alphabet[feat] label_id = self.label_alphabet[label] word_ids.append(word_id) feat_ids.append(feat_id) label_ids.append(label_id) char_list = [] char_id = [] for char in word: char_list.append(char) char_id.append(self.char_alphabet[char]) chars.append(char_list) char_ids.append(char_id) else: if name == 'train' and len(words) > 0 and ( len(words) < config.max_sent_length): self.train_texts.append([chars, words, feats, labels]) self.train_ids.append( [char_ids, word_ids, feat_ids, label_ids]) elif name == 'dev' and len(words) > 0 and ( len(words) < config.max_sent_length): self.dev_texts.append([chars, words, feats, labels]) self.dev_ids.append( [char_ids, word_ids, feat_ids, label_ids]) elif name == 'test' and len(words) > 0 and ( len(words) < config.max_sent_length): self.test_texts.append([chars, words, feats, labels]) self.test_ids.append( [char_ids, word_ids, feat_ids, label_ids]) chars, words, feats, labels = [], [], [], [] char_ids, word_ids, feat_ids, label_ids = [], [], [], [] # 防止漏掉最后一行样本 if len(words) > 0 and (len(words) < config.max_sent_length): if name == 'train': self.train_texts.append([chars, words, feats, labels]) self.train_ids.append( [char_ids, word_ids, feat_ids, label_ids]) elif name == 'dev': self.dev_texts.append([chars, words, feats, labels]) self.dev_ids.append( [char_ids, word_ids, feat_ids, label_ids]) elif name == 'test': self.test_texts.append([chars, words, feats, labels]) self.test_ids.append( [char_ids, word_ids, feat_ids, label_ids])
def getRelationInstance(tokens, entities, relations, names, data): X = [] Y = [] cnt_neg = 0 for i in tqdm(range(len(relations))): doc_relation = relations[i] doc_token = tokens[i] doc_entity = entities[i] # entity are sorted by start offset doc_name = names[i] row_num = doc_entity.shape[0] for latter_idx in range(row_num): for former_idx in range(row_num): if former_idx < latter_idx: former = doc_entity.iloc[former_idx] latter = doc_entity.iloc[latter_idx] if former['text'] == latter['text']: continue gold_relations = doc_relation[( ((doc_relation['entity1_text'] == former['text']) & (doc_relation['entity2_text'] == latter['text'])) | ((doc_relation['entity1_text'] == latter['text']) & (doc_relation['entity2_text'] == former['text'])))] # if gold_relations.shape[0] == 0: # raise RuntimeError("{}: entity {} and {} has strange relations".format(doc_name, former['id'], latter['id'])) context_token = doc_token former_tf_start, former_tf_end = former[ 'tf_start'], former['tf_end'] latter_tf_start, latter_tf_end = latter[ 'tf_start'], latter['tf_end'] if context_token.shape[0] > data.max_seq_len: # truncate logging.debug("exceed max_seq_len {} {}".format( doc_name, context_token.shape[0])) context_token = context_token.iloc[:data.max_seq_len] words = [] postags = [] cap = [] chars = [] positions1 = [] positions2 = [] former_token = [] latter_token = [] i = 0 for _, token in context_token.iterrows(): if data.number_normalized: word = normalize_word(token['text']) else: word = token['text'] entity_word = my_utils1.normalizeWord(token['text']) words.append(data.word_alphabet.get_index(word)) postags.append(data.feature_alphabets[ data.feature_name2id['[POS]']].get_index( token['postag'])) cap.append(data.feature_alphabets[ data.feature_name2id['[Cap]']].get_index( str(my_utils.featureCapital(token['text'])))) char_for1word = [] for char in word: char_for1word.append( data.char_alphabet.get_index(char)) chars.append(char_for1word) if i < former_tf_start: positions1.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(former_tf_start - i)) elif i > former_tf_end: positions1.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(former_tf_end - i)) else: positions1.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(0)) former_token.append(data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY]']].get_index( entity_word)) if i < latter_tf_start: positions2.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(latter_tf_start - i)) elif i > latter_tf_end: positions2.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(latter_tf_end - i)) else: positions2.append(data.re_feature_alphabets[ data.re_feature_name2id['[POSITION]']]. get_index(0)) latter_token.append(data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY]']].get_index( entity_word)) i += 1 if len( former_token ) == 0: # truncated part contains entity, so we have to use the text in doc_entity splitted = my_utils.my_tokenize(former['text']) for s in splitted: s = s.strip() if s != "": former_token.append(data.re_feature_alphabets[ data. re_feature_name2id['[ENTITY]']].get_index( my_utils1.normalizeWord(s))) if len(latter_token) == 0: splitted = my_utils.my_tokenize(latter['text']) for s in splitted: s = s.strip() if s != "": latter_token.append(data.re_feature_alphabets[ data. re_feature_name2id['[ENTITY]']].get_index( my_utils1.normalizeWord(s))) assert len(former_token) > 0 assert len(latter_token) > 0 features = { 'tokens': words, 'postag': postags, 'cap': cap, 'char': chars, 'positions1': positions1, 'positions2': positions2 } features['e1_type'] = data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY_TYPE]']].get_index( former['type']) features['e2_type'] = data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY_TYPE]']].get_index( latter['type']) features['e1_token'] = former_token features['e2_token'] = latter_token features['tok_num_betw'] = data.re_feature_alphabets[ data.re_feature_name2id['[TOKEN_NUM]']].get_index( latter['tf_start'] - former['tf_end']) entity_between = doc_entity[( (doc_entity['start'] >= former['end']) & (doc_entity['end'] <= latter['start']))] features['et_num'] = data.re_feature_alphabets[ data.re_feature_name2id['[ENTITY_NUM]']].get_index( entity_between.shape[0]) X.append(features) gold_answer = '</unk>' for _, gold_relation in gold_relations.iterrows(): if gold_relation['type'] != 'None': gold_answer = gold_relation['type'] break Y.append(data.re_feature_alphabets[data.re_feature_name2id[ '[RELATION]']].get_index(gold_answer)) if gold_answer == '</unk>': cnt_neg += 1 # if gold_relations.iloc[0]['type']=='None' and gold_relations.iloc[1]['type']=='None': # Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index('</unk>')) # cnt_neg += 1 # else: # gold_answer = gold_relations.iloc[0]['type'] if gold_relations.iloc[0]['type']!='None' else gold_relations.iloc[1]['type'] # Y.append(data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']].get_index(gold_answer)) neg = 100.0 * cnt_neg / len(Y) logging.info("positive instance {}%, negative instance {}%".format( 100 - neg, neg)) return X, Y
def read_instanceFromBuffer(in_lines, word_alphabet, char_alphabet, feature_alphabets, label_alphabet, number_normalized, max_sent_length, char_padding_size=-1, char_padding_symbol='</pad>'): feature_num = len(feature_alphabets) instence_texts = [] instence_Ids = [] words = [] features = [] chars = [] labels = [] word_Ids = [] feature_Ids = [] char_Ids = [] label_Ids = [] for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if number_normalized: word = normalize_word(word) label = pairs[-1] words.append(word) labels.append(label) word_Ids.append(word_alphabet.get_index(word)) label_Ids.append(label_alphabet.get_index(label)) ## get features feat_list = [] feat_Id = [] for idx in range(feature_num): feat_idx = pairs[idx + 1].split(']', 1)[-1] feat_list.append(feat_idx) feat_Id.append(feature_alphabets[idx].get_index(feat_idx)) features.append(feat_list) feature_Ids.append(feat_Id) ## get char char_list = [] char_Id = [] for char in word: char_list.append(char) if char_padding_size > 0: char_number = len(char_list) if char_number < char_padding_size: char_list = char_list + [char_padding_symbol] * ( char_padding_size - char_number) assert (len(char_list) == char_padding_size) else: ### not padding pass for char in char_list: char_Id.append(char_alphabet.get_index(char)) chars.append(char_list) char_Ids.append(char_Id) else: if (max_sent_length < 0) or (len(words) < max_sent_length): instence_texts.append([words, features, chars, labels]) instence_Ids.append( [word_Ids, feature_Ids, char_Ids, label_Ids]) words = [] features = [] chars = [] labels = [] word_Ids = [] feature_Ids = [] char_Ids = [] label_Ids = [] return instence_texts, instence_Ids
def inference(self, text, intent, session_keep, previous_intent, trace_id=''): """ :param text: :param intent: 当前意图 :param session_keep: :param previous_intent: 上一轮意图 :param trace_id: :return: """ # 如果存在多轮对话,且当前intent为空,取上一轮text的意图 if session_keep and intent is None: intent = previous_intent if intent is None: # label_alphabet中的None是str类型 intent = 'None' instance, instance_ids = [], [] # 处理当前会话 new_char, seq_char, seq_char_id_list, seq_label, seq_label_id_list = [], [], [], [], [] char, seg_list = list(text), self.process(self.stub, text, trace_id) # 存储one-hot形式的属性特征 lexicons = [] # word level # 记录字符的index word_indices = [] start = 0 for word in seg_list: end = start + len(word) lexi_feat = [] for lexi_type, lb in self.data.trees.lexi_trees.items(): lexi_feat.append(lb.search(word)) for n in range(len(lexi_feat)): if lexi_feat[n] is None or lexi_feat[n] == '_STEM_': lexi_feat[n] = 0 else: lexi_feat[n] = 1 lexi_feat = ''.join([str(i) for i in lexi_feat]) lexicons.append(lexi_feat) word_indices.append([start, end]) # char # '0010000' if '1' in lexi_feat: seq_char.append(lexi_feat) seq_char_id_list.append( self.data.char_alphabet.get_index(lexi_feat)) new_char.append(''.join(char[start:end])) else: # '0000000' for c in word: seq_char.append(c) seq_char_id_list.append( self.data.char_alphabet.get_index(normalize_word(c))) new_char.append(c) start = end # intent intent_id = self.data.intent_alphabet.get_index(intent) instance.append([seq_char, [intent]]) instance_ids.append([seq_char_id_list, [intent_id]]) # instance process batch_char, batch_intent, batch_char_len, mask, batch_char_recover, _ = \ batch_char_sequence_labeling_process(instance_ids, self.gpu, self.char_max_length, False) tag_seq = self.model(batch_char, batch_intent, batch_char_len, mask) # label recover pred_result = self.predict_recover_label(tag_seq, mask, self.data.label_alphabet) pred_result = list(np.array(pred_result).reshape(len(seq_char), )) result = self.slot_concat(new_char, pred_result) return result
def read_instance(self): """ 这里读取完整读数据,不做截断,functions.py中做截断 :return: """ texts, ids = [], [] for idx, line in enumerate(self.corpus): line = ast.literal_eval(line) intent_id_list = [] # word:'0010000' -> 合并成一个标签 seq_char, seq_char_id_list, seq_label, seq_label_id_list = [], [], [], [] char, char_label, seg_list, intent = line['char'], line[ 'char_label'], line['word'], line['intent'] # 存储one-hot形式的属性特征 lexicons = [] # 记录字符的index word_indices = [] start = 0 flag = True # 判断跳至上一循环 for word in seg_list: if flag is True: end = start + len(word) lexi_feat = [] for lexi_type, lb in self.trees.lexi_trees.items(): lexi_feat.append(lb.search(word)) for n in range(len(lexi_feat)): if lexi_feat[n] is None or lexi_feat[n] == '_STEM_': lexi_feat[n] = 0 else: lexi_feat[n] = 1 lexi_feat = ''.join([str(i) for i in lexi_feat]) lexicons.append(lexi_feat) word_indices.append([start, end]) # char # '0010000' if '1' in lexi_feat: seq_char.append(lexi_feat) seq_char_id_list.append( self.char_alphabet.get_index(lexi_feat)) # ["B-room", "I-room", "I-room"] specific_word_label = char_label[start:end] tmp_label = [ swl.split('-')[-1] for swl in specific_word_label ] if len(set(tmp_label)) > 1: # 判断是否过滤该条数据 print('Be filtered: %s' % line['text'], word, tmp_label) flag = False else: assert len(set(tmp_label)) == 1 if tmp_label[0] == 'O': tmp_label = 'O' else: tmp_label = 'B' + '-' + tmp_label[0] seq_label += [tmp_label] seq_label_id_list += [ self.label_alphabet.get_index(tmp_label) ] # '0000000' else: for c in word: seq_char.append(c) seq_char_id_list.append( self.char_alphabet.get_index( normalize_word(c))) seq_label += char_label[start:end] seq_label_id_list += [ self.label_alphabet.get_index(cl) for cl in char_label[start:end] ] start = end else: break # 跳至下一个corpus intent_id_list.append(self.intent_alphabet.get_index(intent)) if idx % 10000 == 0: logger.info('read instance : %s' % idx) if flag is True: # text, char, intent, sequence_label texts.append([line['text'], seq_char, intent, seq_label]) ids.append( [seq_char_id_list, intent_id_list, seq_label_id_list]) # 新形式的corpus的保存下来,方便查bug output_path = self.data_config['data']['output'] with open(output_path, 'w') as wf: for text in texts: line_data = dict() line_data['text'] = text[0] line_data['char'] = text[1] line_data['intent'] = text[2] line_data['char_label'] = text[-1] wf.write(json.dumps(line_data, ensure_ascii=False) + '\n') return texts, ids