def create_examples(self, lines, example_type, cached_file): if cached_file.exists(): tools.logger.info("Loading samples from cached files %s", cached_file) examples = torch.load(cached_file) else: pbar = progressbar.ProgressBar( n_total=len(lines), desc=f'create {example_type} samples') examples = [] for i, line in enumerate(lines): hadm_id = line['HADM_ID'] guid = '%s-%s-%d' % (example_type, hadm_id, i) sentence = line['token'] # list sentence = [' ' if type(t) == float else t for t in sentence] label = line['tags'] # list code = line['code'] # brat entity Tcode T1 T2 relations = line['relations'] # brat relations golden standard # text_a: string. The untokenized text of the first sequence. For single # sequence tasks, only this sequence must be specified. text_a = ' '.join(sentence) # string text_b = None examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, code=code, relations=relations, hadm_id=hadm_id)) pbar(step=i) tools.logger.info("Saving examples into cached file %s", cached_file) torch.save(examples, cached_file) return examples
def train_epoch(self, data_loader): pbar = progressbar.ProgressBar(n_total=len(data_loader), desc='Training') tr_loss = tools.AverageMeter() for step, batch in enumerate(data_loader): self.model.train() batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids, input_lens = batch input_lens = input_lens.cpu().detach().numpy().tolist() _, loss = self.model.forward_loss(input_ids, segment_ids, input_mask, label_ids, input_lens) if len(self.n_gpu.split(',')) >= 2: loss = loss.mean() if self.gradient_accumulation_steps > 1: loss = loss / self.gradient_accumulation_steps if self.fp16: pass else: loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_clip) if (step + 1) % self.gradient_accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() self.global_step += 1 tr_loss.update(loss.item(), n=1) pbar(step=step, info={'loss': loss.item()}) info = {'loss': tr_loss.avg} if 'cuda' in str(self.device): torch.cuda.empty_cache() return info
def valid_epoch(self, data_loader): pbar = progressbar.ProgressBar(n_total=len(data_loader), desc='Evaluating') self.entity_score.reset() valild_loss = tools.AverageMeter() for step, batch in enumerate(data_loader): batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids, input_lens = batch input_lens = input_lens.cpu().detach().numpy().tolist() self.model.eval() with torch.no_grad(): features, loss = self.model.forward_loss( input_ids, segment_ids, input_mask, label_ids, input_lens) tags, _ = self.model.crf._obtain_labels( features, self.id2label, input_lens) valild_loss.update(val=loss.item(), n=input_ids.size(0)) print('tags[0]:', str(tags[0])) pbar(step=step, info={'loss': loss.item()}) label_ids = label_ids.to('cpu').numpy().tolist() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == self.label2id['[SEP]']: self.entity_score.update(pred_paths=[temp_2], label_paths=[temp_1]) break else: temp_1.append(self.id2label[label_ids[i][j]]) temp_2.append(tags[i][j]) valid_info, class_info = self.entity_score.result() info = {f'valid_{key}': value for key, value in valid_info.items()} info['valid_loss'] = valild_loss.avg if 'cuda' in str(self.device): torch.cuda.empty_cache() return info, class_info
def create_features(self, examples, max_seq_len, cached_file): if cached_file.exists(): tools.logger.info('Loading features from cached file %s', cached_file) features = torch.load(cached_file) else: label_list = self.get_labels() label2id = {label: i for i, label in enumerate(label_list)} pbar = progressbar.ProgressBar( n_total=len(examples), desc='creating the specified features of examples') features = [] for example_id, example in enumerate(examples): hamd_id = example.hadm_id text_list = example.text_a.split(' ') # string idx_CR = [ idx for idx, text in enumerate(text_list) if text == '<CRLF>' ] label_list = example.label code_list = example.code relation_list = example.relations new_tokens = [] new_segment_ids = [] new_label_ids = [] new_code = [] new_tokens.append('[CLS]') new_segment_ids.append(0) new_label_ids.append(label2id['[CLS]']) new_code.append('0') for text, label, code in zip(text_list, label_list, code_list): if text == '<CRLF>': continue else: token_list = self.tokenizer.tokenize(text) for idx, token in enumerate(token_list): new_tokens.append(token) new_segment_ids.append(0) if idx == 0: new_label_ids.append(label2id[label]) new_code.append(code) elif label == 'O': new_label_ids.append(label2id[label]) new_code.append(code) else: temp_l = 'I-' + label.split('-')[1] new_label_ids.append(label2id[temp_l]) new_code.append(code) assert len(new_tokens) == len(new_segment_ids) assert len(new_tokens) == len(new_label_ids) assert len(new_tokens) == len(new_code) if len(new_tokens) >= max_seq_len: new_tokens = new_tokens[0:(max_seq_len - 1)] new_segment_ids = new_segment_ids[0:(max_seq_len - 1)] new_label_ids = new_label_ids[0:(max_seq_len - 1)] new_code = new_code[0:(max_seq_len - 1)] new_tokens.append('[SEP]') new_segment_ids.append(0) new_label_ids.append(label2id['[SEP]']) new_code.append('0') input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens) input_mask = [1] * len(input_ids) input_len = len(new_label_ids) if len(input_ids) < max_seq_len: pad_zero = [0] * (max_seq_len - len(input_ids)) input_ids.extend(pad_zero) input_mask.extend(pad_zero) new_segment_ids.extend(pad_zero) new_label_ids.extend(pad_zero) new_code.extend(['0'] * len(pad_zero)) assert len(input_ids) == max_seq_len assert len(input_mask) == max_seq_len assert len(new_segment_ids) == max_seq_len assert len(new_label_ids) == max_seq_len assert len(new_code) == max_seq_len df_temp = pd.DataFrame({ 'input_ids': input_ids, 'code': new_code }) agg_fun = lambda s: (max(s['code']), s.index.tolist()[0], s.index.tolist()[-1]) groupby_code = df_temp.groupby('code').apply(agg_fun) code_position = {} for key, start, end in groupby_code: if key != '0': code_position[(start - 1, end - 1)] = key else: continue if example_id < 2: tools.logger.info('*** Examples: ***') tools.logger.info("guid: %s" % (example.guid)) tools.logger.info("tokens: %s" % " ".join([str(x) for x in new_tokens])) tools.logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) tools.logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) tools.logger.info( "segment_ids: %s" % " ".join([str(x) for x in new_segment_ids])) tools.logger.info("old label name: %s " % " ".join(example.label)) tools.logger.info("new label ids: %s" % " ".join([str(x) for x in new_label_ids])) features.append( InputFeature( input_ids=input_ids, input_mask=input_mask, segment_ids=new_segment_ids, label_id=new_label_ids, input_len=input_len, code=new_code, new_tokens=new_tokens, relations=relation_list, # golden standard hamd_id=hamd_id, code_position=code_position)) pbar(step=example_id) tools.logger.info('Saving features into cached file %s', cached_file) torch.save(features, cached_file) return features
def run_end2end_realtion_extration(args): from main.common import progressbar from main.common import ner_utils from main.common.tools import save_pickle from random import choice import pandas as pd import copy args.resume_path = args.model_path processor = BertProcessor(vocab_path=args.resume_path / 'vocab.txt', do_lower_case=True) label_list = processor.get_labels() # all labels label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} model = BERTLSTMCRF tools.logger.info(f'loading prtrained model from {args.resume_path}') model = model.from_pretrained(args.resume_path, label2id=label2id, device=args.device) model.to(args.device) test_data_path = base.config['data_dir'] / f'RE_Test/re_test.pkl' test_data = processor.get_test(test_data_path) test_examples_cached_file = base.config[ 'data_dir'] / f'End2End_Test/cached_{args.data_name}_e2e_test_examples.pkl' test_examples = processor.create_examples( lines=test_data, example_type='test', cached_file=test_examples_cached_file) test_features_cached_file = base.config[ 'data_dir'] / f'End2End_Test/cached_{args.data_name}_e2e_test_features.pkl' test_features = processor.create_features( examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_file=test_features_cached_file) test_datasets = test_features pbar = progressbar.ProgressBar( n_total=len(test_datasets), desc='Testing End2End relation extraction performace') entity_score = common.ner_utils.SeqEntityScore(id2label) entity_score.reset() ner_output_samples = [] for step, one_sample in enumerate(test_datasets): entity_score.reset() hadm_id = one_sample.hamd_id input_ids = torch.tensor([one_sample.input_ids], dtype=torch.long) input_mask = torch.tensor([one_sample.input_mask], dtype=torch.long) segment_ids = torch.tensor([one_sample.segment_ids], dtype=torch.long) label_ids = torch.tensor([one_sample.label_id], dtype=torch.long) input_lens = torch.tensor([one_sample.input_len], dtype=torch.long) true_codes = one_sample.code relations = one_sample.relations code_position = one_sample.code_position new_tokens = one_sample.new_tokens[1:-1] batch = (input_ids, input_mask, segment_ids, label_ids, input_lens) batch = tuple(t.to(args.device) for t in batch) input_ids, input_mask, segment_ids, label_ids, input_lens = batch input_lens = input_lens.cpu().detach().numpy().tolist() model.eval() with torch.no_grad(): features, loss = model.forward_loss(input_ids, segment_ids, input_mask, label_ids, input_lens) tags, _ = model.crf._obtain_labels(features, id2label, input_lens) label_ids = label_ids.to('cpu').numpy().tolist() pbar(step=step, info={'loss': loss.item()}) for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == label2id['[SEP]']: entity_score.update(pred_paths=[temp_2], label_paths=[temp_1]) break else: temp_1.append(id2label[label_ids[i][j]]) temp_2.append(tags[i][j]) relation_pairs_NER = [] relation_pairs_dict = {} count_y1 = 0 for relation in relations: if relation[3] == 'Has_Value': relation_pairs_NER.append(relation) relation_pairs_dict.setdefault(relation[0], []).append(relation[1]) count_y1 += 1 found_entities = [] found_e1_T_code = [] found_e2_T_code = [] TFN_idx = 1 for entity in entity_score.founds: key = (entity[1], entity[2]) e_type = entity[0] if key in code_position.keys(): T_code = code_position[key] found_entities.append((T_code, e_type, entity[1], entity[2])) else: T_code = 'TFP_' + str(TFN_idx) found_entities.append((T_code, e_type, entity[1], entity[2])) TFN_idx += 1 if e_type == 'Measurement': found_e2_T_code.append(T_code) else: found_e1_T_code.append(T_code) pred_tags = tags[0][1:-1] assert len(new_tokens) == len(pred_tags) pred_codes = ['0'] * len(new_tokens) for e in found_entities: start = e[2] end = e[3] for idx in range(start, end + 1): pred_codes[idx] = e[0] b = 1 count_y0 = 0 relations_temp = copy.deepcopy(relation_pairs_dict) while count_y0 <= (count_y1 + b) and len(found_e1_T_code) > 1: e1_random = choice(found_e1_T_code) try: e1_random_correspond_e2 = relations_temp[e1_random] other_e2_codes = list( set(found_e2_T_code).difference( set(e1_random_correspond_e2))) e2_random = choice(other_e2_codes) except: if len(found_e2_T_code) > 0: e2_random = choice(found_e2_T_code) else: count_y0 += 1 continue if e1_random not in relations_temp.keys(): relations_temp[e1_random] = [e2_random] relation_pairs_NER.append( (e1_random, e2_random, 'RTN_' + str(count_y0), '0')) count_y0 += 1 continue elif e2_random not in relations_temp[e1_random]: value_list = copy.deepcopy(relations_temp[e1_random]) value_list.append(e2_random) relations_temp[e1_random] = value_list relation_pairs_NER.append( (e1_random, e2_random, 'RTN_' + str(count_y0), '0')) count_y0 += 1 continue else: count_y0 += 1 continue single_sample = { 'token': new_tokens, 'codes': pred_codes, 'tags': pred_tags } df_temp = pd.DataFrame(single_sample) current_row = 0 while current_row <= df_temp.shape[0] - 1: i = 0 temp_token = '' current_token = df_temp.iloc[current_row][0] if not current_token.startswith('##'): current_row += 1 continue else: while (current_row + i) <= (df_temp.shape[0] - 1) and df_temp.iloc[ current_row + i][0].startswith('##'): temp_token += df_temp.iloc[current_row + i][0].replace( '##', '') i += 1 start_word_piece_position = current_row - 1 df_temp.iloc[start_word_piece_position][0] += temp_token current_row += i df_temp = df_temp[df_temp['token'].str.startswith('##') == False] new_tokens = df_temp['token'].values.tolist() pred_tags = df_temp['tags'].values.tolist() pred_codes = df_temp['codes'].values.tolist() df_temp = df_temp.reset_index(drop=True) agg_fun = lambda s: (max(s['codes']), s['tags'].iloc[0], s.index.tolist()[0], s.index.tolist()[-1]) groupby_code = df_temp.groupby('codes').apply(agg_fun) new_found_entities = [] for key, e_type, start, end in groupby_code: if key != '0': e_type = e_type.split('-')[1] new_found_entities.append((key, e_type, start, end)) else: continue sample = { 'HADM_ID': hadm_id, 'token': new_tokens, 'tags': pred_tags, 'relations': relation_pairs_NER, 'entities': new_found_entities, 'code': pred_codes } ner_output_samples.append(sample) content = str(ner_output_samples) file_path = base.config['data_dir'] / 'End2End_Test/re_e2e_test.txt' with open(file_path, 'w+') as new_f: new_f.writelines(content) new_f.close() e2e_test_file_path = base.config[ 'data_dir'] / f'End2End_Test/re_e2e_test_by_fold_{args.fold}.pkl' tools.logger.info(f'Saving e2e_test_file into {e2e_test_file_path}') save_pickle(ner_output_samples, e2e_test_file_path)
def run_test(args): from main.common import progressbar from main.common import ner_utils from main.common.tools import save_pickle args.resume_path = args.model_path processor = BertProcessor(args.resume_path / 'vocab.txt', args.do_lower_case) label_list = processor.get_labels() # all labels label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} model = BERTLSTMCRF model = model.from_pretrained(args.resume_path, label2id=label2id, device=args.device) tools.logger.info(f'loaded model from {args.resume_path}') model.to(args.device) max_seq_len = args.eval_max_seq_len test_data_path = base.config['data_dir'] / 'test/new_nihss_ner_test.pkl' test_data = processor.get_test(test_data_path) test_examples_cached_file = base.config[ 'data_dir'] / f'test/cached/cached_{args.data_name}_test_examples' test_examples = processor.create_examples( lines=test_data, example_type='test', cached_file=test_examples_cached_file) test_features_cached_file = base.config[ 'data_dir'] / f'test/cached/cached_{args.data_name}_test_features_{args.eval_max_seq_len}' test_features = processor.create_features( examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_file=test_features_cached_file) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.eval_batch_size) tools.logger.info('****** Running Testing Model ******') tools.logger.info(' Num test examples = %d', len(test_examples)) pbar = progressbar.ProgressBar(n_total=len(test_dataloader), desc='Testing') entity_score = common.ner_utils.SeqEntityScore(id2label) entity_score.reset() test_loss = tools.AverageMeter() for step, batch in enumerate(test_dataloader): batch = tuple(t.to(args.device) for t in batch) input_ids, input_mask, segment_ids, label_ids, input_lens = batch input_lens = input_lens.cpu().detach().numpy().tolist() model.eval() with torch.no_grad(): features, loss = model.forward_loss(input_ids, segment_ids, input_mask, label_ids, input_lens) tags, _ = model.crf._obtain_labels(features, id2label, input_lens) test_loss.update(val=loss.item(), n=input_ids.size(0)) pbar(step=step, info={'loss': loss.item()}) label_ids = label_ids.to('cpu').numpy().tolist() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == label2id['[SEP]']: entity_score.update(pred_paths=[temp_2], label_paths=[temp_1]) break else: temp_1.append(id2label[label_ids[i][j]]) temp_2.append(tags[i][j]) test_info, class_info = entity_score.result() info = {f'test_{key}': value for key, value in test_info.items()} info['test_loss'] = test_loss.avg if 'cuda' in str(args.device): torch.cuda.empty_cache() logs = dict(**info) show_info = f'Test: ' + " -".join( [f' {key}: {value:.4f}' for key, value in logs.items()]) tools.logger.info(show_info) tools.logger.info("The entity scores of test data : ") result_path = base.config['result_dir'] / args.arch result_path.mkdir(exist_ok=True) result_file_path = result_path / f'{args.arch}_test_result_{str(datetime.date.today())}.txt' tools.logger.info(f'Saving test data to {result_file_path}') with open(str(result_file_path), 'a+') as f: content = show_info + '\n' f.write(content) for key, value in class_info.items(): info = f'Entity: {key} \t' + "-\t".join( [f' {key_}: {value_:.4f} ' for key_, value_ in value.items()]) tools.logger.info(info) f.write(info + '\n') f.close()