Exemple #1
0
 def create_examples(self, lines, example_type, cached_file):
     if cached_file.exists():
         tools.logger.info("Loading samples from cached files %s",
                           cached_file)
         examples = torch.load(cached_file)
     else:
         pbar = progressbar.ProgressBar(
             n_total=len(lines), desc=f'create {example_type} samples')
         examples = []
         for i, line in enumerate(lines):
             hadm_id = line['HADM_ID']
             guid = '%s-%s-%d' % (example_type, hadm_id, i)
             sentence = line['token']  # list
             sentence = [' ' if type(t) == float else t for t in sentence]
             label = line['tags']  # list
             code = line['code']  # brat entity Tcode T1 T2
             relations = line['relations']  # brat relations golden standard
             # text_a: string. The untokenized text of the first sequence. For single
             # sequence tasks, only this sequence must be specified.
             text_a = ' '.join(sentence)  # string
             text_b = None
             examples.append(
                 InputExample(guid=guid,
                              text_a=text_a,
                              text_b=text_b,
                              label=label,
                              code=code,
                              relations=relations,
                              hadm_id=hadm_id))
             pbar(step=i)
         tools.logger.info("Saving examples into cached file %s",
                           cached_file)
         torch.save(examples, cached_file)
     return examples
Exemple #2
0
    def train_epoch(self, data_loader):
        pbar = progressbar.ProgressBar(n_total=len(data_loader),
                                       desc='Training')
        tr_loss = tools.AverageMeter()
        for step, batch in enumerate(data_loader):
            self.model.train()
            batch = tuple(t.to(self.device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, input_lens = batch
            input_lens = input_lens.cpu().detach().numpy().tolist()
            _, loss = self.model.forward_loss(input_ids, segment_ids,
                                              input_mask, label_ids,
                                              input_lens)
            if len(self.n_gpu.split(',')) >= 2:
                loss = loss.mean()
            if self.gradient_accumulation_steps > 1:
                loss = loss / self.gradient_accumulation_steps
            if self.fp16:
                pass
            else:
                loss.backward()
                clip_grad_norm_(self.model.parameters(), self.grad_clip)
            if (step + 1) % self.gradient_accumulation_steps == 0:
                self.optimizer.step()
                self.optimizer.zero_grad()
                self.global_step += 1

            tr_loss.update(loss.item(), n=1)
            pbar(step=step, info={'loss': loss.item()})
        info = {'loss': tr_loss.avg}
        if 'cuda' in str(self.device):
            torch.cuda.empty_cache()
        return info
Exemple #3
0
 def valid_epoch(self, data_loader):
     pbar = progressbar.ProgressBar(n_total=len(data_loader),
                                    desc='Evaluating')
     self.entity_score.reset()
     valild_loss = tools.AverageMeter()
     for step, batch in enumerate(data_loader):
         batch = tuple(t.to(self.device) for t in batch)
         input_ids, input_mask, segment_ids, label_ids, input_lens = batch
         input_lens = input_lens.cpu().detach().numpy().tolist()
         self.model.eval()
         with torch.no_grad():
             features, loss = self.model.forward_loss(
                 input_ids, segment_ids, input_mask, label_ids, input_lens)
             tags, _ = self.model.crf._obtain_labels(
                 features, self.id2label, input_lens)
         valild_loss.update(val=loss.item(), n=input_ids.size(0))
         print('tags[0]:', str(tags[0]))
         pbar(step=step, info={'loss': loss.item()})
         label_ids = label_ids.to('cpu').numpy().tolist()
         for i, label in enumerate(label_ids):
             temp_1 = []
             temp_2 = []
             for j, m in enumerate(label):
                 if j == 0:
                     continue
                 elif label_ids[i][j] == self.label2id['[SEP]']:
                     self.entity_score.update(pred_paths=[temp_2],
                                              label_paths=[temp_1])
                     break
                 else:
                     temp_1.append(self.id2label[label_ids[i][j]])
                     temp_2.append(tags[i][j])
         valid_info, class_info = self.entity_score.result()
         info = {f'valid_{key}': value for key, value in valid_info.items()}
         info['valid_loss'] = valild_loss.avg
         if 'cuda' in str(self.device):
             torch.cuda.empty_cache()
         return info, class_info
Exemple #4
0
    def create_features(self, examples, max_seq_len, cached_file):
        if cached_file.exists():
            tools.logger.info('Loading features from cached file %s',
                              cached_file)
            features = torch.load(cached_file)
        else:
            label_list = self.get_labels()
            label2id = {label: i for i, label in enumerate(label_list)}
            pbar = progressbar.ProgressBar(
                n_total=len(examples),
                desc='creating the specified features of examples')
            features = []
            for example_id, example in enumerate(examples):
                hamd_id = example.hadm_id
                text_list = example.text_a.split(' ')  # string
                idx_CR = [
                    idx for idx, text in enumerate(text_list)
                    if text == '<CRLF>'
                ]
                label_list = example.label
                code_list = example.code
                relation_list = example.relations

                new_tokens = []
                new_segment_ids = []
                new_label_ids = []
                new_code = []

                new_tokens.append('[CLS]')
                new_segment_ids.append(0)
                new_label_ids.append(label2id['[CLS]'])
                new_code.append('0')

                for text, label, code in zip(text_list, label_list, code_list):
                    if text == '<CRLF>':
                        continue
                    else:
                        token_list = self.tokenizer.tokenize(text)
                        for idx, token in enumerate(token_list):
                            new_tokens.append(token)
                            new_segment_ids.append(0)
                            if idx == 0:
                                new_label_ids.append(label2id[label])
                                new_code.append(code)
                            elif label == 'O':
                                new_label_ids.append(label2id[label])
                                new_code.append(code)
                            else:
                                temp_l = 'I-' + label.split('-')[1]
                                new_label_ids.append(label2id[temp_l])
                                new_code.append(code)

                assert len(new_tokens) == len(new_segment_ids)
                assert len(new_tokens) == len(new_label_ids)
                assert len(new_tokens) == len(new_code)

                if len(new_tokens) >= max_seq_len:
                    new_tokens = new_tokens[0:(max_seq_len - 1)]
                    new_segment_ids = new_segment_ids[0:(max_seq_len - 1)]
                    new_label_ids = new_label_ids[0:(max_seq_len - 1)]
                    new_code = new_code[0:(max_seq_len - 1)]

                new_tokens.append('[SEP]')
                new_segment_ids.append(0)
                new_label_ids.append(label2id['[SEP]'])
                new_code.append('0')

                input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)
                input_mask = [1] * len(input_ids)
                input_len = len(new_label_ids)

                if len(input_ids) < max_seq_len:
                    pad_zero = [0] * (max_seq_len - len(input_ids))
                    input_ids.extend(pad_zero)
                    input_mask.extend(pad_zero)
                    new_segment_ids.extend(pad_zero)
                    new_label_ids.extend(pad_zero)
                    new_code.extend(['0'] * len(pad_zero))

                assert len(input_ids) == max_seq_len
                assert len(input_mask) == max_seq_len
                assert len(new_segment_ids) == max_seq_len
                assert len(new_label_ids) == max_seq_len
                assert len(new_code) == max_seq_len

                df_temp = pd.DataFrame({
                    'input_ids': input_ids,
                    'code': new_code
                })
                agg_fun = lambda s: (max(s['code']), s.index.tolist()[0],
                                     s.index.tolist()[-1])
                groupby_code = df_temp.groupby('code').apply(agg_fun)
                code_position = {}
                for key, start, end in groupby_code:
                    if key != '0':
                        code_position[(start - 1, end - 1)] = key
                    else:
                        continue

                if example_id < 2:
                    tools.logger.info('*** Examples: ***')
                    tools.logger.info("guid: %s" % (example.guid))
                    tools.logger.info("tokens: %s" %
                                      " ".join([str(x) for x in new_tokens]))
                    tools.logger.info("input_ids: %s" %
                                      " ".join([str(x) for x in input_ids]))
                    tools.logger.info("input_mask: %s" %
                                      " ".join([str(x) for x in input_mask]))
                    tools.logger.info(
                        "segment_ids: %s" %
                        " ".join([str(x) for x in new_segment_ids]))
                    tools.logger.info("old label name: %s " %
                                      " ".join(example.label))
                    tools.logger.info("new label ids: %s" %
                                      " ".join([str(x)
                                                for x in new_label_ids]))

                features.append(
                    InputFeature(
                        input_ids=input_ids,
                        input_mask=input_mask,
                        segment_ids=new_segment_ids,
                        label_id=new_label_ids,
                        input_len=input_len,
                        code=new_code,
                        new_tokens=new_tokens,
                        relations=relation_list,  # golden standard
                        hamd_id=hamd_id,
                        code_position=code_position))

                pbar(step=example_id)

            tools.logger.info('Saving features into cached file %s',
                              cached_file)
            torch.save(features, cached_file)
        return features
Exemple #5
0
def run_end2end_realtion_extration(args):
    from main.common import progressbar
    from main.common import ner_utils
    from main.common.tools import save_pickle
    from random import choice
    import pandas as pd
    import copy

    args.resume_path = args.model_path
    processor = BertProcessor(vocab_path=args.resume_path / 'vocab.txt',
                              do_lower_case=True)
    label_list = processor.get_labels()  # all labels
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for i, label in enumerate(label_list)}
    model = BERTLSTMCRF
    tools.logger.info(f'loading prtrained model from {args.resume_path}')
    model = model.from_pretrained(args.resume_path,
                                  label2id=label2id,
                                  device=args.device)
    model.to(args.device)

    test_data_path = base.config['data_dir'] / f'RE_Test/re_test.pkl'
    test_data = processor.get_test(test_data_path)

    test_examples_cached_file = base.config[
        'data_dir'] / f'End2End_Test/cached_{args.data_name}_e2e_test_examples.pkl'
    test_examples = processor.create_examples(
        lines=test_data,
        example_type='test',
        cached_file=test_examples_cached_file)
    test_features_cached_file = base.config[
        'data_dir'] / f'End2End_Test/cached_{args.data_name}_e2e_test_features.pkl'
    test_features = processor.create_features(
        examples=test_examples,
        max_seq_len=args.eval_max_seq_len,
        cached_file=test_features_cached_file)

    test_datasets = test_features

    pbar = progressbar.ProgressBar(
        n_total=len(test_datasets),
        desc='Testing End2End relation extraction performace')

    entity_score = common.ner_utils.SeqEntityScore(id2label)
    entity_score.reset()

    ner_output_samples = []
    for step, one_sample in enumerate(test_datasets):
        entity_score.reset()
        hadm_id = one_sample.hamd_id
        input_ids = torch.tensor([one_sample.input_ids], dtype=torch.long)
        input_mask = torch.tensor([one_sample.input_mask], dtype=torch.long)
        segment_ids = torch.tensor([one_sample.segment_ids], dtype=torch.long)
        label_ids = torch.tensor([one_sample.label_id], dtype=torch.long)
        input_lens = torch.tensor([one_sample.input_len], dtype=torch.long)
        true_codes = one_sample.code
        relations = one_sample.relations
        code_position = one_sample.code_position
        new_tokens = one_sample.new_tokens[1:-1]

        batch = (input_ids, input_mask, segment_ids, label_ids, input_lens)
        batch = tuple(t.to(args.device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids, input_lens = batch
        input_lens = input_lens.cpu().detach().numpy().tolist()
        model.eval()

        with torch.no_grad():
            features, loss = model.forward_loss(input_ids, segment_ids,
                                                input_mask, label_ids,
                                                input_lens)
            tags, _ = model.crf._obtain_labels(features, id2label, input_lens)

        label_ids = label_ids.to('cpu').numpy().tolist()
        pbar(step=step, info={'loss': loss.item()})

        for i, label in enumerate(label_ids):
            temp_1 = []
            temp_2 = []
            for j, m in enumerate(label):
                if j == 0:
                    continue
                elif label_ids[i][j] == label2id['[SEP]']:
                    entity_score.update(pred_paths=[temp_2],
                                        label_paths=[temp_1])
                    break
                else:
                    temp_1.append(id2label[label_ids[i][j]])
                    temp_2.append(tags[i][j])

        relation_pairs_NER = []
        relation_pairs_dict = {}

        count_y1 = 0
        for relation in relations:
            if relation[3] == 'Has_Value':
                relation_pairs_NER.append(relation)
                relation_pairs_dict.setdefault(relation[0],
                                               []).append(relation[1])
                count_y1 += 1

        found_entities = []
        found_e1_T_code = []
        found_e2_T_code = []
        TFN_idx = 1
        for entity in entity_score.founds:
            key = (entity[1], entity[2])
            e_type = entity[0]
            if key in code_position.keys():
                T_code = code_position[key]
                found_entities.append((T_code, e_type, entity[1], entity[2]))
            else:
                T_code = 'TFP_' + str(TFN_idx)
                found_entities.append((T_code, e_type, entity[1], entity[2]))
                TFN_idx += 1
            if e_type == 'Measurement':
                found_e2_T_code.append(T_code)
            else:
                found_e1_T_code.append(T_code)

        pred_tags = tags[0][1:-1]
        assert len(new_tokens) == len(pred_tags)
        pred_codes = ['0'] * len(new_tokens)
        for e in found_entities:
            start = e[2]
            end = e[3]
            for idx in range(start, end + 1):
                pred_codes[idx] = e[0]

        b = 1
        count_y0 = 0
        relations_temp = copy.deepcopy(relation_pairs_dict)
        while count_y0 <= (count_y1 + b) and len(found_e1_T_code) > 1:
            e1_random = choice(found_e1_T_code)
            try:
                e1_random_correspond_e2 = relations_temp[e1_random]
                other_e2_codes = list(
                    set(found_e2_T_code).difference(
                        set(e1_random_correspond_e2)))
                e2_random = choice(other_e2_codes)
            except:
                if len(found_e2_T_code) > 0:
                    e2_random = choice(found_e2_T_code)
                else:
                    count_y0 += 1
                    continue

            if e1_random not in relations_temp.keys():
                relations_temp[e1_random] = [e2_random]
                relation_pairs_NER.append(
                    (e1_random, e2_random, 'RTN_' + str(count_y0), '0'))
                count_y0 += 1
                continue
            elif e2_random not in relations_temp[e1_random]:
                value_list = copy.deepcopy(relations_temp[e1_random])
                value_list.append(e2_random)
                relations_temp[e1_random] = value_list
                relation_pairs_NER.append(
                    (e1_random, e2_random, 'RTN_' + str(count_y0), '0'))
                count_y0 += 1
                continue
            else:
                count_y0 += 1
                continue

        single_sample = {
            'token': new_tokens,
            'codes': pred_codes,
            'tags': pred_tags
        }
        df_temp = pd.DataFrame(single_sample)
        current_row = 0
        while current_row <= df_temp.shape[0] - 1:
            i = 0
            temp_token = ''
            current_token = df_temp.iloc[current_row][0]
            if not current_token.startswith('##'):
                current_row += 1
                continue
            else:
                while (current_row +
                       i) <= (df_temp.shape[0] - 1) and df_temp.iloc[
                           current_row + i][0].startswith('##'):
                    temp_token += df_temp.iloc[current_row + i][0].replace(
                        '##', '')
                    i += 1
                start_word_piece_position = current_row - 1
                df_temp.iloc[start_word_piece_position][0] += temp_token
                current_row += i

        df_temp = df_temp[df_temp['token'].str.startswith('##') == False]
        new_tokens = df_temp['token'].values.tolist()
        pred_tags = df_temp['tags'].values.tolist()
        pred_codes = df_temp['codes'].values.tolist()

        df_temp = df_temp.reset_index(drop=True)
        agg_fun = lambda s: (max(s['codes']), s['tags'].iloc[0],
                             s.index.tolist()[0], s.index.tolist()[-1])
        groupby_code = df_temp.groupby('codes').apply(agg_fun)
        new_found_entities = []
        for key, e_type, start, end in groupby_code:
            if key != '0':
                e_type = e_type.split('-')[1]
                new_found_entities.append((key, e_type, start, end))
            else:
                continue

        sample = {
            'HADM_ID': hadm_id,
            'token': new_tokens,
            'tags': pred_tags,
            'relations': relation_pairs_NER,
            'entities': new_found_entities,
            'code': pred_codes
        }
        ner_output_samples.append(sample)

    content = str(ner_output_samples)
    file_path = base.config['data_dir'] / 'End2End_Test/re_e2e_test.txt'
    with open(file_path, 'w+') as new_f:
        new_f.writelines(content)
    new_f.close()

    e2e_test_file_path = base.config[
        'data_dir'] / f'End2End_Test/re_e2e_test_by_fold_{args.fold}.pkl'
    tools.logger.info(f'Saving e2e_test_file into {e2e_test_file_path}')
    save_pickle(ner_output_samples, e2e_test_file_path)
Exemple #6
0
def run_test(args):
    from main.common import progressbar
    from main.common import ner_utils
    from main.common.tools import save_pickle
    args.resume_path = args.model_path
    processor = BertProcessor(args.resume_path / 'vocab.txt',
                              args.do_lower_case)
    label_list = processor.get_labels()  # all labels
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for i, label in enumerate(label_list)}
    model = BERTLSTMCRF
    model = model.from_pretrained(args.resume_path,
                                  label2id=label2id,
                                  device=args.device)
    tools.logger.info(f'loaded model from {args.resume_path}')
    model.to(args.device)
    max_seq_len = args.eval_max_seq_len

    test_data_path = base.config['data_dir'] / 'test/new_nihss_ner_test.pkl'
    test_data = processor.get_test(test_data_path)
    test_examples_cached_file = base.config[
        'data_dir'] / f'test/cached/cached_{args.data_name}_test_examples'
    test_examples = processor.create_examples(
        lines=test_data,
        example_type='test',
        cached_file=test_examples_cached_file)
    test_features_cached_file = base.config[
        'data_dir'] / f'test/cached/cached_{args.data_name}_test_features_{args.eval_max_seq_len}'
    test_features = processor.create_features(
        examples=test_examples,
        max_seq_len=args.eval_max_seq_len,
        cached_file=test_features_cached_file)

    test_dataset = processor.create_dataset(test_features)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset,
                                 sampler=test_sampler,
                                 batch_size=args.eval_batch_size)

    tools.logger.info('****** Running Testing Model ******')
    tools.logger.info(' Num test examples = %d', len(test_examples))

    pbar = progressbar.ProgressBar(n_total=len(test_dataloader),
                                   desc='Testing')

    entity_score = common.ner_utils.SeqEntityScore(id2label)
    entity_score.reset()
    test_loss = tools.AverageMeter()

    for step, batch in enumerate(test_dataloader):
        batch = tuple(t.to(args.device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids, input_lens = batch
        input_lens = input_lens.cpu().detach().numpy().tolist()
        model.eval()
        with torch.no_grad():
            features, loss = model.forward_loss(input_ids, segment_ids,
                                                input_mask, label_ids,
                                                input_lens)
            tags, _ = model.crf._obtain_labels(features, id2label, input_lens)
        test_loss.update(val=loss.item(), n=input_ids.size(0))
        pbar(step=step, info={'loss': loss.item()})
        label_ids = label_ids.to('cpu').numpy().tolist()
        for i, label in enumerate(label_ids):
            temp_1 = []
            temp_2 = []
            for j, m in enumerate(label):
                if j == 0:
                    continue
                elif label_ids[i][j] == label2id['[SEP]']:
                    entity_score.update(pred_paths=[temp_2],
                                        label_paths=[temp_1])
                    break
                else:
                    temp_1.append(id2label[label_ids[i][j]])
                    temp_2.append(tags[i][j])
    test_info, class_info = entity_score.result()
    info = {f'test_{key}': value for key, value in test_info.items()}
    info['test_loss'] = test_loss.avg
    if 'cuda' in str(args.device):
        torch.cuda.empty_cache()

    logs = dict(**info)
    show_info = f'Test: ' + " -".join(
        [f' {key}: {value:.4f}' for key, value in logs.items()])
    tools.logger.info(show_info)
    tools.logger.info("The entity scores of test data : ")

    result_path = base.config['result_dir'] / args.arch
    result_path.mkdir(exist_ok=True)
    result_file_path = result_path / f'{args.arch}_test_result_{str(datetime.date.today())}.txt'
    tools.logger.info(f'Saving test data to {result_file_path}')
    with open(str(result_file_path), 'a+') as f:
        content = show_info + '\n'
        f.write(content)

        for key, value in class_info.items():
            info = f'Entity: {key} \t' + "-\t".join(
                [f' {key_}: {value_:.4f} ' for key_, value_ in value.items()])
            tools.logger.info(info)
            f.write(info + '\n')
    f.close()