Example #1
0
    def create_dataset(self, features, is_sorted=False):
        # Convert to Tensors and build dataset
        if is_sorted:
            logger.info("sorted data by th length of input")
            features = sorted(features,
                              key=lambda x: x.input_len,
                              reverse=True)
        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_trigger_mask = torch.tensor([f.trigger_mask for f in features],
                                        dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
        all_input_lens = torch.tensor([f.input_len for f in features],
                                      dtype=torch.long)
        all_one_hot_labels = torch.tensor([f.one_hot_labels for f in features],
                                          dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask,
                                all_trigger_mask, all_segment_ids,
                                all_label_ids, all_input_lens,
                                all_one_hot_labels)

        return dataset
Example #2
0
    def create_examples(self, lines, example_type, cached_file, save_cache):
        '''
		Creates examples for data
		'''
        label_list = self.get_labels()
        if cached_file and cached_file.exists():
            logger.info("Loading examples from cached file %s", cached_file)
            examples = torch.load(cached_file)
        else:
            pbar = ProgressBar(n_total=len(lines), desc='create examples')
            examples = []
            for i, line in enumerate(lines):
                #if i>20:break # for quik debug
                guid = '%s-%d' % (example_type, i)
                label = line['tags']
                text_a = line['info']
                text_b = None
                match = line["cira_match"]

                if self.test_mode == 4 and sum(match) < 4:
                    continue
                else:
                    examples.append(
                        InputExample(guid=guid,
                                     text_a=text_a,
                                     text_b=text_b,
                                     label=label,
                                     match=match))
                pbar(step=i)

            if save_cache:
                logger.info("Saving examples into cached file %s", cached_file)

                torch.save(examples, cached_file)
        return examples
Example #3
0
    def read_type_data(cls, input_file, type):

        with jsonlines.open(input_file) as reader:
            lines = []
            for line in reader:
                e_d = line["guid"].split("_")[1]
                # e_m = sum(line["cira_match"])
                # datasets["%s_%d"%(e_d,e_m)].append(line)
                if e_d in type:
                    lines.append(line)
        logger.info("type {} number = {}".format(type, len(lines)))
        return lines
Example #4
0
    def create_features(self,
                        examples,
                        max_seq_len,
                        cached_file,
                        save_cache=False):
        '''
		# The convention in BERT is:
		# (a) For sequence pairs:
		#  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
		#  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
		# (b) For single sequences:
		#  tokens:   [CLS] the dog is hairy . [SEP]
		#  type_ids:   0   0   0   0  0     0   0
		'''
        max_lenth = 0
        if cached_file and cached_file.exists():
            logger.info("Loading features from cached file %s", cached_file)
            features = torch.load(cached_file)
        else:
            label_list = self.get_labels()
            label2id = {label: i for i, label in enumerate(label_list, 0)}

            pbar = ProgressBar(n_total=len(examples), desc='create features')
            features = []

            for ex_id, example in enumerate(examples):

                # textlist = []
                # for sentence in example.text_a:
                #     textlist.extend(list(sentence))
                textlist = list(example.text_a)
                if len(textlist) > max_lenth:
                    max_lenth = len(textlist)

                tokens = self.tokenizer.tokenize(textlist)
                labels = example.label
                match = example.match
                if len(tokens) >= max_seq_len - 2:
                    tokens = tokens[0:(max_seq_len - 2)]
                    labels = labels[0:(max_seq_len - 2)]
                ntokens = []
                segment_ids = []
                label_ids = []
                one_hot_labels = []
                ntokens.append("[CLS]")
                segment_ids.append(0)
                mask_tags = [1] * len(label2id)
                mask_tags[label2id["[CLS]"]] = 0
                one_hot_labels.append(mask_tags)
                label_ids.append(label2id["[CLS]"])

                possible_tags = [1] * len(label2id)
                trigger_mask = []
                trigger_mask.append(0)
                if sum(match) < 4:
                    possible_tags[0] = 0

                    #assert  match[0]==1
                    # if match[0] < 1:
                    # 	possible_tags[1:3] = [0,0]
                    if match[1] < 1:
                        possible_tags[3:5] = [0, 0]
                    #assert match[2]==1
                    # if match[2] < 1:
                    # 	possible_tags[5:7] = [0, 0]
                    if match[3] < 1:
                        possible_tags[7:9] = [0, 0]

                for i, token in enumerate(tokens):
                    ntokens.append(token)
                    segment_ids.append(0)
                    label_ids.append(label2id[labels[i]])
                    if "ROU" in labels[i]:
                        trigger_mask.append(1)
                    else:
                        trigger_mask.append(0)

                    if sum(match) < 4 and labels[i] == 'O' and (
                            token not in range(7993,
                                               8029)) and (token not in range(
                                                   8039, 8051)):
                        one_hot_labels.append(possible_tags)

                    else:
                        mask_tags = [1] * len(label2id)
                        mask_tags[label2id[labels[i]]] = 0
                        one_hot_labels.append(mask_tags)

                ntokens.append("[SEP]")
                segment_ids.append(0)
                label_ids.append(label2id["[SEP]"])
                mask_tags = [1] * len(label2id)
                mask_tags[label2id["[SEP]"]] = 0
                one_hot_labels.append(mask_tags)
                trigger_mask.append(0)

                input_ids = self.tokenizer.convert_tokens_to_ids(ntokens)
                input_mask = [1] * len(input_ids)

                input_len = len(label_ids)

                while len(input_ids) < max_seq_len:
                    input_ids.append(0)
                    input_mask.append(0)
                    segment_ids.append(0)
                    label_ids.append(0)
                    one_hot_labels.append([1] * len(label2id))
                    trigger_mask.append(0)

                assert len(input_ids) == max_seq_len
                assert len(input_mask) == max_seq_len
                assert len(segment_ids) == max_seq_len
                assert len(label_ids) == max_seq_len
                assert len(one_hot_labels) == max_seq_len
                assert len(one_hot_labels) == max_seq_len

                for i in range(len(one_hot_labels)):
                    if len(one_hot_labels[i]) < 11:
                        logger.info(
                            "one-hot labels: pos:%d, %s" %
                            (i, " ".join([str(x) for x in one_hot_labels[i]])))
                        # if ex_id < 2:
                        logger.info("*** Example ***")
                        logger.info("guid: %s" % (example.guid))
                        logger.info("tokens: %s" %
                                    " ".join([str(x) for x in tokens]))
                        logger.info("input_ids: %s" %
                                    " ".join([str(x) for x in input_ids]))
                        logger.info("input_mask: %s" %
                                    " ".join([str(x) for x in input_mask]))
                        logger.info("segment_ids: %s" %
                                    " ".join([str(x) for x in segment_ids]))
                        logger.info("label: %s id: %s" %
                                    (" ".join(example.label), " ".join(
                                        [str(x) for x in label_ids])))

                features.append(
                    InputFeature(input_ids=input_ids,
                                 input_mask=input_mask,
                                 trigger_mask=trigger_mask,
                                 segment_ids=segment_ids,
                                 label_id=label_ids,
                                 one_hot_labels=one_hot_labels,
                                 input_len=input_len))

                pbar(step=ex_id)
            if save_cache:
                logger.info("Saving features into cached file %s", cached_file)
                torch.save(features, cached_file)
        logger.info("max_seq_lenth = {}".format(max_lenth))
        return features
Example #5
0
def run_train(args):
    processor = BertProcessor(vocab_path=os.path.join(
        args.pretrained_model,
        'vocab.txt',
    ),
                              test_mode=args.test_mode,
                              do_lower_case=args.do_lower_case)
    #processor.tokenizer.save_vocabulary (str (args.model_path))
    label_list = processor.get_labels()
    label2id = {label: i for i, label in enumerate(label_list)}
    train_cache_sample = config['data_dir'] / f"cached_train_seq_examples"
    train_cache_feature = config['data_dir'] / f"cached_train_seq_features"
    if args.type:
        train_data = processor.read_type_data(os.path.join(
            config['data_dir'], "train.jsonl"),
                                              type=args.type)
        valid_data = processor.read_type_data(os.path.join(
            config['data_dir'], "dev.jsonl"),
                                              type=args.type)
        train_cache_sample = config[
            'data_dir'] / f"cached_train_seq_examples_{args.type}"
        train_cache_feature = config[
            'data_dir'] / f"cached_train_seq_features_{args.type}"
    else:
        train_data = processor.read_data(
            os.path.join(config['data_dir'], "train.jsonl"))
        valid_data = processor.read_data(
            os.path.join(config['data_dir'], "dev.jsonl"))
    if args.early_stop:
        early_stopping = EarlyStopping(patience=3,
                                       monitor="f1",
                                       baseline=0,
                                       mode='max')
    else:
        early_stopping = None

    train_dataset = convert_data_to_tensor(
        processor=processor,
        args=args,
        data=train_data,
        type="train",
        cache_sample_path=train_cache_sample,
        cache_feature_path=train_cache_feature,
        save_cache=False)

    if args.sorted:
        train_sampler = SequentialSampler(train_dataset)
    else:
        train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    valid_dataset = convert_data_to_tensor(
        processor=processor,
        args=args,
        data=valid_data,
        type="dev",
        cache_sample_path=config['data_dir'] / f"cached_dev_seq_examples",
        cache_feature_path=config['data_dir'] / f"cached_dev_seq_features",
        save_cache=False)
    valid_sampler = SequentialSampler(valid_dataset)
    valid_dataloader = DataLoader(valid_dataset,
                                  sampler=valid_sampler,
                                  batch_size=args.eval_batch_size)

    # model = BERTCRF
    #
    # bert_config = BertConfig.from_json_file(os.path.join(args.pretrained_model,"config.json"))
    # bert_config.num_hidden_layers = args.depth
    # if args.resume_path:
    # 	args.resume_path = Path (args.resume_path)
    # 	model = model.from_pretrained (args.resume_path, label2id=label2id, device=args.device,config=bert_config)
    #
    # else:
    # 	model = model.from_pretrained (args.pretrained_model, label2id=label2id, device=args.device,config=bert_config)

    bert_config = BertConfig.from_json_file(
        os.path.join(args.pretrained_model, "config.json"))
    model = CNNLSTMCRF(config=bert_config,
                       label2id=label2id,
                       device=args.device)
    ckpt = torch.load(os.path.join(args.pretrained_model, "pytorch_model.bin"))

    if "state_dict" in ckpt:
        state_dict = ckpt["state_dict"]
    else:
        state_dict = ckpt
    for key in list(state_dict.keys()):
        if 'embedding' in key:
            new_key = key.replace("bert.embeddings.", "")  # delete 'bert.'
            state_dict[new_key] = state_dict.pop(key)
    try:
        model.BERTEm.load_state_dict(state_dict, strict=True)
    except Exception as e:
        print(e)

    model = model.to(args.device)

    t_total = int(
        len(train_dataloader) / args.gradient_accumulation_steps * args.epochs)

    optimizer = RMSprop(model.parameters(), lr=args.learning_rate)

    lr_scheduler = BERTReduceLROnPlateau(optimizer,
                                         lr=args.learning_rate,
                                         mode=args.mode,
                                         factor=0.5,
                                         patience=1,
                                         verbose=1,
                                         epsilon=1e-8,
                                         cooldown=0,
                                         min_lr=0,
                                         eps=1e-8)

    model_checkpoint = ModelCheckpoint(checkpoint_dir=args.model_path,
                                       mode=args.mode,
                                       monitor=args.monitor,
                                       arch=args.arch,
                                       save_best_only=args.save_best)

    # **************************** training model ***********************
    logger.info("***** Running training *****")
    logger.info("  Num Epochs = %d", args.epochs)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    tb_logger = Tensorboard_Logger(
        log_dir=os.path.join(args.model_path, config['output']))

    trainer = Trainer(
        n_gpu=args.n_gpu,
        model=model,
        logger=logger,
        tb_logger=tb_logger,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        label2id=label2id,
        grad_clip=args.grad_clip,
        model_checkpoint=model_checkpoint,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        early_stopping=early_stopping,
        partial=args.partial,
        trigger=args.trigger)

    trainer.train(train_data=train_dataloader,
                  valid_data=valid_dataloader,
                  epochs=args.epochs,
                  seed=args.seed)
Example #6
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--arch", default='bert_crf', type=str)
    parser.add_argument("--type", default='', type=str)
    parser.add_argument("--do_train", action='store_true')
    parser.add_argument("--do_test", action='store_true')
    parser.add_argument("--do_predict", action='store_true')
    parser.add_argument("--save_best", action='store_true')
    parser.add_argument("--do_lower_case", action='store_true')
    parser.add_argument("--early_stop", action='store_true')
    parser.add_argument('--data_name', default='datagrand', type=str)
    parser.add_argument('--optimizer',
                        default='adam',
                        type=str,
                        choices=['adam', 'lookahead'])
    parser.add_argument('--markup',
                        default='bios',
                        type=str,
                        choices=['bio', 'bios'])
    parser.add_argument('--checkpoint', default=900000, type=int)
    parser.add_argument("--epochs", default=30, type=int)
    parser.add_argument('--fold', default=0, type=int)
    # --resume_path = src/output/checkpoints/bert_lstm_crf_bios_fold_0/checkpoint-epoch-30'
    parser.add_argument("--resume_path", default='', type=str)
    parser.add_argument("--mode", default='max', type=str)
    parser.add_argument("--monitor", default='f1', type=str)
    parser.add_argument("--local_rank", type=int, default=-1)
    parser.add_argument("--sorted",
                        default=1,
                        type=int,
                        help='1:True  0:False ')
    parser.add_argument("--n_gpu",
                        type=str,
                        default='0',
                        help='"0,1,.." or "0" or "" ')
    parser.add_argument('--gradient_accumulation_steps', type=int, default=2)
    parser.add_argument("--train_batch_size", default=16, type=int)
    parser.add_argument('--eval_batch_size', default=64, type=int)
    parser.add_argument("--train_max_seq_len", default=256, type=int)
    parser.add_argument("--eval_max_seq_len", default=256, type=int)
    parser.add_argument('--loss_scale', type=float, default=0)
    parser.add_argument("--warmup_proportion", default=0.05, type=float)
    parser.add_argument("--weight_decay", default=0.01, type=float)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)
    parser.add_argument("--grad_clip", default=5.0, type=float)
    parser.add_argument("--learning_rate", default=1e-4, type=float)
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument("--no_cuda", action='store_true')
    parser.add_argument("--partial", action='store_true')
    parser.add_argument("--trigger", action='store_true')
    parser.add_argument("--test_mode", type=int, default=0)
    parser.add_argument("--pretrained_model",
                        type=str,
                        default="pretrained_model")
    parser.add_argument("--depth", type=int)
    args = parser.parse_args()

    args.device = torch.device(
        f"cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    if args.type:
        args.arch += f"_{args.type}"

    # name_str = "_bs-{}_lr-{}_len-{}".format(args.train_batch_size,args.learning_rate,args.train_max_seq_len)

    args.model_path = config['output'] / args.arch
    args.model_path.mkdir(exist_ok=True)
    # Good practice: save your training arguments together with the trained model
    torch.save(args, args.model_path / 'training_args.bin')
    seed_everything(args.seed)
    init_logger(log_file=args.model_path / f"{args.arch}.log")

    logger.info("Training/evaluation parameters %s", args)

    if args.do_train:
        run_train(args)

    if args.do_test:
        run_test(args)
Example #7
0
def run_test(args):
    processor = BertProcessor(os.path.join(args.pretrained_model, 'vocab.txt'),
                              args.do_lower_case,
                              test_mode=args.test_mode)
    label_list = processor.get_labels()
    label2id = {label: i for i, label in enumerate(label_list)}
    # id2label = {i: label for i, label in enumerate (label_list)}
    bert_config = BertConfig.from_json_file(
        os.path.join(args.pretrained_model, "config.json"))
    bert_config.num_hidden_layers = args.depth
    model = CNNLSTMCRF(config=bert_config,
                       label2id=label2id,
                       device=args.device)
    ckpt = torch.load(os.path.join(args.resume_path, "pytorch_model.bin"))

    if "state_dict" in ckpt:
        state_dict = ckpt["state_dict"]
    else:
        state_dict = ckpt
    try:
        model.load_state_dict(state_dict, strict=True)
    except Exception as e:
        print(e)

    model = model.to(args.device)

    trainer = Trainer(
        n_gpu=args.n_gpu,
        model=model,
        logger=logger,
        tb_logger=None,
        optimizer=None,
        lr_scheduler=None,
        label2id=label2id,
        grad_clip=args.grad_clip,
        model_checkpoint=None,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        partial=args.partial,
        trigger=args.trigger)
    split = True
    if split:
        diff = ["e", "m", "h"]

        results = {}
        for d in diff:
            test_data = processor.read_type_data(os.path.join(
                config['data_dir'], "test_gold_all.jsonl"),
                                                 type=d)
            test_dataset = convert_data_to_tensor(processor=processor,
                                                  args=args,
                                                  data=test_data,
                                                  type=d,
                                                  cache_sample_path=None,
                                                  cache_feature_path=None,
                                                  save_cache=False)
            test_sampler = SequentialSampler(test_dataset)
            test_dataloader = DataLoader(test_dataset,
                                         sampler=test_sampler,
                                         batch_size=args.eval_batch_size)
            info, ex_info, class_info = trainer.valid_epoch(test_dataloader)
            results[d] = [class_info, ex_info, info]

        res = json.dumps(results)
        fileObject = open(os.path.join(args.model_path, 'result.json'), 'w')
        fileObject.write(res)
        fileObject.close()
        prf = ["precision", "recall", "f1"]
        ex_prf = ["ex_p", "ex_r", "ex_f1"]
        types = ["COM", "INV", "ROU", "AMO"]
        logger.info("Eval results:")
        for d in diff:
            values = []
            class_info, ex_info, info = results[d]
            for t in types:
                cv = class_info[t]
                for k in prf:
                    values.append("{:.4f}".format(cv[k]))
            for k in prf:
                values.append("{:.4f}".format(info[k]))
            for k in ex_prf:
                values.append("{:.4f}".format(ex_info[k]))
            show_info = f'diff:{d},' + ",".join(values)
            logger.info(show_info)
Example #8
0
def run_train(args):
    processor = BertProcessor(vocab_path=os.path.join(
        args.pretrained_model,
        'vocab.txt',
    ),
                              test_mode=args.test_mode,
                              do_lower_case=args.do_lower_case)
    #processor.tokenizer.save_vocabulary (str (args.model_path))
    label_list = processor.get_labels()
    label2id = {label: i for i, label in enumerate(label_list)}
    train_cache_sample = config['data_dir'] / f"cached_train_seq_examples"
    train_cache_feature = config['data_dir'] / f"cached_train_seq_features"
    if args.type:
        train_data = processor.read_type_data(os.path.join(
            config['data_dir'], "train.jsonl"),
                                              type=args.type)
        valid_data = processor.read_type_data(os.path.join(
            config['data_dir'], "dev.jsonl"),
                                              type=args.type)
        train_cache_sample = config[
            'data_dir'] / f"cached_train_seq_examples_{args.type}"
        train_cache_feature = config[
            'data_dir'] / f"cached_train_seq_features_{args.type}"
    else:
        train_data = processor.read_data(
            os.path.join(config['data_dir'], "train.jsonl"))
        valid_data = processor.read_data(
            os.path.join(config['data_dir'], "dev.jsonl"))
    if args.early_stop:
        early_stopping = EarlyStopping(patience=3,
                                       monitor="f1",
                                       baseline=0,
                                       mode='max')
    else:
        early_stopping = None

    train_dataset = convert_data_to_tensor(
        processor=processor,
        args=args,
        data=train_data,
        type="train",
        cache_sample_path=train_cache_sample,
        cache_feature_path=train_cache_feature,
        save_cache=False)

    if args.sorted:
        train_sampler = SequentialSampler(train_dataset)
    else:
        train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    valid_dataset = convert_data_to_tensor(
        processor=processor,
        args=args,
        data=valid_data,
        type="dev",
        cache_sample_path=config['data_dir'] / f"cached_dev_seq_examples",
        cache_feature_path=config['data_dir'] / f"cached_dev_seq_features",
        save_cache=False)
    valid_sampler = SequentialSampler(valid_dataset)
    valid_dataloader = DataLoader(valid_dataset,
                                  sampler=valid_sampler,
                                  batch_size=args.eval_batch_size)

    model = BERTCRF

    bert_config = BertConfig.from_json_file(
        os.path.join(args.pretrained_model, "config.json"))
    bert_config.num_hidden_layers = args.depth
    if args.resume_path:
        args.resume_path = Path(args.resume_path)
        model = model.from_pretrained(args.resume_path,
                                      label2id=label2id,
                                      device=args.device,
                                      config=bert_config)

    else:
        model = model.from_pretrained(args.pretrained_model,
                                      label2id=label2id,
                                      device=args.device,
                                      config=bert_config)

    model = model.to(args.device)

    t_total = int(
        len(train_dataloader) / args.gradient_accumulation_steps * args.epochs)

    bert_param_optimizer = list(model.bert.named_parameters())
    crf_param_optimizer = list(model.crf.named_parameters())
    linear_param_optimizer = list(model.classifier.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in bert_param_optimizer
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01,
        'lr':
        args.learning_rate
    }, {
        'params': [
            p for n, p in bert_param_optimizer
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0,
        'lr':
        args.learning_rate
    }, {
        'params': [
            p for n, p in crf_param_optimizer
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01,
        'lr':
        0.001
    }, {
        'params':
        [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0,
        'lr':
        0.001
    }, {
        'params': [
            p for n, p in linear_param_optimizer
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01,
        'lr':
        0.001
    }, {
        'params': [
            p for n, p in linear_param_optimizer
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0,
        'lr':
        0.001
    }]
    if args.optimizer == 'adam':
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)
    else:
        raise ValueError("unknown optimizer")
    lr_scheduler = BERTReduceLROnPlateau(optimizer,
                                         lr=args.learning_rate,
                                         mode=args.mode,
                                         factor=0.5,
                                         patience=1,
                                         verbose=1,
                                         epsilon=1e-8,
                                         cooldown=0,
                                         min_lr=0,
                                         eps=1e-8)

    model_checkpoint = ModelCheckpoint(checkpoint_dir=args.model_path,
                                       mode=args.mode,
                                       monitor=args.monitor,
                                       arch=args.arch,
                                       save_best_only=args.save_best)

    # **************************** training model ***********************
    logger.info("***** Running training *****")
    logger.info("  Num Epochs = %d", args.epochs)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    tb_logger = Tensorboard_Logger(
        log_dir=os.path.join(args.model_path, config['output']))

    trainer = Trainer(
        n_gpu=args.n_gpu,
        model=model,
        logger=logger,
        tb_logger=tb_logger,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        label2id=label2id,
        grad_clip=args.grad_clip,
        model_checkpoint=model_checkpoint,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        early_stopping=early_stopping,
        partial=args.partial,
        trigger=args.trigger)

    trainer.train(train_data=train_dataloader,
                  valid_data=valid_dataloader,
                  epochs=args.epochs,
                  seed=args.seed)