Exemple #1
0
    random.seed(config.seed)
    np.random.seed(config.seed)

    # tokenizers
    special_token_dict = {
        "speaker1_token": "<speaker1>",
        "speaker2_token": "<speaker2>"
    }
    if config.tokenizer == "ws":
        tokenizer = WhiteSpaceTokenizer(word_count_path=config.word_count_path,
                                        vocab_size=config.vocab_size,
                                        special_token_dict=special_token_dict)
    eval_tokenizer = WhiteSpaceTokenizer(config.word_count_path, 100000)

    # data loaders
    intrinsic_stat_reporter = StatisticsReporter()
    with open(config.dataset_path, encoding="utf-8") as f:
        dataset = json.load(f)
    mlog("----- Loading test data -----")
    test_data_source = DataSource(data=dataset["test"],
                                  config=config,
                                  tokenizer=tokenizer)
    mlog(str(test_data_source.statistics))

    # metrics calculator
    metrics = SentenceMetrics(config.eval_word_embedding_path, eval_tokenizer)
    evaluator_config = ConfigFromDict({
        "model_path": config.evaluator_model_path,
        "model_size": "large",
    })
    evaluator_tokenizer = ModRobertaTokenizer(
Exemple #2
0
    torch.cuda.manual_seed(config.seed)
    random.seed(config.seed)
    np.random.seed(config.seed)

    # tokenizers
    special_token_dict = {
    }
    if config.tokenizer == "ws":
        tokenizer = WhiteSpaceTokenizer(
            word_count_path=config.word_count_path,
            vocab_size=config.vocab_size,
            special_token_dict=special_token_dict
        )

    # data loaders & number reporters
    trn_reporter = StatisticsReporter()
    dev_reporter = StatisticsReporter()
    with open(config.dataset_path, encoding="utf-8") as f:
        dataset = json.load(f)
    mlog("----- Loading training data -----")
    train_data_source = DataSource(
        data=dataset["train"],
        config=config,
        tokenizer=tokenizer
    )
    mlog(str(train_data_source.statistics))
    mlog("----- Loading dev data -----")
    dev_data_source = DataSource(
        data=dataset["dev"],
        config=config,
        tokenizer=tokenizer
Exemple #3
0
def run_train(config):

    # tokenizers
    tokenizer = ModBertTokenizer('base', cache_dir=config.cache_dir)
    label_token_dict = {
        f"label_{label_idx}_token": label
        for label_idx, label in enumerate(config.joint_da_seg_recog_labels)
    }
    label_token_dict.update({
        "pad_token": "<pad>",
        "bos_token": "<t>",
        "eos_token": "</t>"
    })
    label_tokenizer = CustomizedTokenizer(token_dict=label_token_dict)

    # metrics calculator
    metrics = DAMetrics()

    # define logger
    MODEL_NAME = config.model
    LOG_FILE_NAME = "{}.seed_{}.{}".format(
        MODEL_NAME, config.seed,
        time.strftime("%Y%m%d-%H%M%S", time.localtime())[-6:])
    if config.filename_note:
        LOG_FILE_NAME += f".{config.filename_note}"
        experiment.set_name(config.filename_note)
    experiment.log_text(LOG_FILE_NAME)

    # data loaders & number reporters
    trn_reporter = StatisticsReporter()
    dev_reporter = StatisticsReporter()
    mlog("----- Loading training data -----", config, LOG_FILE_NAME)
    train_data_source = SpeechDataSource(split="train",
                                         config=config,
                                         tokenizer=tokenizer,
                                         label_tokenizer=label_tokenizer)
    mlog(str(train_data_source.statistics), config, LOG_FILE_NAME)

    mlog("----- Loading dev data -----", config, LOG_FILE_NAME)
    dev_data_source = SpeechDataSource(split="dev",
                                       config=config,
                                       tokenizer=tokenizer,
                                       label_tokenizer=label_tokenizer)
    mlog(str(dev_data_source.statistics), config, LOG_FILE_NAME)

    # build model
    if config.model == "bert_attn_ed":
        Model = BertAttnEDSeqLabeler
    elif config.model == "speech_attn_ed":
        Model = SpeechAttnEDSeqLabeler
    else:
        print("no model specified")
        exit(0)
    model = Model(config, tokenizer, label_tokenizer, freeze=config.freeze)

    # model adaption
    if torch.cuda.is_available():
        mlog("----- Using GPU -----", config, LOG_FILE_NAME)
        model = model.cuda()
    if config.model_path:
        model.load_model(config.model_path)
        mlog("----- Model loaded -----", config, LOG_FILE_NAME)
        mlog(f"model path: {config.model_path}", config, LOG_FILE_NAME)

    trainable_parameters = [
        param for param in model.parameters() if param.requires_grad
    ]
    total_params_count = sum([x.numel() for x in trainable_parameters])
    print("Total params count: ", total_params_count)

    # Build optimizer
    optimizer = optim.AdamW(model.parameters(),
                            lr=config.init_lr,
                            weight_decay=config.l2_penalty)

    # Build lr scheduler
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer=optimizer,
        mode="min",
        factor=config.lr_decay_rate,
        patience=2,
    )

    # log hyper parameters
    start_time = time.time()
    mlog("----- Hyper-parameters -----", config, LOG_FILE_NAME)
    for k, v in sorted(dict(config.__dict__).items()):
        mlog("{}: {}".format(k, v), config, LOG_FILE_NAME)
    for name, param in model.named_parameters():
        mlog(
            "{}: {}; Grad: {}".format(name, param.size(), param.requires_grad),
            config, LOG_FILE_NAME)

    # data stats for batching
    train_dialogs_keys = train_data_source.dialog_keys
    shuffle_dialogs = train_dialogs_keys[:]

    # TRAIN
    n_step = 0
    best_score = -9999
    for epoch in range(1, config.n_epochs + 1):
        lr = list(lr_scheduler.optimizer.param_groups)[0]["lr"]
        if lr <= config.min_lr:
            break

        random.shuffle(shuffle_dialogs)
        n_batch = 0
        for dialog_idx in shuffle_dialogs:
            if config.frame_features:
                dialog_frames = train_data_source.load_frames(dialog_idx)
            else:
                dialog_frames = []
            dialog_length = train_data_source.get_dialog_length(dialog_idx)
            turn_keys = list(range(dialog_length))
            random.shuffle(turn_keys)

            if config.debug and n_step > 30:
                break

            for offset in range(0, dialog_length, config.batch_size):
                model.zero_grad()
                model.train()
                turn_idx = turn_keys[offset:offset + config.batch_size]
                batch_data = train_data_source.get_batch_features(
                    dialog_idx, dialog_frames, turn_idx)

                # Forward
                ret_data, ret_stat = model.train_step(batch_data)
                trn_reporter.update_data(ret_stat)

                # Backward
                loss = ret_data["loss"]
                loss.backward()
                if config.gradient_clip > 0.0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   config.gradient_clip)
                optimizer.step()
                optimizer.zero_grad()

                # update
                trn_reporter.update_data(ret_stat)

                # Check loss and Evaluate on dev dataset
                # Check loss
                if n_step > 0 and n_step % config.check_loss_after_n_step == 0:
                    log_s = f"{time.time()-start_time:.2f}s Epoch {epoch} batch {n_batch} - "
                    log_s += trn_reporter.to_string()
                    mlog(log_s, config, LOG_FILE_NAME)
                    trn_reporter.clear()

                # evaluate
                if n_step > 0 and n_step % config.validate_after_n_step == 0:
                    model.eval()

                    log_s = f"<Dev> learning rate: {lr}\n"
                    mlog(log_s, config, LOG_FILE_NAME)

                    current_score, metrics_results, dev_reporter = eval_split(
                        model,
                        dev_data_source,
                        "dev",
                        config,
                        label_tokenizer,
                        metrics,
                        LOG_FILE_NAME,
                        dev_reporter=dev_reporter,
                        write_pred=False)
                    if not config.debug:
                        experiment.log_metrics(metrics_results)

                    # Save model if it has better monitor measurement
                    if current_score > best_score:
                        best_score = current_score
                        if config.save_model:
                            this_model_path = f"{config.model_save_path}/model"
                            if not os.path.exists(this_model_path):
                                os.makedirs(this_model_path)

                            torch.save(
                                model.state_dict(),
                                f"{this_model_path}/{LOG_FILE_NAME}.model.pt")
                            mlog(
                                f"model saved to {this_model_path}/{LOG_FILE_NAME}.model.pt",
                                config, LOG_FILE_NAME)

                            #if torch.cuda.is_available():
                            #    model = model.cuda()

                    # Decay learning rate
                    lr_scheduler.step(dev_reporter.get_value("monitor"))
                    dev_reporter.clear()

                # Finished a step
                n_batch += 1
                n_step += 1

    # Evaluate on test dataset at the end of training
    mlog("----- EVALUATING at end of training -----", config, LOG_FILE_NAME)
    mlog("----- Loading test data -----", config, LOG_FILE_NAME)
    test_data_source = SpeechDataSource(split='test',
                                        config=config,
                                        tokenizer=tokenizer,
                                        label_tokenizer=label_tokenizer)
    mlog(str(test_data_source.statistics), config, LOG_FILE_NAME)
    if config.save_model:
        model_path = f"{this_model_path}/{LOG_FILE_NAME}.model.pt"
        model.load_model(model_path)
        print(f"model path: {model_path}")
    model.eval()

    #if config.debug:
    #    exit(0)

    for set_name, data_source in [("DEV", dev_data_source),
                                  ("TEST", test_data_source)]:
        current_score, metrics_results, dev_reporter = eval_split(
            model,
            data_source,
            set_name,
            config,
            label_tokenizer,
            metrics,
            LOG_FILE_NAME,
            dev_reporter=None,
            write_pred=True)

        lazy_s = f"DSER, DER, F1, LWER:\n {100*metrics_results['DSER']}\t{100*metrics_results['DER']}\t{100*metrics_results['Macro F1']}\t\t{100*metrics_results['Macro LWER']}\n"
        mlog(lazy_s, config, LOG_FILE_NAME)
Exemple #4
0
def run_train(config):
    # tokenizers
    tokenizer = ModBertTokenizer('base', cache_dir=config.cache_dir)
    label_token_dict = {
        "pad_token": "<pad>",
        "bos_token": "<t>",
        "eos_token": "</t>",
    }
    label_token_dict.update({
        f"label_{label_idx}_token": label
        for label_idx, label in enumerate(config.joint_da_seg_recog_labels)
    })
    label_tokenizer = CustomizedTokenizer(token_dict=label_token_dict)

    # metrics calculator
    metrics = DAMetrics()

    # define logger
    MODEL_NAME = config.model
    LOG_FILE_NAME = "{}.seed_{}.{}".format(
        MODEL_NAME, config.seed,
        time.strftime("%Y%m%d-%H%M%S", time.localtime())[-6:])
    if config.filename_note:
        LOG_FILE_NAME += f".{config.filename_note}"
        experiment.set_name(config.filename_note)
    experiment.log_text(LOG_FILE_NAME)

    # data loaders & number reporters
    trn_reporter = StatisticsReporter()
    mlog("----- Loading dev data -----", config, LOG_FILE_NAME)
    dev_data_source = SpeechXTSource(split="dev",
                                     config=config,
                                     tokenizer=tokenizer,
                                     label_tokenizer=label_tokenizer)
    mlog(str(dev_data_source.statistics), config, LOG_FILE_NAME)

    mlog("----- Loading training data -----", config, LOG_FILE_NAME)
    if config.debug:
        train_data_source = dev_data_source
    else:
        train_data_source = SpeechXTSource(split="train",
                                           config=config,
                                           tokenizer=tokenizer,
                                           label_tokenizer=label_tokenizer)
    mlog(str(train_data_source.statistics), config, LOG_FILE_NAME)

    # build model
    if config.model == 'speech_xt':
        model = SpeechTransformerLabeler(config,
                                         tokenizer,
                                         label_tokenizer,
                                         freeze=config.freeze)
    elif config.model == 'speech_bl':
        model = SpeechBaselineLabeler(config,
                                      tokenizer,
                                      label_tokenizer,
                                      freeze=config.freeze)
    else:
        print("No model specified, exiting")
        exit(0)

    # model adaption
    if torch.cuda.is_available():
        mlog("----- Using GPU -----", config, LOG_FILE_NAME)
        model = model.cuda()
    if config.model_path:
        model.load_model(config.model_path)
        mlog("----- Model loaded -----", config, LOG_FILE_NAME)
        mlog(f"model path: {config.model_path}", config, LOG_FILE_NAME)

    this_model_path = f"{config.model_save_path}/model"

    # Build optimizer
    trainable_parameters = [
        param for param in model.named_parameters() if param[1].requires_grad
    ]
    total_params_count = sum([x[1].numel() for x in trainable_parameters])
    print("Total params count: ", total_params_count)

    warmup_steps = math.ceil(train_data_source.statistics['n_turns'] *
                             config.n_epochs / config.batch_size *
                             0.1)  #10% of train data for warm-up
    # additional steps because of different loading schemes
    t_total = math.ceil(1.5 * train_data_source.statistics['n_turns'] *
                        config.n_epochs / config.batch_size)
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in trainable_parameters
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        config.lr_decay_rate
    }, {
        'params': [
            p for n, p in trainable_parameters
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = transformers.AdamW(
        optimizer_grouped_parameters,
        lr=config.init_lr,
        weight_decay=config.lr_decay_rate,
        correct_bias=False,
    )

    # Build lr scheduler
    #lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    #    optimizer=optimizer,
    #    mode="min",
    #    factor=config.lr_decay_rate,
    #    patience=2,
    #)
    print("warmup/total steps:", warmup_steps, t_total)
    lr_scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

    # log hyper parameters
    start_time = time.time()
    mlog("----- Hyper-parameters -----", config, LOG_FILE_NAME)
    for k, v in sorted(dict(config.__dict__).items()):
        mlog("{}: {}".format(k, v), config, LOG_FILE_NAME)
    for name, param in model.named_parameters():
        mlog(
            "{}: {}; Grad: {}".format(name, param.size(), param.requires_grad),
            config, LOG_FILE_NAME)

    # data stats for batching
    train_dialogs_keys = train_data_source.dialog_keys
    shuffle_dialogs = train_dialogs_keys[:]

    # TRAIN
    n_step = 0
    best_score = -9999
    best_loss = np.inf
    for epoch in range(1, config.n_epochs + 1):
        #lr = list(lr_scheduler.optimizer.param_groups)[0]["lr"]
        #if lr <= config.min_lr:
        #    break
        lr = lr_scheduler.get_last_lr()

        random.shuffle(shuffle_dialogs)
        n_batch = 0
        for dialog_idx in shuffle_dialogs:
            if config.frame_features:
                dialog_frames = train_data_source.load_frames(dialog_idx)
            else:
                dialog_frames = []
            dialog_length = train_data_source.get_dialog_length(dialog_idx)
            turn_keys = list(range(dialog_length))
            random.shuffle(turn_keys)

            #if config.debug and n_step > 30:
            #    break

            for offset in range(0, dialog_length, config.batch_size):
                model.zero_grad()
                model.train()
                turn_idx = turn_keys[offset:offset + config.batch_size]
                batch_data = train_data_source.get_batch_features(
                    dialog_idx, dialog_frames, turn_idx)

                # Forward
                ret_data, ret_stat = model.train_step(batch_data)

                # Backward
                loss = ret_data["loss"]
                loss.backward()
                if config.gradient_clip > 0.0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   config.gradient_clip)
                optimizer.step()
                optimizer.zero_grad()
                lr_scheduler.step()

                # update
                trn_reporter.update_data(ret_stat)

                # Check loss and Evaluate on dev dataset
                # Check loss
                if n_step > 0 and n_step % config.check_loss_after_n_step == 0:
                    log_s = f"{time.time()-start_time:.2f}s Epoch {epoch} batch {n_batch} step {n_step} - Training loss on this batch: "
                    log_s += trn_reporter.to_string()
                    mlog(log_s, config, LOG_FILE_NAME)
                    trn_reporter.clear()

                # evaluate
                if n_step > 0 and n_step % config.validate_after_n_step == 0:
                    model.eval()

                    log_s = f"<Dev> learning rate: {lr}\n"
                    mlog(log_s, config, LOG_FILE_NAME)

                    current_score, metrics_results, split_loss \
                            = eval_split(model, dev_data_source, "dev",
                            config, label_tokenizer, metrics,
                            LOG_FILE_NAME, write_pred=False)
                    print("Split loss & best loss ", split_loss, best_loss)
                    print("Split score & best score ", current_score,
                          best_score)
                    if not config.debug:
                        experiment.log_metrics(metrics_results)

                    if current_score > best_score:
                        best_score = current_score
                    # Save model if it has better monitor measurement
                    if split_loss < best_loss:
                        best_loss = split_loss
                        if config.save_model:
                            this_model_path = f"{config.model_save_path}/model"
                            if not os.path.exists(this_model_path):
                                os.makedirs(this_model_path)

                            torch.save(
                                model.state_dict(),
                                f"{this_model_path}/{LOG_FILE_NAME}.model.pt")
                            torch.save(
                                config,
                                f"{this_model_path}/{LOG_FILE_NAME}.config")
                            mlog(
                                f"model saved to {this_model_path}/{LOG_FILE_NAME}.model.pt",
                                config, LOG_FILE_NAME)

                # Finished a step
                n_batch += 1
                n_step += 1

        mlog("----- EVALUATING at end of epoch -----", config, LOG_FILE_NAME)
        mlog(f"End of epoch: {epoch}", config, LOG_FILE_NAME)
        current_score, metrics_results, split_loss = eval_split(
            model,
            dev_data_source,
            "dev",
            config,
            label_tokenizer,
            metrics,
            LOG_FILE_NAME,
            write_pred=False)
        print("Split loss & best loss ", split_loss, best_loss)
        print("Split score & best score ", current_score, best_score)
        if not config.debug:
            experiment.log_metrics(metrics_results)
        if current_score > best_score:
            best_score = current_score
        if split_loss < best_loss:
            best_loss = split_loss
            if config.save_model:
                torch.save(model.state_dict(),
                           f"{this_model_path}/{LOG_FILE_NAME}.model.pt")
                torch.save(config, f"{this_model_path}/{LOG_FILE_NAME}.config")
                mlog(
                    f"model saved to {this_model_path}/{LOG_FILE_NAME}.model.pt",
                    config, LOG_FILE_NAME)

        # Decay learning rate at end of epoch
        #lr_scheduler.step(best_loss)

    # Evaluate on test dataset at the end of training
    mlog("----- EVALUATING at end of training -----", config, LOG_FILE_NAME)
    mlog("----- Loading test data -----", config, LOG_FILE_NAME)
    test_data_source = SpeechXTSource(split='test',
                                      config=config,
                                      tokenizer=tokenizer,
                                      label_tokenizer=label_tokenizer)
    mlog(str(test_data_source.statistics), config, LOG_FILE_NAME)
    if config.save_model:
        model_path = f"{this_model_path}/{LOG_FILE_NAME}.model.pt"
        model.load_model(model_path)
        print(f"model path: {model_path}")
    model.eval()

    for set_name, data_source in [("DEV", dev_data_source),
                                  ("TEST", test_data_source)]:
        current_score, metrics_results, split_loss = eval_split(
            model,
            data_source,
            set_name,
            config,
            label_tokenizer,
            metrics,
            LOG_FILE_NAME,
            write_pred=True)
        print("Split loss: ", split_loss)
        diff = (metrics_results['Macro F1'] - metrics_results['DER']) * 100

        lazy_s = f"DSER, DER, F1, LWER:\n {100*metrics_results['DSER']}\t{100*metrics_results['DER']}\t{100*metrics_results['Macro F1']}\t{diff}\t{100*metrics_results['Macro LWER']}\n"
        mlog(lazy_s, config, LOG_FILE_NAME)
def run_train(config):
    # tokenizers
    special_token_dict = {
        "speaker1_token": "<speaker1>",
        "speaker2_token": "<speaker2>"
    }
    tokenizer = WhiteSpaceTokenizer(word_count_path=config.word_count_path,
                                    vocab_size=config.vocab_size,
                                    special_token_dict=special_token_dict)
    label_token_dict = {
        f"label_{label_idx}_token": label
        for label_idx, label in enumerate(config.joint_da_seg_recog_labels)
    }
    label_token_dict.update({
        "pad_token": "<pad>",
        "bos_token": "<t>",
        "eos_token": "</t>"
    })
    label_tokenizer = CustomizedTokenizer(token_dict=label_token_dict)

    # metrics calculator
    metrics = DAMetrics()

    # define logger
    MODEL_NAME = config.model
    LOG_FILE_NAME = "{}.seed_{}.{}".format(
        MODEL_NAME, config.seed,
        time.strftime("%Y%m%d-%H%M%S", time.localtime()))
    if config.filename_note:
        LOG_FILE_NAME += f".{config.filename_note}"

    # data loaders & number reporters
    trn_reporter = StatisticsReporter()
    dev_reporter = StatisticsReporter()
    with open(config.dataset_path, encoding="utf-8") as f:
        dataset = json.load(f)
    mlog("----- Loading training data -----", config, LOG_FILE_NAME)
    train_data_source = DataSource(data=dataset["train"],
                                   config=config,
                                   tokenizer=tokenizer,
                                   label_tokenizer=label_tokenizer)
    mlog(str(train_data_source.statistics), config, LOG_FILE_NAME)

    mlog("----- Loading dev data -----", config, LOG_FILE_NAME)
    dev_data_source = DataSource(data=dataset["dev"],
                                 config=config,
                                 tokenizer=tokenizer,
                                 label_tokenizer=label_tokenizer)
    mlog(str(dev_data_source.statistics), config, LOG_FILE_NAME)

    # build model
    if config.model == "ed":
        Model = EDSeqLabeler
    elif config.model == "attn_ed":
        Model = AttnEDSeqLabeler
    model = Model(config, tokenizer, label_tokenizer)

    # model adaption
    if torch.cuda.is_available():
        mlog("----- Using GPU -----", config, LOG_FILE_NAME)
        model = model.cuda()
    if config.model_path:
        model.load_model(config.model_path)
        mlog("----- Model loaded -----", config, LOG_FILE_NAME)
        mlog(f"model path: {config.model_path}", config, LOG_FILE_NAME)

    # Build optimizer
    optimizer = optim.AdamW(model.parameters(),
                            lr=config.init_lr,
                            weight_decay=config.l2_penalty)

    # Build lr scheduler
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer=optimizer,
        mode="min",
        factor=config.lr_decay_rate,
        patience=2,
    )

    # log hyper parameters
    start_time = time.time()
    mlog("----- Hyper-parameters -----", config, LOG_FILE_NAME)
    for k, v in sorted(dict(config.__dict__).items()):
        mlog("{}: {}".format(k, v), config, LOG_FILE_NAME)

    # here we go
    n_step = 0
    for epoch in range(1, config.n_epochs + 1):
        lr = list(lr_scheduler.optimizer.param_groups)[0]["lr"]
        if lr <= config.min_lr:
            break

        # Train
        n_batch = 0
        train_data_source.epoch_init(shuffle=True)
        while True:
            batch_data = train_data_source.next(config.batch_size)
            if batch_data is None:
                break

            # Forward
            model.train()
            ret_data, ret_stat = model.train_step(batch_data)
            trn_reporter.update_data(ret_stat)

            # Backward
            loss = ret_data["loss"]
            loss.backward()
            if config.gradient_clip > 0.0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               config.gradient_clip)
            optimizer.step()
            optimizer.zero_grad()

            # update
            trn_reporter.update_data(ret_stat)

            # Check loss
            if n_step > 0 and n_step % config.check_loss_after_n_step == 0:
                log_s = f"{time.time()-start_time:.2f}s Epoch {epoch} batch {n_batch} - "
                log_s += trn_reporter.to_string()
                mlog(log_s, config, LOG_FILE_NAME)
                trn_reporter.clear()

            # Evaluate on dev dataset
            if n_step > 0 and n_step % config.validate_after_n_step == 0:
                model.eval()

                log_s = f"<Dev> learning rate: {lr}\n"
                mlog(log_s, config, LOG_FILE_NAME)

                pred_labels, true_labels = [], []
                dev_data_source.epoch_init(shuffle=False)
                while True:
                    batch_data = dev_data_source.next(config.eval_batch_size)
                    if batch_data is None:
                        break

                    ret_data, ret_stat = model.evaluate_step(batch_data)
                    dev_reporter.update_data(ret_stat)
                    ret_data, ret_stat = model.test_step(batch_data)

                    refs = batch_data["Y"][:, 1:].tolist()
                    hyps = ret_data["symbols"].tolist()
                    for true_label_ids, pred_label_ids in zip(refs, hyps):
                        end_idx = true_label_ids.index(
                            label_tokenizer.eos_token_id)
                        true_labels.append([
                            label_tokenizer.id2word[label_id]
                            for label_id in true_label_ids[:end_idx]
                        ])
                        pred_labels.append([
                            label_tokenizer.id2word[label_id]
                            for label_id in pred_label_ids[:end_idx]
                        ])

                log_s = f"\n<Dev> - {time.time()-start_time:.3f}s - "
                log_s += dev_reporter.to_string()
                mlog(log_s, config, LOG_FILE_NAME)
                metrics_results = metrics.batch_metrics(
                    true_labels, pred_labels)
                experiment.log_metrics(metrics_results)
                log_s = \
                    f"\tDSER:            {100*metrics_results['DSER']:.2f}\n" \
                    f"\tseg WER:         {100*metrics_results['strict segmentation error']:.2f}\n" \
                    f"\tDER:             {100*metrics_results['DER']:.2f}\n" \
                    f"\tjoint WER:       {100*metrics_results['strict joint error']:.2f}\n" \
                    f"\tMacro F1:        {100*metrics_results['Macro F1']:.2f}\n" \
                    f"\tMicro F1:        {100*metrics_results['Micro F1']:.2f}\n" \
                    f"\tMacro LWER:      {100*metrics_results['Macro LWER']:.2f}\n" \
                    f"\tMicro LWER:      {100*metrics_results['Micro LWER']:.2f}\n"
                mlog(log_s, config, LOG_FILE_NAME)

                # Save model if it has better monitor measurement
                if config.save_model:
                    if not os.path.exists(f"{config.task_data_dir}/model/"):
                        os.makedirs(f"{config.task_data_dir}/model/")

                    torch.save(
                        model.state_dict(),
                        f"{config.task_data_dir}/model/{LOG_FILE_NAME}.model.pt"
                    )
                    mlog(
                        f"model saved to {config.task_data_dir}/model/{LOG_FILE_NAME}.model.pt",
                        config, LOG_FILE_NAME)

                    if torch.cuda.is_available():
                        model = model.cuda()

                # Decay learning rate
                lr_scheduler.step(dev_reporter.get_value("monitor"))
                dev_reporter.clear()

            # Finished a step
            n_batch += 1
            n_step += 1

    # Evaluate on test dataset at the end of training
    mlog("----- EVALUATING at end of training -----", config, LOG_FILE_NAME)
    mlog("----- Loading test data -----", config, LOG_FILE_NAME)
    test_data_source = DataSource(data=dataset["test"],
                                  config=config,
                                  tokenizer=tokenizer,
                                  label_tokenizer=label_tokenizer)
    mlog(str(test_data_source.statistics), config, LOG_FILE_NAME)
    model.eval()

    for set_name, data_source in [("DEV", dev_data_source),
                                  ("TEST", test_data_source)]:
        pred_labels, true_labels = [], []
        data_source.epoch_init(shuffle=False)
        RES_FILE_NAME = set_name + "_" + LOG_FILE_NAME
        s = "LABELS\tPREDS"
        reslog(s, RES_FILE_NAME)
        while True:
            batch_data = data_source.next(config.eval_batch_size)
            if batch_data is None:
                break

            ret_data, ret_stat = model.test_step(batch_data)

            refs = batch_data["Y"][:, 1:].tolist()
            hyps = ret_data["symbols"].tolist()
            for true_label_ids, pred_label_ids in zip(refs, hyps):
                end_idx = true_label_ids.index(label_tokenizer.eos_token_id)
                true_syms = [
                    label_tokenizer.id2word[label_id]
                    for label_id in true_label_ids[:end_idx]
                ]
                pred_syms = [
                    label_tokenizer.id2word[label_id]
                    for label_id in pred_label_ids[:end_idx]
                ]
                s = " ".join(true_syms) + "\t" + " ".join(pred_syms)
                reslog(s, RES_FILE_NAME)
                true_labels.append(true_syms)
                pred_labels.append(pred_syms)

        log_s = f"\n<{set_name}> - {time.time()-start_time:.3f}s - "
        mlog(log_s, config, LOG_FILE_NAME)
        metrics_results = metrics.batch_metrics(true_labels, pred_labels)
        log_s = \
            f"\tDSER:            {100*metrics_results['DSER']:.2f}\n" \
            f"\tseg WER:         {100*metrics_results['strict segmentation error']:.2f}\n" \
            f"\tDER:             {100*metrics_results['DER']:.2f}\n" \
            f"\tjoint WER:       {100*metrics_results['strict joint error']:.2f}\n" \
            f"\tMacro F1:        {100*metrics_results['Macro F1']:.2f}\n" \
            f"\tMicro F1:        {100*metrics_results['Micro F1']:.2f}\n" \
            f"\tMacro LWER:      {100*metrics_results['Macro LWER']:.2f}\n" \
            f"\tMicro LWER:      {100*metrics_results['Micro LWER']:.2f}\n"
        mlog(log_s, config, LOG_FILE_NAME)