def main(args):
    logger.info('Checking...')
    print('torch.cuda.is_available:', torch.cuda.is_available())
    print('torch.cuda.current_device:', torch.cuda.current_device())
    logger.info('device: {}'.format(device))
    logger.info('ood: {}'.format(args.ood))
    SEED = args.seed
    gross_result['seed'] = args.seed
    logger.info('seed: {}'.format(SEED))
    logger.info('model: {}'.format(args.model))
    check_manual_seed(SEED)
    check_args(args)
    logger.info('mode: {}'.format(args.mode))
    logger.info('maxlen: {}'.format(args.maxlen))
    logger.info('minlen: {}'.format(args.minlen))

    logger.info('Loading config...')
    bert_config = Config('config/bert.ini')
    bert_config = bert_config(args.bert_type)

    # for oos-eval dataset
    data_config = Config('config/data.ini')
    data_config = data_config(args.dataset)

    # Prepare data processor
    data_path = os.path.join(data_config['DataDir'], data_config[args.data_file])  # 把目录和文件名合成一个路径
    label_path = data_path.replace('.json', '.label')

    with open(data_path, 'r', encoding='utf-8') as fp:
        source = json.load(fp)
        for type in source:
            n = 0
            n_id = 0
            n_ood = 0
            text_len = {}
            for line in source[type]:
                if line['domain'] == 'chat':
                    n_ood += 1
                else:
                    n_id += 1
                n += 1
                text_len[len(line['text'])] = text_len.get(len(line['text']), 0) + 1
            print(type, n)
            print('ood', n_ood)
            print('id', n_id)
            print(sorted(text_len.items(), key=lambda d: d[0], reverse=False))

    if args.dataset == 'oos-eval':
        processor = OOSProcessor(bert_config, maxlen=32)
    elif args.dataset == 'smp':
        if args.mode == -1:
            processor = SMPProcessor(bert_config, maxlen=32)
            print('processor')
        else:
            processor = SMPProcessor_v2(bert_config, maxlen=32)
            print('processor_v2')
    else:
        raise ValueError('The dataset {} is not supported.'.format(args.dataset))

    processor.load_label(label_path)  # Adding label_to_id and id_to_label ot processor.

    n_class = len(processor.id_to_label)
    print('label: ', processor.id_to_label)
    config = vars(args)  # 返回参数字典
    config['gan_save_path'] = os.path.join(args.output_dir, 'save', 'gan.pt')
    config['bert_save_path'] = os.path.join(args.output_dir, 'save', 'bert.pt')
    config['n_class'] = n_class

    logger.info('config:')
    logger.info(config)

    model = import_module('model.' + args.model)

    D = model.Discriminator(config)
    G = model.Generator(config)
    E = BertModel.from_pretrained(bert_config['PreTrainModelDir'])  # Bert encoder

    # logger.info('Discriminator: {}'.format(D))
    # logger.info('Generator: {}'.format(G))

    if args.fine_tune:
        for param in E.parameters():
            param.requires_grad = True
    else:
        for param in E.parameters():
            param.requires_grad = False

    D.to(device)
    G.to(device)
    E.to(device)

    global_step = 0

    def train(train_dataset, dev_dataset):
        train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=2)

        global best_dev
        nonlocal global_step
        n_sample = len(train_dataloader)
        early_stopping = EarlyStopping(args.patience, logger=logger)
        # Loss function
        adversarial_loss = torch.nn.BCELoss().to(device)
        classified_loss = torch.nn.CrossEntropyLoss().to(device)

        # Optimizers
        optimizer_G = torch.optim.Adam(G.parameters(), lr=args.G_lr)  # optimizer for generator
        optimizer_D = torch.optim.Adam(D.parameters(), lr=args.D_lr)  # optimizer for discriminator
        optimizer_E = AdamW(E.parameters(), args.bert_lr)

        G_total_train_loss = []
        D_total_fake_loss = []
        D_total_real_loss = []
        FM_total_train_loss = []
        D_total_class_loss = []
        valid_detection_loss = []
        valid_oos_ind_precision = []
        valid_oos_ind_recall = []
        valid_oos_ind_f_score = []

        all_features = []
        result = dict()

        for i in range(args.n_epoch):

            # Initialize model state
            G.train()
            D.train()
            E.train()

            G_train_loss = 0
            D_fake_loss = 0
            D_real_loss = 0
            FM_train_loss = 0
            D_class_loss = 0

            for sample in tqdm.tqdm(train_dataloader):
                sample = (i.to(device) for i in sample)
                token, mask, type_ids, y = sample
                batch = len(token)

                ood_sample = (y==0.0)
                # weight = torch.ones(len(ood_sample)).to(device) - ood_sample * args.beta
                # real_loss_func = torch.nn.BCELoss(weight=weight).to(device)

                # the label used to train generator and discriminator.
                valid_label = FloatTensor(batch, 1).fill_(1.0).detach()
                fake_label = FloatTensor(batch, 1).fill_(0.0).detach()

                optimizer_E.zero_grad()
                sequence_output, pooled_output = E(token, mask, type_ids)
                real_feature = pooled_output

                # train D on real
                optimizer_D.zero_grad()
                real_f_vector, discriminator_output, classification_output = D(real_feature, return_feature=True)
                discriminator_output = discriminator_output.squeeze()
                real_loss = adversarial_loss(discriminator_output, (y != 0.0).float())
                # real_loss = real_loss_func(discriminator_output, (y != 0.0).float())
                if n_class > 2:  # 大于2表示除了训练判别器还要训练分类器
                    class_loss = classified_loss(classification_output, y.long())
                    real_loss += class_loss
                    D_class_loss += class_loss.detach()
                real_loss.backward()

                if args.do_vis:
                    all_features.append(real_f_vector.detach())

                # # train D on fake
                if args.model == 'lstm_gan' or args.model == 'cnn_gan':
                    z = FloatTensor(np.random.normal(0, 1, (batch, 32, args.G_z_dim))).to(device)
                else:
                    z = FloatTensor(np.random.normal(0, 1, (batch, args.G_z_dim))).to(device)
                fake_feature = G(z).detach()
                fake_discriminator_output = D.detect_only(fake_feature)
                # fake_loss = args.beta * adversarial_loss(fake_discriminator_output, fake_label)
                fake_loss = adversarial_loss(fake_discriminator_output, fake_label)
                fake_loss.backward()
                optimizer_D.step()

                if args.fine_tune:
                    optimizer_E.step()

                # train G
                optimizer_G.zero_grad()
                if args.model == 'lstm_gan' or args.model == 'cnn_gan':
                    z = FloatTensor(np.random.normal(0, 1, (batch, 32, args.G_z_dim))).to(device)
                else:
                    z = FloatTensor(np.random.normal(0, 1, (batch, args.G_z_dim))).to(device)
                fake_f_vector, D_decision = D.detect_only(G(z), return_feature=True)
                gd_loss = adversarial_loss(D_decision, valid_label)
                fm_loss = torch.abs(torch.mean(real_f_vector.detach(), 0) - torch.mean(fake_f_vector, 0)).mean()
                g_loss = gd_loss + 0 * fm_loss
                g_loss.backward()
                optimizer_G.step()

                global_step += 1

                D_fake_loss += fake_loss.detach()
                D_real_loss += real_loss.detach()
                G_train_loss += g_loss.detach() + fm_loss.detach()
                FM_train_loss += fm_loss.detach()

            # logger.info('[Epoch {}] Train: D_fake_loss: {}'.format(i, D_fake_loss / n_sample))
            # logger.info('[Epoch {}] Train: D_real_loss: {}'.format(i, D_real_loss / n_sample))
            # logger.info('[Epoch {}] Train: D_class_loss: {}'.format(i, D_class_loss / n_sample))
            # logger.info('[Epoch {}] Train: G_train_loss: {}'.format(i, G_train_loss / n_sample))
            # logger.info('[Epoch {}] Train: FM_train_loss: {}'.format(i, FM_train_loss / n_sample))
            # logger.info('---------------------------------------------------------------------------')

            D_total_fake_loss.append(D_fake_loss / n_sample)
            D_total_real_loss.append(D_real_loss / n_sample)
            D_total_class_loss.append(D_class_loss / n_sample)
            G_total_train_loss.append(G_train_loss / n_sample)
            FM_total_train_loss.append(FM_train_loss / n_sample)

            if dev_dataset:
                # logger.info('#################### eval result at step {} ####################'.format(global_step))
                eval_result = eval(dev_dataset)

                valid_detection_loss.append(eval_result['detection_loss'])
                valid_oos_ind_precision.append(eval_result['oos_ind_precision'])
                valid_oos_ind_recall.append(eval_result['oos_ind_recall'])
                valid_oos_ind_f_score.append(eval_result['oos_ind_f_score'])

                # 1 表示要保存模型
                # 0 表示不需要保存模型
                # -1 表示不需要模型,且超过了patience,需要early stop
                signal = early_stopping(-eval_result['eer'])
                if signal == -1:
                    break
                elif signal == 0:
                    pass
                elif signal == 1:
                    save_gan_model(D, G, config['gan_save_path'])
                    if args.fine_tune:
                        save_model(E, path=config['bert_save_path'], model_name='bert')

                # logger.info(eval_result)
                # logger.info('valid_eer: {}'.format(eval_result['eer']))
                # logger.info('valid_oos_ind_precision: {}'.format(eval_result['oos_ind_precision']))
                # logger.info('valid_oos_ind_recall: {}'.format(eval_result['oos_ind_recall']))
                # logger.info('valid_oos_ind_f_score: {}'.format(eval_result['oos_ind_f_score']))
                # logger.info('valid_auc: {}'.format(eval_result['auc']))
                # logger.info(
                #     'valid_fpr95: {}'.format(ErrorRateAt95Recall(eval_result['all_binary_y'], eval_result['y_score'])))

        if args.patience >= args.n_epoch:
            save_gan_model(D, G, config['gan_save_path'])
            if args.fine_tune:
                save_model(E, path=config['bert_save_path'], model_name='bert')

        freeze_data['D_total_fake_loss'] = D_total_fake_loss
        freeze_data['D_total_real_loss'] = D_total_real_loss
        freeze_data['D_total_class_loss'] = D_total_class_loss
        freeze_data['G_total_train_loss'] = G_total_train_loss
        freeze_data['FM_total_train_loss'] = FM_total_train_loss
        freeze_data['valid_real_loss'] = valid_detection_loss
        freeze_data['valid_oos_ind_precision'] = valid_oos_ind_precision
        freeze_data['valid_oos_ind_recall'] = valid_oos_ind_recall
        freeze_data['valid_oos_ind_f_score'] = valid_oos_ind_f_score

        best_dev = -early_stopping.best_score

        if args.do_vis:
            all_features = torch.cat(all_features, 0).cpu().numpy()
            result['all_features'] = all_features
        return result

    def eval(dataset):
        dev_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2)
        n_sample = len(dev_dataloader)
        result = dict()

        # Loss function
        detection_loss = torch.nn.BCELoss().to(device)
        classified_loss = torch.nn.CrossEntropyLoss(ignore_index=0).to(device)

        G.eval()
        D.eval()
        E.eval()

        all_detection_preds = []
        all_class_preds = []

        for sample in tqdm.tqdm(dev_dataloader):
            sample = (i.to(device) for i in sample)
            token, mask, type_ids, y = sample
            batch = len(token)

            # -------------------------evaluate D------------------------- #
            # BERT encode sentence to feature vector

            with torch.no_grad():
                sequence_output, pooled_output = E(token, mask, type_ids)
                real_feature = pooled_output

                # 大于2表示除了训练判别器还要训练分类器
                if n_class > 2:
                    f_vector, discriminator_output, classification_output = D(real_feature, return_feature=True)
                    all_detection_preds.append(discriminator_output)
                    all_class_preds.append(classification_output)

                # 只预测判别器
                else:
                    f_vector, discriminator_output = D.detect_only(real_feature, return_feature=True)
                    all_detection_preds.append(discriminator_output)

        all_y = LongTensor(dataset.dataset[:, -1].astype(int)).cpu()  # [length, n_class]
        all_binary_y = (all_y != 0).long()  # [length, 1] label 0 is oos
        all_detection_preds = torch.cat(all_detection_preds, 0).cpu()  # [length, 1]
        all_detection_binary_preds = convert_to_int_by_threshold(all_detection_preds.squeeze())  # [length, 1]

        # print('all_detection_preds', all_detection_preds.size())
        # print('all_binary_y', all_binary_y.size())
        # 计算损失
        detection_loss = detection_loss(all_detection_preds.squeeze(), all_binary_y.float())
        result['detection_loss'] = detection_loss

        if n_class > 2:
            class_one_hot_preds = torch.cat(all_class_preds, 0).detach().cpu()  # one hot label
            class_loss = classified_loss(class_one_hot_preds, all_y)  # compute loss
            all_class_preds = torch.argmax(class_one_hot_preds, 1)  # label
            class_acc = metrics.ind_class_accuracy(all_class_preds, all_y, oos_index=0)  # accuracy for ind class
            logger.info(metrics.classification_report(all_y, all_class_preds, target_names=processor.id_to_label))

        # logger.info(metrics.classification_report(all_binary_y, all_detection_binary_preds, target_names=['oos', 'in']))

        # report
        oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore(all_detection_binary_preds, all_binary_y)
        detection_acc = metrics.accuracy(all_detection_binary_preds, all_binary_y)

        y_score = all_detection_preds.squeeze().tolist()
        eer = metrics.cal_eer(all_binary_y, y_score)

        result['eer'] = eer
        result['all_detection_binary_preds'] = all_detection_binary_preds
        result['detection_acc'] = detection_acc
        result['all_binary_y'] = all_binary_y
        result['oos_ind_precision'] = oos_ind_precision
        result['oos_ind_recall'] = oos_ind_recall
        result['oos_ind_f_score'] = oos_ind_fscore
        result['y_score'] = y_score
        result['auc'] = roc_auc_score(all_binary_y, y_score)
        if n_class > 2:
            result['class_loss'] = class_loss
            result['class_acc'] = class_acc

        freeze_data['valid_all_y'] = all_y
        freeze_data['vaild_all_pred'] = all_detection_binary_preds
        freeze_data['valid_score'] = y_score

        return result

    def test(dataset):
        # load BERT and GAN
        load_gan_model(D, G, config['gan_save_path'])
        if args.fine_tune:
            load_model(E, path=config['bert_save_path'], model_name='bert')

        test_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2)
        n_sample = len(test_dataloader)
        result = dict()

        # Loss function
        detection_loss = torch.nn.BCELoss().to(device)
        classified_loss = torch.nn.CrossEntropyLoss(ignore_index=0).to(device)

        G.eval()
        D.eval()
        E.eval()

        all_detection_preds = []
        all_class_preds = []
        all_features = []

        for sample in tqdm.tqdm(test_dataloader):
            sample = (i.to(device) for i in sample)
            token, mask, type_ids, y = sample
            batch = len(token)

            # -------------------------evaluate D------------------------- #
            # BERT encode sentence to feature vector

            with torch.no_grad():
                sequence_output, pooled_output = E(token, mask, type_ids)
                real_feature = pooled_output

                # 大于2表示除了训练判别器还要训练分类器
                if n_class > 2:
                    f_vector, discriminator_output, classification_output = D(real_feature, return_feature=True)
                    all_detection_preds.append(discriminator_output)
                    all_class_preds.append(classification_output)

                # 只预测判别器
                else:
                    f_vector, discriminator_output = D.detect_only(real_feature, return_feature=True)
                    all_detection_preds.append(discriminator_output)
                if args.do_vis:
                    all_features.append(f_vector)

        all_y = LongTensor(dataset.dataset[:, -1].astype(int)).cpu()  # [length, n_class]
        all_binary_y = (all_y != 0).long()  # [length, 1] label 0 is oos
        all_detection_preds = torch.cat(all_detection_preds, 0).cpu()  # [length, 1]
        all_detection_binary_preds = convert_to_int_by_threshold(all_detection_preds.squeeze())  # [length, 1]

        # 计算损失
        detection_loss = detection_loss(all_detection_preds, all_binary_y.float())
        result['detection_loss'] = detection_loss

        if n_class > 2:
            class_one_hot_preds = torch.cat(all_class_preds, 0).detach().cpu()  # one hot label
            class_loss = classified_loss(class_one_hot_preds, all_y)  # compute loss
            all_class_preds = torch.argmax(class_one_hot_preds, 1)  # label
            class_acc = metrics.ind_class_accuracy(all_class_preds, all_y, oos_index=0)  # accuracy for ind class
            logger.info(metrics.classification_report(all_y, all_class_preds, target_names=processor.id_to_label))

        # logger.info(metrics.classification_report(all_binary_y, all_detection_binary_preds, target_names=['oos', 'in']))

        # report
        oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore(all_detection_binary_preds, all_binary_y)
        detection_acc = metrics.accuracy(all_detection_binary_preds, all_binary_y)

        y_score = all_detection_preds.squeeze().tolist()
        eer = metrics.cal_eer(all_binary_y, y_score)

        result['eer'] = eer
        result['all_detection_binary_preds'] = all_detection_binary_preds
        result['detection_acc'] = detection_acc
        result['all_binary_y'] = all_binary_y
        result['all_y'] = all_y
        result['oos_ind_precision'] = oos_ind_precision
        result['oos_ind_recall'] = oos_ind_recall
        result['oos_ind_f_score'] = oos_ind_fscore
        result['score'] = y_score
        result['y_score'] = y_score
        result['auc'] = roc_auc_score(all_binary_y, y_score)
        if n_class > 2:
            result['class_loss'] = class_loss
            result['class_acc'] = class_acc
        if args.do_vis:
            all_features = torch.cat(all_features, 0).cpu().numpy()
            result['all_features'] = all_features

        freeze_data['test_all_y'] = all_y.tolist()
        freeze_data['test_all_pred'] = all_detection_binary_preds.tolist()
        freeze_data['test_score'] = y_score

        return result

    def get_fake_feature(num_output):
        """
        生成一定数量的假特征
        """
        G.eval()
        fake_features = []
        start = 0
        batch = args.predict_batch_size
        with torch.no_grad():
            while start < num_output:
                end = min(num_output, start + batch)
                if args.model == 'lstm_gan' or args.model == 'cnn_gan':
                    z = FloatTensor(np.random.normal(0, 1, size=(end - start, 32, args.G_z_dim)))
                else:
                    z = FloatTensor(np.random.normal(0, 1, size=(end - start, args.G_z_dim)))
                fake_feature = G(z)
                f_vector, _ = D.detect_only(fake_feature, return_feature=True)
                fake_features.append(f_vector)
                start += batch
            return torch.cat(fake_features, 0).cpu().numpy()

    if args.do_train:
        if config['data_file'].startswith('binary'):
            if args.mode != -1:
                text_train_set, text_train_len = processor.read_dataset(data_path, ['train'], args.mode, args.maxlen, args.minlen)
                text_dev_set, text_dev_len = processor.read_dataset(data_path, ['val'], args.mode, args.maxlen, args.minlen)
                print('--------------')
                print('text_train_set', text_train_set)
                print('text_train_len', text_train_len)
                print('text_dev_set', text_dev_set)
                print('text_dev_len', text_dev_len)
            else:
                print('==============')
                text_train_set = processor.read_dataset(data_path, ['train'])
                text_dev_set = processor.read_dataset(data_path, ['val'])

        elif config['dataset'] == 'oos-eval':
            text_train_set = processor.read_dataset(data_path, ['train', 'oos_train'])
            text_dev_set = processor.read_dataset(data_path, ['val', 'oos_val'])
        elif config['dataset'] == 'smp':
            text_train_set, text_train_len = processor.read_dataset(data_path, ['train'])
            text_dev_set, text_dev_len = processor.read_dataset(data_path, ['val'])

        if args.ood:
            text_train_set = [sample for sample in text_train_set if sample['domain'] != 'chat']
        train_features = processor.convert_to_ids(text_train_set)
        train_dataset = OOSDataset(train_features)
        dev_features = processor.convert_to_ids(text_dev_set)
        dev_dataset = OOSDataset(dev_features)

        train_result = train(train_dataset, dev_dataset)
        # save_feature(train_result['all_features'], os.path.join(args.output_dir, 'train_feature'))


    if args.do_eval:
        logger.info('#################### eval result at step {} ####################'.format(global_step))
        if config['data_file'].startswith('binary'):
            if args.mode != -1:
                text_dev_set, text_dev_len = processor.read_dataset(data_path, ['val'], args.mode, args.maxlen, args.minlen)
                print('--------------')
                print('text_dev_set', text_dev_set)
                print('text_dev_len', text_dev_len)
            else:
                print('==============')
                text_dev_set = processor.read_dataset(data_path, ['val'])
        elif config['dataset'] == 'oos-eval':
            text_dev_set = processor.read_dataset(data_path, ['val', 'oos_val'])
        elif config['dataset'] == 'smp':
            text_dev_set = processor.read_dataset(data_path, ['val'])


        dev_features = processor.convert_to_ids(text_dev_set)
        dev_dataset = OOSDataset(dev_features)
        eval_result = eval(dev_dataset)
        # logger.info(eval_result)
        logger.info('eval_eer: {}'.format(eval_result['eer']))
        logger.info('eval_oos_ind_precision: {}'.format(eval_result['oos_ind_precision']))
        logger.info('eval_oos_ind_recall: {}'.format(eval_result['oos_ind_recall']))
        logger.info('eval_oos_ind_f_score: {}'.format(eval_result['oos_ind_f_score']))
        logger.info('eval_auc: {}'.format(eval_result['auc']))
        logger.info(
            'eval_fpr95: {}'.format(ErrorRateAt95Recall(eval_result['all_binary_y'], eval_result['y_score'])))
        gross_result['eval_oos_ind_precision'] = eval_result['oos_ind_precision']
        gross_result['eval_oos_ind_recall'] = eval_result['oos_ind_recall']
        gross_result['eval_oos_ind_f_score'] = eval_result['oos_ind_f_score']
        gross_result['eval_eer'] = eval_result['eer']
        gross_result['eval_fpr95'] = ErrorRateAt95Recall(eval_result['all_binary_y'], eval_result['y_score'])
        gross_result['eval_auc'] = eval_result['auc']

    if args.do_test:
        logger.info('#################### test result at step {} ####################'.format(global_step))
        if config['data_file'].startswith('binary'):
            if args.mode != -1:
                text_test_set, text_test_len = processor.read_dataset(data_path, ['test'], 0, -1, -1)
                print('--------------')
                print('text_test_len', text_test_len)
            else:
                print('==============')
                text_test_set = processor.read_dataset(data_path, ['test'])
        elif config['dataset'] == 'oos-eval':
            text_test_set = processor.read_dataset(data_path, ['test', 'oos_test'])
        elif config['dataset'] == 'smp':
            text_test_set = processor.read_dataset(data_path, ['test'])

        test_features = processor.convert_to_ids(text_test_set)
        test_dataset = OOSDataset(test_features)
        test_result = test(test_dataset)
        # logger.info(test_result)
        logger.info('test_eer: {}'.format(test_result['eer']))
        logger.info('test_ood_ind_precision: {}'.format(test_result['oos_ind_precision']))
        logger.info('test_ood_ind_recall: {}'.format(test_result['oos_ind_recall']))
        logger.info('test_ood_ind_f_score: {}'.format(test_result['oos_ind_f_score']))
        logger.info('test_auc: {}'.format(test_result['auc']))
        logger.info('test_fpr95: {}'.format(ErrorRateAt95Recall(test_result['all_binary_y'], test_result['y_score'])))
        my_plot_roc(test_result['all_binary_y'], test_result['y_score'],
                    os.path.join(args.output_dir, 'roc_curve.png'))
        save_result(test_result, os.path.join(args.output_dir, 'test_result'))
        # save_feature(test_result['all_features'], os.path.join(args.output_dir, 'test_feature'))
        gross_result['test_oos_ind_precision'] = test_result['oos_ind_precision']
        gross_result['test_oos_ind_recall'] = test_result['oos_ind_recall']
        gross_result['test_oos_ind_f_score'] = test_result['oos_ind_f_score']
        gross_result['test_eer'] = test_result['eer']
        gross_result['test_fpr95'] = ErrorRateAt95Recall(test_result['all_binary_y'], test_result['y_score'])
        gross_result['test_auc'] = test_result['auc']

        # 输出错误cases
        if config['dataset'] == 'oos-eval':
            texts = [line[0] for line in text_test_set]
        elif config['dataset'] == 'smp':
            texts = [line['text'] for line in text_test_set]
        else:
            raise ValueError('The dataset {} is not supported.'.format(args.dataset))

        output_cases(texts, test_result['all_binary_y'], test_result['all_detection_binary_preds'],
                     os.path.join(args.output_dir, 'test_cases.csv'), processor)

        # confusion matrix
        plot_confusion_matrix(test_result['all_binary_y'], test_result['all_detection_binary_preds'],
                              args.output_dir)

        # beta_log_path = 'beta_log.txt'
        # if os.path.exists(beta_log_path):
        #     flag = True
        # else:
        #     flag = False
        # with open(beta_log_path, 'a', encoding='utf-8') as f:
        #     if flag == False:
        #         f.write('seed\tbeta\tdataset\tdev_eer\ttest_eer\tdata_size\n')
        #     line = '\t'.join([str(config['seed']), str(config['beta']), str(config['data_file']), str(best_dev), str(test_result['eer']), '100'])
        #     f.write(line + '\n')

        if args.do_vis:
            # [2 * length, feature_fim]
            features = np.concatenate([test_result['all_features'], get_fake_feature(len(test_dataset) // 2)], axis=0)
            features = TSNE(n_components=2, verbose=1, n_jobs=-1).fit_transform(features)  # [2 * length, 2]
            # [2 * length, 1]
            if n_class > 2:
                labels = np.concatenate([test_result['all_y'], np.array([-1] * (len(test_dataset) // 2))], 0).reshape((-1, 1))
            else:
                labels = np.concatenate([test_result['all_binary_y'], np.array([-1] * (len(test_dataset) // 2))], 0).reshape((-1, 1))
            # [2 * length, 3]
            data = np.concatenate([features, labels], 1)
            fig = scatter_plot(data, processor)
            fig.savefig(os.path.join(args.output_dir, 'plot.png'))
            fig.show()
            freeze_data['feature_label'] = data
            # plot_train_test(train_result['all_features'], test_result['all_features'], args.output_dir)

    with open(os.path.join(config['output_dir'], 'freeze_data.pkl'), 'wb') as f:
        pickle.dump(freeze_data, f)
    df = pd.DataFrame(data={'valid_y': freeze_data['valid_all_y'],
                            'valid_score': freeze_data['valid_score'],
                            })
    df.to_csv(os.path.join(config['output_dir'], 'valid_score.csv'))

    df = pd.DataFrame(data={'test_y': freeze_data['test_all_y'],
                            'test_score': freeze_data['test_score']
                            })
    df.to_csv(os.path.join(config['output_dir'], 'test_score.csv'))

    if args.result != 'no':
        pd_result = pd.DataFrame(gross_result)
        if args.seed == 16:
            pd_result.to_csv(args.result + '_gross_result.csv', index=False)
        else:
            pd_result.to_csv(args.result + '_gross_result.csv', index=False, mode='a', header=False)
        if args.seed == 8192:
            print(args.result)
            std_mean(args.result + '_gross_result.csv')
Example #2
0
def main(args):
    logger.info('Checking...')
    SEED = args.seed
    check_manual_seed(SEED)
    check_args(args)
    logger.info('seed: {}'.format(args.seed))
    gross_result['seed'] = args.seed

    logger.info('Loading config...')
    bert_config = BertConfig('config/bert.ini')
    bert_config = bert_config(args.bert_type)

    # for oos-eval dataset
    data_config = Config('config/data.ini')
    data_config = data_config(args.dataset)

    # Prepare data processor
    data_path = os.path.join(data_config['DataDir'],
                             data_config[args.data_file])  # 把目录和文件名合成一个路径
    label_path = data_path.replace('.json', '.label')

    if args.dataset == 'oos-eval':
        processor = OOSProcessor(bert_config, maxlen=32)
    elif args.dataset == 'smp':
        processor = SMPProcessor(bert_config, maxlen=32)
    else:
        raise ValueError('The dataset {} is not supported.'.format(
            args.dataset))

    processor.load_label(
        label_path)  # Adding label_to_id and id_to_label ot processor.

    n_class = len(processor.id_to_label)
    config = vars(args)  # 返回参数字典
    config['model_save_path'] = os.path.join(args.output_dir, 'save',
                                             'bert.pt')
    config['n_class'] = n_class

    logger.info('config:')
    logger.info(config)

    model = TextCNN(bert_config, n_class)  # Bert encoder
    if args.fine_tune:
        model.unfreeze_bert_encoder()
    else:
        model.freeze_bert_encoder()
    model.to(device)

    global_step = 0

    def train(train_dataset, dev_dataset):
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=args.train_batch_size //
                                      args.gradient_accumulation_steps,
                                      shuffle=True,
                                      num_workers=2)

        nonlocal global_step
        n_sample = len(train_dataloader)
        early_stopping = EarlyStopping(args.patience, logger=logger)
        # Loss function
        classified_loss = torch.nn.CrossEntropyLoss().to(device)

        # Optimizers
        optimizer = AdamW(model.parameters(), args.lr)

        train_loss = []
        if dev_dataset:
            valid_loss = []
            valid_ind_class_acc = []
        iteration = 0
        for i in range(args.n_epoch):

            model.train()

            total_loss = 0
            for sample in tqdm.tqdm(train_dataloader):
                sample = (i.to(device) for i in sample)
                token, mask, type_ids, y = sample
                batch = len(token)

                logits = model(token, mask, type_ids)
                loss = classified_loss(logits, y.long())
                total_loss += loss.item()
                loss = loss / args.gradient_accumulation_steps
                loss.backward()
                # bp and update parameters
                if (global_step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            logger.info('[Epoch {}] Train: train_loss: {}'.format(
                i, total_loss / n_sample))
            logger.info('-' * 30)

            train_loss.append(total_loss / n_sample)
            iteration += 1

            if dev_dataset:
                logger.info(
                    '#################### eval result at step {} ####################'
                    .format(global_step))
                eval_result = eval(dev_dataset)

                valid_loss.append(eval_result['loss'])
                valid_ind_class_acc.append(eval_result['ind_class_acc'])

                # 1 表示要保存模型
                # 0 表示不需要保存模型
                # -1 表示不需要模型,且超过了patience,需要early stop
                signal = early_stopping(eval_result['accuracy'])
                if signal == -1:
                    break
                elif signal == 0:
                    pass
                elif signal == 1:
                    save_model(model,
                               path=config['model_save_path'],
                               model_name='bert')

                # logger.info(eval_result)

        from utils.visualization import draw_curve
        draw_curve(train_loss, iteration, 'train_loss', args.output_dir)
        if dev_dataset:
            draw_curve(valid_loss, iteration, 'valid_loss', args.output_dir)
            draw_curve(valid_ind_class_acc, iteration,
                       'valid_ind_class_accuracy', args.output_dir)

        if args.patience >= args.n_epoch:
            save_model(model,
                       path=config['model_save_path'],
                       model_name='bert')

        freeze_data['train_loss'] = train_loss
        freeze_data['valid_loss'] = valid_loss

    def eval(dataset):
        dev_dataloader = DataLoader(dataset,
                                    batch_size=args.predict_batch_size,
                                    shuffle=False,
                                    num_workers=2)
        n_sample = len(dev_dataloader)
        result = dict()
        model.eval()

        # Loss function
        classified_loss = torch.nn.CrossEntropyLoss().to(device)
        all_pred = []
        all_logit = []
        total_loss = 0
        for sample in tqdm.tqdm(dev_dataloader):
            sample = (i.to(device) for i in sample)
            token, mask, type_ids, y = sample
            batch = len(token)

            with torch.no_grad():
                logit = model(token, mask, type_ids)
                all_logit.append(logit)
                all_pred.append(torch.argmax(logit, 1))
                total_loss += classified_loss(logit, y.long())

        all_y = LongTensor(
            dataset.dataset[:, -1].astype(int)).cpu()  # [length, n_class]
        all_binary_y = (all_y != 0).long()  # [length, 1] label 0 is oos
        all_pred = torch.cat(all_pred, 0).cpu()
        all_logit = torch.cat(all_logit, 0).cpu()
        ind_class_acc = metrics.ind_class_accuracy(all_pred, all_y)
        report = metrics.classification_report(all_y,
                                               all_pred,
                                               output_dict=True)
        result.update(report)
        y_score = all_logit.softmax(1)[:, 1].tolist()
        eer = metrics.cal_eer(all_binary_y, y_score)

        oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore(
            all_pred, all_binary_y)

        result['eer'] = eer
        result['ind_class_acc'] = ind_class_acc
        result['loss'] = total_loss / n_sample

        result['oos_ind_precision'] = oos_ind_precision
        result['oos_ind_recall'] = oos_ind_recall
        result['oos_ind_f_score'] = oos_ind_fscore
        result['auc'] = roc_auc_score(all_binary_y, y_score)
        result['y_score'] = y_score
        result['all_binary_y'] = all_binary_y

        freeze_data['valid_all_y'] = all_y
        freeze_data['vaild_all_pred'] = all_pred
        freeze_data['valid_score'] = y_score

        return result

    def test(dataset):
        load_model(model, path=config['model_save_path'], model_name='bert')
        test_dataloader = DataLoader(dataset,
                                     batch_size=args.predict_batch_size,
                                     shuffle=False,
                                     num_workers=2)
        n_sample = len(test_dataloader)
        result = dict()
        model.eval()

        # Loss function
        classified_loss = torch.nn.CrossEntropyLoss().to(device)
        all_pred = []
        total_loss = 0
        all_logit = []
        for sample in tqdm.tqdm(test_dataloader):
            sample = (i.to(device) for i in sample)
            token, mask, type_ids, y = sample
            batch = len(token)

            with torch.no_grad():
                logit = model(token, mask, type_ids)
                all_logit.append(logit)
                all_pred.append(torch.argmax(logit, 1))
                total_loss += classified_loss(logit, y.long())

        all_y = LongTensor(
            dataset.dataset[:, -1].astype(int)).cpu()  # [length, n_class]
        all_binary_y = (all_y != 0).long()  # [length, 1] label 0 is oos
        all_pred = torch.cat(all_pred, 0).cpu()
        all_logit = torch.cat(all_logit, 0).cpu()

        # classification report
        ind_class_acc = metrics.ind_class_accuracy(all_pred, all_y)
        report = metrics.classification_report(all_y,
                                               all_pred,
                                               output_dict=True)
        oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore(
            all_pred, all_binary_y)
        result.update(report)
        # 只有二分类时候ERR才有意义
        y_score = all_logit.softmax(1)[:, 1].tolist()
        eer = metrics.cal_eer(all_binary_y, y_score)

        result['eer'] = eer
        result['ind_class_acc'] = ind_class_acc
        result['loss'] = total_loss / n_sample
        result['all_y'] = all_y.tolist()
        result['all_pred'] = all_pred.tolist()
        result['all_binary_y'] = all_binary_y

        freeze_data['test_all_y'] = all_y.tolist()
        freeze_data['test_all_pred'] = all_pred.tolist()
        freeze_data['test_score'] = y_score

        result['oos_ind_precision'] = oos_ind_precision
        result['oos_ind_recall'] = oos_ind_recall
        result['oos_ind_f_score'] = oos_ind_fscore
        result['auc'] = roc_auc_score(all_binary_y, y_score)
        result['y_score'] = y_score
        return result

    if args.do_train:
        if config['data_file'].startswith('binary'):
            text_train_set = processor.read_dataset(data_path, ['train'])
            text_dev_set = processor.read_dataset(data_path, ['val'])
        elif config['dataset'] == 'oos-eval':
            text_train_set = processor.read_dataset(data_path,
                                                    ['train', 'oos_train'])
            text_dev_set = processor.read_dataset(data_path,
                                                  ['val', 'oos_val'])
        elif config['dataset'] == 'smp':
            text_train_set = processor.read_dataset(data_path, ['train'])
            text_dev_set = processor.read_dataset(data_path, ['val'])

        train_features = processor.convert_to_ids(text_train_set)
        train_dataset = OOSDataset(train_features)
        dev_features = processor.convert_to_ids(text_dev_set)
        dev_dataset = OOSDataset(dev_features)

        train(train_dataset, dev_dataset)

    if args.do_eval:
        logger.info(
            '#################### eval result at step {} ####################'.
            format(global_step))
        if config['data_file'].startswith('binary'):
            text_dev_set = processor.read_dataset(data_path, ['val'])
        elif config['dataset'] == 'oos-eval':
            text_dev_set = processor.read_dataset(data_path,
                                                  ['val', 'oos_val'])
        elif config['dataset'] == 'smp':
            text_dev_set = processor.read_dataset(data_path, ['val'])

        dev_features = processor.convert_to_ids(text_dev_set)
        dev_dataset = OOSDataset(dev_features)
        eval_result = eval(dev_dataset)
        # logger.info(eval_result)
        logger.info('eval_eer: {}'.format(eval_result['eer']))
        logger.info('eval_oos_ind_precision: {}'.format(
            eval_result['oos_ind_precision']))
        logger.info('eval_oos_ind_recall: {}'.format(
            eval_result['oos_ind_recall']))
        logger.info('eval_oos_ind_f_score: {}'.format(
            eval_result['oos_ind_f_score']))
        logger.info('eval_auc: {}'.format(eval_result['auc']))
        logger.info('eval_fpr95: {}'.format(
            ErrorRateAt95Recall(eval_result['all_binary_y'],
                                eval_result['y_score'])))
        gross_result['eval_eer'] = eval_result['eer']
        gross_result['eval_auc'] = eval_result['auc']
        gross_result['eval_fpr95'] = ErrorRateAt95Recall(
            eval_result['all_binary_y'], eval_result['y_score'])
        gross_result['eval_oos_ind_precision'] = eval_result[
            'oos_ind_precision']
        gross_result['eval_oos_ind_recall'] = eval_result['oos_ind_recall']
        gross_result['eval_oos_ind_f_score'] = eval_result['oos_ind_f_score']

    if args.do_test:
        logger.info(
            '#################### test result at step {} ####################'.
            format(global_step))
        if config['data_file'].startswith('binary'):
            text_test_set = processor.read_dataset(data_path, ['test'])
        elif config['dataset'] == 'oos-eval':
            text_test_set = processor.read_dataset(data_path,
                                                   ['test', 'oos_test'])
        elif config['dataset'] == 'smp':
            text_test_set = processor.read_dataset(data_path, ['test'])

        test_features = processor.convert_to_ids(text_test_set)
        test_dataset = OOSDataset(test_features)
        test_result = test(test_dataset)
        save_result(test_result, os.path.join(args.output_dir, 'test_result'))
        # logger.info(test_result)
        logger.info('test_eer: {}'.format(test_result['eer']))
        logger.info('test_ood_ind_precision: {}'.format(
            test_result['oos_ind_precision']))
        logger.info('test_ood_ind_recall: {}'.format(
            test_result['oos_ind_recall']))
        logger.info('test_ood_ind_f_score: {}'.format(
            test_result['oos_ind_f_score']))
        logger.info('test_auc: {}'.format(test_result['auc']))
        logger.info('test_fpr95: {}'.format(
            ErrorRateAt95Recall(test_result['all_binary_y'],
                                test_result['y_score'])))

        my_plot_roc(test_result['all_binary_y'], test_result['y_score'],
                    os.path.join(args.output_dir, 'roc_curve.png'))
        save_result(test_result, os.path.join(args.output_dir, 'test_result'))

        gross_result['test_eer'] = test_result['eer']
        gross_result['test_auc'] = test_result['auc']
        gross_result['test_fpr95'] = ErrorRateAt95Recall(
            test_result['all_binary_y'], test_result['y_score'])
        gross_result['test_oos_ind_precision'] = test_result[
            'oos_ind_precision']
        gross_result['test_oos_ind_recall'] = test_result['oos_ind_recall']
        gross_result['test_oos_ind_f_score'] = test_result['oos_ind_f_score']

        # 输出错误cases
        if config['dataset'] == 'oos-eval':
            texts = [line[0] for line in text_test_set]
        elif config['dataset'] == 'smp':
            texts = [line['text'] for line in text_test_set]
        else:
            raise ValueError('The dataset {} is not supported.'.format(
                args.dataset))

        output_cases(texts, test_result['all_y'], test_result['all_pred'],
                     os.path.join(args.output_dir, 'test_cases.csv'),
                     processor)

        # confusion matrix
        plot_confusion_matrix(test_result['all_y'], test_result['all_pred'],
                              args.output_dir)

    with open(os.path.join(config['output_dir'], 'freeze_data.pkl'),
              'wb') as f:
        pickle.dump(freeze_data, f)
    df = pd.DataFrame(
        data={
            'valid_y': freeze_data['valid_all_y'],
            'valid_score': freeze_data['valid_score'],
        })
    df.to_csv(os.path.join(config['output_dir'], 'valid_score.csv'))

    df = pd.DataFrame(
        data={
            'test_y': freeze_data['test_all_y'],
            'test_score': freeze_data['test_score']
        })
    df.to_csv(os.path.join(config['output_dir'], 'test_score.csv'))

    if args.result != 'no':
        pd_result = pd.DataFrame(gross_result)
        if args.seed == 16:
            pd_result.to_csv(args.result + '_gross_result.csv', index=False)
        else:
            pd_result.to_csv(args.result + '_gross_result.csv',
                             index=False,
                             mode='a',
                             header=False)
        if args.seed == 8192:
            print(args.result)
            std_mean(args.result + '_gross_result.csv')
def main(args):
    logger.info('Checking...')
    print('torch.cuda.is_available:', torch.cuda.is_available())
    # print('torch.cuda.current_device:', torch.cuda.current_device())
    logger.info('device: {}'.format(device))
    logger.info('ood: {}'.format(args.ood))
    SEED = args.seed
    gross_result['seed'] = args.seed
    logger.info('seed: {}'.format(SEED))
    logger.info('model: {}'.format(args.model))
    check_manual_seed(SEED)
    check_args(args)
    if 0 <= args.beta <= 1:
        logger.info('beta: {}'.format(args.beta))
    logger.info('mode: {}'.format(args.mode))
    logger.info('num_outcomes: {}'.format(args.num_outcomes))
    logger.info('D_updates: {}'.format(args.D_updates))
    logger.info('G_updates: {}'.format(args.G_updates))
    # logger.info('maxlen: {}'.format(args.maxlen))
    # logger.info('minlen: {}'.format(args.minlen))
    # logger.info('optim_mode: {}'.format(args.optim_mode))
    # logger.info('length_weight: {}'.format(args.length_weight))
    # logger.info('sample_weight: {}'.format(args.sample_weight))

    logger.info('Loading config...')
    bert_config = Config('config/bert.ini')
    bert_config = bert_config(args.bert_type)

    # for oos-eval dataset
    data_config = Config('config/data.ini')
    data_config = data_config(args.dataset)

    # Prepare data processor
    data_path = os.path.join(data_config['DataDir'], data_config[args.data_file])  # 把目录和文件名合成一个路径
    label_path = data_path.replace('.json', '.label')

    if args.dataset == 'oos-eval':
        processor = OOSProcessor(bert_config, maxlen=32)
    elif args.dataset == 'smp':
        if args.mode == -1:
            processor = SMPProcessor(bert_config, maxlen=32)
            print('processor')
        else:
            processor = SMPProcessor_v3(bert_config, maxlen=32)
            print('processor_v3')
    else:
        raise ValueError('The dataset {} is not supported.'.format(args.dataset))

    processor.load_label(label_path)  # Adding label_to_id and id_to_label ot processor.

    n_class = len(processor.id_to_label)
    print('label: ', processor.id_to_label)
    config = vars(args)  # 返回参数字典
    config['gan_save_path'] = os.path.join(args.output_dir, 'save', 'gan.pt')
    config['bert_save_path'] = os.path.join(args.output_dir, 'save', 'bert.pt')
    config['n_class'] = n_class

    logger.info('config:')
    logger.info(config)

    model = import_module('model.' + args.model)

    D = model.Discriminator(config)
    G = model.Generator(config)
    E = BertModel.from_pretrained(bert_config['PreTrainModelDir'])  # Bert encoder

    if args.fine_tune:
        for param in E.parameters():
            param.requires_grad = True
    else:
        for param in E.parameters():
            param.requires_grad = False

    D.to(device)
    G.to(device)
    E.to(device)

    global_step = 0

    triplet_loss = CategoricalLoss(atoms=args.num_outcomes, v_max=args.positive_skew, v_min=args.negative_skew)
    triplet_loss.to(device)

    # define anchors
    # e.g. uniform and normal
    unif = np.random.uniform(-1, 1, 1000)
    count, bins = np.histogram(unif, args.num_outcomes)
    anchor0 = count / sum(count)  # for ood

    normal = np.random.normal(0, 0.1, 1000)
    count, bins = np.histogram(normal, args.num_outcomes)
    anchor1 = count / sum(count)  # for ind

    def train(train_dataset, dev_dataset):
        train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=2)

        global best_dev
        nonlocal global_step
        n_sample = len(train_dataloader)
        early_stopping = EarlyStopping(args.patience, logger=logger)
        # Loss function
        adversarial_loss = torch.nn.BCELoss().to(device)
        classified_loss = torch.nn.CrossEntropyLoss().to(device)

        num_outcomes = args.num_outcomes

        # Optimizers
        # optimizer_G = torch.optim.Adam(G.parameters(), lr=args.G_lr)  # optimizer for generator
        # optimizer_D = torch.optim.Adam(D.parameters(), lr=args.D_lr)  # optimizer for discriminator
        optimizer_E = AdamW(E.parameters(), args.bert_lr)

        optimizer_G = torch.optim.Adam(G.parameters(), lr=args.G_lr, betas=(args.beta1, args.beta2),weight_decay=args.weight_decay, eps=args.adam_eps)
        optimizer_D = torch.optim.Adam(D.parameters(), lr=args.D_lr, betas=(args.beta1, args.beta2),weight_decay=args.weight_decay)
        decayG = torch.optim.lr_scheduler.ExponentialLR(optimizer_G, gamma=1 - args.decay)
        decayD = torch.optim.lr_scheduler.ExponentialLR(optimizer_D, gamma=1 - args.decay)

        G_total_train_loss = []
        D_total_fake_loss = []
        D_total_real_loss = []
        FM_total_train_loss = []
        D_total_class_loss = []
        valid_detection_loss = []
        valid_oos_ind_precision = []
        valid_oos_ind_recall = []
        valid_oos_ind_f_score = []

        all_features = []
        result = dict()

        for i in range(args.n_epoch):

            # Initialize model state
            G.train()
            D.train()
            E.train()

            G_train_loss = 0
            # G_d_loss = 0
            D_fake_loss = 0
            D_real_loss = 0
            # FM_train_loss = 0
            D_class_loss = 0

            G_features = []

            for sample in tqdm.tqdm(train_dataloader):
                sample = (i.to(device) for i in sample)
                if args.dataset == 'smp':
                    token, mask, type_ids, knowledge_tag, y = sample
                    batch = len(token)

                    ood_sample = (y == 0.0).float()
                    # weight = torch.ones(len(ood_sample)).to(device) - ood_sample * args.beta
                    # real_loss_func = torch.nn.BCELoss(weight=weight).to(device)

                    # length weight
                    length_sample = FloatTensor([0] * batch)
                    if args.minlen != -1:
                        short_sample = (mask[:, args.minlen] == 0).float()
                        length_sample = length_sample.add(short_sample)
                    if args.maxlen != -1:
                        long_sample = mask[:, args.maxlen].float()
                        length_sample = length_sample.add(long_sample)

                    # get knowledge sample weight by knowledge_tag
                    exclude_sample = knowledge_tag

                    # initailize weight
                    weight = torch.ones(batch).to(device)

                    # optimize without weights
                    if args.optim_mode == 0 and 0 <= args.beta <= 1:
                        weight -= ood_sample * args.beta

                    # only optimize length by weight
                    if args.optim_mode == 1:
                        # set all exclude_sample's weight to 0
                        weight -= exclude_sample
                        length_sample -= exclude_sample
                        length_sample = (length_sample > 0).float()
                        weight -= length_sample * (1 - args.length_weight)

                        # set ood sample weight
                        if 0 <= args.beta <= 1:
                            ood_sample -= exclude_sample
                            ood_sample = (ood_sample > 0).float()
                            temp = torch.ones(batch).to(device)
                            temp -= ood_sample * args.beta
                            weight *= temp

                    # only optimize sample by weight
                    if args.optim_mode == 2:
                        # set all length_sample's weight to 0
                        weight -= length_sample

                        exclude_sample -= length_sample
                        exclude_sample = (exclude_sample > 0).float()
                        weight -= exclude_sample * (1 - args.sample_weight)

                        # set ood sample weight
                        if 0 <= args.beta <= 1:
                            ood_sample -= length_sample
                            ood_sample = (ood_sample > 0).float()
                            temp = torch.ones(batch)
                            temp -= ood_sample * args.beta
                            weight *= temp

                    # optimize length and sample by weight
                    # if args.optim_mode == 3:
                    #     alpha = 0.5
                    #     beta = 0.5
                    #     weight = torch.ones(len(length_sample)).to(device) \
                    #              - alpha * length_sample * (1 - args.length_weight) \
                    #              - beta * exclude_sample * (1 - args.sample_weight)

                if args.dataset == 'oos-eval':
                    token, mask, type_ids, y = sample
                    batch = len(token)

                    ood_sample = (y == 0.0).float()
                    # weight = torch.ones(len(ood_sample)).to(device) - ood_sample * args.beta
                    # real_loss_func = torch.nn.BCELoss(weight=weight).to(device)

                    # length weight
                    length_sample = FloatTensor([0] * batch)
                    if args.minlen != -1:
                        short_sample = (mask[:, args.minlen] == 0).float()
                        length_sample = length_sample.add(short_sample)
                    if args.maxlen != -1:
                        long_sample = mask[:, args.maxlen].float()
                        length_sample = length_sample.add(long_sample)

                    # initailize weight
                    weight = torch.ones(batch).to(device)

                    # optimize without weights
                    if args.optim_mode == 0 and 0 <= args.beta <= 1:
                        weight -= ood_sample * args.beta

                    # only optimize length by weight
                    if args.optim_mode == 1:
                        weight -= length_sample * (1 - args.length_weight)

                        # set ood sample weight
                        if 0 <= args.beta <= 1:
                            ood_sample -= length_sample
                            ood_sample = (ood_sample > 0).float()
                            temp = torch.ones(batch).to(device)
                            temp -= ood_sample * args.beta
                            weight *= temp

                real_loss_func = torch.nn.BCELoss(weight=weight).to(device)

                anchor_ood = torch.zeros((batch, num_outcomes), dtype=torch.float).to(device) + torch.tensor(anchor0, dtype=torch.float).to(device)
                anchor_ind = torch.zeros((batch, num_outcomes), dtype=torch.float).to(device) + torch.tensor(anchor1, dtype=torch.float).to(device)

                optimizer_E.zero_grad()
                sequence_output, pooled_output = E(token, mask, type_ids)
                real_feature = pooled_output

                for t in range(args.D_updates):
                    # train D on real
                    optimizer_D.zero_grad()
                    real_f_vector, discriminator_output, classification_output = D(real_feature, return_feature=True)
                    discriminator_output = discriminator_output.log_softmax(1).exp()
                    discriminator_output = discriminator_output.squeeze()

                    # loss for ood and ind
                    # set anchor according to y
                    y_ood = (y != 0.0).float()
                    anchors =  torch.zeros((batch, num_outcomes), dtype=torch.float).to(device)
                    for i, anchor in enumerate(anchors):
                        if y_ood[i] == 1:
                            anchors[i] += torch.tensor(anchor1, dtype=torch.float).to(device)
                        else:
                            anchors[i] += torch.tensor(anchor0, dtype=torch.float).to(device)

                    # weighted triplet loss
                    real_loss = triplet_loss(anchors, discriminator_output, skewness=args.positive_skew, direction=y_ood, weight=weight)#

                    if n_class > 2:  # 大于2表示除了训练判别器还要训练分类器
                        class_loss = classified_loss(classification_output, y.long())
                        real_loss += class_loss
                        D_class_loss += class_loss.detach()
                    real_loss.backward()

                    if args.do_vis:
                        all_features.append(real_f_vector.detach())

                    # # train D on fake
                    # uniform (-1,1)
                    # z = FloatTensor(np.random.uniform(-1, 1, (batch, args.G_z_dim))).to(device)
                    # normal (0,1)
                    z = FloatTensor(np.random.normal(0, 1, (batch, args.G_z_dim))).to(device)
                    fake_feature = G(z).detach()
                    fake_discriminator_output = D.detect_only(fake_feature).log_softmax(1).exp()

                    # beta of fake
                    if 0 <= args.beta <= 1:
                        fake_loss = args.beta * \
                                    triplet_loss(anchor_ood, fake_discriminator_output, skewness=args.positive_skew)
                    else:
                        fake_loss = triplet_loss(anchor_ood, fake_discriminator_output, skewness=args.positive_skew)
                    fake_loss.backward()
                    optimizer_D.step()
                decayD.step()

                if args.fine_tune:
                    optimizer_E.step()

                for t in range(args.G_updates):
                    # train G
                    optimizer_G.zero_grad()

                    sequence_output, pooled_output = E(token, mask, type_ids)
                    real_feature = pooled_output
                    discriminator_output = D.detect_only(real_feature).log_softmax(1).exp()
                    discriminator_output = discriminator_output.squeeze()

                    # uniform (-1,1)
                    # z = FloatTensor(np.random.uniform(-1, 1, (batch, args.G_z_dim))).to(device)
                    # normal (0,1)
                    z = FloatTensor(np.random.normal(0, 1, (batch, args.G_z_dim))).to(device)
                    fake_f_vector, D_decision = D.detect_only(G(z), return_feature=True)
                    D_decision = D_decision.log_softmax(1).exp()

                    if args.do_vis:
                        G_features.append(fake_f_vector.detach())

                    # todo discriminator_output 区分 ood 与 ind
                    if args.relativisticG:
                        gd_loss = -triplet_loss(anchor_ind, D_decision,skewness=args.negative_skew) + triplet_loss(discriminator_output, D_decision)
                    else:
                        gd_loss = -triplet_loss(anchor_ind, D_decision,skewness=args.negative_skew) + triplet_loss(anchor_ood, D_decision, skewness=args.positive_skew)

                    # feature matching loss
                    fm_loss = torch.abs(torch.mean(real_f_vector.detach(), 0) - torch.mean(fake_f_vector, 0)).mean()
                    # fm_loss = feature_matching_loss(torch.mean(fake_f_vector, 0), torch.mean(real_f_vector.detach(), 0))

                    g_loss = gd_loss + 0 * fm_loss
                    g_loss.backward()
                    optimizer_G.step()
                decayG.step()

                global_step += 1

                D_fake_loss += fake_loss.detach()
                D_real_loss += real_loss.detach()
                # G_d_loss += g_loss.detach()
                G_train_loss += g_loss.detach() + fm_loss.detach()
                # FM_train_loss += fm_loss.detach()

            # logger.info('[Epoch {}] Train: D_fake_loss: {}'.format(i, D_fake_loss / n_sample))
            # logger.info('[Epoch {}] Train: D_real_loss: {}'.format(i, D_real_loss / n_sample))
            # logger.info('[Epoch {}] Train: D_class_loss: {}'.format(i, D_class_loss / n_sample))
            # logger.info('[Epoch {}] Train: G_train_loss: {}'.format(i, G_train_loss / n_sample))
            # logger.info('[Epoch {}] Train: G_d_loss: {}'.format(i, G_d_loss / n_sample))
            # logger.info('[Epoch {}] Train: FM_train_loss: {}'.format(i, FM_train_loss / n_sample))
            # logger.info('---------------------------------------------------------------------------')

            D_total_fake_loss.append(D_fake_loss / n_sample)
            D_total_real_loss.append(D_real_loss / n_sample)
            # D_total_class_loss.append(D_class_loss / n_sample)
            G_total_train_loss.append(G_train_loss / n_sample)
            # FM_total_train_loss.append(FM_train_loss / n_sample)

            if dev_dataset:
                # logger.info('#################### eval result at step {} ####################'.format(global_step))
                eval_result = eval(dev_dataset)

                if args.do_vis and args.do_g_eval_vis:
                    G_features = torch.cat(G_features, 0).cpu().numpy()

                    features = np.concatenate([eval_result['all_features'], G_features], axis=0)
                    features = TSNE(n_components=2, verbose=1, n_jobs=-1).fit_transform(features)
                    labels = np.concatenate([eval_result['all_binary_y'], np.array([-1] * len(G_features))], 0).reshape(-1, 1)

                    data = np.concatenate([features, labels], 1)
                    fig = scatter_plot(data, processor)
                    fig.savefig(os.path.join(args.output_dir, 'plot_epoch_' + str(i) + '.png'))

                valid_detection_loss.append(eval_result['detection_loss'])
                valid_oos_ind_precision.append(eval_result['oos_ind_precision'])
                valid_oos_ind_recall.append(eval_result['oos_ind_recall'])
                valid_oos_ind_f_score.append(eval_result['oos_ind_f_score'])

                # 1 表示要保存模型
                # 0 表示不需要保存模型
                # -1 表示不需要模型,且超过了patience,需要early stop
                signal = early_stopping(-eval_result['eer'])
                if signal == -1:
                    break
                elif signal == 0:
                    pass
                elif signal == 1:
                    save_gan_model(D, G, config['gan_save_path'])
                    if args.fine_tune:
                        save_model(E, path=config['bert_save_path'], model_name='bert')

                # logger.info(eval_result)
                # logger.info('valid_eer: {}'.format(eval_result['eer']))
                # logger.info('valid_oos_ind_precision: {}'.format(eval_result['oos_ind_precision']))
                # logger.info('valid_oos_ind_recall: {}'.format(eval_result['oos_ind_recall']))
                # logger.info('valid_oos_ind_f_score: {}'.format(eval_result['oos_ind_f_score']))
                # logger.info('valid_auc: {}'.format(eval_result['auc']))
                # logger.info(
                #     'valid_fpr95: {}'.format(ErrorRateAt95Recall(eval_result['all_binary_y'], eval_result['y_score'])))

        if args.patience >= args.n_epoch:
            save_gan_model(D, G, config['gan_save_path'])
            if args.fine_tune:
                save_model(E, path=config['bert_save_path'], model_name='bert')

        freeze_data['D_total_fake_loss'] = D_total_fake_loss
        freeze_data['D_total_real_loss'] = D_total_real_loss
        freeze_data['D_total_class_loss'] = D_total_class_loss
        freeze_data['G_total_train_loss'] = G_total_train_loss
        freeze_data['FM_total_train_loss'] = FM_total_train_loss
        freeze_data['valid_real_loss'] = valid_detection_loss
        freeze_data['valid_oos_ind_precision'] = valid_oos_ind_precision
        freeze_data['valid_oos_ind_recall'] = valid_oos_ind_recall
        freeze_data['valid_oos_ind_f_score'] = valid_oos_ind_f_score

        best_dev = -early_stopping.best_score

        if args.do_vis:
            all_features = torch.cat(all_features, 0).cpu().numpy()
            result['all_features'] = all_features
        return result

    def eval(dataset):
        dev_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2)
        n_sample = len(dev_dataloader)
        result = dict()

        # Loss function
        detection_loss = torch.nn.BCELoss().to(device)
        classified_loss = torch.nn.CrossEntropyLoss(ignore_index=0).to(device)

        G.eval()
        D.eval()
        E.eval()

        all_detection_preds = []
        all_class_preds = []
        all_features = []

        for sample in tqdm.tqdm(dev_dataloader):
            sample = (i.to(device) for i in sample)
            if args.dataset == 'smp':
                token, mask, type_ids, knowledge_tag, y = sample
            if args.dataset == 'oos-eval':
                token, mask, type_ids, y = sample
            batch = len(token)

            anchor_ood = torch.zeros(args.num_outcomes, dtype=torch.float).to(device) + torch.tensor(anchor0, dtype=torch.float).to(device)
            anchor_ind = torch.zeros(args.num_outcomes, dtype=torch.float).to(device) + torch.tensor(anchor1, dtype=torch.float).to(device)

            # -------------------------evaluate D------------------------- #
            # BERT encode sentence to feature vector

            with torch.no_grad():
                sequence_output, pooled_output = E(token, mask, type_ids)
                real_feature = pooled_output

                # 大于2表示除了训练判别器还要训练分类器
                if n_class > 2:
                    f_vector, discriminator_output, classification_output = D(real_feature, return_feature=True)
                    all_class_preds.append(classification_output)

                # 只预测判别器
                else:
                    f_vector, discriminator_output = D.detect_only(real_feature, return_feature=True)

                discriminator_output = discriminator_output.log_softmax(1).exp()

                if args.do_vis:
                    all_features.append(f_vector)

                divergence_to_preidction = []

                # logger.info('anchor_ood: {}'.format(anchor_ood))
                # logger.info('anchor_ind: {}'.format(anchor_ind))
                # logger.info('discriminator_output: {}'.format(discriminator_output))

                for output in discriminator_output:
                    d_ood = triplet_loss(anchor_ood, output, skewness=args.positive_skew)
                    d_ind = triplet_loss(anchor_ind, output, skewness=args.negative_skew)
                    # logger.info('d_ood : d_ind = {} : {}'.format(d_ood, d_ind))
                    # divergence_to_preidction.append(1 if d_ind < d_ood else 0)
                    divergence_to_preidction.append(d_ood / (d_ind + d_ood))
                all_detection_preds.extend(divergence_to_preidction)

        all_y = LongTensor(dataset.dataset[:, -1].astype(int)).cpu()  # [length, n_class]
        all_binary_y = (all_y != 0).long()  # [length, 1] label 0 is oos

        # 用 realness_D 做 ood 判别
        # all_detection_preds = torch.cat(all_detection_preds, 0).cpu()  # [length, 1]
        # all_detection_binary_preds = convert_to_int_by_threshold(all_detection_preds.squeeze())  # [length, 1]
        all_detection_preds = FloatTensor(all_detection_preds).cpu()
        # all_detection_binary_preds = all_detection_preds.squeeze()  # [length, 1]
        all_detection_binary_preds = convert_to_int_by_threshold(all_detection_preds.squeeze())  # [length, 1]

        # logger.info('all_detection_preds: {}'.format(all_detection_preds))
        # logger.info('all_binary_y: {}'.format(all_binary_y))

        # 计算损失
        detection_loss = detection_loss(all_detection_preds, all_binary_y.float())
        result['detection_loss'] = detection_loss

        if n_class > 2:
            class_one_hot_preds = torch.cat(all_class_preds, 0).detach().cpu()  # one hot label
            class_loss = classified_loss(class_one_hot_preds, all_y)  # compute loss
            all_class_preds = torch.argmax(class_one_hot_preds, 1)  # label
            class_acc = metrics.ind_class_accuracy(all_class_preds, all_y, oos_index=0)  # accuracy for ind class
            logger.info(metrics.classification_report(all_y, all_class_preds, target_names=processor.id_to_label))

        # report
        oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore(all_detection_binary_preds,all_binary_y)
        detection_acc = metrics.accuracy(all_detection_binary_preds, all_binary_y)

        y_score = all_detection_preds.squeeze().tolist()
        eer = metrics.cal_eer(all_binary_y, all_detection_binary_preds)

        result['eer'] = eer
        result['all_detection_binary_preds'] = all_detection_binary_preds
        result['detection_acc'] = detection_acc
        result['all_binary_y'] = all_binary_y
        result['oos_ind_precision'] = oos_ind_precision
        result['oos_ind_recall'] = oos_ind_recall
        result['oos_ind_f_score'] = oos_ind_fscore
        result['y_score'] = y_score
        result['auc'] = roc_auc_score(all_binary_y, y_score)
        result['fpr95'] = ErrorRateAt95Recall(all_binary_y, y_score)
        if n_class > 2:
            result['class_loss'] = class_loss
            result['class_acc'] = class_acc
        if args.do_vis:
            all_features = torch.cat(all_features, 0).cpu().numpy()
            result['all_features'] = all_features

        freeze_data['valid_all_y'] = all_y
        freeze_data['vaild_all_pred'] = all_detection_binary_preds
        freeze_data['valid_score'] = y_score

        return result

    def test(dataset):
        # load BERT and GAN
        load_gan_model(D, G, config['gan_save_path'])
        if args.fine_tune:
            load_model(E, path=config['bert_save_path'], model_name='bert')

        test_dataloader = DataLoader(dataset, batch_size=args.predict_batch_size, shuffle=False, num_workers=2)
        n_sample = len(test_dataloader)
        result = dict()

        # Loss function
        detection_loss = torch.nn.BCELoss().to(device)
        classified_loss = torch.nn.CrossEntropyLoss(ignore_index=0).to(device)

        G.eval()
        D.eval()
        E.eval()

        all_detection_preds = []
        all_class_preds = []
        all_features = []

        for sample in tqdm.tqdm(test_dataloader):
            sample = (i.to(device) for i in sample)
            if args.dataset == 'smp':
                token, mask, type_ids, knowledge_tag, y = sample
            if args.dataset == 'oos-eval':
                token, mask, type_ids, y = sample
            batch = len(token)

            anchor_ood = torch.zeros(args.num_outcomes, dtype=torch.float).to(device) + torch.tensor(anchor0, dtype=torch.float).to(device)
            anchor_ind = torch.zeros(args.num_outcomes, dtype=torch.float).to(device) + torch.tensor(anchor1, dtype=torch.float).to(device)

            # -------------------------evaluate D------------------------- #
            # BERT encode sentence to feature vector

            with torch.no_grad():
                sequence_output, pooled_output = E(token, mask, type_ids)
                real_feature = pooled_output

                # 大于2表示除了训练判别器还要训练分类器
                if n_class > 2:
                    f_vector, discriminator_output, classification_output = D(real_feature, return_feature=True)
                    all_class_preds.append(classification_output)

                # 只预测判别器
                else:
                    f_vector, discriminator_output = D.detect_only(real_feature, return_feature=True)

                discriminator_output = discriminator_output.log_softmax(1).exp()

                if args.do_vis:
                    all_features.append(f_vector)

                divergence_to_preidction = []

                # logger.info('discriminator_output: {}'.format(discriminator_output))

                for output in discriminator_output:
                    d_ood = triplet_loss(anchor_ood, output, skewness=args.positive_skew)
                    d_ind = triplet_loss(anchor_ind, output, skewness=args.negative_skew)
                    # logger.info('d_ood : d_ind = {} : {}'.format(d_ood, d_ind))
                    # divergence_to_preidction.append(1 if d_ind < d_ood else 0)
                    divergence_to_preidction.append(d_ood / (d_ind + d_ood))
                all_detection_preds.extend(divergence_to_preidction)

        all_y = LongTensor(dataset.dataset[:, -1].astype(int)).cpu()  # [length, n_class]
        all_binary_y = (all_y != 0).long()  # [length, 1] label 0 is oos

        # 用 realness_D 做 ood 判别
        # all_detection_preds = torch.cat(all_detection_preds, 0).cpu()  # [length, 1]
        # all_detection_binary_preds = convert_to_int_by_threshold(all_detection_preds.squeeze())  # [length, 1]
        all_detection_preds = FloatTensor(all_detection_preds).cpu()
        # all_detection_binary_preds = all_detection_preds.squeeze()  # [length, 1]
        all_detection_binary_preds = convert_to_int_by_threshold(all_detection_preds.squeeze())  # [length, 1]

        # logger.info('all_detection_preds: {}'.format(all_detection_preds))
        # logger.info('all_binary_y: {}'.format(all_binary_y))

        # 计算损失
        detection_loss = detection_loss(all_detection_preds, all_binary_y.float())
        result['detection_loss'] = detection_loss

        if n_class > 2:
            class_one_hot_preds = torch.cat(all_class_preds, 0).detach().cpu()  # one hot label
            class_loss = classified_loss(class_one_hot_preds, all_y)  # compute loss
            all_class_preds = torch.argmax(class_one_hot_preds, 1)  # label
            class_acc = metrics.ind_class_accuracy(all_class_preds, all_y, oos_index=0)  # accuracy for ind class
            logger.info(metrics.classification_report(all_y, all_class_preds, target_names=processor.id_to_label))

        # report
        oos_ind_precision, oos_ind_recall, oos_ind_fscore, _ = metrics.binary_recall_fscore(all_detection_binary_preds,all_binary_y)
        detection_acc = metrics.accuracy(all_detection_binary_preds, all_binary_y)

        y_score = all_detection_preds.squeeze().tolist()
        eer = metrics.cal_eer(all_binary_y, y_score)

        result['eer'] = eer
        result['all_detection_binary_preds'] = all_detection_binary_preds
        result['detection_acc'] = detection_acc
        result['all_binary_y'] = all_binary_y
        result['all_y'] = all_y
        result['oos_ind_precision'] = oos_ind_precision
        result['oos_ind_recall'] = oos_ind_recall
        result['oos_ind_f_score'] = oos_ind_fscore
        result['score'] = y_score
        result['y_score'] = y_score
        result['auc'] = roc_auc_score(all_binary_y, y_score)
        result['fpr95'] = ErrorRateAt95Recall(all_binary_y, y_score)
        if n_class > 2:
            result['class_loss'] = class_loss
            result['class_acc'] = class_acc
        if args.do_vis:
            all_features = torch.cat(all_features, 0).cpu().numpy()
            result['all_features'] = all_features

        freeze_data['test_all_y'] = all_y.tolist()
        freeze_data['test_all_pred'] = all_detection_binary_preds.tolist()
        freeze_data['test_score'] = y_score

        return result

    def get_fake_feature(num_output):
        """
        生成一定数量的假特征
        """
        G.eval()
        fake_features = []
        start = 0
        batch = args.predict_batch_size
        with torch.no_grad():
            while start < num_output:
                end = min(num_output, start + batch)
                if args.model == 'lstm_gan' or args.model == 'cnn_gan':
                    z = FloatTensor(np.random.normal(0, 1, size=(end - start, 32, args.G_z_dim)))
                else:
                    z = FloatTensor(np.random.normal(0, 1, size=(end - start, args.G_z_dim)))
                fake_feature = G(z)
                f_vector, _ = D.detect_only(fake_feature, return_feature=True)
                fake_features.append(f_vector)
                start += batch
            return torch.cat(fake_features, 0).cpu().numpy()

    if args.do_train:
        if config['data_file'].startswith('binary'):
            if args.optim_mode == 0:
                text_train_set = processor.read_dataset(data_path, ['train'], args.mode, args.maxlen, args.minlen, pre_exclude=True)
            else:
                # optimize length or sample by weight
                text_train_set = processor.read_dataset(data_path, ['train'], args.mode, args.maxlen, args.minlen, pre_exclude=False)

            text_dev_set = processor.read_dataset(data_path, ['val'], args.mode, args.maxlen, args.minlen, pre_exclude=False)

        elif config['dataset'] == 'oos-eval':
            text_train_set = processor.read_dataset(data_path, ['train', 'oos_train'])
            text_dev_set = processor.read_dataset(data_path, ['val', 'oos_val'])
        elif config['dataset'] == 'smp':
            text_train_set, text_train_len = processor.read_dataset(data_path, ['train'])
            text_dev_set, text_dev_len = processor.read_dataset(data_path, ['val'])

        if config['ood']:
            if config['dataset'] == 'smp':
                text_train_set = [sample for sample in text_train_set if sample['domain'] != 'chat']
            if config['dataset'] == 'oos-eval':
                text_train_set = [sample for sample in text_train_set if sample[1] != 'oos']

        train_features = processor.convert_to_ids(text_train_set)
        dev_features = processor.convert_to_ids(text_dev_set)

        if config['dataset'] == 'oos-eval':
            train_dataset = OOSDataset(train_features)
            dev_dataset = OOSDataset(dev_features)
        if config['dataset'] == 'smp':
            train_dataset = SMPDataset(train_features)
            dev_dataset = SMPDataset(dev_features)

        train_result = train(train_dataset, dev_dataset)
        # save_feature(train_result['all_features'], os.path.join(args.output_dir, 'train_feature'))

    if args.do_eval:
        logger.info('#################### eval result at step {} ####################'.format(global_step))
        if config['data_file'].startswith('binary'):
            #  don't optim dev_set by weight, don't pre_exclude it
            text_dev_set = processor.read_dataset(data_path, ['val'], args.mode, args.maxlen, args.minlen,
                                                  pre_exclude=False)

        elif config['dataset'] == 'oos-eval':
            text_dev_set = processor.read_dataset(data_path, ['val', 'oos_val'])
        elif config['dataset'] == 'smp':
            text_dev_set = processor.read_dataset(data_path, ['val'])

        dev_features = processor.convert_to_ids(text_dev_set)

        if config['dataset'] == 'oos-eval':
            dev_dataset = OOSDataset(dev_features)
        if config['dataset'] == 'smp':
            dev_dataset = SMPDataset(dev_features)

        eval_result = eval(dev_dataset)
        # logger.info(eval_result)
        logger.info('eval_eer: {}'.format(eval_result['eer']))
        logger.info('eval_oos_ind_precision: {}'.format(eval_result['oos_ind_precision']))
        logger.info('eval_oos_ind_recall: {}'.format(eval_result['oos_ind_recall']))
        logger.info('eval_oos_ind_f_score: {}'.format(eval_result['oos_ind_f_score']))
        logger.info('eval_auc: {}'.format(eval_result['auc']))
        logger.info(
            'eval_fpr95: {}'.format(ErrorRateAt95Recall(eval_result['all_binary_y'], eval_result['y_score'])))
        gross_result['eval_oos_ind_precision'] = eval_result['oos_ind_precision']
        gross_result['eval_oos_ind_recall'] = eval_result['oos_ind_recall']
        gross_result['eval_oos_ind_f_score'] = eval_result['oos_ind_f_score']
        gross_result['eval_eer'] = eval_result['eer']
        gross_result['eval_fpr95'] = ErrorRateAt95Recall(eval_result['all_binary_y'], eval_result['y_score'])
        gross_result['eval_auc'] = eval_result['auc']

        freeze_data['eval_result'] = eval_result

    if args.do_test:
        logger.info('#################### test result at step {} ####################'.format(global_step))
        if config['data_file'].startswith('binary'):
            # always keep test_set unchanged
            text_test_set = processor.read_dataset(data_path, ['test'])
        elif config['dataset'] == 'oos-eval':
            text_test_set = processor.read_dataset(data_path, ['test', 'oos_test'])
        elif config['dataset'] == 'smp':
            text_test_set = processor.read_dataset(data_path, ['test'])

        test_features = processor.convert_to_ids(text_test_set)

        if config['dataset'] == 'oos-eval':
            test_dataset = OOSDataset(test_features)
        if config['dataset'] == 'smp':
            test_dataset = SMPDataset(test_features)

        test_result = test(test_dataset)
        # logger.info(test_result)
        logger.info('test_eer: {}'.format(test_result['eer']))
        logger.info('test_ood_ind_precision: {}'.format(test_result['oos_ind_precision']))
        logger.info('test_ood_ind_recall: {}'.format(test_result['oos_ind_recall']))
        logger.info('test_ood_ind_f_score: {}'.format(test_result['oos_ind_f_score']))
        logger.info('test_auc: {}'.format(test_result['auc']))
        logger.info('test_fpr95: {}'.format(ErrorRateAt95Recall(test_result['all_binary_y'], test_result['y_score'])))
        my_plot_roc(test_result['all_binary_y'], test_result['y_score'],
                    os.path.join(args.output_dir, 'roc_curve.png'))
        save_result(test_result, os.path.join(args.output_dir, 'test_result'))
        # save_feature(test_result['all_features'], os.path.join(args.output_dir, 'test_feature'))
        gross_result['test_oos_ind_precision'] = test_result['oos_ind_precision']
        gross_result['test_oos_ind_recall'] = test_result['oos_ind_recall']
        gross_result['test_oos_ind_f_score'] = test_result['oos_ind_f_score']
        gross_result['test_eer'] = test_result['eer']
        gross_result['test_fpr95'] = ErrorRateAt95Recall(test_result['all_binary_y'], test_result['y_score'])
        gross_result['test_auc'] = test_result['auc']

        freeze_data['test_result'] = test_result

        # 输出错误cases
        if config['dataset'] == 'oos-eval':
            texts = [line[0] for line in text_test_set]
        elif config['dataset'] == 'smp':
            texts = [line['text'] for line in text_test_set]
        else:
            raise ValueError('The dataset {} is not supported.'.format(args.dataset))

        output_cases(texts, test_result['all_binary_y'], test_result['all_detection_binary_preds'],
                     os.path.join(args.output_dir, 'test_cases.csv'), processor)

        # confusion matrix
        plot_confusion_matrix(test_result['all_binary_y'], test_result['all_detection_binary_preds'],
                              args.output_dir)

        if args.do_vis:
            # [2 * length, feature_fim]
            features = np.concatenate([test_result['all_features'], get_fake_feature(len(test_dataset) // 2)], axis=0)
            features = TSNE(n_components=2, verbose=1, n_jobs=-1).fit_transform(features)  # [2 * length, 2]
            # [2 * length, 1]
            if n_class > 2:
                labels = np.concatenate([test_result['all_y'], np.array([-1] * (len(test_dataset) // 2))], 0).reshape(
                    (-1, 1))
            else:
                labels = np.concatenate([test_result['all_binary_y'], np.array([-1] * (len(test_dataset) // 2))],
                                        0).reshape((-1, 1))
            # [2 * length, 3]
            data = np.concatenate([features, labels], 1)
            fig = scatter_plot(data, processor)
            fig.savefig(os.path.join(args.output_dir, 'plot.png'))
            fig.show()
            freeze_data['feature_label'] = data
            # plot_train_test(train_result['all_features'], test_result['all_features'], args.output_dir)

    with open(os.path.join(config['output_dir'], 'freeze_data.pkl'), 'wb') as f:
        pickle.dump(freeze_data, f)
    df = pd.DataFrame(data={'valid_y': freeze_data['valid_all_y'],
                            'valid_score': freeze_data['valid_score'],
                            })
    df.to_csv(os.path.join(config['output_dir'], 'valid_score.csv'))

    df = pd.DataFrame(data={'test_y': freeze_data['test_all_y'],
                            'test_score': freeze_data['test_score']
                            })
    df.to_csv(os.path.join(config['output_dir'], 'test_score.csv'))

    if args.result != 'no':
        pd_result = pd.DataFrame(gross_result)
        if args.seed == 16:
            pd_result.to_csv(args.result + '_gross_result.csv', index=False)
        else:
            pd_result.to_csv(args.result + '_gross_result.csv', index=False, mode='a', header=False)
        if args.seed == 8192:
            print(args.result)
            std_mean(args.result + '_gross_result.csv')