def main():
    if not LOAD_TEST_SPLIT:
        global X, y
    else:
        global X_train_dev, X_test, y_train_dev, y_test

    from sklearn.model_selection import ShuffleSplit, KFold
    if not LOAD_TEST_SPLIT:
        ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
        ss.get_n_splits(X, y)
        train_index, test_index = next(ss.split(y))
        X_train_dev, X_test = [X[i] for i in train_index
                               ], [X[i] for i in test_index]
        y_train_dev, y_test = [y[i] for i in train_index
                               ], [y[i] for i in test_index]

    kf = KFold(n_splits=NUM_FOLD, random_state=0)

    gold_list = None
    # all_preds = []
    for i, (train_index, dev_index) in enumerate(kf.split(y_train_dev)):
        logger('STARTING Fold -----------', i + 1)
        X_train, X_dev = [X_train_dev[i] for i in train_index
                          ], [X_train_dev[i] for i in dev_index]
        y_train, y_dev = [y_train_dev[i] for i in train_index
                          ], [y_train_dev[i] for i in dev_index]

        gold_list, pred_list = train(X_train, y_train, X_dev, y_dev, X_test,
                                     y_test)
        # all_preds.append(pred_list)
        break

    # all_preds = np.stack(all_preds, axis=0)

    # shape = all_preds[0].shape
    # mj = np.zeros(shape)
    # for m in range(shape[0]):
    #     for n in range(shape[1]):
    #         mj[m, n] = find_majority(np.asarray(all_preds[:, m, n]).reshape((-1)))[0]
    final_pred = pred_list

    logger('Final test by majority voting:')
    show_classification_report(gold_list, final_pred)
    metric = get_metrics(gold_list, final_pred)
    logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    metric = get_multi_metrics(gold_list, final_pred)
    logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    metric = get_single_metrics(gold_list, final_pred)
    logger('Single only: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    logger('Jaccard:', jaccard_score(gold_list, final_pred))
    logger('Bert Binary', args)

    if args.output_path is not None:
        with open(args.output_path, 'bw') as _f:
            pkl.dump(final_pred, _f)
def train(X_train, y_train, X_dev, y_dev, X_test, y_test):
    train_data = TrainDataReader(X_train, y_train, PAD_LEN)
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

    dev_data = TrainDataReader(X_dev, y_dev, PAD_LEN)
    dev_loader = DataLoader(dev_data, batch_size=BATCH_SIZE, shuffle=False)

    test_data = TrainDataReader(X_test, y_test, PAD_LEN)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

    is_broken = False

    model = BinaryBertClassifier(hidden_dim=SRC_HIDDEN_DIM,
                                 num_label=NUM_EMO,
                                 args=args)
    model.init_encoder(BERT_MODEL)
    model.cuda()

    loss_criterion = nn.CrossEntropyLoss()  #

    # Encoder setup
    learning_rate, adam_epsilon, weight_decay, warmup_steps = ENCODER_LEARNING_RATE, 1e-8, 0, 0
    no_decay = ['bias', 'LayerNorm.weight']
    encoder_optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay) and n.startswith('encoder')
        ],
        'weight_decay':
        weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay) and n.startswith('encoder')
        ],
        'weight_decay':
        0.0
    }]
    encoder_optimizer = AdamW(encoder_optimizer_grouped_parameters,
                              lr=learning_rate,
                              eps=adam_epsilon)

    # Decoder setup
    decoder_optimizer_grouped_parameters = [{
        'params':
        [p for n, p in model.named_parameters() if n.startswith("decoder")],
        'lr':
        args.de_lr
    }]
    decoder_optimizer = optim.Adam(decoder_optimizer_grouped_parameters)

    if args.glorot_init:
        logger('use glorot initialization')
        for group in decoder_optimizer_grouped_parameters:
            nn_utils.glorot_init(group['params'])

    if args.huang_init:
        nn_utils.huang_init(model.named_parameters(),
                            uniform=not args.normal_init,
                            startswith='decoder')

    if args.scheduler:
        epoch_to_step = int(len(train_data) / BATCH_SIZE)
        encoder_scheduler = get_cosine_schedule_with_warmup(
            encoder_optimizer,
            num_warmup_steps=WARMUP_EPOCH * epoch_to_step,
            num_training_steps=STOP_EPOCH * epoch_to_step,
            min_lr_ratio=args.min_lr_ratio)
        decoder_scheduler = get_cosine_schedule_with_warmup(
            encoder_optimizer,
            num_warmup_steps=
            0,  # NOTE: decoder start steps set to 0, hardcoded warning
            num_training_steps=STOP_EPOCH * epoch_to_step,
            min_lr_ratio=args.min_lr_ratio)

    es = EarlyStopping(patience=PATIENCE)
    best_model = None
    exit_training = None
    EVAL_EVERY = int(len(train_data) / BATCH_SIZE / 4)

    update_step = 0
    for epoch in range(1, args.max_epoch):
        logger('Epoch: ' + str(epoch) + '===================================')
        train_loss = 0

        for i, (src, mask, label) in tqdm(enumerate(train_loader),
                                          total=len(train_data) / BATCH_SIZE):
            model.train()
            update_step += 1
            if args.scheduler:
                encoder_scheduler.step()
                decoder_scheduler.step()

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            decoder_logit = model(src.cuda(), mask.cuda())

            loss = loss_criterion(
                decoder_logit.view(-1, decoder_logit.shape[-1]),
                label.view(-1).cuda())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(
                model.parameters(), max_grad_norm
            )  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
            encoder_optimizer.step()
            decoder_optimizer.step()
            # scheduler.step()
            train_loss += loss.data.cpu().numpy() * src.shape[0]
            del decoder_logit, loss
            # break
            if update_step % EVAL_EVERY == 0 and args.eval_every is not None:
                model, best_model, exit_training = eval(
                    model, best_model, loss_criterion, es, dev_loader,
                    dev_data)
                if exit_training:
                    break

        logger(f"Training Loss for epoch {epoch}:",
               train_loss / len(train_data))
        model, best_model, exit_training = eval(model, best_model,
                                                loss_criterion, es, dev_loader,
                                                dev_data)
        if exit_training:
            break

    pred_list = []
    gold_list = []
    model.eval()
    for _, (_data, _mask, _label) in enumerate(test_loader):
        with torch.no_grad():
            decoder_logit = model(_data.cuda(), _mask.cuda())
            pred_list.append(
                np.argmax(decoder_logit.data.cpu().numpy(), axis=-1))

            gold_list.append(_label.numpy())
            del decoder_logit
        # break

    torch.save(model, 'nlpcc_bert.pt')
    # pred_list_2 = np.concatenate(pred_list, axis=0)[:, 1]
    preds = np.concatenate(pred_list, axis=0)
    gold = np.concatenate(gold_list, axis=0)

    binary_gold = gold
    binary_preds = preds
    logger("NOTE, this is on the test set")
    metric = get_metrics(binary_gold, binary_preds)
    logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    metric = get_multi_metrics(binary_gold, binary_preds)
    logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    # show_classification_report(binary_gold, binary_preds)
    logger('Jaccard:', jaccard_score(gold, preds))

    return binary_gold, binary_preds
def eval(model, best_model, loss_criterion, es, dev_loader, dev_data):
    pred_list = []
    gold_list = []
    test_loss_sum = 0
    exit_training = False
    model.eval()
    for _, (_data, _mask, _label) in enumerate(dev_loader):
        with torch.no_grad():
            decoder_logit = model(_data.cuda(), _mask.cuda())
            test_loss = loss_criterion(
                decoder_logit.view(-1, decoder_logit.shape[-1]),
                _label.view(-1).cuda())
            test_loss_sum += test_loss.data.cpu().numpy() * _data.shape[0]
            gold_list.append(_label.numpy())

            pred_list.append(
                np.argmax(decoder_logit.data.cpu().numpy(), axis=-1))
            del decoder_logit, test_loss
            # break

    preds = np.concatenate(pred_list, axis=0)
    gold = np.concatenate(gold_list, axis=0)
    metric = get_metrics(gold, preds)
    # report_all(gold_list, pred_list)
    jaccard = jaccard_score(gold, preds)
    logger("Evaluation results:")
    # show_classification_report(binary_gold, binary_preds)
    logger("Evaluation Loss", test_loss_sum / len(dev_data))

    logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4], 'micro P', metric[5], 'micro R', metric[6])
    metric_2 = get_multi_metrics(gold, preds)
    logger('Multi only: h_loss:', metric_2[0], 'macro F', metric_2[1],
           'micro F', metric_2[4])
    logger('Jaccard:', jaccard)

    if args.criterion == 'loss':
        criterion = test_loss_sum
    elif args.criterion == 'macro':
        criterion = 1 - metric[1]
    elif args.criterion == 'micro':
        criterion = 1 - metric[4]
    elif args.criterion == 'h_loss':
        criterion = metric[0]
    elif args.criterion == 'jaccard':
        criterion = 1 - jaccard
    else:
        raise ValueError

    if es.step(criterion):  # overfitting
        del model
        logger('overfitting, loading best model ...')
        model = best_model
        exit_training = True
    else:
        if es.is_best():
            if best_model is not None:
                del best_model
            logger('saving best model ...')
            best_model = deepcopy(model)
        else:
            logger(f'patience {es.cur_patience} not best model , ignoring ...')
            if best_model is None:
                best_model = deepcopy(model)

    return model, best_model, exit_training
Esempio n. 4
0
def main():

    global X_train_dev, X_test, y_train_dev, y_test
    if args.shuffle_emo is not None:
        new_order = np.asarray([int(tmp) for tmp in args.shuffle_emo.split()])
        y_train_dev = np.asarray(y_train_dev).T[new_order].T
        y_test = np.asarray(y_test).T[new_order].T

    glove_tokenizer.build_tokenizer(X_train_dev + X_test,
                                    vocab_size=VOCAB_SIZE)
    glove_tokenizer.build_embedding(GLOVE_EMB_PATH, dataset_name=data_set_name)

    from sklearn.model_selection import ShuffleSplit, KFold

    kf = KFold(n_splits=args.folds, random_state=args.dev_split_seed)
    # kf.get_n_splits(X_train_dev)

    all_preds = []
    gold_list = None

    for i, (train_index, dev_index) in enumerate(kf.split(y_train_dev)):
        logger('STARTING Fold -----------', i + 1)
        X_train, X_dev = [X_train_dev[i] for i in train_index
                          ], [X_train_dev[i] for i in dev_index]
        y_train, y_dev = [y_train_dev[i] for i in train_index
                          ], [y_train_dev[i] for i in dev_index]

        gold_list, pred_list = train(X_train, y_train, X_dev, y_dev, X_test,
                                     y_test)
        all_preds.append(pred_list)
        if args.no_cross:
            break

    all_preds = np.stack(all_preds, axis=0)

    shape = all_preds[0].shape
    mj = np.zeros(shape)
    for m in range(shape[0]):
        for n in range(shape[1]):
            mj[m, n] = find_majority(
                np.asarray(all_preds[:, m, n]).reshape((-1)))[0]

    final_pred = mj

    show_classification_report(gold_list, final_pred)
    metric = get_metrics(gold_list, final_pred)
    logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    metric = get_multi_metrics(gold_list, final_pred)
    logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    metric = get_single_metrics(gold_list, final_pred)
    logger('Single only: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])

    logger('Final Jaccard:', jaccard_score(gold_list, final_pred))
    logger(os.path.basename(__file__))
    logger(args)

    if args.output_path is not None:
        with open(args.output_path, 'bw') as _f:
            pkl.dump(final_pred, _f)
Esempio n. 5
0
def train(X_train, y_train, X_dev, y_dev, X_test, y_test):
    train_set = TrainDataReader(X_train, y_train, MAX_LEN_DATA)
    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)

    dev_set = TrainDataReader(X_dev, y_dev, MAX_LEN_DATA)
    dev_loader = DataLoader(dev_set, batch_size=BATCH_SIZE * 3, shuffle=False)

    test_set = TestDataReader(X_test, MAX_LEN_DATA)
    test_loader = DataLoader(test_set,
                             batch_size=BATCH_SIZE * 3,
                             shuffle=False)

    # Model initialize
    model = CCLSTMClassifier(emb_dim=SRC_EMB_DIM,
                             hidden_dim=SRC_HIDDEN_DIM,
                             num_label=NUM_EMO,
                             vocab_size=glove_tokenizer.get_vocab_size(),
                             args=args)

    if args.fix_emb:
        para_group = [{
            'params': [
                p for n, p in model.named_parameters()
                if n.startswith("encoder") and not 'encoder.embeddings' in n
            ],
            'lr':
            args.en_lr
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if n.startswith("decoder")
            ],
            'lr':
            args.de_lr
        }]
    else:
        para_group = [{
            'params': [
                p for n, p in model.named_parameters()
                if n.startswith("encoder")
            ],
            'lr':
            args.en_lr
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if n.startswith("decoder")
            ],
            'lr':
            args.de_lr
        }]
    loss_criterion = nn.CrossEntropyLoss()  # reduction='sum'
    optimizer = optim.Adam(para_group)
    if args.scheduler:
        epoch_to_step = len(train_set) / BATCH_SIZE
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(WARMUP_EPOCH * epoch_to_step),
            num_training_steps=int(STOP_EPOCH * epoch_to_step),
            min_lr_ratio=args.min_lr_ratio)

    if args.glorot_init:
        logger('use glorot initialization')
        for group in para_group:
            nn_utils.glorot_init(group['params'])

    model.load_encoder_embedding(glove_tokenizer.get_embeddings(),
                                 fix_emb=args.fix_emb)
    model.cuda()

    # Start training
    EVAL_EVERY = int(len(train_set) / BATCH_SIZE / 4)
    best_model = None
    es = EarlyStopping(patience=PATIENCE)
    update_step = 0
    exit_training = False
    for epoch in range(1, MAX_EPOCH + 1):
        logger('Training on epoch=%d -------------------------' % (epoch))
        train_loss_sum = 0
        # print('Current encoder learning rate', scheduler.get_lr())
        # print('Current decoder learning rate', scheduler.get_lr())

        for i, (src, src_len,
                trg) in tqdm(enumerate(train_loader),
                             total=int(len(train_set) / BATCH_SIZE)):
            model.train()
            update_step += 1
            # print('i=%d: ' % (i))
            # trg = torch.index_select(trg, 1, torch.LongTensor(list(range(1, len(EMOS)+1))))

            optimizer.zero_grad()

            elmo_src = elmo_encode(src)

            loss = model.loss(src.cuda(), src_len.cuda(), elmo_src.cuda(),
                              trg.cuda())

            loss.backward()
            train_loss_sum += loss.data.cpu().numpy() * src.shape[0]

            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS)
            optimizer.step()
            if args.scheduler:
                scheduler.step()

            if update_step % EVAL_EVERY == 0:  #
                model, best_model, exit_training = eval(
                    model, best_model, loss_criterion, es, dev_loader, dev_set)
                if exit_training:
                    break

        logger(f"Training Loss for epoch {epoch}:",
               train_loss_sum / len(train_set))
        # model, best_model, exit_training = eval(model, best_model, loss_criterion, es, dev_loader, dev_set)
        if exit_training:
            break

    # final_testing
    model.eval()
    preds = []
    logger("Testing:")
    for i, (src, src_len) in tqdm(enumerate(test_loader),
                                  total=int(len(test_set) / BATCH_SIZE)):
        with torch.no_grad():
            elmo_src = elmo_encode(src)
            pred = model.greedy_decode_batch(src.cuda(), src_len.cuda(),
                                             elmo_src.cuda())
            preds.append(pred.cpu().numpy())
            del pred

    preds = np.concatenate(preds, axis=0)
    gold = np.asarray(y_test)
    binary_gold = gold
    binary_preds = preds
    logger("NOTE, this is on the test set")
    metric = get_metrics(binary_gold, binary_preds)
    logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    metric = get_multi_metrics(binary_gold, binary_preds)
    logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    # show_classification_report(binary_gold, binary_preds)
    logger('Jaccard:', jaccard_score(gold, preds))
    return binary_gold, binary_preds
Esempio n. 6
0
def eval(model, best_model, loss_criterion, es, dev_loader, dev_set):
    # Evaluate
    exit_training = False
    model.eval()
    test_loss_sum = 0
    preds = []
    gold = []
    logger("Evaluating:")
    for i, (src, src_len, trg) in tqdm(enumerate(dev_loader),
                                       total=int(len(dev_set) / BATCH_SIZE),
                                       disable=True):
        with torch.no_grad():
            elmo_src = elmo_encode(src)

            pred = model.greedy_decode_batch(src.cuda(), src_len.cuda(),
                                             elmo_src.cuda())

            gold.append(trg.data.numpy())
            preds.append(pred.cpu().numpy())
            del pred

    preds = np.concatenate(preds, axis=0)
    gold = np.concatenate(gold, axis=0)
    # binary_gold = conver_to_binary(gold)
    # binary_preds = conver_to_binary(preds)
    metric = get_metrics(gold, preds)
    jaccard = jaccard_score(gold, preds)
    logger("Evaluation results:")
    # show_classification_report(binary_gold, binary_preds)
    logger("Evaluation Loss", test_loss_sum / len(dev_set))

    logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4], 'micro P', metric[5], 'micro R', metric[6])
    metric_2 = get_multi_metrics(gold, preds)
    logger('Multi only: h_loss:', metric_2[0], 'macro F', metric_2[1],
           'micro F', metric_2[4])
    logger('Jaccard:', jaccard)

    if args.criterion == 'loss':
        criterion = test_loss_sum
    elif args.criterion == 'macro':
        criterion = 1 - metric[1]
    elif args.criterion == 'micro':
        criterion = 1 - metric[4]
    elif args.criterion == 'h_loss':
        criterion = metric[0]
    elif args.criterion == 'jaccard':
        criterion = 1 - jaccard
    else:
        raise ValueError

    if es.step(criterion):  # overfitting
        del model
        logger('overfitting, loading best model ...')
        model = best_model
        exit_training = True
    else:
        if es.is_best():
            if best_model is not None:
                del best_model
            logger('saving best model ...')
            best_model = deepcopy(model)
        else:
            logger(f'patience {es.cur_patience} not best model , ignoring ...')
            if best_model is None:
                best_model = deepcopy(model)

    return model, best_model, exit_training