def main(): if not LOAD_TEST_SPLIT: global X, y else: global X_train_dev, X_test, y_train_dev, y_test from sklearn.model_selection import ShuffleSplit, KFold if not LOAD_TEST_SPLIT: ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=0) ss.get_n_splits(X, y) train_index, test_index = next(ss.split(y)) X_train_dev, X_test = [X[i] for i in train_index ], [X[i] for i in test_index] y_train_dev, y_test = [y[i] for i in train_index ], [y[i] for i in test_index] kf = KFold(n_splits=NUM_FOLD, random_state=0) gold_list = None # all_preds = [] for i, (train_index, dev_index) in enumerate(kf.split(y_train_dev)): logger('STARTING Fold -----------', i + 1) X_train, X_dev = [X_train_dev[i] for i in train_index ], [X_train_dev[i] for i in dev_index] y_train, y_dev = [y_train_dev[i] for i in train_index ], [y_train_dev[i] for i in dev_index] gold_list, pred_list = train(X_train, y_train, X_dev, y_dev, X_test, y_test) # all_preds.append(pred_list) break # all_preds = np.stack(all_preds, axis=0) # shape = all_preds[0].shape # mj = np.zeros(shape) # for m in range(shape[0]): # for n in range(shape[1]): # mj[m, n] = find_majority(np.asarray(all_preds[:, m, n]).reshape((-1)))[0] final_pred = pred_list logger('Final test by majority voting:') show_classification_report(gold_list, final_pred) metric = get_metrics(gold_list, final_pred) logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) metric = get_multi_metrics(gold_list, final_pred) logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) metric = get_single_metrics(gold_list, final_pred) logger('Single only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) logger('Jaccard:', jaccard_score(gold_list, final_pred)) logger('Bert Binary', args) if args.output_path is not None: with open(args.output_path, 'bw') as _f: pkl.dump(final_pred, _f)
def train(X_train, y_train, X_dev, y_dev, X_test, y_test): train_data = TrainDataReader(X_train, y_train, PAD_LEN) train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True) dev_data = TrainDataReader(X_dev, y_dev, PAD_LEN) dev_loader = DataLoader(dev_data, batch_size=BATCH_SIZE, shuffle=False) test_data = TrainDataReader(X_test, y_test, PAD_LEN) test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False) is_broken = False model = BinaryBertClassifier(hidden_dim=SRC_HIDDEN_DIM, num_label=NUM_EMO, args=args) model.init_encoder(BERT_MODEL) model.cuda() loss_criterion = nn.CrossEntropyLoss() # # Encoder setup learning_rate, adam_epsilon, weight_decay, warmup_steps = ENCODER_LEARNING_RATE, 1e-8, 0, 0 no_decay = ['bias', 'LayerNorm.weight'] encoder_optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and n.startswith('encoder') ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and n.startswith('encoder') ], 'weight_decay': 0.0 }] encoder_optimizer = AdamW(encoder_optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) # Decoder setup decoder_optimizer_grouped_parameters = [{ 'params': [p for n, p in model.named_parameters() if n.startswith("decoder")], 'lr': args.de_lr }] decoder_optimizer = optim.Adam(decoder_optimizer_grouped_parameters) if args.glorot_init: logger('use glorot initialization') for group in decoder_optimizer_grouped_parameters: nn_utils.glorot_init(group['params']) if args.huang_init: nn_utils.huang_init(model.named_parameters(), uniform=not args.normal_init, startswith='decoder') if args.scheduler: epoch_to_step = int(len(train_data) / BATCH_SIZE) encoder_scheduler = get_cosine_schedule_with_warmup( encoder_optimizer, num_warmup_steps=WARMUP_EPOCH * epoch_to_step, num_training_steps=STOP_EPOCH * epoch_to_step, min_lr_ratio=args.min_lr_ratio) decoder_scheduler = get_cosine_schedule_with_warmup( encoder_optimizer, num_warmup_steps= 0, # NOTE: decoder start steps set to 0, hardcoded warning num_training_steps=STOP_EPOCH * epoch_to_step, min_lr_ratio=args.min_lr_ratio) es = EarlyStopping(patience=PATIENCE) best_model = None exit_training = None EVAL_EVERY = int(len(train_data) / BATCH_SIZE / 4) update_step = 0 for epoch in range(1, args.max_epoch): logger('Epoch: ' + str(epoch) + '===================================') train_loss = 0 for i, (src, mask, label) in tqdm(enumerate(train_loader), total=len(train_data) / BATCH_SIZE): model.train() update_step += 1 if args.scheduler: encoder_scheduler.step() decoder_scheduler.step() encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() decoder_logit = model(src.cuda(), mask.cuda()) loss = loss_criterion( decoder_logit.view(-1, decoder_logit.shape[-1]), label.view(-1).cuda()) loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), max_grad_norm ) # Gradient clipping is not in AdamW anymore (so you can use amp without issue) encoder_optimizer.step() decoder_optimizer.step() # scheduler.step() train_loss += loss.data.cpu().numpy() * src.shape[0] del decoder_logit, loss # break if update_step % EVAL_EVERY == 0 and args.eval_every is not None: model, best_model, exit_training = eval( model, best_model, loss_criterion, es, dev_loader, dev_data) if exit_training: break logger(f"Training Loss for epoch {epoch}:", train_loss / len(train_data)) model, best_model, exit_training = eval(model, best_model, loss_criterion, es, dev_loader, dev_data) if exit_training: break pred_list = [] gold_list = [] model.eval() for _, (_data, _mask, _label) in enumerate(test_loader): with torch.no_grad(): decoder_logit = model(_data.cuda(), _mask.cuda()) pred_list.append( np.argmax(decoder_logit.data.cpu().numpy(), axis=-1)) gold_list.append(_label.numpy()) del decoder_logit # break torch.save(model, 'nlpcc_bert.pt') # pred_list_2 = np.concatenate(pred_list, axis=0)[:, 1] preds = np.concatenate(pred_list, axis=0) gold = np.concatenate(gold_list, axis=0) binary_gold = gold binary_preds = preds logger("NOTE, this is on the test set") metric = get_metrics(binary_gold, binary_preds) logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) metric = get_multi_metrics(binary_gold, binary_preds) logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) # show_classification_report(binary_gold, binary_preds) logger('Jaccard:', jaccard_score(gold, preds)) return binary_gold, binary_preds
def eval(model, best_model, loss_criterion, es, dev_loader, dev_data): pred_list = [] gold_list = [] test_loss_sum = 0 exit_training = False model.eval() for _, (_data, _mask, _label) in enumerate(dev_loader): with torch.no_grad(): decoder_logit = model(_data.cuda(), _mask.cuda()) test_loss = loss_criterion( decoder_logit.view(-1, decoder_logit.shape[-1]), _label.view(-1).cuda()) test_loss_sum += test_loss.data.cpu().numpy() * _data.shape[0] gold_list.append(_label.numpy()) pred_list.append( np.argmax(decoder_logit.data.cpu().numpy(), axis=-1)) del decoder_logit, test_loss # break preds = np.concatenate(pred_list, axis=0) gold = np.concatenate(gold_list, axis=0) metric = get_metrics(gold, preds) # report_all(gold_list, pred_list) jaccard = jaccard_score(gold, preds) logger("Evaluation results:") # show_classification_report(binary_gold, binary_preds) logger("Evaluation Loss", test_loss_sum / len(dev_data)) logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4], 'micro P', metric[5], 'micro R', metric[6]) metric_2 = get_multi_metrics(gold, preds) logger('Multi only: h_loss:', metric_2[0], 'macro F', metric_2[1], 'micro F', metric_2[4]) logger('Jaccard:', jaccard) if args.criterion == 'loss': criterion = test_loss_sum elif args.criterion == 'macro': criterion = 1 - metric[1] elif args.criterion == 'micro': criterion = 1 - metric[4] elif args.criterion == 'h_loss': criterion = metric[0] elif args.criterion == 'jaccard': criterion = 1 - jaccard else: raise ValueError if es.step(criterion): # overfitting del model logger('overfitting, loading best model ...') model = best_model exit_training = True else: if es.is_best(): if best_model is not None: del best_model logger('saving best model ...') best_model = deepcopy(model) else: logger(f'patience {es.cur_patience} not best model , ignoring ...') if best_model is None: best_model = deepcopy(model) return model, best_model, exit_training
def main(): global X_train_dev, X_test, y_train_dev, y_test if args.shuffle_emo is not None: new_order = np.asarray([int(tmp) for tmp in args.shuffle_emo.split()]) y_train_dev = np.asarray(y_train_dev).T[new_order].T y_test = np.asarray(y_test).T[new_order].T glove_tokenizer.build_tokenizer(X_train_dev + X_test, vocab_size=VOCAB_SIZE) glove_tokenizer.build_embedding(GLOVE_EMB_PATH, dataset_name=data_set_name) from sklearn.model_selection import ShuffleSplit, KFold kf = KFold(n_splits=args.folds, random_state=args.dev_split_seed) # kf.get_n_splits(X_train_dev) all_preds = [] gold_list = None for i, (train_index, dev_index) in enumerate(kf.split(y_train_dev)): logger('STARTING Fold -----------', i + 1) X_train, X_dev = [X_train_dev[i] for i in train_index ], [X_train_dev[i] for i in dev_index] y_train, y_dev = [y_train_dev[i] for i in train_index ], [y_train_dev[i] for i in dev_index] gold_list, pred_list = train(X_train, y_train, X_dev, y_dev, X_test, y_test) all_preds.append(pred_list) if args.no_cross: break all_preds = np.stack(all_preds, axis=0) shape = all_preds[0].shape mj = np.zeros(shape) for m in range(shape[0]): for n in range(shape[1]): mj[m, n] = find_majority( np.asarray(all_preds[:, m, n]).reshape((-1)))[0] final_pred = mj show_classification_report(gold_list, final_pred) metric = get_metrics(gold_list, final_pred) logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) metric = get_multi_metrics(gold_list, final_pred) logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) metric = get_single_metrics(gold_list, final_pred) logger('Single only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) logger('Final Jaccard:', jaccard_score(gold_list, final_pred)) logger(os.path.basename(__file__)) logger(args) if args.output_path is not None: with open(args.output_path, 'bw') as _f: pkl.dump(final_pred, _f)
def train(X_train, y_train, X_dev, y_dev, X_test, y_test): train_set = TrainDataReader(X_train, y_train, MAX_LEN_DATA) train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True) dev_set = TrainDataReader(X_dev, y_dev, MAX_LEN_DATA) dev_loader = DataLoader(dev_set, batch_size=BATCH_SIZE * 3, shuffle=False) test_set = TestDataReader(X_test, MAX_LEN_DATA) test_loader = DataLoader(test_set, batch_size=BATCH_SIZE * 3, shuffle=False) # Model initialize model = CCLSTMClassifier(emb_dim=SRC_EMB_DIM, hidden_dim=SRC_HIDDEN_DIM, num_label=NUM_EMO, vocab_size=glove_tokenizer.get_vocab_size(), args=args) if args.fix_emb: para_group = [{ 'params': [ p for n, p in model.named_parameters() if n.startswith("encoder") and not 'encoder.embeddings' in n ], 'lr': args.en_lr }, { 'params': [ p for n, p in model.named_parameters() if n.startswith("decoder") ], 'lr': args.de_lr }] else: para_group = [{ 'params': [ p for n, p in model.named_parameters() if n.startswith("encoder") ], 'lr': args.en_lr }, { 'params': [ p for n, p in model.named_parameters() if n.startswith("decoder") ], 'lr': args.de_lr }] loss_criterion = nn.CrossEntropyLoss() # reduction='sum' optimizer = optim.Adam(para_group) if args.scheduler: epoch_to_step = len(train_set) / BATCH_SIZE scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=int(WARMUP_EPOCH * epoch_to_step), num_training_steps=int(STOP_EPOCH * epoch_to_step), min_lr_ratio=args.min_lr_ratio) if args.glorot_init: logger('use glorot initialization') for group in para_group: nn_utils.glorot_init(group['params']) model.load_encoder_embedding(glove_tokenizer.get_embeddings(), fix_emb=args.fix_emb) model.cuda() # Start training EVAL_EVERY = int(len(train_set) / BATCH_SIZE / 4) best_model = None es = EarlyStopping(patience=PATIENCE) update_step = 0 exit_training = False for epoch in range(1, MAX_EPOCH + 1): logger('Training on epoch=%d -------------------------' % (epoch)) train_loss_sum = 0 # print('Current encoder learning rate', scheduler.get_lr()) # print('Current decoder learning rate', scheduler.get_lr()) for i, (src, src_len, trg) in tqdm(enumerate(train_loader), total=int(len(train_set) / BATCH_SIZE)): model.train() update_step += 1 # print('i=%d: ' % (i)) # trg = torch.index_select(trg, 1, torch.LongTensor(list(range(1, len(EMOS)+1)))) optimizer.zero_grad() elmo_src = elmo_encode(src) loss = model.loss(src.cuda(), src_len.cuda(), elmo_src.cuda(), trg.cuda()) loss.backward() train_loss_sum += loss.data.cpu().numpy() * src.shape[0] torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS) optimizer.step() if args.scheduler: scheduler.step() if update_step % EVAL_EVERY == 0: # model, best_model, exit_training = eval( model, best_model, loss_criterion, es, dev_loader, dev_set) if exit_training: break logger(f"Training Loss for epoch {epoch}:", train_loss_sum / len(train_set)) # model, best_model, exit_training = eval(model, best_model, loss_criterion, es, dev_loader, dev_set) if exit_training: break # final_testing model.eval() preds = [] logger("Testing:") for i, (src, src_len) in tqdm(enumerate(test_loader), total=int(len(test_set) / BATCH_SIZE)): with torch.no_grad(): elmo_src = elmo_encode(src) pred = model.greedy_decode_batch(src.cuda(), src_len.cuda(), elmo_src.cuda()) preds.append(pred.cpu().numpy()) del pred preds = np.concatenate(preds, axis=0) gold = np.asarray(y_test) binary_gold = gold binary_preds = preds logger("NOTE, this is on the test set") metric = get_metrics(binary_gold, binary_preds) logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) metric = get_multi_metrics(binary_gold, binary_preds) logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4]) # show_classification_report(binary_gold, binary_preds) logger('Jaccard:', jaccard_score(gold, preds)) return binary_gold, binary_preds
def eval(model, best_model, loss_criterion, es, dev_loader, dev_set): # Evaluate exit_training = False model.eval() test_loss_sum = 0 preds = [] gold = [] logger("Evaluating:") for i, (src, src_len, trg) in tqdm(enumerate(dev_loader), total=int(len(dev_set) / BATCH_SIZE), disable=True): with torch.no_grad(): elmo_src = elmo_encode(src) pred = model.greedy_decode_batch(src.cuda(), src_len.cuda(), elmo_src.cuda()) gold.append(trg.data.numpy()) preds.append(pred.cpu().numpy()) del pred preds = np.concatenate(preds, axis=0) gold = np.concatenate(gold, axis=0) # binary_gold = conver_to_binary(gold) # binary_preds = conver_to_binary(preds) metric = get_metrics(gold, preds) jaccard = jaccard_score(gold, preds) logger("Evaluation results:") # show_classification_report(binary_gold, binary_preds) logger("Evaluation Loss", test_loss_sum / len(dev_set)) logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4], 'micro P', metric[5], 'micro R', metric[6]) metric_2 = get_multi_metrics(gold, preds) logger('Multi only: h_loss:', metric_2[0], 'macro F', metric_2[1], 'micro F', metric_2[4]) logger('Jaccard:', jaccard) if args.criterion == 'loss': criterion = test_loss_sum elif args.criterion == 'macro': criterion = 1 - metric[1] elif args.criterion == 'micro': criterion = 1 - metric[4] elif args.criterion == 'h_loss': criterion = metric[0] elif args.criterion == 'jaccard': criterion = 1 - jaccard else: raise ValueError if es.step(criterion): # overfitting del model logger('overfitting, loading best model ...') model = best_model exit_training = True else: if es.is_best(): if best_model is not None: del best_model logger('saving best model ...') best_model = deepcopy(model) else: logger(f'patience {es.cur_patience} not best model , ignoring ...') if best_model is None: best_model = deepcopy(model) return model, best_model, exit_training