def main(): test_df = pd.read_csv(TEST_PATH) with timer('preprocessing text'): test_df['comment_text'] = test_df['comment_text'].astype(str) test_df = test_df.fillna(0) with timer('load embedding'): tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True) X_text = convert_lines(test_df["comment_text"].fillna("DUMMY_VALUE"), max_len, tokenizer) with timer('train'): model = BertForSequenceClassification(bert_config, num_labels=n_labels) model.load_state_dict(torch.load(model_path)) model = model.to(device) test_dataset = torch.utils.data.TensorDataset( torch.tensor(X_text, dtype=torch.long)) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size * 2, shuffle=False) test_pred = inference(model, test_loader, device, n_labels) del model gc.collect() torch.cuda.empty_cache() submission = pd.DataFrame.from_dict({ 'id': test_df['id'], 'prediction': test_pred.reshape(-1) }) submission.to_csv('submission.csv', index=False) LOGGER.info(submission.head())
def main(): train_df = pd.read_csv(TRAIN_PATH).sample(train_size+valid_size, random_state=seed) y = np.where(train_df['target'] >= 0.5, 1, 0) y_aux = train_df[AUX_COLUMNS].values identity_columns_new = [] for column in identity_columns + ['target']: train_df[column + "_bin"] = np.where(train_df[column] >= 0.5, True, False) if column != "target": identity_columns_new.append(column + "_bin") weights = np.ones((len(train_df),)) / 4 weights += (train_df[identity_columns].fillna(0).values >= 0.5).sum(axis=1).astype(bool).astype(np.int) / 4 weights += (((train_df["target"].values >= 0.5).astype(bool).astype(np.int) + (train_df[identity_columns].fillna(0).values < 0.5).sum(axis=1).astype(bool).astype(np.int)) > 1).astype( bool).astype(np.int) / 4 weights += (((train_df["target"].values < 0.5).astype(bool).astype(np.int) + (train_df[identity_columns].fillna(0).values >= 0.5).sum(axis=1).astype(bool).astype(np.int)) > 1).astype( bool).astype(np.int) / 4 loss_weight = 1.0 / weights.mean() with timer('preprocessing text'): #df["comment_text"] = [analyzer_embed(text) for text in df["comment_text"]] train_df['comment_text'] = train_df['comment_text'].astype(str) train_df = train_df.fillna(0) with timer('load embedding'): tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True) X_text, train_lengths = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"), max_len, tokenizer) test_df = train_df[train_size:] with timer('train'): X_train, y_train, y_aux_train, w_train = X_text[:train_size], y[:train_size], y_aux[:train_size], weights[ :train_size] X_val, y_val, y_aux_val, w_val = X_text[train_size:], y[train_size:], y_aux[train_size:], weights[ train_size:] trn_lengths, val_lengths = train_lengths[:train_size], train_lengths[train_size:] model = BertForSequenceClassification.from_pretrained(WORK_DIR, cache_dir=None, num_labels=n_labels) model.zero_grad() model = model.to(device) y_train = np.concatenate((y_train.reshape(-1, 1), w_train.reshape(-1, 1), y_aux_train), axis=1) y_val = np.concatenate((y_val.reshape(-1, 1), w_val.reshape(-1, 1), y_aux_val), axis=1) train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float)) valid = torch.utils.data.TensorDataset(torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val, dtype=torch.float)) ran_sampler = torch.utils.data.RandomSampler(train_dataset) len_sampler = LenMatchBatchSampler(ran_sampler, batch_size=batch_size, drop_last=False) train_loader = torch.utils.data.DataLoader(train_dataset, batch_sampler=len_sampler) valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size * 2, shuffle=False) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = int(epochs * train_size / batch_size / accumulation_steps) total_step = int(epochs * train_size / batch_size) optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=0.05, t_total=num_train_optimization_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) #criterion = torch.nn.BCEWithLogitsLoss().to(device) criterion = CustomLoss(loss_weight).to(device) LOGGER.info(f"Starting 1 epoch...") tr_loss, train_losses = train_one_epoch(model, train_loader, criterion, optimizer, device, accumulation_steps, total_step, n_labels) LOGGER.info(f'Mean train loss: {round(tr_loss,5)}') torch.save(model.state_dict(), '{}_dic'.format(exp)) valid_loss, oof_pred = validate(model, valid_loader, criterion, device, n_labels) del model gc.collect() torch.cuda.empty_cache() test_df["pred"] = oof_pred[:, 0] test_df = convert_dataframe_to_bool(test_df) bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns) LOGGER.info(bias_metrics_df) score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df)) LOGGER.info(f'final score is {score}') test_df.to_csv("oof.csv", index=False) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss'); plt.legend(); plt.xticks(xs); plt.xlabel('Iter') plt.savefig("loss.png")