def test( start_epoch=0, additional_epoch=90, lr=0.0001, optim="adam", leaky_relu=False, ndcg_gain_in_train="exp2", sigma=1.0, double_precision=False, standardize=False, small_dataset=False, debug=False, output_dir="/tmp/ranking_output/", ): print("start_epoch:{}, additional_epoch:{}, lr:{}".format( start_epoch, additional_epoch, lr)) writer = SummaryWriter(output_dir) precision = torch.float64 if double_precision else torch.float32 # get training and validation data: data_fold = 'Fold1' valid_loader, df_valid, test_loader, df_test = load_train_vali_data( 'Fold1', small_dataset=True) print(test_loader.num_features) if standardize: df_train, scaler = train_loader.train_scaler_and_transform() df_valid = valid_loader.apply_scaler(scaler) lambdarank_structure = [136, 64, 16] net = LambdaRank(lambdarank_structure, leaky_relu=leaky_relu, double_precision=double_precision, sigma=sigma) device = get_device() net.to(device) net.load_state_dict(torch.load("ckptdir\lambdarank-136-64-16-scale-1.0")) print(net) ckptfile = get_ckptdir('lambdarank', lambdarank_structure, sigma) net.eval() with torch.no_grad(): count = 0 batch_size = 200 grad_batch, y_pred_batch = [], [] for X, Y in test_loader.generate_batch_per_query(): X_tensor = torch.tensor(X, dtype=precision, device=device) y_pred = net(X_tensor) y_pred_batch.append(y_pred) # compute the rank order of each document rank_df = pd.DataFrame({"Y": y_pred, "doc": np.arange(Y.shape[0])}) rank_df = rank_df.sort_values("Y").reset_index(drop=True) rank_order = rank_df.sort_values("doc").index.values + 1
def train( start_epoch=0, additional_epoch=100, lr=0.0001, optim="adam", leaky_relu=False, ndcg_gain_in_train="exp2", sigma=1.0, double_precision=False, standardize=False, small_dataset=False, debug=False, output_dir="/tmp/ranking_output/", ): print("start_epoch:{}, additional_epoch:{}, lr:{}".format( start_epoch, additional_epoch, lr)) writer = SummaryWriter(output_dir) precision = torch.float64 if double_precision else torch.float32 # get training and validation data: data_fold = 'Fold1' train_loader, df_train, valid_loader, df_valid = load_train_vali_data( data_fold, small_dataset) if standardize: df_train, scaler = train_loader.train_scaler_and_transform() df_valid = valid_loader.apply_scaler(scaler) lambdarank_structure = [136, 64, 16] net = LambdaRank(lambdarank_structure, leaky_relu=leaky_relu, double_precision=double_precision, sigma=sigma) device = get_device('LambdaRank') net.to(device) net.apply(init_weights) print(net) ckptfile = get_ckptdir('lambdarank', lambdarank_structure, sigma) if optim == "adam": optimizer = torch.optim.Adam(net.parameters(), lr=lr) elif optim == "sgd": optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=0.9) else: raise ValueError( "Optimization method {} not implemented".format(optim)) print(optimizer) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.75) ideal_dcg = NDCG(2**9, ndcg_gain_in_train) for i in range(start_epoch, start_epoch + additional_epoch): net.train() net.zero_grad() count = 0 batch_size = 200 grad_batch, y_pred_batch = [], [] for X, Y in train_loader.generate_batch_per_query(shuffle=True): if np.sum(Y) == 0: # negative session, cannot learn useful signal continue N = 1.0 / ideal_dcg.maxDCG(Y) X_tensor = torch.tensor(X, dtype=precision, device=device) y_pred = net(X_tensor) y_pred_batch.append(y_pred) # compute the rank order of each document rank_df = pd.DataFrame({"Y": Y, "doc": np.arange(Y.shape[0])}) rank_df = rank_df.sort_values("Y").reset_index(drop=True) rank_order = rank_df.sort_values("doc").index.values + 1 with torch.no_grad(): pos_pairs_score_diff = 1.0 + torch.exp(sigma * (y_pred - y_pred.t())) Y_tensor = torch.tensor(Y, dtype=precision, device=device).view(-1, 1) rel_diff = Y_tensor - Y_tensor.t() pos_pairs = (rel_diff > 0).type(precision) neg_pairs = (rel_diff < 0).type(precision) Sij = pos_pairs - neg_pairs if ndcg_gain_in_train == "exp2": gain_diff = torch.pow(2.0, Y_tensor) - torch.pow( 2.0, Y_tensor.t()) elif ndcg_gain_in_train == "identity": gain_diff = Y_tensor - Y_tensor.t() else: raise ValueError( "ndcg_gain method not supported yet {}".format( ndcg_gain_in_train)) rank_order_tensor = torch.tensor(rank_order, dtype=precision, device=device).view(-1, 1) decay_diff = 1.0 / torch.log2(rank_order_tensor + 1.0) - 1.0 / torch.log2( rank_order_tensor.t() + 1.0) delta_ndcg = torch.abs(N * gain_diff * decay_diff) lambda_update = sigma * (0.5 * (1 - Sij) - 1 / pos_pairs_score_diff) * delta_ndcg lambda_update = torch.sum(lambda_update, 1, keepdim=True) assert lambda_update.shape == y_pred.shape check_grad = torch.sum(lambda_update, (0, 1)).item() if check_grad == float('inf') or np.isnan(check_grad): import ipdb ipdb.set_trace() grad_batch.append(lambda_update) # optimization is to similar to RankNetListWise, but to maximize NDCG. # lambda_update scales with gain and decay count += 1 if count % batch_size == 0: for grad, y_pred in zip(grad_batch, y_pred_batch): y_pred.backward(grad / batch_size) if count % (4 * batch_size) == 0 and debug: net.dump_param() optimizer.step() net.zero_grad() grad_batch, y_pred_batch = [], [ ] # grad_batch, y_pred_batch used for gradient_acc # optimizer.step() print( get_time(), "training dataset at epoch {}, total queries: {}".format(i, count)) if debug: eval_cross_entropy_loss(net, device, train_loader, i, writer, phase="Train") # eval_ndcg_at_k(net, device, df_train, train_loader, 100000, [10, 30, 50]) if i % 5 == 0 and i != start_epoch: print(get_time(), "eval for epoch: {}".format(i)) eval_cross_entropy_loss(net, device, valid_loader, i, writer) eval_ndcg_at_k(net, device, df_valid, valid_loader, 100000, [10, 30], i, writer) if i % 10 == 0 and i != start_epoch: save_to_ckpt(ckptfile, i, net, optimizer, scheduler) scheduler.step() # save the last ckpt save_to_ckpt(ckptfile, start_epoch + additional_epoch, net, optimizer, scheduler) # save the final model torch.save(net.state_dict(), ckptfile) ndcg_result = eval_ndcg_at_k(net, device, df_valid, valid_loader, 100000, [10, 30], start_epoch + additional_epoch, writer) print( get_time(), "finish training " + ", ".join( ["NDCG@{}: {:.5f}".format(k, ndcg_result[k]) for k in ndcg_result]), '\n\n')
def train_rank_net( start_epoch=0, additional_epoch=100, lr=0.0001, optim="adam", train_algo=SUM_SESSION, double_precision=False, standardize=False, small_dataset=False, debug=False, output_dir="/tmp/ranking_output/", ): """ :param start_epoch: int :param additional_epoch: int :param lr: float :param optim: str :param train_algo: str :param double_precision: boolean :param standardize: boolean :param small_dataset: boolean :param debug: boolean :return: """ print("start_epoch:{}, additional_epoch:{}, lr:{}".format( start_epoch, additional_epoch, lr)) writer = SummaryWriter(output_dir) precision = torch.float64 if double_precision else torch.float32 # get training and validation data: data_fold = 'Fold1' train_loader, df_train, valid_loader, df_valid = load_train_vali_data( data_fold, small_dataset) if standardize: df_train, scaler = train_loader.train_scaler_and_transform() df_valid = valid_loader.apply_scaler(scaler) net, net_inference, ckptfile = get_train_inference_net( train_algo, train_loader.num_features, start_epoch, double_precision) device = get_device() net.to(device) net_inference.to(device) # initialize to make training faster net.apply(init_weights) if optim == "adam": optimizer = torch.optim.Adam(net.parameters(), lr=lr) elif optim == "sgd": optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=0.9) else: raise ValueError( "Optimization method {} not implemented".format(optim)) print(optimizer) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.75) loss_func = None if train_algo == BASELINE: loss_func = torch.nn.BCELoss() loss_func.to(device) losses = [] for i in range(start_epoch, start_epoch + additional_epoch): scheduler.step() net.zero_grad() net.train() if train_algo == BASELINE: epoch_loss = baseline_pairwise_training_loop(i, net, loss_func, optimizer, train_loader, precision=precision, device=device, debug=debug) elif train_algo in [SUM_SESSION, ACC_GRADIENT]: epoch_loss = factorized_training_loop(i, net, None, optimizer, train_loader, training_algo=train_algo, precision=precision, device=device, debug=debug) losses.append(epoch_loss) print('=' * 20 + '\n', get_time(), 'Epoch{}, loss : {}'.format(i, losses[-1]), '\n' + '=' * 20) # save to checkpoint every 5 step, and run eval if i % 5 == 0 and i != start_epoch: save_to_ckpt(ckptfile, i, net, optimizer, scheduler) net_inference.load_state_dict(net.state_dict()) eval_model(net_inference, device, df_valid, valid_loader, i, writer) # save the last ckpt save_to_ckpt(ckptfile, start_epoch + additional_epoch, net, optimizer, scheduler) # final evaluation net_inference.load_state_dict(net.state_dict()) ndcg_result = eval_model(net_inference, device, df_valid, valid_loader, start_epoch + additional_epoch, writer) # save the final model torch.save(net.state_dict(), ckptfile) print( get_time(), "finish training " + ", ".join( ["NDCG@{}: {:.5f}".format(k, ndcg_result[k]) for k in ndcg_result]), '\n\n')
sep=' ', names=[ 'qid', 'did1', 'did2', 'did3', 'did4', 'did5', 'did6', 'did7', 'did8', 'did9', 'did10' ]) leaky_relu = False ndcg_gain_in_train = "exp2" sigma = 1.0 #writer = SummaryWriter(output_dir) double_precision = False precision = torch.float64 if double_precision else torch.float32 # get training and validation data: data_fold = 'Fold1' valid_loader, df_valid, test_loader, df_test = load_train_vali_data( 'Fold1', small_dataset=True) print(test_loader.num_features) qid_list = df_test.loc[:, 'qid'].values breakpoint lambdarank_structure = [136, 64, 16] net = LambdaRank(lambdarank_structure, leaky_relu=leaky_relu, double_precision=double_precision, sigma=sigma) device = get_device() net.to(device) net.load_state_dict( torch.load(