def stochastic_neural_sort(s, n_samples, tau, mask, beta=1.0, log_scores=True, eps=1e-10): """ Stochastic neural sort. Please note that memory complexity grows by factor n_samples. Code taken from "Stochastic Optimization of Sorting Networks via Continuous Relaxations", ICLR 2019. Minor modifications applied to the original code (masking). :param s: values to sort, shape [batch_size, slate_length] :param n_samples: number of samples (approximations) for each permutation matrix :param tau: temperature for the final softmax function :param mask: mask indicating padded elements :param beta: scale parameter for the Gumbel distribution :param log_scores: whether to apply the logarithm function to scores prior to Gumbel perturbation :param eps: epsilon for the logarithm function :return: approximate permutation matrices of shape [n_samples, batch_size, slate_length, slate_length] """ dev = get_torch_device() batch_size = s.size()[0] n = s.size()[1] s_positive = s + torch.abs(s.min()) samples = beta * sample_gumbel([n_samples, batch_size, n, 1], device=dev) if log_scores: s_positive = torch.log(s_positive + eps) s_perturb = (s_positive + samples).view(n_samples * batch_size, n, 1) mask_repeated = mask.repeat_interleave(n_samples, dim=0) P_hat = deterministic_neural_sort(s_perturb, tau, mask_repeated) P_hat = P_hat.view(n_samples, batch_size, n, n) return P_hat
def mrr(y_pred, y_true, ats=None): """ Mean Reciprocal Rank at k. Compute MRR at ranks given by ats or at the maximum rank if ats is None. :param y_pred: predictions from the model, shape [batch_size, slate_length] :param y_true: ground truth labels, shape [batch_size, slate_length] :param ats: optional list of ranks for MRR evaluation, if None, maximum rank is used :return: MRR values for each slate and evaluation position, shape [batch_size, len(ats)] """ y_true = y_true.clone() y_pred = y_pred.clone() if ats is None: ats = [y_true.shape[1]] true_sorted_by_preds = __apply_mask_and_get_true_sorted_by_preds(y_pred, y_true) values, indices = torch.max(true_sorted_by_preds, dim=1) indices = indices.type_as(values).unsqueeze(dim=0).t().expand(len(y_true), len(ats)) dev = get_torch_device() ats_rep = torch.tensor(data=ats, device=dev, dtype=torch.float32).expand(len(y_true), len(ats)) within_at_mask = (indices < ats_rep).type(torch.float32) result = torch.tensor(1.0) / (indices + torch.tensor(1.0)) zero_sum_mask = torch.sum(values) == 0.0 result[zero_sum_mask] = 0.0 result = result * within_at_mask return result
def dcg(y_pred, y_true, ats=None, gain_function=lambda x: torch.pow(2, x) - 1): # y_true and y_pred have dimensions [listing, document_score] # returns a tensor with ndcg values at positions specified by 'ats' with dimensions [listing, dcg_at] y_true = y_true.clone() y_pred = y_pred.clone() actual_length = y_true.shape[1] if ats is None: ats = [actual_length] ats = [min(at, actual_length) for at in ats] true_sorted_by_preds = __apply_mask_and_get_true_sorted_by_preds( y_pred, y_true) dev = get_torch_device() discounts = (torch.tensor(1) / torch.log2( torch.arange(true_sorted_by_preds.shape[1], dtype=torch.float) + 2.0) ).to(device=dev) gains = gain_function(true_sorted_by_preds) discounted_gains = (gains * discounts)[:, :np.max(ats)] cum_dcg = torch.cumsum(discounted_gains, dim=1) ats_tensor = torch.tensor(ats, dtype=torch.long) - torch.tensor(1) dcg = cum_dcg[:, ats_tensor] return dcg
def deterministic_neural_sort(s, tau, mask): """ Deterministic neural sort. Code taken from "Stochastic Optimization of Sorting Networks via Continuous Relaxations", ICLR 2019. Minor modifications applied to the original code (masking). :param s: values to sort, shape [batch_size, slate_length] :param tau: temperature for the final softmax function :param mask: mask indicating padded elements :return: approximate permutation matrices of shape [batch_size, slate_length, slate_length] """ dev = get_torch_device() n = s.size()[1] one = torch.ones((n, 1), dtype=torch.float32, device=dev) s = s.masked_fill(mask[:, :, None], -1e8) A_s = torch.abs(s - s.permute(0, 2, 1)) A_s = A_s.masked_fill(mask[:, :, None] | mask[:, None, :], 0.0) B = torch.matmul(A_s, torch.matmul(one, torch.transpose(one, 0, 1))) temp = [n - m + 1 - 2 * (torch.arange(n - m, device=dev) + 1) for m in mask.squeeze(-1).sum(dim=1)] temp = [t.type(torch.float32) for t in temp] temp = [torch.cat((t, torch.zeros(n - len(t), device=dev))) for t in temp] scaling = torch.stack(temp).type(torch.float32).to(dev) # type: ignore s = s.masked_fill(mask[:, :, None], 0.0) C = torch.matmul(s, scaling.unsqueeze(-2)) P_max = (C - B).permute(0, 2, 1) P_max = P_max.masked_fill(mask[:, :, None] | mask[:, None, :], -np.inf) P_max = P_max.masked_fill(mask[:, :, None] & mask[:, None, :], 1.0) sm = torch.nn.Softmax(-1) P_hat = sm(P_max / tau) return P_hat
def mrr(y_pred, y_true, ats=None): # y_true and y_pred have dimensions [listing, document_score] # returns a tensor with mrr values at positions specified by 'ats' with dimensions [listing, mrr_at] y_true = y_true.clone() y_pred = y_pred.clone() if ats is None: ats = [y_true.shape[1]] true_sorted_by_preds = __apply_mask_and_get_true_sorted_by_preds( y_pred, y_true) values, indices = torch.max(true_sorted_by_preds, dim=1) indices = indices.type_as(values).unsqueeze(dim=0).t().expand( len(y_true), len(ats)) dev = get_torch_device() ats_rep = torch.tensor(data=ats, device=dev, dtype=torch.float32).expand(len(y_true), len(ats)) within_at_mask = (indices < ats_rep).type(torch.float32) result = torch.tensor(1.0) / (indices + torch.tensor(1.0)) zero_sum_mask = torch.sum(values) == 0.0 result[zero_sum_mask] = 0.0 result = result * within_at_mask return result
def weighted_ordinal_2(y_pred, y_true, n, padded_value_indicator=PADDED_Y_VALUE): """ Ordinal loss. :param y_pred: predictions from the model, shape [batch_size, slate_length, n] :param y_true: ground truth labels, shape [batch_size, slate_length] :param n: number of ordinal values, int :param padded_value_indicator: an indicator of the y_true index containing a padded item, e.g. -1 :return: loss value, a torch.Tensor """ device = get_torch_device() y_pred = y_pred.clone() weights = y_true + 1 y_true = with_ordinals(y_true.clone(), n) mask = y_true == padded_value_indicator valid_mask = y_true != padded_value_indicator ls = BCELoss(reduction='none')(y_pred, y_true) ls[mask] = 0.0 document_loss = torch.sum(ls, dim=2) sum_valid = torch.sum(valid_mask, dim=2).type( torch.float32) > torch.tensor(0.0, dtype=torch.float32, device=device) loss_output = torch.sum(document_loss * weights) / torch.sum(sum_valid) return loss_output
def bce(y_pred, y_true, indices=None, padded_value_indicator=PADDED_Y_VALUE): """ Binary Cross-Entropy loss. :param y_pred: predictions from the model, shape [batch_size, slate_length] :param y_true: ground truth labels, shape [batch_size, slate_length] :param padded_value_indicator: an indicator of the y_true index containing a padded item, e.g. -1 :return: loss value, a torch.Tensor """ device = get_torch_device() y_pred = y_pred.clone() y_true = y_true.clone() mask = y_true == padded_value_indicator valid_mask = y_true != padded_value_indicator ls = BCELoss(reduction='none')(y_pred, y_true) ls[mask] = 0.0 document_loss = torch.sum(ls, dim=-1) sum_valid = torch.sum(valid_mask, dim=-1).type( torch.float32) > torch.tensor(0.0, dtype=torch.float32, device=device) loss_output = torch.sum(document_loss) / torch.sum(sum_valid) return loss_output
def with_ordinals(y, n): # y dimensions: [batch, listing] # output dimensions: [batch, listing, n (number of ordinal values] dev = get_torch_device() one_to_n = torch.arange(start=1, end=n + 1, dtype=torch.float, device=dev) unsqueezed = y.unsqueeze(2).repeat(1, 1, n) mask = unsqueezed == PADDED_Y_VALUE ordinals = (unsqueezed >= one_to_n).type(torch.float) ordinals[mask] = PADDED_Y_VALUE return ordinals
def with_smoothed_ordinals(y, n, padded_value_indicator=PADDED_Y_VALUE): dev = get_torch_device() one_to_n = torch.arange(start=1, end=n + 1, dtype=torch.float, device=dev) unsqueezed = y.unsqueeze(2).repeat(1, 1, n) mask = unsqueezed == padded_value_indicator upper_mask = (unsqueezed < one_to_n) new_ordinals = (unsqueezed >= one_to_n).type(torch.int) * one_to_n new_ordinals = new_ordinals / (torch.max(new_ordinals, -1))[0].unsqueeze(-1) new_ordinals[upper_mask] = 0.0 new_ordinals[mask] = padded_value_indicator return new_ordinals
def with_ordinals(y, n): """ Helper function for ordinal loss, transforming input labels to ordinal values. :param y: labels, shape [batch_size, slate_length] :param n: number of ordinals :return: ordinals, shape [batch_size, slate_length, n] """ dev = get_torch_device() one_to_n = torch.arange(start=1, end=n + 1, dtype=torch.float, device=dev) unsqueezed = y.unsqueeze(2).repeat(1, 1, n) mask = unsqueezed == PADDED_Y_VALUE ordinals = (unsqueezed >= one_to_n).type(torch.float) ordinals[mask] = PADDED_Y_VALUE return ordinals
def with_ordinals(y, n, padded_value_indicator=PADDED_Y_VALUE): """ Helper function for ordinal loss, transforming input labels to ordinal values. :param y: labels, shape [batch_size, slate_length] :param n: number of ordinals :param padded_value_indicator: an indicator of the y_true index containing a padded item, e.g. -1 :return: ordinals, shape [batch_size, slate_length, n] """ dev = get_torch_device() one_to_n = torch.arange(start=1, end=n + 1, dtype=torch.float, device=dev) unsqueezed = y.unsqueeze(2).repeat(1, 1, n) mask = unsqueezed == padded_value_indicator ordinals = (unsqueezed >= one_to_n).type(torch.float) ordinals[mask] = padded_value_indicator return ordinals
def dcg(y_pred, y_true, ats=None, gain_function=lambda x: torch.pow(2, x) - 1, padding_indicator=PADDED_Y_VALUE): """ Discounted Cumulative Gain at k. Compute DCG at ranks given by ats or at the maximum rank if ats is None. :param y_pred: predictions from the model, shape [batch_size, slate_length] :param y_true: ground truth labels, shape [batch_size, slate_length] :param ats: optional list of ranks for DCG evaluation, if None, maximum rank is used :param gain_function: callable, gain function for the ground truth labels, e.g. torch.pow(2, x) - 1 :param padding_indicator: an indicator of the y_true index containing a padded item, e.g. -1 :return: DCG values for each slate and evaluation position, shape [batch_size, len(ats)] """ y_true = y_true.clone() y_pred = y_pred.clone() actual_length = y_true.shape[1] if ats is None: ats = [actual_length] ats = [min(at, actual_length) for at in ats] true_sorted_by_preds = __apply_mask_and_get_true_sorted_by_preds( y_pred, y_true, padding_indicator) dev = get_torch_device() discounts = (torch.tensor(1) / torch.log2( torch.arange(true_sorted_by_preds.shape[1], dtype=torch.float) + 2.0) ).to(device=dev) gains = gain_function(true_sorted_by_preds) discounted_gains = (gains * discounts)[:, :np.max(ats)] cum_dcg = torch.cumsum(discounted_gains, dim=1) ats_tensor = torch.tensor(ats, dtype=torch.long) - torch.tensor(1) dcg = cum_dcg[:, ats_tensor] return dcg
def bce(y_pred, y_true, padded_value_indicator=PADDED_Y_VALUE): dev = get_torch_device() y_pred = y_pred.clone() y_true = y_true.clone() mask = y_true == padded_value_indicator valid_mask = y_true != padded_value_indicator ls = BCELoss(reduction='none')(y_pred, y_true) ls[mask] = 0.0 document_loss = torch.sum(ls, dim=-1) sum_valid = torch.sum(valid_mask, dim=-1).type( torch.float32) > torch.tensor(0.0, dtype=torch.float32, device=dev) loss_output = torch.sum(document_loss) / torch.sum(sum_valid) return loss_output
def __rank_slates(dataloader: DataLoader, model: LTRModel, dstore): reranked_X = [] reranked_y = [] model.eval() device = get_torch_device() out = collections.defaultdict(list) with torch.no_grad(): for xb, yb, indices, qb, hb in wrap_dl(dataloader, dstore, return_all=True): x_id = xb[:, :, -3].long().to(device) q_src = xb[:, :, -2].long().to(device) x_tgt = xb[:, :, -1].long().to(device) rank = indices.to(device=device) y_true = yb.to(device=device) input_indices = torch.ones_like(y_true).type(torch.long) mask = (y_true == losses.PADDED_Y_VALUE) scores = model.score(xb.to(device), mask, input_indices) scores[mask] = float('-inf') _, indices = scores.sort(descending=True, dim=-1) res_y = torch.gather(y_true, dim=1, index=indices).cpu() res_rank = torch.gather(rank, dim=1, index=indices).cpu() res_x_id = torch.gather(x_id, dim=1, index=indices).cpu() res_q_src = torch.gather(q_src, dim=1, index=indices).cpu() res_x_tgt = torch.gather(x_tgt, dim=1, index=indices).cpu() res_scores = scores.gather(index=indices, dim=1).cpu() out['rank'].append(res_rank) out['label'].append(res_y) out['qid'].append(qb) out['kid'].append(res_x_id) out['q_src'].append(res_q_src) out['x_tgt'].append(res_x_tgt) out['scores'].append(res_scores) return out
def __rank_slates(dataloader: DataLoader, model: LTRModel) -> Tuple[torch.Tensor, torch.Tensor]: reranked_X = [] reranked_y = [] model.eval() device = get_torch_device() with torch.no_grad(): for xb, yb, _ in dataloader: X = xb.type(torch.float32).to(device=device) y_true = yb.to(device=device) input_indices = torch.ones_like(y_true).type(torch.long) mask = (y_true == losses.PADDED_Y_VALUE) scores = model.score(X, mask, input_indices) scores[mask] = float('-inf') _, indices = scores.sort(descending=True, dim=-1) indices_X = torch.unsqueeze(indices, -1).repeat_interleave(X.shape[-1], -1) reranked_X.append(torch.gather(X, dim=1, index=indices_X).cpu()) reranked_y.append(torch.gather(y_true, dim=1, index=indices).cpu()) combined_X = torch.cat(reranked_X) combined_y = torch.cat(reranked_y) return combined_X, combined_y
def ordinal( y_pred, y_true, n, padded_value_indicator=PADDED_Y_VALUE): # dimensions: [batch, listing] dev = get_torch_device() y_pred = y_pred.clone() y_true = with_ordinals(y_true.clone(), n) mask = y_true == padded_value_indicator valid_mask = y_true != padded_value_indicator ls = BCELoss(reduction='none')(y_pred, y_true) ls[mask] = 0.0 document_loss = torch.sum(ls, dim=2) sum_valid = torch.sum(valid_mask, dim=2).type( torch.float32) > torch.tensor(0.0, dtype=torch.float32, device=dev) loss_output = torch.sum(document_loss) / torch.sum(sum_valid) return loss_output
def run(): # reproducibility torch.manual_seed(42) torch.cuda.manual_seed_all(42) np.random.seed(42) args = parse_args() paths = PathsContainer.from_args(args.job_dir, args.run_id, args.config_file_name) os.makedirs(paths.base_output_path, exist_ok=True) create_output_dirs(paths.output_dir) logger = init_logger(paths.output_dir) logger.info("will save data in {output_dir}".format(output_dir=paths.base_output_path)) # read config config = Config.from_json(paths.config_path) logger.info("Config:\n {}".format(pformat(vars(config), width=1))) output_config_path = os.path.join(paths.output_dir, "used_config.json") execute_command("cp {} {}".format(paths.config_path, output_config_path)) datasets = {role: load_libsvm_dataset_role(role, config.data.path, config.data.slate_length) for role in args.roles} n_features = [ds.shape[-1] for ds in datasets.values()] assert all_equal(n_features), f"Last dimensions of datasets must match but got {n_features}" # gpu support dev = get_torch_device() logger.info("Will use device {}".format(dev.type)) # instantiate model model = make_model(n_features=n_features[0], **asdict(config.model, recurse=False)) model.load_state_dict(load_state_dict_from_file(args.input_model_path, dev)) logger.info(f"loaded model weights from {args.input_model_path}") if torch.cuda.device_count() > 1: model = CustomDataParallel(model) logger.info("Model training will be distributed to {} GPUs.".format(torch.cuda.device_count())) model.to(dev) assert config.click_model is not None, "click_model must be defined in config for this run" click_model = instantiate_from_recursive_name_args(name_args=config.click_model) ranked_slates = rank_slates(datasets, model, config) clicked_slates = {role: click_on_slates(slates, click_model, include_empty=False) for role, slates in ranked_slates.items()} # save clickthrough datasets for role, slates in clicked_slates.items(): write_to_libsvm_without_masked(os.path.join(paths.output_dir, f"{role}.txt"), *slates) # calculate metrics metered_slates = {role: metrics_on_clicked_slates(slates) for role, slates in clicked_slates.items()} for role, metrics in metered_slates.items(): metrics_df = pd.DataFrame(metrics) logger.info(f"{role} metrics summary:") logger.info(metrics_df.mean()) metrics_df.to_csv(os.path.join(paths.output_dir, f"{role}_metrics.csv"), index=False) pd.DataFrame(metrics_df.mean()).T.to_csv(os.path.join(paths.output_dir, f"{role}_metrics_mean.csv"), index=False) if urlparse(args.job_dir).scheme == "gs": copy_local_to_gs(paths.local_base_output_path, args.job_dir)
def run(): # reproducibility torch.manual_seed(42) torch.cuda.manual_seed_all(42) np.random.seed(42) args = parse_args() paths = PathsContainer.from_args(args.output, args.run_id, args.config_file_name) os.makedirs(paths.base_output_path, exist_ok=True) create_output_dirs(paths.output_dir) logger = init_logger(paths.output_dir) logger.info("will save data in {output_dir}".format( output_dir=paths.base_output_path)) # read config config = Config.from_json(paths.config_path) logger.info("Config:\n {}".format(pformat(vars(config), width=1))) output_config_path = os.path.join(paths.output_dir, "used_config.json") execute_command("cp {} {}".format(paths.config_path, output_config_path)) # train_ds, val_ds train_ds, val_ds = load_libsvm_dataset( input_path=config.data.path, slate_length=config.data.slate_length, validation_ds_role=config.data.validation_ds_role, ) n_features = train_ds.shape[-1] assert n_features == val_ds.shape[ -1], "Last dimensions of train_ds and val_ds do not match!" # train_dl, val_dl train_dl, val_dl = create_data_loaders(train_ds, val_ds, num_workers=config.data.num_workers, batch_size=config.data.batch_size) # gpu support dev = get_torch_device() logger.info("Model training will execute on {}".format(dev.type)) # instantiate model model = make_model(**asdict(config.model, recurse=False), n_features=n_features) if torch.cuda.device_count() > 1: model = CustomDataParallel(model) logger.info("Model training will be distributed to {} GPUs.".format( torch.cuda.device_count())) model.to(dev) # load optimizer, loss and LR scheduler optimizer = getattr(optim, config.optimizer.name)(params=model.parameters(), **config.optimizer.args) loss_func = partial(getattr(losses, config.loss.name), **config.loss.args) if config.lr_scheduler.name: scheduler = getattr(optim.lr_scheduler, config.lr_scheduler.name)( optimizer, **config.lr_scheduler.args) else: scheduler = None with torch.autograd.detect_anomaly( ) if config.detect_anomaly else dummy_context_mgr(): # run training result = fit(**asdict(config.training), model=model, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, train_dl=train_dl, valid_dl=val_dl, config=config, device=dev, output_dir=paths.output_dir, tensorboard_output_path=paths.tensorboard_output_path) dump_experiment_result(args, config, paths.output_dir, result) assert_expected_metrics(result, config.expected_metrics)
def neuralNDCG(y_pred, y_true, padded_value_indicator=PADDED_Y_VALUE, temperature=1., powered_relevancies=True, k=None, stochastic=False, n_samples=32, beta=0.1, log_scores=True): """ NeuralNDCG loss introduced in "NeuralNDCG: Direct Optimisation of a Ranking Metric via Differentiable Relaxation of Sorting" - https://arxiv.org/abs/2102.07831. Based on the NeuralSort algorithm. :param y_pred: predictions from the model, shape [batch_size, slate_length] :param y_true: ground truth labels, shape [batch_size, slate_length] :param padded_value_indicator: an indicator of the y_true index containing a padded item, e.g. -1 :param temperature: temperature for the NeuralSort algorithm :param powered_relevancies: whether to apply 2^x - 1 gain function, x otherwise :param k: rank at which the loss is truncated :param stochastic: whether to calculate the stochastic variant :param n_samples: how many stochastic samples are taken, used if stochastic == True :param beta: beta parameter for NeuralSort algorithm, used if stochastic == True :param log_scores: log_scores parameter for NeuralSort algorithm, used if stochastic == True :return: loss value, a torch.Tensor """ dev = get_torch_device() if k is None: k = y_true.shape[1] mask = (y_true == padded_value_indicator) # Choose the deterministic/stochastic variant if stochastic: P_hat = stochastic_neural_sort(y_pred.unsqueeze(-1), n_samples=n_samples, tau=temperature, mask=mask, beta=beta, log_scores=log_scores) else: P_hat = deterministic_neural_sort(y_pred.unsqueeze(-1), tau=temperature, mask=mask).unsqueeze(0) # Perform sinkhorn scaling to obtain doubly stochastic permutation matrices P_hat = sinkhorn_scaling(P_hat.view(P_hat.shape[0] * P_hat.shape[1], P_hat.shape[2], P_hat.shape[3]), mask.repeat_interleave(P_hat.shape[0], dim=0), tol=1e-6, max_iter=50) P_hat = P_hat.view(int(P_hat.shape[0] / y_pred.shape[0]), y_pred.shape[0], P_hat.shape[1], P_hat.shape[2]) # Mask P_hat and apply to true labels, ie approximately sort them P_hat = P_hat.masked_fill(mask[None, :, :, None] | mask[None, :, None, :], 0.) y_true_masked = y_true.masked_fill(mask, 0.).unsqueeze(-1).unsqueeze(0) if powered_relevancies: y_true_masked = torch.pow(2., y_true_masked) - 1. ground_truth = torch.matmul(P_hat, y_true_masked).squeeze(-1) discounts = (torch.tensor(1.) / torch.log2(torch.arange(y_true.shape[-1], dtype=torch.float) + 2.)).to(dev) discounted_gains = ground_truth * discounts if powered_relevancies: idcg = dcg(y_true, y_true, ats=[k]).permute(1, 0) else: idcg = dcg(y_true, y_true, ats=[k], gain_function=lambda x: x).permute(1, 0) discounted_gains = discounted_gains[:, :, :k] ndcg = discounted_gains.sum(dim=-1) / (idcg + DEFAULT_EPS) idcg_mask = idcg == 0. ndcg = ndcg.masked_fill(idcg_mask.repeat(ndcg.shape[0], 1), 0.) assert (ndcg < 0.).sum() >= 0, "every ndcg should be non-negative" if idcg_mask.all(): return torch.tensor(0.) mean_ndcg = ndcg.sum() / ((~idcg_mask).sum() * ndcg.shape[0]) # type: ignore return -1. * mean_ndcg # -1 cause we want to maximize NDCG
def neuralNDCG_transposed(y_pred, y_true, padded_value_indicator=PADDED_Y_VALUE, temperature=1., powered_relevancies=True, k=None, stochastic=False, n_samples=32, beta=0.1, log_scores=True, max_iter=50, tol=1e-6): """ NeuralNDCG Transposed loss introduced in "NeuralNDCG: Direct Optimisation of a Ranking Metric via Differentiable Relaxation of Sorting" - https://arxiv.org/abs/2102.07831. Based on the NeuralSort algorithm. :param y_pred: predictions from the model, shape [batch_size, slate_length] :param y_true: ground truth labels, shape [batch_size, slate_length] :param padded_value_indicator: an indicator of the y_true index containing a padded item, e.g. -1 :param temperature: temperature for the NeuralSort algorithm :param powered_relevancies: whether to apply 2^x - 1 gain function, x otherwise :param k: rank at which the loss is truncated :param stochastic: whether to calculate the stochastic variant :param n_samples: how many stochastic samples are taken, used if stochastic == True :param beta: beta parameter for NeuralSort algorithm, used if stochastic == True :param log_scores: log_scores parameter for NeuralSort algorithm, used if stochastic == True :param max_iter: maximum iteration count for Sinkhorn scaling :param tol: tolerance for Sinkhorn scaling :return: loss value, a torch.Tensor """ dev = get_torch_device() if k is None: k = y_true.shape[1] mask = (y_true == padded_value_indicator) if stochastic: P_hat = stochastic_neural_sort(y_pred.unsqueeze(-1), n_samples=n_samples, tau=temperature, mask=mask, beta=beta, log_scores=log_scores) else: P_hat = deterministic_neural_sort(y_pred.unsqueeze(-1), tau=temperature, mask=mask).unsqueeze(0) # Perform sinkhorn scaling to obtain doubly stochastic permutation matrices P_hat_masked = sinkhorn_scaling(P_hat.view(P_hat.shape[0] * y_pred.shape[0], y_pred.shape[1], y_pred.shape[1]), mask.repeat_interleave(P_hat.shape[0], dim=0), tol=tol, max_iter=max_iter) P_hat_masked = P_hat_masked.view(P_hat.shape[0], y_pred.shape[0], y_pred.shape[1], y_pred.shape[1]) discounts = (torch.tensor(1) / torch.log2(torch.arange(y_true.shape[-1], dtype=torch.float) + 2.)).to(dev) # This takes care of the @k metric truncation - if something is @>k, it is useless and gets 0.0 discount discounts[k:] = 0. discounts = discounts[None, None, :, None] # Here the discounts become expected discounts discounts = torch.matmul(P_hat_masked.permute(0, 1, 3, 2), discounts).squeeze(-1) if powered_relevancies: gains = torch.pow(2., y_true) - 1 discounted_gains = gains.unsqueeze(0) * discounts idcg = dcg(y_true, y_true, ats=[k]).squeeze() else: gains = y_true discounted_gains = gains.unsqueeze(0) * discounts idcg = dcg(y_true, y_true, ats=[k]).squeeze() ndcg = discounted_gains.sum(dim=2) / (idcg + DEFAULT_EPS) idcg_mask = idcg == 0. ndcg = ndcg.masked_fill(idcg_mask, 0.) assert (ndcg < 0.).sum() >= 0, "every ndcg should be non-negative" if idcg_mask.all(): return torch.tensor(0.) mean_ndcg = ndcg.sum() / ((~idcg_mask).sum() * ndcg.shape[0]) # type: ignore return -1. * mean_ndcg # -1 cause we want to maximize NDCG
def run(): # reproducibility torch.manual_seed(42) torch.cuda.manual_seed_all(42) np.random.seed(42) args = parse_args() paths = PathsContainer.from_args(args.job_dir, args.run_id, args.config_file_name) os.makedirs(paths.base_output_path, exist_ok=True) create_output_dirs(paths.output_dir) logger = init_logger(paths.output_dir) logger.info("will save data in {output_dir}".format( output_dir=paths.base_output_path)) # read config config = Config.from_json(paths.config_path) logger.info("Config:\n {}".format(pformat(vars(config), width=1))) output_config_path = os.path.join(paths.output_dir, "used_config.json") execute_command("cp {} {}".format(paths.config_path, output_config_path)) train_ds, val_ds = load_libsvm_dataset( input_path=config.data.path, slate_length=config.data.slate_length, validation_ds_role=config.data.validation_ds_role, ) # load dstore and use as feature func dstore = Dstore(**config.dstore) n_features = train_ds.shape[-1] n_features = dstore.get_n_features(n_features, config) train_dl, val_dl = create_data_loaders(train_ds, val_ds, num_workers=config.data.num_workers, batch_size=config.data.batch_size, dstore=dstore) if dstore.prefetch: dstore.run_prefetch([train_dl, val_dl]) # gpu support dev = get_torch_device() logger.info("Will use device {}".format(dev.type)) # instantiate model model = make_model(n_features=n_features, dstore=dstore, **asdict(config.model, recurse=False)) model.load_state_dict(load_state_dict_from_file(args.input_model_path, dev)) logger.info(f"loaded model weights from {args.input_model_path}") if torch.cuda.device_count() > 1: model = CustomDataParallel(model) logger.info("Model training will be distributed to {} GPUs.".format( torch.cuda.device_count())) model.to(dev) datasets = {'vali': val_dl} ranked_slates = rank_slates(datasets, model, dstore, config) # save output for role, out in ranked_slates.items(): write_out_dir(paths.output_dir, role, out, dstore) print('DONE')
def run(args): # reproducibility torch.manual_seed(42) torch.cuda.manual_seed_all(42) np.random.seed(42) paths = PathsContainer.from_args(args.job_dir, args.run_id, args.config_file_name) create_output_dirs(paths.output_dir) logger = init_logger(paths.output_dir) logger.info(f"created paths container {paths}") # read config config = Config.from_json(paths.config_path) logger.info("Config:\n {}".format(pformat(vars(config), width=1))) output_config_path = os.path.join(paths.output_dir, "used_config.json") execute_command("cp {} {}".format(paths.config_path, output_config_path)) print("Shared in main", config.data.shared) # train_ds, val_ds, test_ds train_ds, val_ds, test_ds = load_libsvm_dataset( input_path=config.data.path, slate_length=config.data.slate_length, validation_ds_role=config.data.validation_ds_role, test_ds_role=config.data.test_ds_role, sigma=config.data.noise, shared=config.data.shared) n_features = train_ds.shape[-1] assert n_features == val_ds.shape[ -1], "Last dimensions of train_ds and val_ds do not match!" # train_dl, val_dl, test_dl train_dl, val_dl, test_dl = create_data_loaders( train_ds, val_ds, test_ds, num_workers=config.data.num_workers, batch_size=config.data.batch_size) # gpu support dev = get_torch_device() logger.info("Model training will execute on {}".format(dev.type)) # instantiate model use_distillation = True if config.distillation_loss else False full_pipeline = True if use_distillation and "full" in config.distillation_loss.name else False fit_size = config.teacher_model.fc_model['sizes'][ -1] if full_pipeline and config.teacher_model.fc_model['sizes'][ -1] != config.model.fc_model['sizes'][-1] else None print("Fit size", fit_size) model = make_model(n_features=n_features, **asdict(config.model, recurse=False), fit_size=fit_size, distillation=full_pipeline, seq_len=config.data.slate_length) if torch.cuda.device_count() > 1: model = CustomDataParallel(model) logger.info("Model training will be distributed to {} GPUs.".format( torch.cuda.device_count())) model.to(dev) # load optimizer, loss and LR scheduler if hasattr(optim, config.optimizer.name): optimizer = getattr(optim, config.optimizer.name)(params=model.parameters(), **config.optimizer.args) #if hasattr(optimizers, config.optimizer.name): # optimizer = getattr(optimizers, config.optimizer.name)(params=model.parameters(), **config.optimizer.args) if config.lr_scheduler.name: scheduler = getattr(optim.lr_scheduler, config.lr_scheduler.name)( optimizer, **config.lr_scheduler.args) else: scheduler = None loss_func = partial(getattr(losses, config.loss.name), **config.loss.args) if args.evaluate: test_metrics = compute_metrics(config.metrics, model, test_dl, dev) print(test_metrics) sys.exit() if use_distillation: if full_pipeline: assert config.teacher_model.transformer.h == config.model.transformer.h teacher_model = make_model(n_features=n_features, **asdict(config.teacher_model, recurse=False), distillation=full_pipeline, fit_size=None) if torch.cuda.device_count() > 1: teacher_model = CustomDataParallel(teacher_model) logger.info( "Model training will be distributed to {} GPUs.".format( torch.cuda.device_count())) teacher_model.to(dev) loss_func = partial(getattr(losses, config.distillation_loss.name), gt_loss_func=loss_func, **config.distillation_loss.args) with torch.autograd.detect_anomaly( ) if config.detect_anomaly else dummy_context_mgr(): # type: ignore result, model = fit_with_distillation( student_model=model, teacher_model=teacher_model, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, train_dl=train_dl, valid_dl=val_dl, config=config, device=dev, output_dir=paths.output_dir, tensorboard_output_path=paths.tensorboard_output_path, full=full_pipeline, **asdict(config.training)) else: with torch.autograd.detect_anomaly( ) if config.detect_anomaly else dummy_context_mgr(): # type: ignore # run training result, model = fit( model=model, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, train_dl=train_dl, valid_dl=val_dl, config=config, device=dev, output_dir=paths.output_dir, tensorboard_output_path=paths.tensorboard_output_path, **asdict(config.training)) #Reload best model sd = torch.load(os.path.join(paths.output_dir, "best_model.pkl")) model.load_state_dict(sd) test_metrics = compute_metrics(config.metrics, model, test_dl, dev) result['test_metrics'] = test_metrics print(result) dump_experiment_result(args, config, paths.output_dir, result) if urlparse(args.job_dir).scheme == "gs": copy_local_to_gs(paths.local_base_output_path, args.job_dir) assert_expected_metrics(result, config.expected_metrics) return test_metrics['ndcg_10']