Example #1
0
def stochastic_neural_sort(s, n_samples, tau, mask, beta=1.0, log_scores=True, eps=1e-10):
    """
    Stochastic neural sort. Please note that memory complexity grows by factor n_samples.
    Code taken from "Stochastic Optimization of Sorting Networks via Continuous Relaxations", ICLR 2019.
    Minor modifications applied to the original code (masking).
    :param s: values to sort, shape [batch_size, slate_length]
    :param n_samples: number of samples (approximations) for each permutation matrix
    :param tau: temperature for the final softmax function
    :param mask: mask indicating padded elements
    :param beta: scale parameter for the Gumbel distribution
    :param log_scores: whether to apply the logarithm function to scores prior to Gumbel perturbation
    :param eps: epsilon for the logarithm function
    :return: approximate permutation matrices of shape [n_samples, batch_size, slate_length, slate_length]
    """
    dev = get_torch_device()

    batch_size = s.size()[0]
    n = s.size()[1]
    s_positive = s + torch.abs(s.min())
    samples = beta * sample_gumbel([n_samples, batch_size, n, 1], device=dev)
    if log_scores:
        s_positive = torch.log(s_positive + eps)

    s_perturb = (s_positive + samples).view(n_samples * batch_size, n, 1)
    mask_repeated = mask.repeat_interleave(n_samples, dim=0)

    P_hat = deterministic_neural_sort(s_perturb, tau, mask_repeated)
    P_hat = P_hat.view(n_samples, batch_size, n, n)
    return P_hat
Example #2
0
def mrr(y_pred, y_true, ats=None):
    """
    Mean Reciprocal Rank at k.

    Compute MRR at ranks given by ats or at the maximum rank if ats is None.
    :param y_pred: predictions from the model, shape [batch_size, slate_length]
    :param y_true: ground truth labels, shape [batch_size, slate_length]
    :param ats: optional list of ranks for MRR evaluation, if None, maximum rank is used
    :return: MRR values for each slate and evaluation position, shape [batch_size, len(ats)]
    """
    y_true = y_true.clone()
    y_pred = y_pred.clone()

    if ats is None:
        ats = [y_true.shape[1]]

    true_sorted_by_preds = __apply_mask_and_get_true_sorted_by_preds(y_pred, y_true)

    values, indices = torch.max(true_sorted_by_preds, dim=1)
    indices = indices.type_as(values).unsqueeze(dim=0).t().expand(len(y_true), len(ats))

    dev = get_torch_device()

    ats_rep = torch.tensor(data=ats, device=dev, dtype=torch.float32).expand(len(y_true), len(ats))

    within_at_mask = (indices < ats_rep).type(torch.float32)

    result = torch.tensor(1.0) / (indices + torch.tensor(1.0))

    zero_sum_mask = torch.sum(values) == 0.0
    result[zero_sum_mask] = 0.0

    result = result * within_at_mask

    return result
Example #3
0
def dcg(y_pred, y_true, ats=None, gain_function=lambda x: torch.pow(2, x) - 1):
    # y_true and y_pred have dimensions [listing, document_score]
    # returns a tensor with ndcg values at positions specified by 'ats' with dimensions [listing, dcg_at]
    y_true = y_true.clone()
    y_pred = y_pred.clone()

    actual_length = y_true.shape[1]

    if ats is None:
        ats = [actual_length]
    ats = [min(at, actual_length) for at in ats]

    true_sorted_by_preds = __apply_mask_and_get_true_sorted_by_preds(
        y_pred, y_true)

    dev = get_torch_device()

    discounts = (torch.tensor(1) / torch.log2(
        torch.arange(true_sorted_by_preds.shape[1], dtype=torch.float) + 2.0)
                 ).to(device=dev)

    gains = gain_function(true_sorted_by_preds)

    discounted_gains = (gains * discounts)[:, :np.max(ats)]

    cum_dcg = torch.cumsum(discounted_gains, dim=1)

    ats_tensor = torch.tensor(ats, dtype=torch.long) - torch.tensor(1)

    dcg = cum_dcg[:, ats_tensor]

    return dcg
Example #4
0
def deterministic_neural_sort(s, tau, mask):
    """
    Deterministic neural sort.
    Code taken from "Stochastic Optimization of Sorting Networks via Continuous Relaxations", ICLR 2019.
    Minor modifications applied to the original code (masking).
    :param s: values to sort, shape [batch_size, slate_length]
    :param tau: temperature for the final softmax function
    :param mask: mask indicating padded elements
    :return: approximate permutation matrices of shape [batch_size, slate_length, slate_length]
    """
    dev = get_torch_device()

    n = s.size()[1]
    one = torch.ones((n, 1), dtype=torch.float32, device=dev)
    s = s.masked_fill(mask[:, :, None], -1e8)
    A_s = torch.abs(s - s.permute(0, 2, 1))
    A_s = A_s.masked_fill(mask[:, :, None] | mask[:, None, :], 0.0)

    B = torch.matmul(A_s, torch.matmul(one, torch.transpose(one, 0, 1)))

    temp = [n - m + 1 - 2 * (torch.arange(n - m, device=dev) + 1) for m in mask.squeeze(-1).sum(dim=1)]
    temp = [t.type(torch.float32) for t in temp]
    temp = [torch.cat((t, torch.zeros(n - len(t), device=dev))) for t in temp]
    scaling = torch.stack(temp).type(torch.float32).to(dev)  # type: ignore

    s = s.masked_fill(mask[:, :, None], 0.0)
    C = torch.matmul(s, scaling.unsqueeze(-2))

    P_max = (C - B).permute(0, 2, 1)
    P_max = P_max.masked_fill(mask[:, :, None] | mask[:, None, :], -np.inf)
    P_max = P_max.masked_fill(mask[:, :, None] & mask[:, None, :], 1.0)
    sm = torch.nn.Softmax(-1)
    P_hat = sm(P_max / tau)
    return P_hat
Example #5
0
def mrr(y_pred, y_true, ats=None):
    # y_true and y_pred have dimensions [listing, document_score]
    # returns a tensor with mrr values at positions specified by 'ats' with dimensions [listing, mrr_at]
    y_true = y_true.clone()
    y_pred = y_pred.clone()

    if ats is None:
        ats = [y_true.shape[1]]

    true_sorted_by_preds = __apply_mask_and_get_true_sorted_by_preds(
        y_pred, y_true)

    values, indices = torch.max(true_sorted_by_preds, dim=1)
    indices = indices.type_as(values).unsqueeze(dim=0).t().expand(
        len(y_true), len(ats))

    dev = get_torch_device()

    ats_rep = torch.tensor(data=ats, device=dev,
                           dtype=torch.float32).expand(len(y_true), len(ats))

    within_at_mask = (indices < ats_rep).type(torch.float32)

    result = torch.tensor(1.0) / (indices + torch.tensor(1.0))

    zero_sum_mask = torch.sum(values) == 0.0
    result[zero_sum_mask] = 0.0

    result = result * within_at_mask

    return result
Example #6
0
def weighted_ordinal_2(y_pred,
                       y_true,
                       n,
                       padded_value_indicator=PADDED_Y_VALUE):
    """
    Ordinal loss.
    :param y_pred: predictions from the model, shape [batch_size, slate_length, n]
    :param y_true: ground truth labels, shape [batch_size, slate_length]
    :param n: number of ordinal values, int
    :param padded_value_indicator: an indicator of the y_true index containing a padded item, e.g. -1
    :return: loss value, a torch.Tensor
    """
    device = get_torch_device()

    y_pred = y_pred.clone()
    weights = y_true + 1

    y_true = with_ordinals(y_true.clone(), n)

    mask = y_true == padded_value_indicator
    valid_mask = y_true != padded_value_indicator

    ls = BCELoss(reduction='none')(y_pred, y_true)
    ls[mask] = 0.0

    document_loss = torch.sum(ls, dim=2)
    sum_valid = torch.sum(valid_mask, dim=2).type(
        torch.float32) > torch.tensor(0.0, dtype=torch.float32, device=device)

    loss_output = torch.sum(document_loss * weights) / torch.sum(sum_valid)

    return loss_output
Example #7
0
def bce(y_pred, y_true, indices=None, padded_value_indicator=PADDED_Y_VALUE):
    """
    Binary Cross-Entropy loss.
    :param y_pred: predictions from the model, shape [batch_size, slate_length]
    :param y_true: ground truth labels, shape [batch_size, slate_length]
    :param padded_value_indicator: an indicator of the y_true index containing a padded item, e.g. -1
    :return: loss value, a torch.Tensor
    """
    device = get_torch_device()

    y_pred = y_pred.clone()
    y_true = y_true.clone()

    mask = y_true == padded_value_indicator
    valid_mask = y_true != padded_value_indicator

    ls = BCELoss(reduction='none')(y_pred, y_true)
    ls[mask] = 0.0

    document_loss = torch.sum(ls, dim=-1)
    sum_valid = torch.sum(valid_mask, dim=-1).type(
        torch.float32) > torch.tensor(0.0, dtype=torch.float32, device=device)

    loss_output = torch.sum(document_loss) / torch.sum(sum_valid)

    return loss_output
Example #8
0
def with_ordinals(y, n):
    # y dimensions: [batch, listing]
    # output dimensions: [batch, listing, n (number of ordinal values]
    dev = get_torch_device()
    one_to_n = torch.arange(start=1, end=n + 1, dtype=torch.float, device=dev)
    unsqueezed = y.unsqueeze(2).repeat(1, 1, n)
    mask = unsqueezed == PADDED_Y_VALUE
    ordinals = (unsqueezed >= one_to_n).type(torch.float)
    ordinals[mask] = PADDED_Y_VALUE
    return ordinals
Example #9
0
def with_smoothed_ordinals(y, n, padded_value_indicator=PADDED_Y_VALUE):
    dev = get_torch_device()
    one_to_n = torch.arange(start=1, end=n + 1, dtype=torch.float, device=dev)
    unsqueezed = y.unsqueeze(2).repeat(1, 1, n)
    mask = unsqueezed == padded_value_indicator

    upper_mask = (unsqueezed < one_to_n)
    new_ordinals = (unsqueezed >= one_to_n).type(torch.int) * one_to_n
    new_ordinals = new_ordinals / (torch.max(new_ordinals,
                                             -1))[0].unsqueeze(-1)
    new_ordinals[upper_mask] = 0.0
    new_ordinals[mask] = padded_value_indicator
    return new_ordinals
Example #10
0
def with_ordinals(y, n):
    """
    Helper function for ordinal loss, transforming input labels to ordinal values.
    :param y: labels, shape [batch_size, slate_length]
    :param n: number of ordinals
    :return: ordinals, shape [batch_size, slate_length, n]
    """
    dev = get_torch_device()
    one_to_n = torch.arange(start=1, end=n + 1, dtype=torch.float, device=dev)
    unsqueezed = y.unsqueeze(2).repeat(1, 1, n)
    mask = unsqueezed == PADDED_Y_VALUE
    ordinals = (unsqueezed >= one_to_n).type(torch.float)
    ordinals[mask] = PADDED_Y_VALUE
    return ordinals
Example #11
0
def with_ordinals(y, n, padded_value_indicator=PADDED_Y_VALUE):
    """
    Helper function for ordinal loss, transforming input labels to ordinal values.
    :param y: labels, shape [batch_size, slate_length]
    :param n: number of ordinals
    :param padded_value_indicator: an indicator of the y_true index containing a padded item, e.g. -1
    :return: ordinals, shape [batch_size, slate_length, n]
    """
    dev = get_torch_device()
    one_to_n = torch.arange(start=1, end=n + 1, dtype=torch.float, device=dev)
    unsqueezed = y.unsqueeze(2).repeat(1, 1, n)
    mask = unsqueezed == padded_value_indicator
    ordinals = (unsqueezed >= one_to_n).type(torch.float)
    ordinals[mask] = padded_value_indicator
    return ordinals
Example #12
0
def dcg(y_pred,
        y_true,
        ats=None,
        gain_function=lambda x: torch.pow(2, x) - 1,
        padding_indicator=PADDED_Y_VALUE):
    """
    Discounted Cumulative Gain at k.

    Compute DCG at ranks given by ats or at the maximum rank if ats is None.
    :param y_pred: predictions from the model, shape [batch_size, slate_length]
    :param y_true: ground truth labels, shape [batch_size, slate_length]
    :param ats: optional list of ranks for DCG evaluation, if None, maximum rank is used
    :param gain_function: callable, gain function for the ground truth labels, e.g. torch.pow(2, x) - 1
    :param padding_indicator: an indicator of the y_true index containing a padded item, e.g. -1
    :return: DCG values for each slate and evaluation position, shape [batch_size, len(ats)]
    """
    y_true = y_true.clone()
    y_pred = y_pred.clone()

    actual_length = y_true.shape[1]

    if ats is None:
        ats = [actual_length]
    ats = [min(at, actual_length) for at in ats]

    true_sorted_by_preds = __apply_mask_and_get_true_sorted_by_preds(
        y_pred, y_true, padding_indicator)

    dev = get_torch_device()

    discounts = (torch.tensor(1) / torch.log2(
        torch.arange(true_sorted_by_preds.shape[1], dtype=torch.float) + 2.0)
                 ).to(device=dev)

    gains = gain_function(true_sorted_by_preds)

    discounted_gains = (gains * discounts)[:, :np.max(ats)]

    cum_dcg = torch.cumsum(discounted_gains, dim=1)

    ats_tensor = torch.tensor(ats, dtype=torch.long) - torch.tensor(1)

    dcg = cum_dcg[:, ats_tensor]

    return dcg
Example #13
0
def bce(y_pred, y_true, padded_value_indicator=PADDED_Y_VALUE):
    dev = get_torch_device()

    y_pred = y_pred.clone()
    y_true = y_true.clone()

    mask = y_true == padded_value_indicator
    valid_mask = y_true != padded_value_indicator

    ls = BCELoss(reduction='none')(y_pred, y_true)
    ls[mask] = 0.0

    document_loss = torch.sum(ls, dim=-1)
    sum_valid = torch.sum(valid_mask, dim=-1).type(
        torch.float32) > torch.tensor(0.0, dtype=torch.float32, device=dev)

    loss_output = torch.sum(document_loss) / torch.sum(sum_valid)

    return loss_output
Example #14
0
def __rank_slates(dataloader: DataLoader, model: LTRModel, dstore):
    reranked_X = []
    reranked_y = []
    model.eval()
    device = get_torch_device()

    out = collections.defaultdict(list)

    with torch.no_grad():
        for xb, yb, indices, qb, hb in wrap_dl(dataloader, dstore, return_all=True):
            x_id = xb[:, :, -3].long().to(device)
            q_src = xb[:, :, -2].long().to(device)
            x_tgt = xb[:, :, -1].long().to(device)
            rank = indices.to(device=device)
            y_true = yb.to(device=device)

            input_indices = torch.ones_like(y_true).type(torch.long)
            mask = (y_true == losses.PADDED_Y_VALUE)
            scores = model.score(xb.to(device), mask, input_indices)

            scores[mask] = float('-inf')

            _, indices = scores.sort(descending=True, dim=-1)
            res_y = torch.gather(y_true, dim=1, index=indices).cpu()
            res_rank = torch.gather(rank, dim=1, index=indices).cpu()
            res_x_id = torch.gather(x_id, dim=1, index=indices).cpu()
            res_q_src = torch.gather(q_src, dim=1, index=indices).cpu()
            res_x_tgt = torch.gather(x_tgt, dim=1, index=indices).cpu()
            res_scores = scores.gather(index=indices, dim=1).cpu()

            out['rank'].append(res_rank)
            out['label'].append(res_y)
            out['qid'].append(qb)
            out['kid'].append(res_x_id)
            out['q_src'].append(res_q_src)
            out['x_tgt'].append(res_x_tgt)
            out['scores'].append(res_scores)

    return out
Example #15
0
def __rank_slates(dataloader: DataLoader, model: LTRModel) -> Tuple[torch.Tensor, torch.Tensor]:
    reranked_X = []
    reranked_y = []
    model.eval()
    device = get_torch_device()
    with torch.no_grad():
        for xb, yb, _ in dataloader:
            X = xb.type(torch.float32).to(device=device)
            y_true = yb.to(device=device)

            input_indices = torch.ones_like(y_true).type(torch.long)
            mask = (y_true == losses.PADDED_Y_VALUE)
            scores = model.score(X, mask, input_indices)

            scores[mask] = float('-inf')

            _, indices = scores.sort(descending=True, dim=-1)
            indices_X = torch.unsqueeze(indices, -1).repeat_interleave(X.shape[-1], -1)
            reranked_X.append(torch.gather(X, dim=1, index=indices_X).cpu())
            reranked_y.append(torch.gather(y_true, dim=1, index=indices).cpu())

    combined_X = torch.cat(reranked_X)
    combined_y = torch.cat(reranked_y)
    return combined_X, combined_y
Example #16
0
def ordinal(
        y_pred,
        y_true,
        n,
        padded_value_indicator=PADDED_Y_VALUE):  # dimensions: [batch, listing]

    dev = get_torch_device()

    y_pred = y_pred.clone()
    y_true = with_ordinals(y_true.clone(), n)

    mask = y_true == padded_value_indicator
    valid_mask = y_true != padded_value_indicator

    ls = BCELoss(reduction='none')(y_pred, y_true)
    ls[mask] = 0.0

    document_loss = torch.sum(ls, dim=2)
    sum_valid = torch.sum(valid_mask, dim=2).type(
        torch.float32) > torch.tensor(0.0, dtype=torch.float32, device=dev)

    loss_output = torch.sum(document_loss) / torch.sum(sum_valid)

    return loss_output
Example #17
0
def run():
    # reproducibility
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    np.random.seed(42)

    args = parse_args()

    paths = PathsContainer.from_args(args.job_dir, args.run_id, args.config_file_name)

    os.makedirs(paths.base_output_path, exist_ok=True)

    create_output_dirs(paths.output_dir)
    logger = init_logger(paths.output_dir)

    logger.info("will save data in {output_dir}".format(output_dir=paths.base_output_path))

    # read config
    config = Config.from_json(paths.config_path)
    logger.info("Config:\n {}".format(pformat(vars(config), width=1)))

    output_config_path = os.path.join(paths.output_dir, "used_config.json")
    execute_command("cp {} {}".format(paths.config_path, output_config_path))

    datasets = {role: load_libsvm_dataset_role(role, config.data.path, config.data.slate_length) for role in args.roles}

    n_features = [ds.shape[-1] for ds in datasets.values()]
    assert all_equal(n_features), f"Last dimensions of datasets must match but got {n_features}"

    # gpu support
    dev = get_torch_device()
    logger.info("Will use device {}".format(dev.type))

    # instantiate model
    model = make_model(n_features=n_features[0], **asdict(config.model, recurse=False))

    model.load_state_dict(load_state_dict_from_file(args.input_model_path, dev))
    logger.info(f"loaded model weights from {args.input_model_path}")

    if torch.cuda.device_count() > 1:
        model = CustomDataParallel(model)
        logger.info("Model training will be distributed to {} GPUs.".format(torch.cuda.device_count()))
    model.to(dev)

    assert config.click_model is not None, "click_model must be defined in config for this run"
    click_model = instantiate_from_recursive_name_args(name_args=config.click_model)

    ranked_slates = rank_slates(datasets, model, config)

    clicked_slates = {role: click_on_slates(slates, click_model, include_empty=False) for role, slates in ranked_slates.items()}

    # save clickthrough datasets
    for role, slates in clicked_slates.items():
        write_to_libsvm_without_masked(os.path.join(paths.output_dir, f"{role}.txt"), *slates)

    # calculate metrics
    metered_slates = {role: metrics_on_clicked_slates(slates) for role, slates in clicked_slates.items()}

    for role, metrics in metered_slates.items():
        metrics_df = pd.DataFrame(metrics)
        logger.info(f"{role} metrics summary:")
        logger.info(metrics_df.mean())
        metrics_df.to_csv(os.path.join(paths.output_dir, f"{role}_metrics.csv"), index=False)
        pd.DataFrame(metrics_df.mean()).T.to_csv(os.path.join(paths.output_dir, f"{role}_metrics_mean.csv"), index=False)

    if urlparse(args.job_dir).scheme == "gs":
        copy_local_to_gs(paths.local_base_output_path, args.job_dir)
Example #18
0
def run():
    # reproducibility
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    np.random.seed(42)

    args = parse_args()

    paths = PathsContainer.from_args(args.output, args.run_id,
                                     args.config_file_name)

    os.makedirs(paths.base_output_path, exist_ok=True)

    create_output_dirs(paths.output_dir)
    logger = init_logger(paths.output_dir)

    logger.info("will save data in {output_dir}".format(
        output_dir=paths.base_output_path))

    # read config
    config = Config.from_json(paths.config_path)
    logger.info("Config:\n {}".format(pformat(vars(config), width=1)))

    output_config_path = os.path.join(paths.output_dir, "used_config.json")
    execute_command("cp {} {}".format(paths.config_path, output_config_path))

    # train_ds, val_ds
    train_ds, val_ds = load_libsvm_dataset(
        input_path=config.data.path,
        slate_length=config.data.slate_length,
        validation_ds_role=config.data.validation_ds_role,
    )

    n_features = train_ds.shape[-1]
    assert n_features == val_ds.shape[
        -1], "Last dimensions of train_ds and val_ds do not match!"

    # train_dl, val_dl
    train_dl, val_dl = create_data_loaders(train_ds,
                                           val_ds,
                                           num_workers=config.data.num_workers,
                                           batch_size=config.data.batch_size)

    # gpu support
    dev = get_torch_device()
    logger.info("Model training will execute on {}".format(dev.type))

    # instantiate model
    model = make_model(**asdict(config.model, recurse=False),
                       n_features=n_features)
    if torch.cuda.device_count() > 1:
        model = CustomDataParallel(model)
        logger.info("Model training will be distributed to {} GPUs.".format(
            torch.cuda.device_count()))
    model.to(dev)

    # load optimizer, loss and LR scheduler
    optimizer = getattr(optim,
                        config.optimizer.name)(params=model.parameters(),
                                               **config.optimizer.args)
    loss_func = partial(getattr(losses, config.loss.name), **config.loss.args)
    if config.lr_scheduler.name:
        scheduler = getattr(optim.lr_scheduler, config.lr_scheduler.name)(
            optimizer, **config.lr_scheduler.args)
    else:
        scheduler = None

    with torch.autograd.detect_anomaly(
    ) if config.detect_anomaly else dummy_context_mgr():
        # run training
        result = fit(**asdict(config.training),
                     model=model,
                     loss_func=loss_func,
                     optimizer=optimizer,
                     scheduler=scheduler,
                     train_dl=train_dl,
                     valid_dl=val_dl,
                     config=config,
                     device=dev,
                     output_dir=paths.output_dir,
                     tensorboard_output_path=paths.tensorboard_output_path)

    dump_experiment_result(args, config, paths.output_dir, result)

    assert_expected_metrics(result, config.expected_metrics)
Example #19
0
def neuralNDCG(y_pred, y_true, padded_value_indicator=PADDED_Y_VALUE, temperature=1., powered_relevancies=True, k=None,
               stochastic=False, n_samples=32, beta=0.1, log_scores=True):
    """
    NeuralNDCG loss introduced in "NeuralNDCG: Direct Optimisation of a Ranking Metric via Differentiable
    Relaxation of Sorting" - https://arxiv.org/abs/2102.07831. Based on the NeuralSort algorithm.
    :param y_pred: predictions from the model, shape [batch_size, slate_length]
    :param y_true: ground truth labels, shape [batch_size, slate_length]
    :param padded_value_indicator: an indicator of the y_true index containing a padded item, e.g. -1
    :param temperature: temperature for the NeuralSort algorithm
    :param powered_relevancies: whether to apply 2^x - 1 gain function, x otherwise
    :param k: rank at which the loss is truncated
    :param stochastic: whether to calculate the stochastic variant
    :param n_samples: how many stochastic samples are taken, used if stochastic == True
    :param beta: beta parameter for NeuralSort algorithm, used if stochastic == True
    :param log_scores: log_scores parameter for NeuralSort algorithm, used if stochastic == True
    :return: loss value, a torch.Tensor
    """
    dev = get_torch_device()

    if k is None:
        k = y_true.shape[1]

    mask = (y_true == padded_value_indicator)
    # Choose the deterministic/stochastic variant
    if stochastic:
        P_hat = stochastic_neural_sort(y_pred.unsqueeze(-1), n_samples=n_samples, tau=temperature, mask=mask,
                                       beta=beta, log_scores=log_scores)
    else:
        P_hat = deterministic_neural_sort(y_pred.unsqueeze(-1), tau=temperature, mask=mask).unsqueeze(0)

    # Perform sinkhorn scaling to obtain doubly stochastic permutation matrices
    P_hat = sinkhorn_scaling(P_hat.view(P_hat.shape[0] * P_hat.shape[1], P_hat.shape[2], P_hat.shape[3]),
                             mask.repeat_interleave(P_hat.shape[0], dim=0), tol=1e-6, max_iter=50)
    P_hat = P_hat.view(int(P_hat.shape[0] / y_pred.shape[0]), y_pred.shape[0], P_hat.shape[1], P_hat.shape[2])

    # Mask P_hat and apply to true labels, ie approximately sort them
    P_hat = P_hat.masked_fill(mask[None, :, :, None] | mask[None, :, None, :], 0.)
    y_true_masked = y_true.masked_fill(mask, 0.).unsqueeze(-1).unsqueeze(0)
    if powered_relevancies:
        y_true_masked = torch.pow(2., y_true_masked) - 1.

    ground_truth = torch.matmul(P_hat, y_true_masked).squeeze(-1)
    discounts = (torch.tensor(1.) / torch.log2(torch.arange(y_true.shape[-1], dtype=torch.float) + 2.)).to(dev)
    discounted_gains = ground_truth * discounts

    if powered_relevancies:
        idcg = dcg(y_true, y_true, ats=[k]).permute(1, 0)
    else:
        idcg = dcg(y_true, y_true, ats=[k], gain_function=lambda x: x).permute(1, 0)

    discounted_gains = discounted_gains[:, :, :k]
    ndcg = discounted_gains.sum(dim=-1) / (idcg + DEFAULT_EPS)
    idcg_mask = idcg == 0.
    ndcg = ndcg.masked_fill(idcg_mask.repeat(ndcg.shape[0], 1), 0.)

    assert (ndcg < 0.).sum() >= 0, "every ndcg should be non-negative"
    if idcg_mask.all():
        return torch.tensor(0.)

    mean_ndcg = ndcg.sum() / ((~idcg_mask).sum() * ndcg.shape[0])  # type: ignore
    return -1. * mean_ndcg  # -1 cause we want to maximize NDCG
Example #20
0
def neuralNDCG_transposed(y_pred, y_true, padded_value_indicator=PADDED_Y_VALUE, temperature=1.,
                          powered_relevancies=True, k=None, stochastic=False, n_samples=32, beta=0.1, log_scores=True,
                          max_iter=50, tol=1e-6):
    """
    NeuralNDCG Transposed loss introduced in "NeuralNDCG: Direct Optimisation of a Ranking Metric via Differentiable
    Relaxation of Sorting" - https://arxiv.org/abs/2102.07831. Based on the NeuralSort algorithm.
    :param y_pred: predictions from the model, shape [batch_size, slate_length]
    :param y_true: ground truth labels, shape [batch_size, slate_length]
    :param padded_value_indicator: an indicator of the y_true index containing a padded item, e.g. -1
    :param temperature: temperature for the NeuralSort algorithm
    :param powered_relevancies: whether to apply 2^x - 1 gain function, x otherwise
    :param k: rank at which the loss is truncated
    :param stochastic: whether to calculate the stochastic variant
    :param n_samples: how many stochastic samples are taken, used if stochastic == True
    :param beta: beta parameter for NeuralSort algorithm, used if stochastic == True
    :param log_scores: log_scores parameter for NeuralSort algorithm, used if stochastic == True
    :param max_iter: maximum iteration count for Sinkhorn scaling
    :param tol: tolerance for Sinkhorn scaling
    :return: loss value, a torch.Tensor
    """
    dev = get_torch_device()

    if k is None:
        k = y_true.shape[1]

    mask = (y_true == padded_value_indicator)

    if stochastic:
        P_hat = stochastic_neural_sort(y_pred.unsqueeze(-1), n_samples=n_samples, tau=temperature, mask=mask,
                                       beta=beta, log_scores=log_scores)
    else:
        P_hat = deterministic_neural_sort(y_pred.unsqueeze(-1), tau=temperature, mask=mask).unsqueeze(0)

    # Perform sinkhorn scaling to obtain doubly stochastic permutation matrices
    P_hat_masked = sinkhorn_scaling(P_hat.view(P_hat.shape[0] * y_pred.shape[0], y_pred.shape[1], y_pred.shape[1]),
                                    mask.repeat_interleave(P_hat.shape[0], dim=0), tol=tol, max_iter=max_iter)
    P_hat_masked = P_hat_masked.view(P_hat.shape[0], y_pred.shape[0], y_pred.shape[1], y_pred.shape[1])
    discounts = (torch.tensor(1) / torch.log2(torch.arange(y_true.shape[-1], dtype=torch.float) + 2.)).to(dev)

    # This takes care of the @k metric truncation - if something is @>k, it is useless and gets 0.0 discount
    discounts[k:] = 0.
    discounts = discounts[None, None, :, None]

    # Here the discounts become expected discounts
    discounts = torch.matmul(P_hat_masked.permute(0, 1, 3, 2), discounts).squeeze(-1)
    if powered_relevancies:
        gains = torch.pow(2., y_true) - 1
        discounted_gains = gains.unsqueeze(0) * discounts
        idcg = dcg(y_true, y_true, ats=[k]).squeeze()
    else:
        gains = y_true
        discounted_gains = gains.unsqueeze(0) * discounts
        idcg = dcg(y_true, y_true, ats=[k]).squeeze()

    ndcg = discounted_gains.sum(dim=2) / (idcg + DEFAULT_EPS)
    idcg_mask = idcg == 0.
    ndcg = ndcg.masked_fill(idcg_mask, 0.)

    assert (ndcg < 0.).sum() >= 0, "every ndcg should be non-negative"
    if idcg_mask.all():
        return torch.tensor(0.)

    mean_ndcg = ndcg.sum() / ((~idcg_mask).sum() * ndcg.shape[0])  # type: ignore
    return -1. * mean_ndcg  # -1 cause we want to maximize NDCG
Example #21
0
def run():
    # reproducibility
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    np.random.seed(42)

    args = parse_args()

    paths = PathsContainer.from_args(args.job_dir, args.run_id,
                                     args.config_file_name)

    os.makedirs(paths.base_output_path, exist_ok=True)

    create_output_dirs(paths.output_dir)
    logger = init_logger(paths.output_dir)

    logger.info("will save data in {output_dir}".format(
        output_dir=paths.base_output_path))

    # read config
    config = Config.from_json(paths.config_path)
    logger.info("Config:\n {}".format(pformat(vars(config), width=1)))

    output_config_path = os.path.join(paths.output_dir, "used_config.json")
    execute_command("cp {} {}".format(paths.config_path, output_config_path))

    train_ds, val_ds = load_libsvm_dataset(
        input_path=config.data.path,
        slate_length=config.data.slate_length,
        validation_ds_role=config.data.validation_ds_role,
    )

    # load dstore and use as feature func
    dstore = Dstore(**config.dstore)
    n_features = train_ds.shape[-1]
    n_features = dstore.get_n_features(n_features, config)

    train_dl, val_dl = create_data_loaders(train_ds,
                                           val_ds,
                                           num_workers=config.data.num_workers,
                                           batch_size=config.data.batch_size,
                                           dstore=dstore)

    if dstore.prefetch:
        dstore.run_prefetch([train_dl, val_dl])

    # gpu support
    dev = get_torch_device()
    logger.info("Will use device {}".format(dev.type))

    # instantiate model
    model = make_model(n_features=n_features,
                       dstore=dstore,
                       **asdict(config.model, recurse=False))

    model.load_state_dict(load_state_dict_from_file(args.input_model_path,
                                                    dev))
    logger.info(f"loaded model weights from {args.input_model_path}")

    if torch.cuda.device_count() > 1:
        model = CustomDataParallel(model)
        logger.info("Model training will be distributed to {} GPUs.".format(
            torch.cuda.device_count()))
    model.to(dev)

    datasets = {'vali': val_dl}

    ranked_slates = rank_slates(datasets, model, dstore, config)

    # save output
    for role, out in ranked_slates.items():
        write_out_dir(paths.output_dir, role, out, dstore)

    print('DONE')
Example #22
0
def run(args):
    # reproducibility
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    np.random.seed(42)

    paths = PathsContainer.from_args(args.job_dir, args.run_id,
                                     args.config_file_name)

    create_output_dirs(paths.output_dir)

    logger = init_logger(paths.output_dir)
    logger.info(f"created paths container {paths}")

    # read config
    config = Config.from_json(paths.config_path)
    logger.info("Config:\n {}".format(pformat(vars(config), width=1)))

    output_config_path = os.path.join(paths.output_dir, "used_config.json")
    execute_command("cp {} {}".format(paths.config_path, output_config_path))

    print("Shared in main", config.data.shared)
    # train_ds, val_ds, test_ds
    train_ds, val_ds, test_ds = load_libsvm_dataset(
        input_path=config.data.path,
        slate_length=config.data.slate_length,
        validation_ds_role=config.data.validation_ds_role,
        test_ds_role=config.data.test_ds_role,
        sigma=config.data.noise,
        shared=config.data.shared)

    n_features = train_ds.shape[-1]
    assert n_features == val_ds.shape[
        -1], "Last dimensions of train_ds and val_ds do not match!"

    # train_dl, val_dl, test_dl
    train_dl, val_dl, test_dl = create_data_loaders(
        train_ds,
        val_ds,
        test_ds,
        num_workers=config.data.num_workers,
        batch_size=config.data.batch_size)

    # gpu support
    dev = get_torch_device()
    logger.info("Model training will execute on {}".format(dev.type))

    # instantiate model

    use_distillation = True if config.distillation_loss else False
    full_pipeline = True if use_distillation and "full" in config.distillation_loss.name else False
    fit_size = config.teacher_model.fc_model['sizes'][
        -1] if full_pipeline and config.teacher_model.fc_model['sizes'][
            -1] != config.model.fc_model['sizes'][-1] else None
    print("Fit size", fit_size)
    model = make_model(n_features=n_features,
                       **asdict(config.model, recurse=False),
                       fit_size=fit_size,
                       distillation=full_pipeline,
                       seq_len=config.data.slate_length)
    if torch.cuda.device_count() > 1:
        model = CustomDataParallel(model)
        logger.info("Model training will be distributed to {} GPUs.".format(
            torch.cuda.device_count()))
    model.to(dev)

    # load optimizer, loss and LR scheduler
    if hasattr(optim, config.optimizer.name):
        optimizer = getattr(optim,
                            config.optimizer.name)(params=model.parameters(),
                                                   **config.optimizer.args)
    #if hasattr(optimizers, config.optimizer.name):
    #    optimizer = getattr(optimizers, config.optimizer.name)(params=model.parameters(), **config.optimizer.args)
    if config.lr_scheduler.name:
        scheduler = getattr(optim.lr_scheduler, config.lr_scheduler.name)(
            optimizer, **config.lr_scheduler.args)
    else:
        scheduler = None
    loss_func = partial(getattr(losses, config.loss.name), **config.loss.args)

    if args.evaluate:
        test_metrics = compute_metrics(config.metrics, model, test_dl, dev)
        print(test_metrics)
        sys.exit()

    if use_distillation:
        if full_pipeline:
            assert config.teacher_model.transformer.h == config.model.transformer.h
        teacher_model = make_model(n_features=n_features,
                                   **asdict(config.teacher_model,
                                            recurse=False),
                                   distillation=full_pipeline,
                                   fit_size=None)
        if torch.cuda.device_count() > 1:
            teacher_model = CustomDataParallel(teacher_model)
            logger.info(
                "Model training will be distributed to {} GPUs.".format(
                    torch.cuda.device_count()))
        teacher_model.to(dev)
        loss_func = partial(getattr(losses, config.distillation_loss.name),
                            gt_loss_func=loss_func,
                            **config.distillation_loss.args)
        with torch.autograd.detect_anomaly(
        ) if config.detect_anomaly else dummy_context_mgr():  # type: ignore
            result, model = fit_with_distillation(
                student_model=model,
                teacher_model=teacher_model,
                loss_func=loss_func,
                optimizer=optimizer,
                scheduler=scheduler,
                train_dl=train_dl,
                valid_dl=val_dl,
                config=config,
                device=dev,
                output_dir=paths.output_dir,
                tensorboard_output_path=paths.tensorboard_output_path,
                full=full_pipeline,
                **asdict(config.training))

    else:
        with torch.autograd.detect_anomaly(
        ) if config.detect_anomaly else dummy_context_mgr():  # type: ignore
            # run training
            result, model = fit(
                model=model,
                loss_func=loss_func,
                optimizer=optimizer,
                scheduler=scheduler,
                train_dl=train_dl,
                valid_dl=val_dl,
                config=config,
                device=dev,
                output_dir=paths.output_dir,
                tensorboard_output_path=paths.tensorboard_output_path,
                **asdict(config.training))
    #Reload best model
    sd = torch.load(os.path.join(paths.output_dir, "best_model.pkl"))
    model.load_state_dict(sd)
    test_metrics = compute_metrics(config.metrics, model, test_dl, dev)
    result['test_metrics'] = test_metrics
    print(result)
    dump_experiment_result(args, config, paths.output_dir, result)

    if urlparse(args.job_dir).scheme == "gs":
        copy_local_to_gs(paths.local_base_output_path, args.job_dir)

    assert_expected_metrics(result, config.expected_metrics)
    return test_metrics['ndcg_10']