def evaluate(
    experiments: Iterable[Tuple[Iterable[SlateEstimator], int]],
    log_dataset: TrainingDataset,
    log_distribution: RewardDistribution,
    tgt_dataset: TrainingDataset,
    tgt_distribution: RewardDistribution,
    log_queries: Sequence[TrainingQuery],
    slate_size: int,
    item_size: int,
    metric_func: str,
    max_num_workers: int,
    device=None,
):
    log_length = len(log_queries)
    slots = SlateSlots(slate_size)

    logging.info("Generating log...")
    st = time.perf_counter()
    tasks = []
    total_samples = 0
    for estimators, num_samples in experiments:
        samples = []
        if num_samples * 10 > log_length:
            logging.warning(f"not enough log data, needs {num_samples * 10}")
            continue
        query_choices = np.random.choice(log_length,
                                         num_samples,
                                         replace=False)
        for i in query_choices:
            q = log_queries[i]
            context = SlateContext(SlateQuery((q.query_id, *(q.query_terms))),
                                   slots)
            url_relevances = q.url_relevances
            if len(url_relevances) > item_size:
                url_relevances = {
                    k: v
                    for k, v in sorted(url_relevances.items(),
                                       key=lambda item: item[1])[:item_size]
                }
            items = url_relevances.keys()
            log_item_rewards = log_dataset.item_relevances(
                q.query_id, q.query_terms, items)
            log_item_probs = log_distribution(log_item_rewards)
            tgt_item_rewards = tgt_dataset.item_relevances(
                q.query_id, q.query_terms, items)
            tgt_item_probs = tgt_distribution(tgt_item_rewards)
            tgt_slot_expectation = tgt_item_probs.slot_item_expectations(slots)
            gt_item_rewards = SlateItemValues(url_relevances)
            if metric_func == "dcg":
                metric = DCGSlateMetric(device=device)
            elif metric_func == "err":
                metric = ERRSlateMetric(4.0, device=device)
            else:
                metric = NDCGSlateMetric(gt_item_rewards, device=device)
            slot_weights = metric.slot_weights(slots)
            if tgt_item_probs.is_deterministic:
                tgt_slate_prob = 1.0
                log_slate = tgt_item_probs.sample_slate(slots)
            else:
                tgt_slate_prob = float("nan")
                log_slate = log_item_probs.sample_slate(slots)
            log_slate_prob = log_item_probs.slate_probability(log_slate)
            log_rewards = log_slate.slot_values(gt_item_rewards)
            log_reward = metric.calculate_reward(slots, log_rewards, None,
                                                 slot_weights)
            gt_slot_rewards = tgt_slot_expectation.expected_rewards(
                gt_item_rewards)
            gt_reward = metric.calculate_reward(slots, gt_slot_rewards, None,
                                                slot_weights)
            samples.append(
                LogSample(
                    context,
                    metric,
                    log_slate,
                    log_reward,
                    log_slate_prob,
                    None,
                    log_item_probs,
                    tgt_slate_prob,
                    None,
                    tgt_item_probs,
                    gt_reward,
                    slot_weights,
                ))
            total_samples += 1
        tasks.append((estimators, SlateEstimatorInput(samples)))
    dt = time.perf_counter() - st
    logging.info(f"Generating log done: {total_samples} samples in {dt}s")

    logging.info("start evaluating...")
    st = time.perf_counter()
    evaluator = Evaluator(tasks, max_num_workers)
    Evaluator.report_results(evaluator.evaluate())
    logging.info(f"evaluating done in {time.perf_counter() - st}s")
def evaluate_all(
    experiments: Iterable[Tuple[Iterable[Estimator], int]],
    dataset: UCIMultiClassDataset,
    log_trainer: Trainer,
    log_epsilon: float,
    tgt_trainer: Trainer,
    tgt_epsilon: float,
    max_num_workers: int,
    random_reward_prob: float = 0.0,
    device=None,
):
    action_space = ActionSpace(dataset.num_actions)
    config_path = PurePath(dataset.config_file)
    data_name = config_path.stem
    log_model_name = data_name + "_" + log_trainer.__class__.__name__ + ".pickle"
    log_model_file = str(config_path.with_name(log_model_name))
    tgt_model_name = data_name + "_" + tgt_trainer.__class__.__name__ + ".pickle"
    tgt_model_file = str(config_path.with_name(tgt_model_name))

    log_trainer.load_model(log_model_file)
    tgt_trainer.load_model(tgt_model_file)
    if not log_trainer.is_trained or not tgt_trainer.is_trained:
        (
            train_x,
            train_y,
            train_r,
            val_x,
            val_y,
            val_r,
            test_x,
            test_y,
            test_r,
            train_choices,
        ) = dataset.train_val_test_split((0.2, 0.8))
        trainer_data = TrainingData(train_x, train_y, None, val_x, val_y, None)
        if not log_trainer.is_trained:
            log_trainer.train(trainer_data)
            log_trainer.save_model(log_model_file)
        if not tgt_trainer.is_trained:
            tgt_trainer.train(trainer_data)
            tgt_trainer.save_model(tgt_model_file)

    log_results = log_trainer.predict(dataset.features)
    assert log_results.probabilities is not None
    log_policy = MultiClassPolicy(action_space, log_results.probabilities,
                                  log_epsilon)

    tgt_results = tgt_trainer.predict(dataset.features)
    assert tgt_results.probabilities is not None
    tgt_policy = MultiClassPolicy(action_space, tgt_results.probabilities,
                                  tgt_epsilon)

    tasks = []
    # pyre-fixme[61]: `train_choices` may not be initialized here.
    test_queries = list(set(range(len(dataset))) - set(train_choices))
    for estimators, num_samples in experiments:
        samples = []
        for _ in range(num_samples):
            qid = random.sample(test_queries, 1)
            label = int(dataset.labels[qid].item())
            log_action, log_action_probabilities = log_policy(qid)
            log_reward = 1.0 if log_action.value == label else 0.0
            tgt_action, tgt_action_probabilities = tgt_policy(qid)
            ground_truth_reward = 1.0 if tgt_action.value == label else 0.0
            item_feature = dataset.features[qid]
            random_reward = random.random() < random_reward_prob
            samples.append(
                LogSample(
                    context=qid,
                    log_action=log_action,
                    log_reward=random.randint(0, 1)
                    if random_reward else log_reward,
                    log_action_probabilities=log_action_probabilities,
                    tgt_action_probabilities=tgt_action_probabilities,
                    tgt_action=tgt_action,
                    ground_truth_reward=ground_truth_reward,
                    item_feature=item_feature,
                ))
        tasks.append(
            (estimators, BanditsEstimatorInput(action_space, samples, False)))

    evaluator = Evaluator(tasks, max_num_workers)
    results = evaluator.evaluate()
    Evaluator.report_results(results)
    return results
Exemple #3
0
def evaluate_all(
    experiments: Iterable[Tuple[Iterable[Estimator], int]],
    dataset: UCIMultiClassDataset,
    log_trainer: Trainer,
    log_epsilon: float,
    tgt_trainer: Trainer,
    tgt_epsilon: float,
    max_num_workers: int,
    device=None,
):
    action_space = ActionSpace(dataset.num_actions)
    config_path = PurePath(dataset.config_file)
    data_name = config_path.stem
    log_model_name = data_name + "_" + log_trainer.__class__.__name__ + ".pickle"
    log_model_file = str(config_path.with_name(log_model_name))
    tgt_model_name = data_name + "_" + tgt_trainer.__class__.__name__ + ".pickle"
    tgt_model_file = str(config_path.with_name(tgt_model_name))

    log_trainer.load_model(log_model_file)
    tgt_trainer.load_model(tgt_model_file)
    if not log_trainer.is_trained or not tgt_trainer.is_trained:
        (
            train_x,
            train_y,
            train_r,
            val_x,
            val_y,
            val_r,
            test_x,
            test_y,
            test_r,
        ) = dataset.train_val_test_split((0.8, 0.8))
        trainer_data = TrainingData(train_x, train_y, None, val_x, val_y, None)
        if not log_trainer.is_trained:
            log_trainer.train(trainer_data)
            log_trainer.save_model(log_model_file)
        if not tgt_trainer.is_trained:
            tgt_trainer.train(trainer_data)
            tgt_trainer.save_model(tgt_model_file)

    log_results = log_trainer.predict(dataset.features)
    log_policy = MultiClassPolicy(action_space, log_results.probabilities,
                                  log_epsilon)

    tgt_results = tgt_trainer.predict(dataset.features)
    tgt_policy = MultiClassPolicy(action_space, tgt_results.probabilities,
                                  tgt_epsilon)

    inputs = []
    tasks = []
    total_queries = len(dataset)
    for estimators, num_samples in experiments:
        samples = []
        for i in range(num_samples):
            qid = random.randrange(total_queries)
            label = int(dataset.labels[qid].item())
            log_action, log_action_probabilities = log_policy(qid)
            log_reward = 1.0 if log_action.value == label else 0.0
            tgt_action, tgt_action_probabilities = tgt_policy(qid)
            ground_truth_reward = 1.0 if tgt_action.value == label else 0.0
            item_feature = dataset.features[qid]
            samples.append(
                LogSample(
                    context=qid,
                    log_action=log_action,
                    log_reward=log_reward,
                    log_action_probabilities=log_action_probabilities,
                    tgt_action_probabilities=tgt_action_probabilities,
                    tgt_action=tgt_action,
                    ground_truth_reward=ground_truth_reward,
                    item_feature=item_feature,
                ))
        tasks.append(
            (estimators, BanditsEstimatorInput(action_space, samples, False)))

    logging.info("start evaluating...")
    st = time.perf_counter()
    evaluator = Evaluator(tasks, max_num_workers)
    results = evaluator.evaluate()
    Evaluator.report_results(results)
    logging.info(f"evaluating done in {time.perf_counter() - st}s")
    return results
Exemple #4
0
def evaluate(
    experiments: Iterable[Tuple[Iterable[SlateEstimator], int]],
    dataset: MSLRDatasets,
    slate_size: int,
    item_size: int,
    metric_func: str,
    log_trainer: Trainer,
    log_distribution: RewardDistribution,
    log_features: str,
    tgt_trainer: Trainer,
    tgt_distribution: RewardDistribution,
    tgt_features: str,
    dm_features: str,
    max_num_workers: int,
    device=None,
):
    assert slate_size < item_size
    print(
        f"Evaluate All:"
        f" slate_size={slate_size}, item_size={item_size}, metric={metric_func}"
        f", Log=[{log_trainer.name}, {log_distribution}, {log_features}]"
        f", Target=[{tgt_trainer.name}, {tgt_distribution}, {tgt_features}]"
        f", DM=[{dm_features}]"
        f", Workers={max_num_workers}, device={device}",
        flush=True,
    )
    logging.info("Preparing models and policies...")
    st = time.perf_counter()
    log_trainer.load_model(
        os.path.join(dataset.folder,
                     log_trainer.name + "_all_" + log_features + ".pickle"))
    # calculate behavior model scores
    log_pred = log_trainer.predict(getattr(dataset, log_features))

    tgt_trainer.load_model(
        os.path.join(dataset.folder,
                     tgt_trainer.name + "_all_" + tgt_features + ".pickle"))
    # calculate target model scores
    tgt_pred = tgt_trainer.predict(getattr(dataset, tgt_features))

    dm_train_features = getattr(dataset, dm_features)

    slots = SlateSlots(slate_size)

    dt = time.perf_counter() - st
    logging.info(f"Preparing models and policies done: {dt}s")

    total_samples = 0
    for _, num_samples in experiments:
        total_samples += num_samples
    logging.info(f"Generating log: total_samples={total_samples}")
    st = time.perf_counter()
    tasks = []
    samples_generated = 0
    total_queries = dataset.queries.shape[0]
    for estimators, num_samples in experiments:
        samples = []
        for _ in range(num_samples):
            # randomly sample a query
            q = dataset.queries[random.randrange(total_queries)]
            doc_size = int(q[2])
            if doc_size < item_size:
                # skip if number of docs is less than item_size
                continue
            si = int(q[1])
            ei = si + doc_size
            # using top item_size docs for logging
            log_scores, item_choices = log_pred.scores[si:ei].sort(
                dim=0, descending=True)
            log_scores = log_scores[:item_size]
            item_choices = item_choices[:item_size]
            log_item_probs = log_distribution(SlateItemValues(log_scores))
            tgt_scores = tgt_pred.scores[si:ei][item_choices].detach().clone()
            tgt_item_probs = tgt_distribution(SlateItemValues(tgt_scores))
            tgt_slot_expectation = tgt_item_probs.slot_item_expectations(slots)
            gt_item_rewards = SlateItemValues(
                dataset.relevances[si:ei][item_choices])
            gt_rewards = tgt_slot_expectation.expected_rewards(gt_item_rewards)
            if metric_func == "dcg":
                metric = DCGSlateMetric(device=device)
            elif metric_func == "err":
                metric = ERRSlateMetric(4.0, device=device)
            else:
                metric = NDCGSlateMetric(gt_item_rewards, device=device)
            query = SlateQuery((si, ei))
            context = SlateContext(query, slots, item_choices)
            slot_weights = metric.slot_weights(slots)
            gt_reward = metric.calculate_reward(slots, gt_rewards, None,
                                                slot_weights)
            if tgt_item_probs.is_deterministic:
                tgt_slate_prob = 1.0
                log_slate = tgt_item_probs.sample_slate(slots)
                log_reward = gt_reward
            else:
                tgt_slate_prob = float("nan")
                log_slate = log_item_probs.sample_slate(slots)
                log_rewards = log_slate.slot_values(gt_item_rewards)
                log_reward = metric.calculate_reward(slots, log_rewards, None,
                                                     slot_weights)
            log_slate_prob = log_item_probs.slate_probability(log_slate)
            item_features = SlateItemFeatures(
                dm_train_features[si:ei][item_choices])
            sample = LogSample(
                context,
                metric,
                log_slate,
                log_reward,
                log_slate_prob,
                None,
                log_item_probs,
                tgt_slate_prob,
                None,
                tgt_item_probs,
                gt_reward,
                slot_weights,
                None,
                item_features,
            )
            samples.append(sample)
            samples_generated += 1
            if samples_generated % 1000 == 0:
                logging.info(
                    f"  samples generated: {samples_generated}, {100 * samples_generated / total_samples:.1f}%"
                )
        tasks.append((estimators, SlateEstimatorInput(samples)))
    dt = time.perf_counter() - st
    logging.info(f"Generating log done: {total_samples} samples in {dt}s")

    logging.info("start evaluating...")
    st = time.perf_counter()
    evaluator = Evaluator(tasks, max_num_workers)
    Evaluator.report_results(evaluator.evaluate())
    logging.info(f"evaluating done in {time.perf_counter() - st}s")