def evaluate( experiments: Iterable[Tuple[Iterable[SlateEstimator], int]], log_dataset: TrainingDataset, log_distribution: RewardDistribution, tgt_dataset: TrainingDataset, tgt_distribution: RewardDistribution, log_queries: Sequence[TrainingQuery], slate_size: int, item_size: int, metric_func: str, max_num_workers: int, device=None, ): log_length = len(log_queries) slots = SlateSlots(slate_size) logging.info("Generating log...") st = time.perf_counter() tasks = [] total_samples = 0 for estimators, num_samples in experiments: samples = [] if num_samples * 10 > log_length: logging.warning(f"not enough log data, needs {num_samples * 10}") continue query_choices = np.random.choice(log_length, num_samples, replace=False) for i in query_choices: q = log_queries[i] context = SlateContext(SlateQuery((q.query_id, *(q.query_terms))), slots) url_relevances = q.url_relevances if len(url_relevances) > item_size: url_relevances = { k: v for k, v in sorted(url_relevances.items(), key=lambda item: item[1])[:item_size] } items = url_relevances.keys() log_item_rewards = log_dataset.item_relevances( q.query_id, q.query_terms, items) log_item_probs = log_distribution(log_item_rewards) tgt_item_rewards = tgt_dataset.item_relevances( q.query_id, q.query_terms, items) tgt_item_probs = tgt_distribution(tgt_item_rewards) tgt_slot_expectation = tgt_item_probs.slot_item_expectations(slots) gt_item_rewards = SlateItemValues(url_relevances) if metric_func == "dcg": metric = DCGSlateMetric(device=device) elif metric_func == "err": metric = ERRSlateMetric(4.0, device=device) else: metric = NDCGSlateMetric(gt_item_rewards, device=device) slot_weights = metric.slot_weights(slots) if tgt_item_probs.is_deterministic: tgt_slate_prob = 1.0 log_slate = tgt_item_probs.sample_slate(slots) else: tgt_slate_prob = float("nan") log_slate = log_item_probs.sample_slate(slots) log_slate_prob = log_item_probs.slate_probability(log_slate) log_rewards = log_slate.slot_values(gt_item_rewards) log_reward = metric.calculate_reward(slots, log_rewards, None, slot_weights) gt_slot_rewards = tgt_slot_expectation.expected_rewards( gt_item_rewards) gt_reward = metric.calculate_reward(slots, gt_slot_rewards, None, slot_weights) samples.append( LogSample( context, metric, log_slate, log_reward, log_slate_prob, None, log_item_probs, tgt_slate_prob, None, tgt_item_probs, gt_reward, slot_weights, )) total_samples += 1 tasks.append((estimators, SlateEstimatorInput(samples))) dt = time.perf_counter() - st logging.info(f"Generating log done: {total_samples} samples in {dt}s") logging.info("start evaluating...") st = time.perf_counter() evaluator = Evaluator(tasks, max_num_workers) Evaluator.report_results(evaluator.evaluate()) logging.info(f"evaluating done in {time.perf_counter() - st}s")
def evaluate_all( experiments: Iterable[Tuple[Iterable[Estimator], int]], dataset: UCIMultiClassDataset, log_trainer: Trainer, log_epsilon: float, tgt_trainer: Trainer, tgt_epsilon: float, max_num_workers: int, random_reward_prob: float = 0.0, device=None, ): action_space = ActionSpace(dataset.num_actions) config_path = PurePath(dataset.config_file) data_name = config_path.stem log_model_name = data_name + "_" + log_trainer.__class__.__name__ + ".pickle" log_model_file = str(config_path.with_name(log_model_name)) tgt_model_name = data_name + "_" + tgt_trainer.__class__.__name__ + ".pickle" tgt_model_file = str(config_path.with_name(tgt_model_name)) log_trainer.load_model(log_model_file) tgt_trainer.load_model(tgt_model_file) if not log_trainer.is_trained or not tgt_trainer.is_trained: ( train_x, train_y, train_r, val_x, val_y, val_r, test_x, test_y, test_r, train_choices, ) = dataset.train_val_test_split((0.2, 0.8)) trainer_data = TrainingData(train_x, train_y, None, val_x, val_y, None) if not log_trainer.is_trained: log_trainer.train(trainer_data) log_trainer.save_model(log_model_file) if not tgt_trainer.is_trained: tgt_trainer.train(trainer_data) tgt_trainer.save_model(tgt_model_file) log_results = log_trainer.predict(dataset.features) assert log_results.probabilities is not None log_policy = MultiClassPolicy(action_space, log_results.probabilities, log_epsilon) tgt_results = tgt_trainer.predict(dataset.features) assert tgt_results.probabilities is not None tgt_policy = MultiClassPolicy(action_space, tgt_results.probabilities, tgt_epsilon) tasks = [] # pyre-fixme[61]: `train_choices` may not be initialized here. test_queries = list(set(range(len(dataset))) - set(train_choices)) for estimators, num_samples in experiments: samples = [] for _ in range(num_samples): qid = random.sample(test_queries, 1) label = int(dataset.labels[qid].item()) log_action, log_action_probabilities = log_policy(qid) log_reward = 1.0 if log_action.value == label else 0.0 tgt_action, tgt_action_probabilities = tgt_policy(qid) ground_truth_reward = 1.0 if tgt_action.value == label else 0.0 item_feature = dataset.features[qid] random_reward = random.random() < random_reward_prob samples.append( LogSample( context=qid, log_action=log_action, log_reward=random.randint(0, 1) if random_reward else log_reward, log_action_probabilities=log_action_probabilities, tgt_action_probabilities=tgt_action_probabilities, tgt_action=tgt_action, ground_truth_reward=ground_truth_reward, item_feature=item_feature, )) tasks.append( (estimators, BanditsEstimatorInput(action_space, samples, False))) evaluator = Evaluator(tasks, max_num_workers) results = evaluator.evaluate() Evaluator.report_results(results) return results
def evaluate_all( experiments: Iterable[Tuple[Iterable[Estimator], int]], dataset: UCIMultiClassDataset, log_trainer: Trainer, log_epsilon: float, tgt_trainer: Trainer, tgt_epsilon: float, max_num_workers: int, device=None, ): action_space = ActionSpace(dataset.num_actions) config_path = PurePath(dataset.config_file) data_name = config_path.stem log_model_name = data_name + "_" + log_trainer.__class__.__name__ + ".pickle" log_model_file = str(config_path.with_name(log_model_name)) tgt_model_name = data_name + "_" + tgt_trainer.__class__.__name__ + ".pickle" tgt_model_file = str(config_path.with_name(tgt_model_name)) log_trainer.load_model(log_model_file) tgt_trainer.load_model(tgt_model_file) if not log_trainer.is_trained or not tgt_trainer.is_trained: ( train_x, train_y, train_r, val_x, val_y, val_r, test_x, test_y, test_r, ) = dataset.train_val_test_split((0.8, 0.8)) trainer_data = TrainingData(train_x, train_y, None, val_x, val_y, None) if not log_trainer.is_trained: log_trainer.train(trainer_data) log_trainer.save_model(log_model_file) if not tgt_trainer.is_trained: tgt_trainer.train(trainer_data) tgt_trainer.save_model(tgt_model_file) log_results = log_trainer.predict(dataset.features) log_policy = MultiClassPolicy(action_space, log_results.probabilities, log_epsilon) tgt_results = tgt_trainer.predict(dataset.features) tgt_policy = MultiClassPolicy(action_space, tgt_results.probabilities, tgt_epsilon) inputs = [] tasks = [] total_queries = len(dataset) for estimators, num_samples in experiments: samples = [] for i in range(num_samples): qid = random.randrange(total_queries) label = int(dataset.labels[qid].item()) log_action, log_action_probabilities = log_policy(qid) log_reward = 1.0 if log_action.value == label else 0.0 tgt_action, tgt_action_probabilities = tgt_policy(qid) ground_truth_reward = 1.0 if tgt_action.value == label else 0.0 item_feature = dataset.features[qid] samples.append( LogSample( context=qid, log_action=log_action, log_reward=log_reward, log_action_probabilities=log_action_probabilities, tgt_action_probabilities=tgt_action_probabilities, tgt_action=tgt_action, ground_truth_reward=ground_truth_reward, item_feature=item_feature, )) tasks.append( (estimators, BanditsEstimatorInput(action_space, samples, False))) logging.info("start evaluating...") st = time.perf_counter() evaluator = Evaluator(tasks, max_num_workers) results = evaluator.evaluate() Evaluator.report_results(results) logging.info(f"evaluating done in {time.perf_counter() - st}s") return results
def evaluate( experiments: Iterable[Tuple[Iterable[SlateEstimator], int]], dataset: MSLRDatasets, slate_size: int, item_size: int, metric_func: str, log_trainer: Trainer, log_distribution: RewardDistribution, log_features: str, tgt_trainer: Trainer, tgt_distribution: RewardDistribution, tgt_features: str, dm_features: str, max_num_workers: int, device=None, ): assert slate_size < item_size print( f"Evaluate All:" f" slate_size={slate_size}, item_size={item_size}, metric={metric_func}" f", Log=[{log_trainer.name}, {log_distribution}, {log_features}]" f", Target=[{tgt_trainer.name}, {tgt_distribution}, {tgt_features}]" f", DM=[{dm_features}]" f", Workers={max_num_workers}, device={device}", flush=True, ) logging.info("Preparing models and policies...") st = time.perf_counter() log_trainer.load_model( os.path.join(dataset.folder, log_trainer.name + "_all_" + log_features + ".pickle")) # calculate behavior model scores log_pred = log_trainer.predict(getattr(dataset, log_features)) tgt_trainer.load_model( os.path.join(dataset.folder, tgt_trainer.name + "_all_" + tgt_features + ".pickle")) # calculate target model scores tgt_pred = tgt_trainer.predict(getattr(dataset, tgt_features)) dm_train_features = getattr(dataset, dm_features) slots = SlateSlots(slate_size) dt = time.perf_counter() - st logging.info(f"Preparing models and policies done: {dt}s") total_samples = 0 for _, num_samples in experiments: total_samples += num_samples logging.info(f"Generating log: total_samples={total_samples}") st = time.perf_counter() tasks = [] samples_generated = 0 total_queries = dataset.queries.shape[0] for estimators, num_samples in experiments: samples = [] for _ in range(num_samples): # randomly sample a query q = dataset.queries[random.randrange(total_queries)] doc_size = int(q[2]) if doc_size < item_size: # skip if number of docs is less than item_size continue si = int(q[1]) ei = si + doc_size # using top item_size docs for logging log_scores, item_choices = log_pred.scores[si:ei].sort( dim=0, descending=True) log_scores = log_scores[:item_size] item_choices = item_choices[:item_size] log_item_probs = log_distribution(SlateItemValues(log_scores)) tgt_scores = tgt_pred.scores[si:ei][item_choices].detach().clone() tgt_item_probs = tgt_distribution(SlateItemValues(tgt_scores)) tgt_slot_expectation = tgt_item_probs.slot_item_expectations(slots) gt_item_rewards = SlateItemValues( dataset.relevances[si:ei][item_choices]) gt_rewards = tgt_slot_expectation.expected_rewards(gt_item_rewards) if metric_func == "dcg": metric = DCGSlateMetric(device=device) elif metric_func == "err": metric = ERRSlateMetric(4.0, device=device) else: metric = NDCGSlateMetric(gt_item_rewards, device=device) query = SlateQuery((si, ei)) context = SlateContext(query, slots, item_choices) slot_weights = metric.slot_weights(slots) gt_reward = metric.calculate_reward(slots, gt_rewards, None, slot_weights) if tgt_item_probs.is_deterministic: tgt_slate_prob = 1.0 log_slate = tgt_item_probs.sample_slate(slots) log_reward = gt_reward else: tgt_slate_prob = float("nan") log_slate = log_item_probs.sample_slate(slots) log_rewards = log_slate.slot_values(gt_item_rewards) log_reward = metric.calculate_reward(slots, log_rewards, None, slot_weights) log_slate_prob = log_item_probs.slate_probability(log_slate) item_features = SlateItemFeatures( dm_train_features[si:ei][item_choices]) sample = LogSample( context, metric, log_slate, log_reward, log_slate_prob, None, log_item_probs, tgt_slate_prob, None, tgt_item_probs, gt_reward, slot_weights, None, item_features, ) samples.append(sample) samples_generated += 1 if samples_generated % 1000 == 0: logging.info( f" samples generated: {samples_generated}, {100 * samples_generated / total_samples:.1f}%" ) tasks.append((estimators, SlateEstimatorInput(samples))) dt = time.perf_counter() - st logging.info(f"Generating log done: {total_samples} samples in {dt}s") logging.info("start evaluating...") st = time.perf_counter() evaluator = Evaluator(tasks, max_num_workers) Evaluator.report_results(evaluator.evaluate()) logging.info(f"evaluating done in {time.perf_counter() - st}s")