def evaluate( self, test_dataset: data.TFRecordDataset, inference_signature: str = None, additional_features: dict = {}, group_metrics_min_queries: int = 50, logs_dir: Optional[str] = None, logging_frequency: int = 25, compute_intermediate_stats: bool = True, ): """ Evaluate the RelevanceModel Parameters ---------- test_dataset: an instance of tf.data.dataset inference_signature : str, optional If using a SavedModel for prediction, specify the inference signature to be used for computing scores additional_features : dict, optional Dictionary containing new feature name and function definition to compute them. Use this to compute additional features from the scores. For example, converting ranking scores for each document into ranks for the query group_metrics_min_queries : int, optional Minimum count threshold per group to be considered for computing groupwise metrics logs_dir : str, optional Path to directory to save logs logging_frequency : int Value representing how often(in batches) to log status compute_intermediate_stats : bool [Currently ignored] Determines if group metrics and other intermediate stats on the test set should be computed Returns ------- df_overall_metrics : `pd.DataFrame` object `pd.DataFrame` containing overall metrics df_groupwise_metrics : `pd.DataFrame` object `pd.DataFrame` containing groupwise metrics if group_metric_keys are defined in the FeatureConfig metrics_dict : dict metrics as a dictionary of metric names mapping to values Notes ----- You can directly do a `model.evaluate()` only if the keras model is compiled Override this method to implement your own evaluation metrics. """ metrics_dict = dict() group_metrics_keys = self.feature_config.get_group_metrics_keys() evaluation_features = (group_metrics_keys + [ self.feature_config.get_query_key(), self.feature_config.get_label(), self.feature_config.get_rank(), ] + [ f for f in self.feature_config.get_secondary_labels() if f.get( "node_name", f["name"] not in self.feature_config.get_group_metrics_keys( "node_name"), ) ]) additional_features[RankingConstants. NEW_RANK] = prediction_helper.convert_score_to_rank _predict_fn = get_predict_fn( model=self.model, tfrecord_type=self.tfrecord_type, feature_config=self.feature_config, inference_signature=inference_signature, is_compiled=self.is_compiled, output_name=self.output_name, features_to_return=evaluation_features, additional_features=additional_features, max_sequence_size=self.max_sequence_size, ) batch_count = 0 df_grouped_stats = pd.DataFrame() for predictions_dict in test_dataset.map(_predict_fn).take(-1): predictions_df = pd.DataFrame(predictions_dict) df_batch_grouped_stats = metrics_helper.get_grouped_stats( df=predictions_df, query_key_col=self.feature_config.get_query_key("node_name"), label_col=self.feature_config.get_label("node_name"), old_rank_col=self.feature_config.get_rank("node_name"), new_rank_col=RankingConstants.NEW_RANK, group_keys=self.feature_config.get_group_metrics_keys( "node_name"), secondary_labels=self.feature_config.get_secondary_labels( "node_name"), ) if df_grouped_stats.empty: df_grouped_stats = df_batch_grouped_stats else: df_grouped_stats = df_grouped_stats.add(df_batch_grouped_stats, fill_value=0.0) batch_count += 1 if batch_count % logging_frequency == 0: self.logger.info( "Finished evaluating {} batches".format(batch_count)) # Compute overall metrics df_overall_metrics = metrics_helper.summarize_grouped_stats( df_grouped_stats) self.logger.info("Overall Metrics: \n{}".format(df_overall_metrics)) # Log metrics to weights and biases metrics_dict.update({ "test_{}".format(k): v for k, v in df_overall_metrics.to_dict().items() }) df_group_metrics = None df_group_metrics_summary = None if group_metrics_keys: # Filter groups by min_query_count df_grouped_stats = df_grouped_stats[ df_grouped_stats["query_count"] >= group_metrics_min_queries] # Compute group metrics df_group_metrics = df_grouped_stats.apply( metrics_helper.summarize_grouped_stats, axis=1) if logs_dir: self.file_io.write_df( df_group_metrics, outfile=os.path.join( logs_dir, RelevanceModelConstants.GROUP_METRICS_CSV_FILE), ) # Compute group metrics summary df_group_metrics_summary = df_group_metrics.describe() self.logger.info("Computing group metrics using keys: {}".format( self.feature_config.get_group_metrics_keys("node_name"))) self.logger.info("Groupwise Metrics: \n{}".format( df_group_metrics_summary.T)) # Log metrics to weights and biases metrics_dict.update({ "test_group_mean_{}".format(k): v for k, v in df_group_metrics_summary.T["mean"].to_dict().items() }) return df_overall_metrics, df_group_metrics, metrics_dict
def evaluate( self, test_dataset: data.TFRecordDataset, inference_signature: str = None, additional_features: dict = {}, group_metrics_min_queries: int = 50, logs_dir: Optional[str] = None, logging_frequency: int = 25, ): """ Evaluate the ranking model Args: test_dataset: an instance of tf.data.dataset inference_signature: If using a SavedModel for prediction, specify the inference signature logging_frequency: integer representing how often(in batches) to log status metric_group_keys: list of fields to compute group based metrics on save_to_file: set to True to save predictions to file like self.predict() Returns: metrics and groupwise metrics as pandas DataFrames """ group_metrics_keys = self.feature_config.get_group_metrics_keys() evaluation_features = (group_metrics_keys + [ self.feature_config.get_query_key(), self.feature_config.get_label(), self.feature_config.get_rank(), ] + [ f for f in self.feature_config.get_secondary_labels() if f.get( "node_name", f["name"] not in self.feature_config.get_group_metrics_keys( "node_name"), ) ]) additional_features[RankingConstants. NEW_RANK] = prediction_helper.convert_score_to_rank _predict_fn = get_predict_fn( model=self.model, tfrecord_type=self.tfrecord_type, feature_config=self.feature_config, inference_signature=inference_signature, is_compiled=self.is_compiled, output_name=self.output_name, features_to_return=evaluation_features, additional_features=additional_features, max_sequence_size=self.max_sequence_size, ) batch_count = 0 df_grouped_stats = pd.DataFrame() for predictions_dict in test_dataset.map(_predict_fn).take(-1): predictions_df = pd.DataFrame(predictions_dict) df_batch_grouped_stats = metrics_helper.get_grouped_stats( df=predictions_df, query_key_col=self.feature_config.get_query_key("node_name"), label_col=self.feature_config.get_label("node_name"), old_rank_col=self.feature_config.get_rank("node_name"), new_rank_col=RankingConstants.NEW_RANK, group_keys=self.feature_config.get_group_metrics_keys( "node_name"), secondary_labels=self.feature_config.get_secondary_labels( "node_name"), ) if df_grouped_stats.empty: df_grouped_stats = df_batch_grouped_stats else: df_grouped_stats = df_grouped_stats.add(df_batch_grouped_stats, fill_value=0.0) batch_count += 1 if batch_count % logging_frequency == 0: self.logger.info( "Finished evaluating {} batches".format(batch_count)) # Compute overall metrics df_overall_metrics = metrics_helper.summarize_grouped_stats( df_grouped_stats) self.logger.info("Overall Metrics: \n{}".format(df_overall_metrics)) df_group_metrics = None df_group_metrics_summary = None if group_metrics_keys: # Filter groups by min_query_count df_grouped_stats = df_grouped_stats[ df_grouped_stats["query_count"] >= group_metrics_min_queries] # Compute group metrics df_group_metrics = df_grouped_stats.apply( metrics_helper.summarize_grouped_stats, axis=1) if logs_dir: self.file_io.write_df( df_group_metrics, outfile=os.path.join( logs_dir, RelevanceModelConstants.GROUP_METRICS_CSV_FILE), ) # Compute group metrics summary df_group_metrics_summary = df_group_metrics.describe() self.logger.info("Computing group metrics using keys: {}".format( self.feature_config.get_group_metrics_keys("node_name"))) self.logger.info("Groupwise Metrics: \n{}".format( df_group_metrics_summary.T)) return df_overall_metrics, df_group_metrics