def produce_shap_values( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> CallResult[container.DataFrame]: if self._needs_fit: self.fit() # don't want to produce SHAP predictions on train set because too computationally intensive check_rows = min(self._input_hash.shape[0], inputs.shape[0]) if (pd.util.hash_pandas_object( inputs.head(check_rows)) == self._input_hash.head(check_rows) ).all(): logger.info( "Not producing SHAP interpretations on train set because of computational considerations" ) return CallResult(container.DataFrame([])) # drop any non-numeric columns num_cols = inputs.shape[1] inputs = inputs.select_dtypes(include="number") col_diff = num_cols - inputs.shape[1] if col_diff > 0: logger.warn(f"Removed {col_diff} unencoded columns.") explainer = shap.TreeExplainer(self._model._models[0].model) max_size = self.hyperparams["shap_max_dataset_size"] if inputs.shape[0] > max_size: logger.warning( f"There are more than {max_size} rows in dataset, sub-sampling ~{max_size} approximately representative rows " + "on which to produce interpretations") df = self._shap_sub_sample(inputs) shap_values = explainer.shap_values(df) else: shap_values = explainer.shap_values(pd.DataFrame(inputs)) if self._model.mode == "classification": logger.info( f"Returning interpretability values offset from most frequent class in dataset" ) shap_values = shap_values[np.argmax(explainer.expected_value)] output_df = container.DataFrame(shap_values, generate_metadata=True) for i, col in enumerate(inputs.columns): output_df.metadata = output_df.metadata.update_column( i, {"name": col}) component_cols: Dict[str, List[int]] = {} for c in range(0, len(output_df.columns)): col_dict = dict( inputs.metadata.query((metadata_base.ALL_ELEMENTS, c))) if "source_column" in col_dict: src = col_dict["source_column"] if src not in component_cols: component_cols[src] = [] component_cols[src].append(c) # build the source column values and add them to the output for s, cc in component_cols.items(): src_col = output_df.iloc[:, cc].apply(lambda x: sum(x), axis=1) src_col_index = len(output_df.columns) output_df.insert(src_col_index, s, src_col) output_df.metadata = output_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, src_col_index), "https://metadata.datadrivendiscovery.org/types/Attribute", ) df_dict = dict(output_df.metadata.query( (metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( output_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict["dimension"] = df_dict_1 df_dict_1["name"] = "columns" df_dict_1["semantic_types"] = ( "https://metadata.datadrivendiscovery.org/types/TabularColumn", ) df_dict_1["length"] = len(output_df.columns) output_df.metadata = output_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(output_df)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") # force a fit it hasn't yet been done if self._needs_fit: self.fit() # drop any non-numeric columns # drop all non-numeric columns num_cols = inputs.shape[1] inputs = inputs.select_dtypes(include="number") col_diff = num_cols - inputs.shape[1] if col_diff > 0: logger.warn( f"Removed {col_diff} unencoded columns from produce data.") # create dataframe to hold the result result = self._model.predict(inputs.values) if len(self._target_cols) > 1: result_df = container.DataFrame() for i, c in enumerate(self._target_cols): col = container.DataFrame({c: result[:, i]}) result_df = pd.concat([result_df, col], axis=1) for c in range(result_df.shape[1]): result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float") else: result_df = container.DataFrame({self._target_cols[0]: result}, generate_metadata=True) # if we mapped values earlier map them back. if len(self._label_map) > 0: # TODO label map will not work if there are multiple output columns. result_df[self._target_cols[0]] = result_df[ self._target_cols[0]].map(self._label_map) # mark the semantic types on the dataframe for i, _ in enumerate(result_df.columns): result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, i), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) if (self._model.mode == "classification" and self.hyperparams["compute_confidences"]): confidence = self._model.predict_proba(inputs.values) if self._binary: pos_column = (0 if self.hyperparams["pos_label"] == self._label_map[0] else 1) result_df.insert(result_df.shape[1], "confidence", confidence[:, pos_column]) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1), "http://schema.org/Float", ) else: # add confidence scores as some metrics require them. confidence = pd.Series(confidence.tolist(), name="confidence") result_df = pd.concat([result_df, confidence], axis=1) confidences = [ item for sublist in result_df["confidence"].values.tolist() for item in sublist ] labels = np.array( list(self._label_map.values()) * len(result_df)) index = [ item for sublist in [[i] * len(np.unique(labels)) for i in result_df.index] for item in sublist ] result_df_temp = container.DataFrame() result_df_temp["Class"] = labels result_df_temp["confidence"] = confidences result_df_temp.metadata = result_df.metadata result_df_temp["index_temp"] = index result_df_temp = result_df_temp.set_index("index_temp") result_df = result_df_temp result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1), "https://metadata.datadrivendiscovery.org/types/FloatVector", ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1), "https://metadata.datadrivendiscovery.org/types/Score", ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) logger.debug(f"\n{result_df}") return base.CallResult(result_df)