Esempio n. 1
0
    def produce_shap_values(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> CallResult[container.DataFrame]:

        if self._needs_fit:
            self.fit()

        # don't want to produce SHAP predictions on train set because too computationally intensive
        check_rows = min(self._input_hash.shape[0], inputs.shape[0])
        if (pd.util.hash_pandas_object(
                inputs.head(check_rows)) == self._input_hash.head(check_rows)
            ).all():
            logger.info(
                "Not producing SHAP interpretations on train set because of computational considerations"
            )
            return CallResult(container.DataFrame([]))

        # drop any non-numeric columns
        num_cols = inputs.shape[1]
        inputs = inputs.select_dtypes(include="number")
        col_diff = num_cols - inputs.shape[1]
        if col_diff > 0:
            logger.warn(f"Removed {col_diff} unencoded columns.")

        explainer = shap.TreeExplainer(self._model._models[0].model)
        max_size = self.hyperparams["shap_max_dataset_size"]
        if inputs.shape[0] > max_size:
            logger.warning(
                f"There are more than {max_size} rows in dataset, sub-sampling ~{max_size} approximately representative rows "
                + "on which to produce interpretations")
            df = self._shap_sub_sample(inputs)
            shap_values = explainer.shap_values(df)
        else:
            shap_values = explainer.shap_values(pd.DataFrame(inputs))

        if self._model.mode == "classification":
            logger.info(
                f"Returning interpretability values offset from most frequent class in dataset"
            )
            shap_values = shap_values[np.argmax(explainer.expected_value)]

        output_df = container.DataFrame(shap_values, generate_metadata=True)
        for i, col in enumerate(inputs.columns):
            output_df.metadata = output_df.metadata.update_column(
                i, {"name": col})

        component_cols: Dict[str, List[int]] = {}
        for c in range(0, len(output_df.columns)):
            col_dict = dict(
                inputs.metadata.query((metadata_base.ALL_ELEMENTS, c)))
            if "source_column" in col_dict:
                src = col_dict["source_column"]
                if src not in component_cols:
                    component_cols[src] = []
                component_cols[src].append(c)

        # build the source column values and add them to the output
        for s, cc in component_cols.items():
            src_col = output_df.iloc[:, cc].apply(lambda x: sum(x), axis=1)
            src_col_index = len(output_df.columns)
            output_df.insert(src_col_index, s, src_col)
            output_df.metadata = output_df.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, src_col_index),
                "https://metadata.datadrivendiscovery.org/types/Attribute",
            )

        df_dict = dict(output_df.metadata.query(
            (metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(
            output_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict["dimension"] = df_dict_1
        df_dict_1["name"] = "columns"
        df_dict_1["semantic_types"] = (
            "https://metadata.datadrivendiscovery.org/types/TabularColumn", )
        df_dict_1["length"] = len(output_df.columns)
        output_df.metadata = output_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, ), df_dict)

        return CallResult(output_df)
Esempio n. 2
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> CallResult[container.DataFrame]:

        logger.debug(f"Producing {__name__}")

        # force a fit it hasn't yet been done
        if self._needs_fit:
            self.fit()

        # drop any non-numeric columns
        # drop all non-numeric columns
        num_cols = inputs.shape[1]
        inputs = inputs.select_dtypes(include="number")
        col_diff = num_cols - inputs.shape[1]
        if col_diff > 0:
            logger.warn(
                f"Removed {col_diff} unencoded columns from produce data.")

        # create dataframe to hold the result
        result = self._model.predict(inputs.values)
        if len(self._target_cols) > 1:
            result_df = container.DataFrame()
            for i, c in enumerate(self._target_cols):
                col = container.DataFrame({c: result[:, i]})
                result_df = pd.concat([result_df, col], axis=1)
            for c in range(result_df.shape[1]):
                result_df.metadata = result_df.metadata.add_semantic_type(
                    (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float")
        else:
            result_df = container.DataFrame({self._target_cols[0]: result},
                                            generate_metadata=True)
        # if we mapped values earlier map them back.
        if len(self._label_map) > 0:
            # TODO label map will not work if there are multiple output columns.
            result_df[self._target_cols[0]] = result_df[
                self._target_cols[0]].map(self._label_map)
        # mark the semantic types on the dataframe
        for i, _ in enumerate(result_df.columns):
            result_df.metadata = result_df.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, i),
                "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
            )
        if (self._model.mode == "classification"
                and self.hyperparams["compute_confidences"]):
            confidence = self._model.predict_proba(inputs.values)
            if self._binary:
                pos_column = (0 if self.hyperparams["pos_label"]
                              == self._label_map[0] else 1)
                result_df.insert(result_df.shape[1], "confidence",
                                 confidence[:, pos_column])
                result_df.metadata = result_df.metadata.add_semantic_type(
                    (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1),
                    "http://schema.org/Float",
                )
            else:
                # add confidence scores as some metrics require them.
                confidence = pd.Series(confidence.tolist(), name="confidence")
                result_df = pd.concat([result_df, confidence], axis=1)

                confidences = [
                    item
                    for sublist in result_df["confidence"].values.tolist()
                    for item in sublist
                ]
                labels = np.array(
                    list(self._label_map.values()) * len(result_df))

                index = [
                    item for sublist in [[i] * len(np.unique(labels))
                                         for i in result_df.index]
                    for item in sublist
                ]
                result_df_temp = container.DataFrame()
                result_df_temp["Class"] = labels
                result_df_temp["confidence"] = confidences
                result_df_temp.metadata = result_df.metadata
                result_df_temp["index_temp"] = index
                result_df_temp = result_df_temp.set_index("index_temp")
                result_df = result_df_temp
                result_df.metadata = result_df.metadata.add_semantic_type(
                    (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1),
                    "https://metadata.datadrivendiscovery.org/types/FloatVector",
                )

            result_df.metadata = result_df.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1),
                "https://metadata.datadrivendiscovery.org/types/Score",
            )
            result_df.metadata = result_df.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, len(result_df.columns) - 1),
                "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
            )

        logger.debug(f"\n{result_df}")
        return base.CallResult(result_df)