def execute_inspections(op_id, caller_filename, lineno,
                                optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            function_info = FunctionInfo('statsmodel.api', 'add_constant')
            input_info = get_input_info(args[0], caller_filename, lineno,
                                        function_info, optional_code_reference,
                                        optional_source_code)

            operator_context = OperatorContext(OperatorType.PROJECTION_MODIFY,
                                               function_info)
            input_infos = SklearnBackend.before_call(
                operator_context, [input_info.annotated_dfobject])
            result = original(input_infos[0].result_data, *args[1:], **kwargs)
            backend_result = SklearnBackend.after_call(operator_context,
                                                       input_infos, result)
            new_return_value = backend_result.annotated_dfobject.result_data

            dag_node = DagNode(
                op_id, BasicCodeLocation(caller_filename,
                                         lineno), operator_context,
                DagNodeDetails("Adds const column", ["array"]),
                get_optional_code_info_or_none(optional_code_reference,
                                               optional_source_code))
            add_dag_node(dag_node, [input_info.dag_node], backend_result)

            return new_return_value
Exemple #2
0
    def patched_fit_transform(self, *args, **kwargs):
        """ Patch for ('example_pipelines.healthcare.healthcare_utils.MyW2VTransformer', 'fit_transform') """
        # pylint: disable=no-method-argument
        self.mlinspect_fit_transform_active = True  # pylint: disable=attribute-defined-outside-init
        original = gorilla.get_original_attribute(
            healthcare_utils.MyW2VTransformer, 'fit_transform')
        function_info = FunctionInfo(
            'example_pipelines.healthcare.healthcare_utils',
            'MyW2VTransformer')
        input_info = get_input_info(args[0], self.mlinspect_caller_filename,
                                    self.mlinspect_lineno, function_info,
                                    self.mlinspect_optional_code_reference,
                                    self.mlinspect_optional_source_code)

        operator_context = OperatorContext(OperatorType.TRANSFORMER,
                                           function_info)
        input_infos = SklearnBackend.before_call(
            operator_context, [input_info.annotated_dfobject])
        result = original(self, input_infos[0].result_data, *args[1:],
                          **kwargs)
        backend_result = SklearnBackend.after_call(operator_context,
                                                   input_infos, result)
        new_return_value = backend_result.annotated_dfobject.result_data
        assert isinstance(new_return_value, MlinspectNdarray)
        dag_node = DagNode(
            singleton.get_next_op_id(),
            BasicCodeLocation(self.mlinspect_caller_filename,
                              self.mlinspect_lineno), operator_context,
            DagNodeDetails("Word2Vec: fit_transform", ['array']),
            get_optional_code_info_or_none(
                self.mlinspect_optional_code_reference,
                self.mlinspect_optional_source_code))
        add_dag_node(dag_node, [input_info.dag_node], backend_result)
        self.mlinspect_fit_transform_active = False  # pylint: disable=attribute-defined-outside-init
        return new_return_value
Exemple #3
0
        def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            function_info = FunctionInfo('pandas.core.frame', 'merge')

            input_info_a = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference,
                                          optional_source_code)
            input_info_b = get_input_info(args[0], caller_filename, lineno, function_info, optional_code_reference,
                                          optional_source_code)
            operator_context = OperatorContext(OperatorType.JOIN, function_info)
            input_infos = PandasBackend.before_call(operator_context, [input_info_a.annotated_dfobject,
                                                                       input_info_b.annotated_dfobject])
            # No input_infos copy needed because it's only a selection and the rows not being removed don't change
            result = original(input_infos[0].result_data, input_infos[1].result_data, *args[1:], **kwargs)
            backend_result = PandasBackend.after_call(operator_context,
                                                      input_infos,
                                                      result)
            result = backend_result.annotated_dfobject.result_data
            description = "on '{}'".format(kwargs['on'])
            dag_node = DagNode(op_id,
                               BasicCodeLocation(caller_filename, lineno),
                               operator_context,
                               DagNodeDetails(description, list(result.columns)),
                               get_optional_code_info_or_none(optional_code_reference, optional_source_code))
            add_dag_node(dag_node, [input_info_a.dag_node, input_info_b.dag_node], backend_result)

            return result
Exemple #4
0
        def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            function_info = FunctionInfo('pandas.core.groupby.generic', 'agg')
            if not hasattr(self, '_mlinspect_dag_node'):
                raise NotImplementedError("TODO: Support agg if groupby happened in external code")
            input_dag_node = get_dag_node_for_id(self._mlinspect_dag_node)  # pylint: disable=no-member

            operator_context = OperatorContext(OperatorType.GROUP_BY_AGG, function_info)

            input_infos = PandasBackend.before_call(operator_context, [])
            result = original(self, *args, **kwargs)
            backend_result = PandasBackend.after_call(operator_context,
                                                      input_infos,
                                                      result)

            if len(args) > 0:
                description = "Groupby '{}', Aggregate: '{}'".format(result.index.name, args)
            else:
                description = "Groupby '{}', Aggregate: '{}'".format(result.index.name, kwargs)
            columns = [result.index.name] + list(result.columns)
            dag_node = DagNode(op_id,
                               BasicCodeLocation(caller_filename, lineno),
                               operator_context,
                               DagNodeDetails(description, columns),
                               get_optional_code_info_or_none(optional_code_reference, optional_source_code))
            add_dag_node(dag_node, [input_dag_node], backend_result)
            new_return_value = backend_result.annotated_dfobject.result_data

            return new_return_value
Exemple #5
0
        def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            function_info = FunctionInfo('pandas.core.frame', 'replace')

            input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference,
                                        optional_source_code)
            operator_context = OperatorContext(OperatorType.PROJECTION_MODIFY, function_info)
            input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject])
            # No input_infos copy needed because it's only a selection and the rows not being removed don't change
            result = original(input_infos[0].result_data, *args, **kwargs)
            backend_result = PandasBackend.after_call(operator_context,
                                                      input_infos,
                                                      result)
            result = backend_result.annotated_dfobject.result_data
            if isinstance(args[0], dict):
                raise NotImplementedError("TODO: Add support for replace with dicts")
            description = "Replace '{}' with '{}'".format(args[0], args[1])
            dag_node = DagNode(op_id,
                               BasicCodeLocation(caller_filename, lineno),
                               operator_context,
                               DagNodeDetails(description, list(result.columns)),
                               get_optional_code_info_or_none(optional_code_reference, optional_source_code))
            add_dag_node(dag_node, [input_info.dag_node], backend_result)

            return result
Exemple #6
0
        def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            # pylint: disable=too-many-locals
            function_info = FunctionInfo('pandas.core.frame', '__setitem__')
            operator_context = OperatorContext(OperatorType.PROJECTION_MODIFY, function_info)

            input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference,
                                        optional_source_code)

            if isinstance(args[0], str):
                input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject])
                input_infos = copy.deepcopy(input_infos)
                result = original(self, *args, **kwargs)
                backend_result = PandasBackend.after_call(operator_context,
                                                          input_infos,
                                                          self)
                columns = list(self.columns)  # pylint: disable=no-member
                description = "modifies {}".format([args[0]])
            else:
                raise NotImplementedError("TODO: Handling __setitem__ for key type {}".format(type(args[0])))
            dag_node = DagNode(op_id,
                               BasicCodeLocation(caller_filename, lineno),
                               operator_context,
                               DagNodeDetails(description, columns),
                               get_optional_code_info_or_none(optional_code_reference, optional_source_code))
            add_dag_node(dag_node, [input_info.dag_node], backend_result)
            assert hasattr(self, "_mlinspect_annotation")
            return result
Exemple #7
0
        def execute_inspections(_, caller_filename, lineno, optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            function_info = FunctionInfo('pandas.core.frame', 'groupby')
            # We ignore groupbys, we only do something with aggs

            input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference,
                                        optional_source_code)
            result = original(self, *args, **kwargs)
            result._mlinspect_dag_node = input_info.dag_node.node_id  # pylint: disable=protected-access

            return result
Exemple #8
0
 def get_relevant_nodes_and_histograms(self, inspection_result):
     """Get all DAG nodes relevant for this inspection and their histograms"""
     dag = inspection_result.dag
     histograms = {}
     for dag_node, inspection_results in inspection_result.dag_node_to_inspection_results.items(
     ):
         histograms[dag_node] = inspection_results[HistogramForColumns(
             self.sensitive_columns)]
     relevant_nodes = [
         node for node in dag.nodes if node.operator_info.operator in
         {OperatorType.JOIN, OperatorType.SELECTION} or (
             node.operator_info.function_info == FunctionInfo(
                 'sklearn.impute._base', 'SimpleImputer') and
             set(node.details.columns).intersection(self.sensitive_columns))
     ]
     return histograms, relevant_nodes
Exemple #9
0
        def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            function_info = FunctionInfo('pandas.core.frame', 'DataFrame')
            operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info)
            input_infos = PandasBackend.before_call(operator_context, [])
            original(self, *args, **kwargs)
            result = self
            backend_result = PandasBackend.after_call(operator_context, input_infos, result)

            columns = list(self.columns)  # pylint: disable=no-member
            dag_node = DagNode(op_id,
                               BasicCodeLocation(caller_filename, lineno),
                               operator_context,
                               DagNodeDetails(None, columns),
                               get_optional_code_info_or_none(optional_code_reference, optional_source_code))
            add_dag_node(dag_node, [], backend_result)
Exemple #10
0
        def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            function_info = FunctionInfo('pandas.core.frame', '__getitem__')
            input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference,
                                        optional_source_code)
            if isinstance(args[0], str):  # Projection to Series
                columns = [args[0]]
                operator_context = OperatorContext(OperatorType.PROJECTION, function_info)
                dag_node = DagNode(op_id,
                                   BasicCodeLocation(caller_filename, lineno),
                                   operator_context,
                                   DagNodeDetails("to {}".format(columns), columns),
                                   get_optional_code_info_or_none(optional_code_reference, optional_source_code))
            elif isinstance(args[0], list) and isinstance(args[0][0], str):  # Projection to DF
                columns = args[0]
                operator_context = OperatorContext(OperatorType.PROJECTION, function_info)
                dag_node = DagNode(op_id,
                                   BasicCodeLocation(caller_filename, lineno),
                                   operator_context,
                                   DagNodeDetails("to {}".format(columns), columns),
                                   get_optional_code_info_or_none(optional_code_reference, optional_source_code))
            elif isinstance(args[0], pandas.Series):  # Selection
                operator_context = OperatorContext(OperatorType.SELECTION, function_info)
                columns = list(self.columns)  # pylint: disable=no-member
                if optional_source_code:
                    description = "Select by Series: {}".format(optional_source_code)
                else:
                    description = "Select by Series"
                dag_node = DagNode(op_id,
                                   BasicCodeLocation(caller_filename, lineno),
                                   operator_context,
                                   DagNodeDetails(description, columns),
                                   get_optional_code_info_or_none(optional_code_reference, optional_source_code))
            else:
                raise NotImplementedError()
            input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject])
            result = original(input_infos[0].result_data, *args, **kwargs)
            backend_result = PandasBackend.after_call(operator_context,
                                                      input_infos,
                                                      result)
            result = backend_result.annotated_dfobject.result_data
            add_dag_node(dag_node, [input_info.dag_node], backend_result)

            return result
Exemple #11
0
        def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            function_info = FunctionInfo('pandas.io.parsers', 'read_csv')

            operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info)
            input_infos = PandasBackend.before_call(operator_context, [])
            result = original(*args, **kwargs)
            backend_result = PandasBackend.after_call(operator_context,
                                                      input_infos,
                                                      result)

            description = "{}".format(args[0].split(os.path.sep)[-1])
            dag_node = DagNode(op_id,
                               BasicCodeLocation(caller_filename, lineno),
                               operator_context,
                               DagNodeDetails(description, list(result.columns)),
                               get_optional_code_info_or_none(optional_code_reference, optional_source_code))
            add_dag_node(dag_node, [], backend_result)
            return result
Exemple #12
0
        def execute_inspections(op_id, caller_filename, lineno,
                                optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            function_info = FunctionInfo('numpy.random', 'random')
            operator_context = OperatorContext(OperatorType.DATA_SOURCE,
                                               function_info)
            input_infos = SklearnBackend.before_call(operator_context, [])
            result = original(*args, **kwargs)
            backend_result = SklearnBackend.after_call(operator_context,
                                                       input_infos, result)

            dag_node = DagNode(
                op_id, BasicCodeLocation(caller_filename, lineno),
                operator_context, DagNodeDetails("random", ['array']),
                get_optional_code_info_or_none(optional_code_reference,
                                               optional_source_code))
            add_dag_node(dag_node, [], backend_result)
            new_return_value = backend_result.annotated_dfobject.result_data
            return new_return_value
Exemple #13
0
    def evaluate(self, inspection_result: InspectionResult) -> CheckResult:
        """Evaluate the check"""
        # pylint: disable=too-many-locals
        dag = inspection_result.dag
        histograms = {}
        for dag_node, inspection_results in inspection_result.dag_node_to_inspection_results.items(
        ):
            histograms[dag_node] = inspection_results[HistogramForColumns(
                self.sensitive_columns)]
        relevant_nodes = [
            node for node in dag.nodes if node.operator_info.operator in
            {OperatorType.JOIN, OperatorType.SELECTION} or (
                node.operator_info.function_info == FunctionInfo(
                    'sklearn.impute._base', 'SimpleImputer') and
                set(node.details.columns).intersection(self.sensitive_columns))
        ]
        check_status = CheckStatus.SUCCESS
        bias_distribution_change = collections.OrderedDict()
        issue_list = []
        for node in relevant_nodes:
            parents = list(dag.predecessors(node))
            column_results = collections.OrderedDict()
            for column in self.sensitive_columns:
                column_result = self.get_histograms_for_node_and_column(
                    column, histograms, node, parents)
                column_results[column] = column_result
                if not column_result.acceptable_change:
                    issue = "A {} causes a min_relative_ratio_change of '{}' by {}, a value below the " \
                            "configured minimum threshold {}!" \
                        .format(node.operator_info.operator.value, column, column_result.min_relative_ratio_change,
                                self.min_allowed_relative_ratio_change)
                    issue_list.append(issue)
                    check_status = CheckStatus.FAILURE

            bias_distribution_change[node] = column_results
        if issue_list:
            description = " ".join(issue_list)
        else:
            description = None
        return NoBiasIntroducedForResult(self, check_status, description,
                                         bias_distribution_change)
        def execute_inspections(op_id, caller_filename, lineno,
                                optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            function_info = FunctionInfo('statsmodels.datasets',
                                         'get_rdataset')

            operator_context = OperatorContext(OperatorType.DATA_SOURCE,
                                               function_info)
            input_infos = PandasBackend.before_call(operator_context, [])
            result = original(*args, **kwargs)
            backend_result = PandasBackend.after_call(operator_context,
                                                      input_infos, result.data)
            result.data = backend_result.annotated_dfobject.result_data
            dag_node = DagNode(
                op_id, BasicCodeLocation(caller_filename, lineno),
                operator_context,
                DagNodeDetails(result.title, list(result.data.columns)),
                get_optional_code_info_or_none(optional_code_reference,
                                               optional_source_code))
            add_dag_node(dag_node, [], backend_result)
            return result
    def patched_fit(self, *args, **kwargs):
        """ Patch for ('statsmodel.api.OLS', 'fit') """
        # pylint: disable=no-method-argument, too-many-locals
        original = gorilla.get_original_attribute(api.OLS, 'fit')
        function_info = FunctionInfo('statsmodel.api.OLS', 'fit')

        # Train data
        # pylint: disable=no-member
        data_backend_result, train_data_node, train_data_result = add_train_data_node(
            self, self.data.exog, function_info)
        self.data.exog = train_data_result
        # pylint: disable=no-member
        label_backend_result, train_labels_node, train_labels_result = add_train_label_node(
            self, self.data.endog, function_info)
        self.data.endog = train_labels_result

        # Estimator
        operator_context = OperatorContext(OperatorType.ESTIMATOR,
                                           function_info)
        input_dfs = [
            data_backend_result.annotated_dfobject,
            label_backend_result.annotated_dfobject
        ]
        input_infos = SklearnBackend.before_call(operator_context, input_dfs)
        result = original(self, *args, **kwargs)
        estimator_backend_result = SklearnBackend.after_call(
            operator_context, input_infos, None)

        dag_node = DagNode(
            singleton.get_next_op_id(),
            BasicCodeLocation(self.mlinspect_caller_filename,
                              self.mlinspect_lineno), operator_context,
            DagNodeDetails("Decision Tree", []),
            get_optional_code_info_or_none(
                self.mlinspect_optional_code_reference,
                self.mlinspect_optional_source_code))
        add_dag_node(dag_node, [train_data_node, train_labels_node],
                     estimator_backend_result)
        return result
Exemple #16
0
        def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code):
            """ Execute inspections, add DAG node """
            function_info = FunctionInfo('pandas.core.frame', 'dropna')

            input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference,
                                        optional_source_code)
            operator_context = OperatorContext(OperatorType.SELECTION, function_info)
            input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject])
            # No input_infos copy needed because it's only a selection and the rows not being removed don't change
            result = original(input_infos[0].result_data, *args[1:], **kwargs)
            if result is None:
                raise NotImplementedError("TODO: Support inplace dropna")
            backend_result = PandasBackend.after_call(operator_context,
                                                      input_infos,
                                                      result)
            result = backend_result.annotated_dfobject.result_data
            dag_node = DagNode(op_id,
                               BasicCodeLocation(caller_filename, lineno),
                               operator_context,
                               DagNodeDetails("dropna", list(result.columns)),
                               get_optional_code_info_or_none(optional_code_reference, optional_source_code))
            add_dag_node(dag_node, [input_info.dag_node], backend_result)

            return result
    def visit_operator(self, inspection_input) -> Iterable[any]:
        """
        Visit an operator
        """
        # pylint: disable=too-many-branches, too-many-statements, too-many-locals
        current_count = -1

        histogram_maps = []
        for _ in self.sensitive_columns:
            histogram_maps.append({})

        self._operator_type = inspection_input.operator_context.operator

        if isinstance(inspection_input, InspectionInputUnaryOperator):
            sensitive_columns_present = []
            sensitive_columns_index = []
            for column in self.sensitive_columns:
                column_present = column in inspection_input.input_columns.fields
                sensitive_columns_present.append(column_present)
                column_index = inspection_input.input_columns.get_index_of_column(
                    column)
                sensitive_columns_index.append(column_index)
            if inspection_input.operator_context.function_info == FunctionInfo(
                    'sklearn.impute._base', 'SimpleImputer'):
                for row in inspection_input.row_iterator:
                    current_count += 1
                    column_values = []
                    for check_index, _ in enumerate(self.sensitive_columns):
                        if sensitive_columns_present[check_index]:
                            column_value = row.output[0][
                                sensitive_columns_index[check_index]]
                        else:
                            column_value = row.annotation[check_index]
                        column_values.append(column_value)
                        group_count = histogram_maps[check_index].get(
                            column_value, 0)
                        group_count += 1
                        histogram_maps[check_index][column_value] = group_count
                    yield column_values
            else:
                for row in inspection_input.row_iterator:
                    current_count += 1
                    column_values = []
                    for check_index, _ in enumerate(self.sensitive_columns):
                        if sensitive_columns_present[check_index]:
                            column_value = row.input[
                                sensitive_columns_index[check_index]]
                        else:
                            column_value = row.annotation[check_index]
                        column_values.append(column_value)
                        group_count = histogram_maps[check_index].get(
                            column_value, 0)
                        group_count += 1
                        histogram_maps[check_index][column_value] = group_count
                    yield column_values
        elif isinstance(inspection_input, InspectionInputDataSource):
            sensitive_columns_present = []
            sensitive_columns_index = []
            for column in self.sensitive_columns:
                column_present = column in inspection_input.output_columns.fields
                sensitive_columns_present.append(column_present)
                column_index = inspection_input.output_columns.get_index_of_column(
                    column)
                sensitive_columns_index.append(column_index)
            for row in inspection_input.row_iterator:
                current_count += 1
                column_values = []
                for check_index, _ in enumerate(self.sensitive_columns):
                    if sensitive_columns_present[check_index]:
                        column_value = row.output[
                            sensitive_columns_index[check_index]]
                        column_values.append(column_value)
                        group_count = histogram_maps[check_index].get(
                            column_value, 0)
                        group_count += 1
                        histogram_maps[check_index][column_value] = group_count
                    else:
                        column_values.append(None)
                yield column_values
        elif isinstance(inspection_input, InspectionInputNAryOperator):
            sensitive_columns_present = []
            sensitive_columns_index = []
            for column in self.sensitive_columns:
                column_present = column in inspection_input.output_columns.fields
                sensitive_columns_present.append(column_present)
                column_index = inspection_input.output_columns.get_index_of_column(
                    column)
                sensitive_columns_index.append(column_index)
            for row in inspection_input.row_iterator:
                current_count += 1
                column_values = []
                for check_index, _ in enumerate(self.sensitive_columns):
                    if sensitive_columns_present[check_index]:
                        column_value = row.output[
                            sensitive_columns_index[check_index]]
                        column_values.append(column_value)
                        group_count = histogram_maps[check_index].get(
                            column_value, 0)
                        group_count += 1
                        histogram_maps[check_index][column_value] = group_count
                    else:
                        column_values.append(None)
                yield column_values
        else:
            for _ in inspection_input.row_iterator:
                yield None

        self._histogram_op_output = {}
        for check_index, column in enumerate(self.sensitive_columns):
            self._histogram_op_output[column] = histogram_maps[check_index]