def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('statsmodel.api', 'add_constant') input_info = get_input_info(args[0], caller_filename, lineno, function_info, optional_code_reference, optional_source_code) operator_context = OperatorContext(OperatorType.PROJECTION_MODIFY, function_info) input_infos = SklearnBackend.before_call( operator_context, [input_info.annotated_dfobject]) result = original(input_infos[0].result_data, *args[1:], **kwargs) backend_result = SklearnBackend.after_call(operator_context, input_infos, result) new_return_value = backend_result.annotated_dfobject.result_data dag_node = DagNode( op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails("Adds const column", ["array"]), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_info.dag_node], backend_result) return new_return_value
def patched_fit_transform(self, *args, **kwargs): """ Patch for ('example_pipelines.healthcare.healthcare_utils.MyW2VTransformer', 'fit_transform') """ # pylint: disable=no-method-argument self.mlinspect_fit_transform_active = True # pylint: disable=attribute-defined-outside-init original = gorilla.get_original_attribute( healthcare_utils.MyW2VTransformer, 'fit_transform') function_info = FunctionInfo( 'example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer') input_info = get_input_info(args[0], self.mlinspect_caller_filename, self.mlinspect_lineno, function_info, self.mlinspect_optional_code_reference, self.mlinspect_optional_source_code) operator_context = OperatorContext(OperatorType.TRANSFORMER, function_info) input_infos = SklearnBackend.before_call( operator_context, [input_info.annotated_dfobject]) result = original(self, input_infos[0].result_data, *args[1:], **kwargs) backend_result = SklearnBackend.after_call(operator_context, input_infos, result) new_return_value = backend_result.annotated_dfobject.result_data assert isinstance(new_return_value, MlinspectNdarray) dag_node = DagNode( singleton.get_next_op_id(), BasicCodeLocation(self.mlinspect_caller_filename, self.mlinspect_lineno), operator_context, DagNodeDetails("Word2Vec: fit_transform", ['array']), get_optional_code_info_or_none( self.mlinspect_optional_code_reference, self.mlinspect_optional_source_code)) add_dag_node(dag_node, [input_info.dag_node], backend_result) self.mlinspect_fit_transform_active = False # pylint: disable=attribute-defined-outside-init return new_return_value
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.frame', 'merge') input_info_a = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) input_info_b = get_input_info(args[0], caller_filename, lineno, function_info, optional_code_reference, optional_source_code) operator_context = OperatorContext(OperatorType.JOIN, function_info) input_infos = PandasBackend.before_call(operator_context, [input_info_a.annotated_dfobject, input_info_b.annotated_dfobject]) # No input_infos copy needed because it's only a selection and the rows not being removed don't change result = original(input_infos[0].result_data, input_infos[1].result_data, *args[1:], **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result) result = backend_result.annotated_dfobject.result_data description = "on '{}'".format(kwargs['on']) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, list(result.columns)), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_info_a.dag_node, input_info_b.dag_node], backend_result) return result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.groupby.generic', 'agg') if not hasattr(self, '_mlinspect_dag_node'): raise NotImplementedError("TODO: Support agg if groupby happened in external code") input_dag_node = get_dag_node_for_id(self._mlinspect_dag_node) # pylint: disable=no-member operator_context = OperatorContext(OperatorType.GROUP_BY_AGG, function_info) input_infos = PandasBackend.before_call(operator_context, []) result = original(self, *args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result) if len(args) > 0: description = "Groupby '{}', Aggregate: '{}'".format(result.index.name, args) else: description = "Groupby '{}', Aggregate: '{}'".format(result.index.name, kwargs) columns = [result.index.name] + list(result.columns) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_dag_node], backend_result) new_return_value = backend_result.annotated_dfobject.result_data return new_return_value
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.frame', 'replace') input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) operator_context = OperatorContext(OperatorType.PROJECTION_MODIFY, function_info) input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject]) # No input_infos copy needed because it's only a selection and the rows not being removed don't change result = original(input_infos[0].result_data, *args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result) result = backend_result.annotated_dfobject.result_data if isinstance(args[0], dict): raise NotImplementedError("TODO: Add support for replace with dicts") description = "Replace '{}' with '{}'".format(args[0], args[1]) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, list(result.columns)), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_info.dag_node], backend_result) return result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ # pylint: disable=too-many-locals function_info = FunctionInfo('pandas.core.frame', '__setitem__') operator_context = OperatorContext(OperatorType.PROJECTION_MODIFY, function_info) input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) if isinstance(args[0], str): input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject]) input_infos = copy.deepcopy(input_infos) result = original(self, *args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, self) columns = list(self.columns) # pylint: disable=no-member description = "modifies {}".format([args[0]]) else: raise NotImplementedError("TODO: Handling __setitem__ for key type {}".format(type(args[0]))) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_info.dag_node], backend_result) assert hasattr(self, "_mlinspect_annotation") return result
def execute_inspections(_, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.frame', 'groupby') # We ignore groupbys, we only do something with aggs input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) result = original(self, *args, **kwargs) result._mlinspect_dag_node = input_info.dag_node.node_id # pylint: disable=protected-access return result
def get_relevant_nodes_and_histograms(self, inspection_result): """Get all DAG nodes relevant for this inspection and their histograms""" dag = inspection_result.dag histograms = {} for dag_node, inspection_results in inspection_result.dag_node_to_inspection_results.items( ): histograms[dag_node] = inspection_results[HistogramForColumns( self.sensitive_columns)] relevant_nodes = [ node for node in dag.nodes if node.operator_info.operator in {OperatorType.JOIN, OperatorType.SELECTION} or ( node.operator_info.function_info == FunctionInfo( 'sklearn.impute._base', 'SimpleImputer') and set(node.details.columns).intersection(self.sensitive_columns)) ] return histograms, relevant_nodes
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.frame', 'DataFrame') operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info) input_infos = PandasBackend.before_call(operator_context, []) original(self, *args, **kwargs) result = self backend_result = PandasBackend.after_call(operator_context, input_infos, result) columns = list(self.columns) # pylint: disable=no-member dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(None, columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [], backend_result)
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.frame', '__getitem__') input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) if isinstance(args[0], str): # Projection to Series columns = [args[0]] operator_context = OperatorContext(OperatorType.PROJECTION, function_info) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails("to {}".format(columns), columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) elif isinstance(args[0], list) and isinstance(args[0][0], str): # Projection to DF columns = args[0] operator_context = OperatorContext(OperatorType.PROJECTION, function_info) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails("to {}".format(columns), columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) elif isinstance(args[0], pandas.Series): # Selection operator_context = OperatorContext(OperatorType.SELECTION, function_info) columns = list(self.columns) # pylint: disable=no-member if optional_source_code: description = "Select by Series: {}".format(optional_source_code) else: description = "Select by Series" dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) else: raise NotImplementedError() input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject]) result = original(input_infos[0].result_data, *args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result) result = backend_result.annotated_dfobject.result_data add_dag_node(dag_node, [input_info.dag_node], backend_result) return result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.io.parsers', 'read_csv') operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info) input_infos = PandasBackend.before_call(operator_context, []) result = original(*args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result) description = "{}".format(args[0].split(os.path.sep)[-1]) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, list(result.columns)), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [], backend_result) return result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('numpy.random', 'random') operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info) input_infos = SklearnBackend.before_call(operator_context, []) result = original(*args, **kwargs) backend_result = SklearnBackend.after_call(operator_context, input_infos, result) dag_node = DagNode( op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails("random", ['array']), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [], backend_result) new_return_value = backend_result.annotated_dfobject.result_data return new_return_value
def evaluate(self, inspection_result: InspectionResult) -> CheckResult: """Evaluate the check""" # pylint: disable=too-many-locals dag = inspection_result.dag histograms = {} for dag_node, inspection_results in inspection_result.dag_node_to_inspection_results.items( ): histograms[dag_node] = inspection_results[HistogramForColumns( self.sensitive_columns)] relevant_nodes = [ node for node in dag.nodes if node.operator_info.operator in {OperatorType.JOIN, OperatorType.SELECTION} or ( node.operator_info.function_info == FunctionInfo( 'sklearn.impute._base', 'SimpleImputer') and set(node.details.columns).intersection(self.sensitive_columns)) ] check_status = CheckStatus.SUCCESS bias_distribution_change = collections.OrderedDict() issue_list = [] for node in relevant_nodes: parents = list(dag.predecessors(node)) column_results = collections.OrderedDict() for column in self.sensitive_columns: column_result = self.get_histograms_for_node_and_column( column, histograms, node, parents) column_results[column] = column_result if not column_result.acceptable_change: issue = "A {} causes a min_relative_ratio_change of '{}' by {}, a value below the " \ "configured minimum threshold {}!" \ .format(node.operator_info.operator.value, column, column_result.min_relative_ratio_change, self.min_allowed_relative_ratio_change) issue_list.append(issue) check_status = CheckStatus.FAILURE bias_distribution_change[node] = column_results if issue_list: description = " ".join(issue_list) else: description = None return NoBiasIntroducedForResult(self, check_status, description, bias_distribution_change)
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('statsmodels.datasets', 'get_rdataset') operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info) input_infos = PandasBackend.before_call(operator_context, []) result = original(*args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result.data) result.data = backend_result.annotated_dfobject.result_data dag_node = DagNode( op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(result.title, list(result.data.columns)), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [], backend_result) return result
def patched_fit(self, *args, **kwargs): """ Patch for ('statsmodel.api.OLS', 'fit') """ # pylint: disable=no-method-argument, too-many-locals original = gorilla.get_original_attribute(api.OLS, 'fit') function_info = FunctionInfo('statsmodel.api.OLS', 'fit') # Train data # pylint: disable=no-member data_backend_result, train_data_node, train_data_result = add_train_data_node( self, self.data.exog, function_info) self.data.exog = train_data_result # pylint: disable=no-member label_backend_result, train_labels_node, train_labels_result = add_train_label_node( self, self.data.endog, function_info) self.data.endog = train_labels_result # Estimator operator_context = OperatorContext(OperatorType.ESTIMATOR, function_info) input_dfs = [ data_backend_result.annotated_dfobject, label_backend_result.annotated_dfobject ] input_infos = SklearnBackend.before_call(operator_context, input_dfs) result = original(self, *args, **kwargs) estimator_backend_result = SklearnBackend.after_call( operator_context, input_infos, None) dag_node = DagNode( singleton.get_next_op_id(), BasicCodeLocation(self.mlinspect_caller_filename, self.mlinspect_lineno), operator_context, DagNodeDetails("Decision Tree", []), get_optional_code_info_or_none( self.mlinspect_optional_code_reference, self.mlinspect_optional_source_code)) add_dag_node(dag_node, [train_data_node, train_labels_node], estimator_backend_result) return result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.frame', 'dropna') input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) operator_context = OperatorContext(OperatorType.SELECTION, function_info) input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject]) # No input_infos copy needed because it's only a selection and the rows not being removed don't change result = original(input_infos[0].result_data, *args[1:], **kwargs) if result is None: raise NotImplementedError("TODO: Support inplace dropna") backend_result = PandasBackend.after_call(operator_context, input_infos, result) result = backend_result.annotated_dfobject.result_data dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails("dropna", list(result.columns)), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_info.dag_node], backend_result) return result
def visit_operator(self, inspection_input) -> Iterable[any]: """ Visit an operator """ # pylint: disable=too-many-branches, too-many-statements, too-many-locals current_count = -1 histogram_maps = [] for _ in self.sensitive_columns: histogram_maps.append({}) self._operator_type = inspection_input.operator_context.operator if isinstance(inspection_input, InspectionInputUnaryOperator): sensitive_columns_present = [] sensitive_columns_index = [] for column in self.sensitive_columns: column_present = column in inspection_input.input_columns.fields sensitive_columns_present.append(column_present) column_index = inspection_input.input_columns.get_index_of_column( column) sensitive_columns_index.append(column_index) if inspection_input.operator_context.function_info == FunctionInfo( 'sklearn.impute._base', 'SimpleImputer'): for row in inspection_input.row_iterator: current_count += 1 column_values = [] for check_index, _ in enumerate(self.sensitive_columns): if sensitive_columns_present[check_index]: column_value = row.output[0][ sensitive_columns_index[check_index]] else: column_value = row.annotation[check_index] column_values.append(column_value) group_count = histogram_maps[check_index].get( column_value, 0) group_count += 1 histogram_maps[check_index][column_value] = group_count yield column_values else: for row in inspection_input.row_iterator: current_count += 1 column_values = [] for check_index, _ in enumerate(self.sensitive_columns): if sensitive_columns_present[check_index]: column_value = row.input[ sensitive_columns_index[check_index]] else: column_value = row.annotation[check_index] column_values.append(column_value) group_count = histogram_maps[check_index].get( column_value, 0) group_count += 1 histogram_maps[check_index][column_value] = group_count yield column_values elif isinstance(inspection_input, InspectionInputDataSource): sensitive_columns_present = [] sensitive_columns_index = [] for column in self.sensitive_columns: column_present = column in inspection_input.output_columns.fields sensitive_columns_present.append(column_present) column_index = inspection_input.output_columns.get_index_of_column( column) sensitive_columns_index.append(column_index) for row in inspection_input.row_iterator: current_count += 1 column_values = [] for check_index, _ in enumerate(self.sensitive_columns): if sensitive_columns_present[check_index]: column_value = row.output[ sensitive_columns_index[check_index]] column_values.append(column_value) group_count = histogram_maps[check_index].get( column_value, 0) group_count += 1 histogram_maps[check_index][column_value] = group_count else: column_values.append(None) yield column_values elif isinstance(inspection_input, InspectionInputNAryOperator): sensitive_columns_present = [] sensitive_columns_index = [] for column in self.sensitive_columns: column_present = column in inspection_input.output_columns.fields sensitive_columns_present.append(column_present) column_index = inspection_input.output_columns.get_index_of_column( column) sensitive_columns_index.append(column_index) for row in inspection_input.row_iterator: current_count += 1 column_values = [] for check_index, _ in enumerate(self.sensitive_columns): if sensitive_columns_present[check_index]: column_value = row.output[ sensitive_columns_index[check_index]] column_values.append(column_value) group_count = histogram_maps[check_index].get( column_value, 0) group_count += 1 histogram_maps[check_index][column_value] = group_count else: column_values.append(None) yield column_values else: for _ in inspection_input.row_iterator: yield None self._histogram_op_output = {} for check_index, column in enumerate(self.sensitive_columns): self._histogram_op_output[column] = histogram_maps[check_index]