def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('statsmodel.api', 'add_constant') input_info = get_input_info(args[0], caller_filename, lineno, function_info, optional_code_reference, optional_source_code) operator_context = OperatorContext(OperatorType.PROJECTION_MODIFY, function_info) input_infos = SklearnBackend.before_call( operator_context, [input_info.annotated_dfobject]) result = original(input_infos[0].result_data, *args[1:], **kwargs) backend_result = SklearnBackend.after_call(operator_context, input_infos, result) new_return_value = backend_result.annotated_dfobject.result_data dag_node = DagNode( op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails("Adds const column", ["array"]), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_info.dag_node], backend_result) return new_return_value
def get_expected_check_result_simple_imputer(): """ Expected result for the code snippet in test_no_bias_introduced_for_simple_imputer""" dag_node = DagNode( 1, BasicCodeLocation('<string-source>', 6), OperatorContext(OperatorType.TRANSFORMER, FunctionInfo('sklearn.impute._base', 'SimpleImputer')), DagNodeDetails('Simple Imputer: fit_transform', ['A']), OptionalCodeInfo( CodeReference(6, 10, 6, 72), "SimpleImputer(missing_values=np.nan, strategy='most_frequent')")) change_df = DataFrame({ 'sensitive_column_value': ['cat_a', 'cat_c', math.nan], 'count_before': [2, 1, 1], 'count_after': [3, 1, 0], 'removed_records': [-1, 0, 1], 'removal_probability': [0., 0., 1.], 'normalized_removal_probability': [0., 0., 1.] }) expected_probabilities = RemovalProbabilities(dag_node, True, 0., change_df) expected_dag_node_to_change = {dag_node: {'A': expected_probabilities}} failure_message = None expected_result = SimilarRemovalProbabilitiesForResult( SimilarRemovalProbabilitiesFor(['A']), CheckStatus.SUCCESS, failure_message, expected_dag_node_to_change) return expected_result
def get_expected_check_result_merge(): """ Expected result for the code snippet in test_no_bias_introduced_for_merge""" dag_node = DagNode( 2, BasicCodeLocation('<string-source>', 5), OperatorContext(OperatorType.JOIN, FunctionInfo('pandas.core.frame', 'merge')), DagNodeDetails("on 'B'", ['A', 'B', 'C']), OptionalCodeInfo(CodeReference(5, 12, 5, 36), "df_a.merge(df_b, on='B')")) change_df = DataFrame({ 'sensitive_column_value': ['cat_a', 'cat_b', 'cat_c'], 'count_before': [2, 2, 1], 'count_after': [2, 1, 1], 'removed_records': [0, 1, 0], 'removal_probability': [0., 0.5, 0.], 'normalized_removal_probability': [0., 1., 0.] }) expected_probabilities = RemovalProbabilities(dag_node, True, 1., change_df) expected_dag_node_to_change = {dag_node: {'A': expected_probabilities}} failure_message = None expected_result = SimilarRemovalProbabilitiesForResult( SimilarRemovalProbabilitiesFor(['A']), CheckStatus.SUCCESS, failure_message, expected_dag_node_to_change) return expected_result
def get_expected_check_result_simple_imputer(): """ Expected result for the code snippet in test_no_bias_introduced_for_simple_imputer""" imputer_dag_node = DagNode( 1, BasicCodeLocation('<string-source>', 6), OperatorContext(OperatorType.TRANSFORMER, FunctionInfo('sklearn.impute._base', 'SimpleImputer')), DagNodeDetails('Simple Imputer', ['A']), OptionalCodeInfo( CodeReference(6, 10, 6, 72), "SimpleImputer(missing_values=np.nan, strategy='most_frequent')")) change_df = DataFrame({ 'sensitive_column_value': ['cat_a', 'cat_c', math.nan], 'count_before': [2, 1, 1], 'count_after': [3, 1, 0], 'ratio_before': [0.5, 0.25, 0.25], 'ratio_after': [0.75, 0.25, 0.], 'relative_ratio_change': [0.5, 0., -1.] }) expected_distribution_change = BiasDistributionChange( imputer_dag_node, True, 0., change_df) expected_dag_node_to_change = { imputer_dag_node: { 'A': expected_distribution_change } } expected_result = NoBiasIntroducedForResult(NoBiasIntroducedFor(['A']), CheckStatus.SUCCESS, None, expected_dag_node_to_change) return expected_result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.groupby.generic', 'agg') if not hasattr(self, '_mlinspect_dag_node'): raise NotImplementedError("TODO: Support agg if groupby happened in external code") input_dag_node = get_dag_node_for_id(self._mlinspect_dag_node) # pylint: disable=no-member operator_context = OperatorContext(OperatorType.GROUP_BY_AGG, function_info) input_infos = PandasBackend.before_call(operator_context, []) result = original(self, *args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result) if len(args) > 0: description = "Groupby '{}', Aggregate: '{}'".format(result.index.name, args) else: description = "Groupby '{}', Aggregate: '{}'".format(result.index.name, kwargs) columns = [result.index.name] + list(result.columns) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_dag_node], backend_result) new_return_value = backend_result.annotated_dfobject.result_data return new_return_value
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.frame', 'replace') input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) operator_context = OperatorContext(OperatorType.PROJECTION_MODIFY, function_info) input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject]) # No input_infos copy needed because it's only a selection and the rows not being removed don't change result = original(input_infos[0].result_data, *args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result) result = backend_result.annotated_dfobject.result_data if isinstance(args[0], dict): raise NotImplementedError("TODO: Add support for replace with dicts") description = "Replace '{}' with '{}'".format(args[0], args[1]) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, list(result.columns)), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_info.dag_node], backend_result) return result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.frame', 'merge') input_info_a = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) input_info_b = get_input_info(args[0], caller_filename, lineno, function_info, optional_code_reference, optional_source_code) operator_context = OperatorContext(OperatorType.JOIN, function_info) input_infos = PandasBackend.before_call(operator_context, [input_info_a.annotated_dfobject, input_info_b.annotated_dfobject]) # No input_infos copy needed because it's only a selection and the rows not being removed don't change result = original(input_infos[0].result_data, input_infos[1].result_data, *args[1:], **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result) result = backend_result.annotated_dfobject.result_data description = "on '{}'".format(kwargs['on']) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, list(result.columns)), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_info_a.dag_node, input_info_b.dag_node], backend_result) return result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ # pylint: disable=too-many-locals function_info = FunctionInfo('pandas.core.frame', '__setitem__') operator_context = OperatorContext(OperatorType.PROJECTION_MODIFY, function_info) input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) if isinstance(args[0], str): input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject]) input_infos = copy.deepcopy(input_infos) result = original(self, *args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, self) columns = list(self.columns) # pylint: disable=no-member description = "modifies {}".format([args[0]]) else: raise NotImplementedError("TODO: Handling __setitem__ for key type {}".format(type(args[0]))) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_info.dag_node], backend_result) assert hasattr(self, "_mlinspect_annotation") return result
def get_expected_check_result_merge(): """ Expected result for the code snippet in test_no_bias_introduced_for_merge""" failing_dag_node = DagNode( 2, BasicCodeLocation('<string-source>', 5), OperatorContext(OperatorType.JOIN, FunctionInfo('pandas.core.frame', 'merge')), DagNodeDetails("on 'B'", ['A', 'B', 'C']), OptionalCodeInfo(CodeReference(5, 12, 5, 36), "df_a.merge(df_b, on='B')")) change_df = DataFrame({ 'sensitive_column_value': ['cat_a', 'cat_b', 'cat_c'], 'count_before': [2, 2, 1], 'count_after': [2, 1, 1], 'ratio_before': [0.4, 0.4, 0.2], 'ratio_after': [0.5, 0.25, 0.25], 'relative_ratio_change': [(0.5 - 0.4) / 0.4, (.25 - 0.4) / 0.4, (0.25 - 0.2) / 0.2] }) expected_distribution_change = BiasDistributionChange( failing_dag_node, False, (.25 - 0.4) / 0.4, change_df) expected_dag_node_to_change = { failing_dag_node: { 'A': expected_distribution_change } } failure_message = 'A Join causes a min_relative_ratio_change of \'A\' by -0.37500000000000006, a value below the ' \ 'configured minimum threshold -0.3!' expected_result = NoBiasIntroducedForResult(NoBiasIntroducedFor(['A']), CheckStatus.FAILURE, failure_message, expected_dag_node_to_change) return expected_result
def test_no_missing_embeddings(): """ Tests whether NoMissingEmbeddings works for joins """ test_code = cleandoc(""" import pandas as pd from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']}) word_to_vec = MyW2VTransformer(min_count=2, size=2, workers=1) encoded_data = word_to_vec.fit_transform(df) """) inspector_result = PipelineInspector \ .on_pipeline_from_string(test_code) \ .add_check(NoMissingEmbeddings()) \ .add_custom_monkey_patching_module(custom_monkeypatching) \ .execute() check_result = inspector_result.check_to_check_results[NoMissingEmbeddings()] expected_failed_dag_node_with_result = { DagNode(1, BasicCodeLocation('<string-source>', 5), OperatorContext(OperatorType.TRANSFORMER, FunctionInfo('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer')), DagNodeDetails('Word2Vec: fit_transform', ['array']), OptionalCodeInfo(CodeReference(5, 14, 5, 62), 'MyW2VTransformer(min_count=2, size=2, workers=1)')) : MissingEmbeddingsInfo(2, ['cat_b', 'cat_c'])} expected_result = NoMissingEmbeddingsResult(NoMissingEmbeddings(10), CheckStatus.FAILURE, 'Missing embeddings were found!', expected_failed_dag_node_with_result) compare(check_result, expected_result)
def get_expected_check_result_dropna(): """ Expected result for the code snippet in test_no_bias_introduced_for_dropna""" dag_node = DagNode( 1, BasicCodeLocation('<string-source>', 5), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails("dropna", ['A', 'B']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), "df.dropna()")) change_df = DataFrame({ 'sensitive_column_value': ['cat_a', 'cat_c'], 'count_before': [2, 3], 'count_after': [0, 2], 'removed_records': [2, 1], 'removal_probability': [1., 1. / 3.], 'normalized_removal_probability': [3., 1.] }) expected_probabilities = RemovalProbabilities(dag_node, False, 3., change_df) expected_dag_node_to_change = {dag_node: {'A': expected_probabilities}} failure_message = "A Selection causes a max_probability_difference of 'A' by 3.0, a value above the configured " \ "maximum threshold 2.0!" expected_result = SimilarRemovalProbabilitiesForResult( SimilarRemovalProbabilitiesFor(['A']), CheckStatus.FAILURE, failure_message, expected_dag_node_to_change) return expected_result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.frame', '__getitem__') input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) if isinstance(args[0], str): # Projection to Series columns = [args[0]] operator_context = OperatorContext(OperatorType.PROJECTION, function_info) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails("to {}".format(columns), columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) elif isinstance(args[0], list) and isinstance(args[0][0], str): # Projection to DF columns = args[0] operator_context = OperatorContext(OperatorType.PROJECTION, function_info) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails("to {}".format(columns), columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) elif isinstance(args[0], pandas.Series): # Selection operator_context = OperatorContext(OperatorType.SELECTION, function_info) columns = list(self.columns) # pylint: disable=no-member if optional_source_code: description = "Select by Series: {}".format(optional_source_code) else: description = "Select by Series" dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) else: raise NotImplementedError() input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject]) result = original(input_infos[0].result_data, *args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result) result = backend_result.annotated_dfobject.result_data add_dag_node(dag_node, [input_info.dag_node], backend_result) return result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.frame', 'DataFrame') operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info) input_infos = PandasBackend.before_call(operator_context, []) original(self, *args, **kwargs) result = self backend_result = PandasBackend.after_call(operator_context, input_infos, result) columns = list(self.columns) # pylint: disable=no-member dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(None, columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [], backend_result)
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.io.parsers', 'read_csv') operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info) input_infos = PandasBackend.before_call(operator_context, []) result = original(*args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result) description = "{}".format(args[0].split(os.path.sep)[-1]) dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(description, list(result.columns)), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [], backend_result) return result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('numpy.random', 'random') operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info) input_infos = SklearnBackend.before_call(operator_context, []) result = original(*args, **kwargs) backend_result = SklearnBackend.after_call(operator_context, input_infos, result) dag_node = DagNode( op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails("random", ['array']), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [], backend_result) new_return_value = backend_result.annotated_dfobject.result_data return new_return_value
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('statsmodels.datasets', 'get_rdataset') operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info) input_infos = PandasBackend.before_call(operator_context, []) result = original(*args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result.data) result.data = backend_result.annotated_dfobject.result_data dag_node = DagNode( op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(result.title, list(result.data.columns)), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [], backend_result) return result
def patched__getitem__(self, *args, **kwargs): """ Patch for ('pandas.core.series', 'Series') """ original = gorilla.get_original_attribute( pandas.core.indexing._LocIndexer, '__getitem__') # pylint: disable=protected-access if call_info_singleton.column_transformer_active: op_id = singleton.get_next_op_id() caller_filename = call_info_singleton.transformer_filename lineno = call_info_singleton.transformer_lineno function_info = call_info_singleton.transformer_function_info optional_code_reference = call_info_singleton.transformer_optional_code_reference optional_source_code = call_info_singleton.transformer_optional_source_code if isinstance(args[0], tuple) and not args[0][0].start and not args[0][0].stop \ and isinstance(args[0][1], list) and isinstance(args[0][1][0], str): # Projection to one or multiple columns, return value is df columns = args[0][1] else: raise NotImplementedError() operator_context = OperatorContext(OperatorType.PROJECTION, function_info) input_info = get_input_info(self.obj, caller_filename, # pylint: disable=no-member lineno, function_info, optional_code_reference, optional_source_code) input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject]) result = original(self, *args, **kwargs) backend_result = PandasBackend.after_call(operator_context, input_infos, result) result = backend_result.annotated_dfobject.result_data dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails("to {}".format(columns), columns), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_info.dag_node], backend_result) else: result = original(self, *args, **kwargs) return result
def patched_fit(self, *args, **kwargs): """ Patch for ('statsmodel.api.OLS', 'fit') """ # pylint: disable=no-method-argument, too-many-locals original = gorilla.get_original_attribute(api.OLS, 'fit') function_info = FunctionInfo('statsmodel.api.OLS', 'fit') # Train data # pylint: disable=no-member data_backend_result, train_data_node, train_data_result = add_train_data_node( self, self.data.exog, function_info) self.data.exog = train_data_result # pylint: disable=no-member label_backend_result, train_labels_node, train_labels_result = add_train_label_node( self, self.data.endog, function_info) self.data.endog = train_labels_result # Estimator operator_context = OperatorContext(OperatorType.ESTIMATOR, function_info) input_dfs = [ data_backend_result.annotated_dfobject, label_backend_result.annotated_dfobject ] input_infos = SklearnBackend.before_call(operator_context, input_dfs) result = original(self, *args, **kwargs) estimator_backend_result = SklearnBackend.after_call( operator_context, input_infos, None) dag_node = DagNode( singleton.get_next_op_id(), BasicCodeLocation(self.mlinspect_caller_filename, self.mlinspect_lineno), operator_context, DagNodeDetails("Decision Tree", []), get_optional_code_info_or_none( self.mlinspect_optional_code_reference, self.mlinspect_optional_source_code)) add_dag_node(dag_node, [train_data_node, train_labels_node], estimator_backend_result) return result
def execute_inspections(op_id, caller_filename, lineno, optional_code_reference, optional_source_code): """ Execute inspections, add DAG node """ function_info = FunctionInfo('pandas.core.frame', 'dropna') input_info = get_input_info(self, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) operator_context = OperatorContext(OperatorType.SELECTION, function_info) input_infos = PandasBackend.before_call(operator_context, [input_info.annotated_dfobject]) # No input_infos copy needed because it's only a selection and the rows not being removed don't change result = original(input_infos[0].result_data, *args[1:], **kwargs) if result is None: raise NotImplementedError("TODO: Support inplace dropna") backend_result = PandasBackend.after_call(operator_context, input_infos, result) result = backend_result.annotated_dfobject.result_data dag_node = DagNode(op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails("dropna", list(result.columns)), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) add_dag_node(dag_node, [input_info.dag_node], backend_result) return result