def test_func_defs_and_loops(): """ Tests whether the monkey patching of pandas function works """ test_code = get_test_code_with_function_def_and_for_loop() extracted_dag = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True).dag expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo(CodeReference(4, 9, 4, 44), "pd.DataFrame([0, 1], columns=['A'])")) expected_select_1 = DagNode( 1, BasicCodeLocation("<string-source>", 8), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(8, 9, 8, 20), 'df.dropna()')) expected_dag.add_edge(expected_data_source, expected_select_1) expected_select_2 = DagNode( 2, BasicCodeLocation("<string-source>", 8), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(8, 9, 8, 20), 'df.dropna()')) expected_dag.add_edge(expected_select_1, expected_select_2) compare(networkx.to_dict_of_dicts(extracted_dag), networkx.to_dict_of_dicts(expected_dag))
def test_frame_merge_sorted(): """ Tests whether the monkey patching of ('pandas.core.frame', 'merge') works if the sort option is set to True """ test_code = cleandoc(""" import pandas as pd df_a = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]}) df_b = pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]}) df_merged = df_a.merge(df_b, on='B', sort=True) df_expected = pd.DataFrame({'A': [5, 8, 4, 2], 'B': [1, 2, 4, 5], 'C': [1, 11, 5, None]}) pd.testing.assert_frame_equal(df_merged, df_expected) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(5)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_a = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B']), OptionalCodeInfo( CodeReference(3, 7, 3, 65), "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]})")) expected_b = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['B', 'C']), OptionalCodeInfo( CodeReference(4, 7, 4, 69), "pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]})")) expected_join = DagNode( 2, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.JOIN, FunctionInfo('pandas.core.frame', 'merge')), DagNodeDetails("on 'B'", ['A', 'B', 'C']), OptionalCodeInfo(CodeReference(5, 12, 5, 47), "df_a.merge(df_b, on='B', sort=True)")) expected_dag.add_edge(expected_a, expected_join) expected_dag.add_edge(expected_b, expected_join) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_join] lineage_output = inspection_results_data_source[RowLineage(5)] expected_lineage_df = DataFrame( [[5, 1, 1., {LineageId(0, 4), LineageId(1, 0)}], [8, 2, 11., {LineageId(0, 3), LineageId(1, 3)}], [4, 4, 5., {LineageId(0, 2), LineageId(1, 1)}], [2, 5, math.nan, {LineageId(0, 1), LineageId(1, 4)}]], columns=['A', 'B', 'C', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_statsmodels_add_constant(): """ Tests whether the monkey patching of ('statsmodel.api', 'add_constant') works """ test_code = cleandoc(""" import numpy as np import statsmodels.api as sm np.random.seed(42) test = np.random.random(100) test = sm.add_constant(test) assert len(test) == 100 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) expected_dag = networkx.DiGraph() expected_random = DagNode( 0, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('numpy.random', 'random')), DagNodeDetails('random', ['array']), OptionalCodeInfo(CodeReference(4, 7, 4, 28), "np.random.random(100)")) expected_constant = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('statsmodel.api', 'add_constant')), DagNodeDetails('Adds const column', ['array']), OptionalCodeInfo(CodeReference(5, 7, 5, 28), "sm.add_constant(test)")) expected_dag.add_edge(expected_random, expected_constant) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_random] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_constant] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[numpy.array([0.5, 1.]), {LineageId(0, 0)}], [numpy.array([0.5, 1.]), {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1)
def test_frame__setitem__(): """ Tests whether the monkey patching of ('pandas.core.frame', '__setitem__') works """ test_code = cleandoc(""" import pandas as pd pandas_df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 'baz': [1, 2, 3, 4, 5, 6], 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) pandas_df['baz'] = pandas_df['baz'] + 1 df_expected = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 'baz': [2, 3, 4, 5, 6, 7], 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) pd.testing.assert_frame_equal(pandas_df, df_expected) """) inspector_result = _pipeline_executor.singleton.run(python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_data_source = DagNode(0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['foo', 'bar', 'baz', 'zoo']), OptionalCodeInfo(CodeReference(3, 12, 6, 53), "pd.DataFrame({'foo': ['one', 'one', 'one', 'two', " "'two', 'two'],\n" " 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],\n" " 'baz': [1, 2, 3, 4, 5, 6],\n" " 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})")) expected_project = DagNode(1, BasicCodeLocation("<string-source>", 7), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['baz']", ['baz']), OptionalCodeInfo(CodeReference(7, 19, 7, 35), "pandas_df['baz']")) expected_dag.add_edge(expected_data_source, expected_project) expected_project_modify = DagNode(2, BasicCodeLocation("<string-source>", 7), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('pandas.core.frame', '__setitem__')), DagNodeDetails("modifies ['baz']", ['foo', 'bar', 'baz', 'zoo']), OptionalCodeInfo(CodeReference(7, 0, 7, 39), "pandas_df['baz'] = pandas_df['baz'] + 1")) expected_dag.add_edge(expected_data_source, expected_project_modify) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[expected_project_modify] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame([['one', 'A', 2, 'x', {LineageId(0, 0)}], ['one', 'B', 3, 'y', {LineageId(0, 1)}]], columns=['foo', 'bar', 'baz', 'zoo', 'mlinspect_lineage']) pandas.testing.assert_frame_equal(lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame__getitem__selection(): """ Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for filtering """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]}) df_selection = df[df['A'] > 3] df_expected = pd.DataFrame({'A': [4, 8, 5], 'B': [4, 11, None]}) pd.testing.assert_frame_equal(df_selection.reset_index(drop=True), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B']), OptionalCodeInfo( CodeReference(3, 5, 3, 67), "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]})")) expected_projection = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['A']", ['A']), OptionalCodeInfo(CodeReference(4, 18, 4, 25), "df['A']")) expected_dag.add_edge(expected_data_source, expected_projection) expected_selection = DagNode( 2, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("Select by Series: df[df['A'] > 3]", ['A', 'B']), OptionalCodeInfo(CodeReference(4, 15, 4, 30), "df[df['A'] > 3]")) expected_dag.add_edge(expected_data_source, expected_selection) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_selection] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[4, 4., {LineageId(0, 2)}], [8, 11., {LineageId(0, 3)}]], columns=['A', 'B', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_my_word_to_vec_transformer(): """ Tests whether the monkey patching of ('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer') works """ test_code = cleandoc(""" import pandas as pd from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer import numpy as np df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']}) word_to_vec = MyW2VTransformer(min_count=2, size=2, workers=1) encoded_data = word_to_vec.fit_transform(df) assert encoded_data.shape == (4, 2) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(3)], custom_monkey_patching=[custom_monkeypatching]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo( CodeReference(5, 5, 5, 62), "pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']})")) expected_estimator = DagNode( 1, BasicCodeLocation("<string-source>", 6), OperatorContext( OperatorType.TRANSFORMER, FunctionInfo('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer')), DagNodeDetails('Word2Vec', ['array']), OptionalCodeInfo(CodeReference(6, 14, 6, 62), 'MyW2VTransformer(min_count=2, size=2, workers=1)')) expected_dag.add_edge(expected_data_source, expected_estimator) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_estimator] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 0)}], [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 1)}], [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 2)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_series_equal( lineage_output["mlinspect_lineage"], expected_lineage_df["mlinspect_lineage"]) assert expected_lineage_df.iloc[0, 0].shape == (3, )
def test_groupby_agg(): """ Tests whether the monkey patching of ('pandas.core.frame', 'groupby') and ('pandas.core.groupbygeneric', 'agg') works. """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], 'value': [1, 2, 1, 3, 4]}) df_groupby_agg = df.groupby('group').agg(mean_value=('value', 'mean')) df_expected = pd.DataFrame({'group': ['A', 'B', 'C'], 'mean_value': [1, 3, 3]}) pd.testing.assert_frame_equal(df_groupby_agg.reset_index(drop=False), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['group', 'value']), OptionalCodeInfo( CodeReference(3, 5, 3, 81), "pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], " "'value': [1, 2, 1, 3, 4]})")) expected_groupby_agg = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.GROUP_BY_AGG, FunctionInfo('pandas.core.groupby.generic', 'agg')), DagNodeDetails( "Groupby 'group', Aggregate: '{'mean_value': ('value', 'mean')}'", ['group', 'mean_value']), OptionalCodeInfo( CodeReference(4, 17, 4, 70), "df.groupby('group').agg(mean_value=('value', 'mean'))")) expected_dag.add_edge(expected_data, expected_groupby_agg) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_groupby_agg] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [["A", 1, {LineageId(1, 0)}], ['B', 3, {LineageId(1, 1)}]], columns=['group', 'mean_value', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame__getitem__frame(): """ Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for multiple string arguments """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], [9, 2, 3], [6, 1, 2], [1, 2, 3]], columns=['A', 'B', 'C']) df_projection = df[['A', 'C']] df_expected = pd.DataFrame([[0, 2], [1, 3], [4, 2], [9, 3], [6, 2], [1, 3]], columns=['A', 'C']) pd.testing.assert_frame_equal(df_projection, df_expected) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B', 'C']), OptionalCodeInfo( CodeReference(3, 5, 4, 28), "pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], " "[9, 2, 3], [6, 1, 2], [1, 2, 3]], \n" " columns=['A', 'B', 'C'])")) expected_project = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['A', 'C']", ['A', 'C']), OptionalCodeInfo(CodeReference(5, 16, 5, 30), "df[['A', 'C']]")) expected_dag.add_edge(expected_data_source, expected_project) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_project] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0, 2, {LineageId(0, 0)}], [1, 3, {LineageId(0, 1)}]], columns=['A', 'C', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame_replace(): """ Tests whether the monkey patching of ('pandas.core.frame', 'replace') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], columns=['A']) df_replace = df.replace('Medium', 'Low') df_expected = pd.DataFrame(['Low', 'Low', 'Low', 'High', None], columns=['A']) pd.testing.assert_frame_equal(df_replace.reset_index(drop=True), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo( CodeReference(3, 5, 3, 72), "pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], " "columns=['A'])")) expected_modify = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('pandas.core.frame', 'replace')), DagNodeDetails("Replace 'Medium' with 'Low'", ['A']), OptionalCodeInfo(CodeReference(4, 13, 4, 40), "df.replace('Medium', 'Low')")) expected_dag.add_edge(expected_data_source, expected_modify) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_modify] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [['Low', {LineageId(0, 0)}], ['Low', {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def get_expected_check_result_merge(): """ Expected result for the code snippet in test_no_bias_introduced_for_merge""" failing_dag_node = DagNode( 2, BasicCodeLocation('<string-source>', 5), OperatorContext(OperatorType.JOIN, FunctionInfo('pandas.core.frame', 'merge')), DagNodeDetails("on 'B'", ['A', 'B', 'C']), OptionalCodeInfo(CodeReference(5, 12, 5, 36), "df_a.merge(df_b, on='B')")) change_df = DataFrame({ 'sensitive_column_value': ['cat_a', 'cat_b', 'cat_c'], 'count_before': [2, 2, 1], 'count_after': [2, 1, 1], 'ratio_before': [0.4, 0.4, 0.2], 'ratio_after': [0.5, 0.25, 0.25], 'relative_ratio_change': [(0.5 - 0.4) / 0.4, (.25 - 0.4) / 0.4, (0.25 - 0.2) / 0.2] }) expected_distribution_change = BiasDistributionChange( failing_dag_node, False, (.25 - 0.4) / 0.4, change_df) expected_dag_node_to_change = { failing_dag_node: { 'A': expected_distribution_change } } failure_message = 'A Join causes a min_relative_ratio_change of \'A\' by -0.37500000000000006, a value below the ' \ 'configured minimum threshold -0.3!' expected_result = NoBiasIntroducedForResult(NoBiasIntroducedFor(['A']), CheckStatus.FAILURE, failure_message, expected_dag_node_to_change) return expected_result
def test_black_box_operation(): """ Tests whether the monkey patching of pandas function works """ test_code = cleandoc(""" import pandas from mlinspect.testing._testing_helper_utils import black_box_df_op df = black_box_df_op() df = df.dropna() print("df") """) extracted_dag = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True).dag expected_dag = networkx.DiGraph() expected_missing_op = DagNode( -1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.MISSING_OP, None), DagNodeDetails( 'Warning! Operator <string-source>:5 (df.dropna()) encountered a ' 'DataFrame resulting from an operation without mlinspect support!', ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()')) expected_select = DagNode( 0, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()')) expected_dag.add_edge(expected_missing_op, expected_select) compare(networkx.to_dict_of_dicts(extracted_dag), networkx.to_dict_of_dicts(expected_dag))
def test_no_missing_embeddings(): """ Tests whether NoMissingEmbeddings works for joins """ test_code = cleandoc(""" import pandas as pd from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']}) word_to_vec = MyW2VTransformer(min_count=2, size=2, workers=1) encoded_data = word_to_vec.fit_transform(df) """) inspector_result = PipelineInspector \ .on_pipeline_from_string(test_code) \ .add_check(NoMissingEmbeddings()) \ .add_custom_monkey_patching_module(custom_monkeypatching) \ .execute() check_result = inspector_result.check_to_check_results[NoMissingEmbeddings()] expected_failed_dag_node_with_result = { DagNode(1, BasicCodeLocation('<string-source>', 5), OperatorContext(OperatorType.TRANSFORMER, FunctionInfo('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer')), DagNodeDetails('Word2Vec: fit_transform', ['array']), OptionalCodeInfo(CodeReference(5, 14, 5, 62), 'MyW2VTransformer(min_count=2, size=2, workers=1)')) : MissingEmbeddingsInfo(2, ['cat_b', 'cat_c'])} expected_result = NoMissingEmbeddingsResult(NoMissingEmbeddings(10), CheckStatus.FAILURE, 'Missing embeddings were found!', expected_failed_dag_node_with_result) compare(check_result, expected_result)
def test_numpy_random(): """ Tests whether the monkey patching of ('numpy.random', 'random') works """ test_code = cleandoc(""" import numpy as np np.random.seed(42) test = np.random.random(100) assert len(test) == 100 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('numpy.random', 'random')), DagNodeDetails('random', ['array']), OptionalCodeInfo(CodeReference(3, 7, 3, 28), "np.random.random(100)")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1)
def get_expected_check_result_simple_imputer(): """ Expected result for the code snippet in test_no_bias_introduced_for_simple_imputer""" imputer_dag_node = DagNode( 1, BasicCodeLocation('<string-source>', 6), OperatorContext(OperatorType.TRANSFORMER, FunctionInfo('sklearn.impute._base', 'SimpleImputer')), DagNodeDetails('Simple Imputer', ['A']), OptionalCodeInfo( CodeReference(6, 10, 6, 72), "SimpleImputer(missing_values=np.nan, strategy='most_frequent')")) change_df = DataFrame({ 'sensitive_column_value': ['cat_a', 'cat_c', math.nan], 'count_before': [2, 1, 1], 'count_after': [3, 1, 0], 'ratio_before': [0.5, 0.25, 0.25], 'ratio_after': [0.75, 0.25, 0.], 'relative_ratio_change': [0.5, 0., -1.] }) expected_distribution_change = BiasDistributionChange( imputer_dag_node, True, 0., change_df) expected_dag_node_to_change = { imputer_dag_node: { 'A': expected_distribution_change } } expected_result = NoBiasIntroducedForResult(NoBiasIntroducedFor(['A']), CheckStatus.SUCCESS, None, expected_dag_node_to_change) return expected_result
def get_expected_check_result_simple_imputer(): """ Expected result for the code snippet in test_no_bias_introduced_for_simple_imputer""" dag_node = DagNode( 1, BasicCodeLocation('<string-source>', 6), OperatorContext(OperatorType.TRANSFORMER, FunctionInfo('sklearn.impute._base', 'SimpleImputer')), DagNodeDetails('Simple Imputer: fit_transform', ['A']), OptionalCodeInfo( CodeReference(6, 10, 6, 72), "SimpleImputer(missing_values=np.nan, strategy='most_frequent')")) change_df = DataFrame({ 'sensitive_column_value': ['cat_a', 'cat_c', math.nan], 'count_before': [2, 1, 1], 'count_after': [3, 1, 0], 'removed_records': [-1, 0, 1], 'removal_probability': [0., 0., 1.], 'normalized_removal_probability': [0., 0., 1.] }) expected_probabilities = RemovalProbabilities(dag_node, True, 0., change_df) expected_dag_node_to_change = {dag_node: {'A': expected_probabilities}} failure_message = None expected_result = SimilarRemovalProbabilitiesForResult( SimilarRemovalProbabilitiesFor(['A']), CheckStatus.SUCCESS, failure_message, expected_dag_node_to_change) return expected_result
def get_expected_check_result_dropna(): """ Expected result for the code snippet in test_no_bias_introduced_for_dropna""" dag_node = DagNode( 1, BasicCodeLocation('<string-source>', 5), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails("dropna", ['A', 'B']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), "df.dropna()")) change_df = DataFrame({ 'sensitive_column_value': ['cat_a', 'cat_c'], 'count_before': [2, 3], 'count_after': [0, 2], 'removed_records': [2, 1], 'removal_probability': [1., 1. / 3.], 'normalized_removal_probability': [3., 1.] }) expected_probabilities = RemovalProbabilities(dag_node, False, 3., change_df) expected_dag_node_to_change = {dag_node: {'A': expected_probabilities}} failure_message = "A Selection causes a max_probability_difference of 'A' by 3.0, a value above the configured " \ "maximum threshold 2.0!" expected_result = SimilarRemovalProbabilitiesForResult( SimilarRemovalProbabilitiesFor(['A']), CheckStatus.FAILURE, failure_message, expected_dag_node_to_change) return expected_result
def test_frame__init__(): """ Tests whether the monkey patching of ('pandas.core.frame', 'DataFrame') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([0, 1, 2], columns=['A']) assert len(df) == 3 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo(CodeReference(3, 5, 3, 43), "pd.DataFrame([0, 1, 2], columns=['A'])")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0, {LineageId(0, 0)}], [1, {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def get_expected_check_result_merge(): """ Expected result for the code snippet in test_no_bias_introduced_for_merge""" dag_node = DagNode( 2, BasicCodeLocation('<string-source>', 5), OperatorContext(OperatorType.JOIN, FunctionInfo('pandas.core.frame', 'merge')), DagNodeDetails("on 'B'", ['A', 'B', 'C']), OptionalCodeInfo(CodeReference(5, 12, 5, 36), "df_a.merge(df_b, on='B')")) change_df = DataFrame({ 'sensitive_column_value': ['cat_a', 'cat_b', 'cat_c'], 'count_before': [2, 2, 1], 'count_after': [2, 1, 1], 'removed_records': [0, 1, 0], 'removal_probability': [0., 0.5, 0.], 'normalized_removal_probability': [0., 1., 0.] }) expected_probabilities = RemovalProbabilities(dag_node, True, 1., change_df) expected_dag_node_to_change = {dag_node: {'A': expected_probabilities}} failure_message = None expected_result = SimilarRemovalProbabilitiesForResult( SimilarRemovalProbabilitiesFor(['A']), CheckStatus.SUCCESS, failure_message, expected_dag_node_to_change) return expected_result
def test_read_csv(): """ Tests whether the monkey patching of ('pandas.io.parsers', 'read_csv') works """ test_code = cleandoc(""" import os import pandas as pd from mlinspect.utils import get_project_root train_file = os.path.join(str(get_project_root()), "example_pipelines", "adult_complex", "adult_train.csv") raw_data = pd.read_csv(train_file, na_values='?', index_col=0) assert len(raw_data) == 22792 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 6), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.io.parsers', 'read_csv')), DagNodeDetails(StringComparison(r".*\.csv"), [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year' ]), OptionalCodeInfo( CodeReference(6, 11, 6, 62), "pd.read_csv(train_file, na_values='?', index_col=0)")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[ 46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty', 'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K', {LineageId(0, 0)} ], [ 29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K', {LineageId(0, 1)} ]], columns=[ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year', 'mlinspect_lineage' ]) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_get_rdataset(): """ Tests whether the monkey patching of ('statsmodels.datasets', 'get_rdataset') works """ test_code = cleandoc(""" import statsmodels.api as sm dat = sm.datasets.get_rdataset("Guerry", "HistData").data assert len(dat) == 86 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('statsmodels.datasets', 'get_rdataset')), DagNodeDetails( 'Data from A.-M. Guerry, "Essay on the Moral Statistics of France"', [ 'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop', 'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity', 'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide', 'Donation_clergy', 'Lottery', 'Desertion', 'Instruction', 'Prostitutes', 'Distance', 'Area', 'Pop1831' ]), OptionalCodeInfo(CodeReference(3, 6, 3, 52), """sm.datasets.get_rdataset("Guerry", "HistData")""")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[ 1, 'E', 'Ain', 28870, 15890, 37, 5098, 33120, 35039, '2:Med', 73, 58, 11, 71, 60, 69, 41, 55, 46, 13, 218.372, 5762, 346.03, {LineageId(0, 0)} ], [ 2, 'N', 'Aisne', 26226, 5521, 51, 8901, 14572, 12831, '2:Med', 22, 10, 82, 4, 82, 36, 38, 82, 24, 327, 65.945, 7369, 513.0, {LineageId(0, 1)} ]], columns=[ 'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop', 'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity', 'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide', 'Donation_clergy', 'Lottery', 'Desertion', 'Instruction', 'Prostitutes', 'Distance', 'Area', 'Pop1831', 'mlinspect_lineage' ]) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame_dropna(): """ Tests whether the monkey patching of ('pandas.core.frame', 'dropna') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([0, 2, 4, 5, None], columns=['A']) assert len(df) == 5 df = df.dropna() assert len(df) == 4 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo(CodeReference(3, 5, 3, 52), "pd.DataFrame([0, 2, 4, 5, None], columns=['A'])")) expected_select = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()')) expected_dag.add_edge(expected_data_source, expected_select) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_select] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0., {LineageId(0, 0)}], [2., {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_removal_probab_dropna(): """ Tests whether SimilarRemovalProbabilitiesFor works for dropna """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame({'A': ['cat_a', 'cat_a', 'cat_c', 'cat_c', 'cat_c'], 'B': [None, None, 1, 2, None]}) df = df.dropna() """) inspector_result = PipelineInspector \ .on_pipeline_from_string(test_code) \ .add_check(SimilarRemovalProbabilitiesFor(['A'])) \ .execute() check_result = inspector_result.check_to_check_results[ SimilarRemovalProbabilitiesFor(['A'])] expected_result = get_expected_check_result_dropna() compare(check_result, expected_result) overview = SimilarRemovalProbabilitiesFor.get_removal_probabilities_overview_as_df( check_result) expected_df = pandas.DataFrame({ 'operator_type': [OperatorType.SELECTION], 'description': ['dropna'], 'code_reference': [CodeReference(5, 5, 5, 16)], 'source_code': ['df.dropna()'], 'function_info': [FunctionInfo('pandas.core.frame', 'dropna')], "'A' probability difference below the configured maximum test threshold": [True] }) pandas.testing.assert_frame_equal(overview, expected_df) matplotlib.use( "template") # Disable plt.show when executing nb as part of this test SimilarRemovalProbabilitiesFor.plot_removal_probability_histograms( list(check_result.removal_probability_change.values())[0]['A']) SimilarRemovalProbabilitiesFor.plot_distribution_change_histograms( list(check_result.removal_probability_change.values())[0]['A'])
def visit_operator(self, inspection_input) -> Iterable[any]: """ Visit an operator """ # pylint: disable=too-many-branches, too-many-statements if isinstance(inspection_input, InspectionInputUnaryOperator) and \ inspection_input.operator_context.function_info == \ FunctionInfo('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer'): # TODO: Are there existing word embedding transformers for sklearn we can use this for? self._is_embedding_operator = True for row in inspection_input.row_iterator: # Count missing embeddings embedding_array = row.output[0] is_zero_vector = not embedding_array.any() if is_zero_vector: self._missing_embedding_count += 1 if len(self._missing_embeddings_examples) < self.example_threshold: self._missing_embeddings_examples.append(row.input[0]) yield None else: for _ in inspection_input.row_iterator: yield None
def get_expected_dag_adult_easy(caller_filename: str, line_offset: int = 0, with_code_references=True): """ Get the expected DAG for the adult_easy pipeline """ # pylint: disable=too-many-locals # The line numbers differ slightly between the .py file and the.ipynb file expected_graph = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation(caller_filename, 12 + line_offset), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.io.parsers', 'read_csv')), DagNodeDetails('adult_train.csv', [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year' ]), OptionalCodeInfo( CodeReference(12 + line_offset, 11, 12 + line_offset, 62), "pd.read_csv(train_file, na_values='?', index_col=0)")) expected_graph.add_node(expected_data_source) expected_select = DagNode( 1, BasicCodeLocation(caller_filename, 14 + line_offset), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year' ]), OptionalCodeInfo( CodeReference(14 + line_offset, 7, 14 + line_offset, 24), 'raw_data.dropna()')) expected_graph.add_edge(expected_data_source, expected_select) pipeline_str = "compose.ColumnTransformer(transformers=[\n" \ " ('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), " \ "['education', 'workclass']),\n" \ " ('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week'])\n" \ "])" expected_pipeline_project_one = DagNode( 4, BasicCodeLocation(caller_filename, 18 + line_offset), OperatorContext( OperatorType.PROJECTION, FunctionInfo('sklearn.compose._column_transformer', 'ColumnTransformer')), DagNodeDetails("to ['education', 'workclass']", ['education', 'workclass']), OptionalCodeInfo( CodeReference(18 + line_offset, 25, 21 + line_offset, 2), pipeline_str)) expected_graph.add_edge(expected_select, expected_pipeline_project_one) expected_pipeline_project_two = DagNode( 6, BasicCodeLocation(caller_filename, 18 + line_offset), OperatorContext( OperatorType.PROJECTION, FunctionInfo('sklearn.compose._column_transformer', 'ColumnTransformer')), DagNodeDetails("to ['age', 'hours-per-week']", ['age', 'hours-per-week']), OptionalCodeInfo( CodeReference(18 + line_offset, 25, 21 + line_offset, 2), pipeline_str)) expected_graph.add_edge(expected_select, expected_pipeline_project_two) expected_pipeline_transformer_one = DagNode( 5, BasicCodeLocation(caller_filename, 19 + line_offset), OperatorContext( OperatorType.TRANSFORMER, FunctionInfo('sklearn.preprocessing._encoders', 'OneHotEncoder')), DagNodeDetails('One-Hot Encoder: fit_transform', ['array']), OptionalCodeInfo( CodeReference(19 + line_offset, 20, 19 + line_offset, 72), "preprocessing.OneHotEncoder(handle_unknown='ignore')")) expected_pipeline_transformer_two = DagNode( 7, BasicCodeLocation(caller_filename, 20 + line_offset), OperatorContext( OperatorType.TRANSFORMER, FunctionInfo('sklearn.preprocessing._data', 'StandardScaler')), DagNodeDetails('Standard Scaler: fit_transform', ['array']), OptionalCodeInfo( CodeReference(20 + line_offset, 16, 20 + line_offset, 46), 'preprocessing.StandardScaler()')) expected_graph.add_edge(expected_pipeline_project_one, expected_pipeline_transformer_one) expected_graph.add_edge(expected_pipeline_project_two, expected_pipeline_transformer_two) expected_pipeline_concatenation = DagNode( 8, BasicCodeLocation(caller_filename, 18 + line_offset), OperatorContext( OperatorType.CONCATENATION, FunctionInfo('sklearn.compose._column_transformer', 'ColumnTransformer')), DagNodeDetails(None, ['array']), OptionalCodeInfo( CodeReference(18 + line_offset, 25, 21 + line_offset, 2), pipeline_str)) expected_graph.add_edge(expected_pipeline_transformer_one, expected_pipeline_concatenation) expected_graph.add_edge(expected_pipeline_transformer_two, expected_pipeline_concatenation) expected_train_data = DagNode( 9, BasicCodeLocation(caller_filename, 26 + line_offset), OperatorContext( OperatorType.TRAIN_DATA, FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')), DagNodeDetails(None, ['array']), OptionalCodeInfo( CodeReference(26 + line_offset, 19, 26 + line_offset, 48), 'tree.DecisionTreeClassifier()')) expected_graph.add_edge(expected_pipeline_concatenation, expected_train_data) expected_project = DagNode( 2, BasicCodeLocation(caller_filename, 16 + line_offset), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['income-per-year']", ['income-per-year']), OptionalCodeInfo( CodeReference(16 + line_offset, 38, 16 + line_offset, 61), "data['income-per-year']")) expected_graph.add_edge(expected_select, expected_project) expected_project_modify = DagNode( 3, BasicCodeLocation(caller_filename, 16 + line_offset), OperatorContext( OperatorType.PROJECTION_MODIFY, FunctionInfo('sklearn.preprocessing._label', 'label_binarize')), DagNodeDetails("label_binarize, classes: ['>50K', '<=50K']", ['array']), OptionalCodeInfo( CodeReference(16 + line_offset, 9, 16 + line_offset, 89), "preprocessing.label_binarize(data['income-per-year'], " "classes=['>50K', '<=50K'])")) expected_graph.add_edge(expected_project, expected_project_modify) expected_train_labels = DagNode( 10, BasicCodeLocation(caller_filename, 26 + line_offset), OperatorContext( OperatorType.TRAIN_LABELS, FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')), DagNodeDetails(None, ['array']), OptionalCodeInfo( CodeReference(26 + line_offset, 19, 26 + line_offset, 48), 'tree.DecisionTreeClassifier()')) expected_graph.add_edge(expected_project_modify, expected_train_labels) expected_estimator = DagNode( 11, BasicCodeLocation(caller_filename, 26 + line_offset), OperatorContext( OperatorType.ESTIMATOR, FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')), DagNodeDetails('Decision Tree', []), OptionalCodeInfo( CodeReference(26 + line_offset, 19, 26 + line_offset, 48), 'tree.DecisionTreeClassifier()')) expected_graph.add_edge(expected_train_data, expected_estimator) expected_graph.add_edge(expected_train_labels, expected_estimator) if not with_code_references: for dag_node in expected_graph.nodes: dag_node.optional_code_info = None return expected_graph
def test_ols_fit(): """ Tests whether the monkey patching of ('statsmodels.regression.linear_model.OLS', 'fit') works """ test_code = cleandoc(""" import numpy as np import statsmodels.api as sm np.random.seed(42) nobs = 100 X = np.random.random((nobs, 2)) X = sm.add_constant(X) beta = [1, .1, .5] e = np.random.random(nobs) y = np.dot(X, beta) + e results = sm.OLS(y, X).fit() assert results.summary() is not None """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(3)]) inspector_result.dag.remove_nodes_from( list(inspector_result.dag.nodes)[0:4]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[1]) expected_dag = networkx.DiGraph() expected_train_data = DagNode( 3, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.TRAIN_DATA, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails(None, ['array']), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_train_labels = DagNode( 4, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.TRAIN_LABELS, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails(None, ['array']), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_ols = DagNode( 5, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.ESTIMATOR, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails('Decision Tree', []), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_dag.add_edge(expected_train_data, expected_ols) expected_dag.add_edge(expected_train_labels, expected_ols) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_train_data] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[ numpy.array([1.0, 0.3745401188473625, 0.9507143064099162]), {LineageId(3, 0)} ], [ numpy.array([1.0, 0.7319939418114051, 0.5986584841970366]), {LineageId(3, 1)} ], [ numpy.array([1.0, 0.15601864044243652, 0.15599452033620265]), {LineageId(3, 2)} ]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=0.1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_train_labels] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame([[2.154842811243982, {LineageId(5, 0)}], [1.4566686012747074, {LineageId(5, 1)}], [1.2552278383069588, {LineageId(5, 2)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=0.1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_ols] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[{LineageId(5, 0), LineageId(3, 0)}], [{LineageId(5, 1), LineageId(3, 1)}], [{LineageId(5, 2), LineageId(3, 2)}]], columns=['mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), check_column_type=False)