コード例 #1
0
def test_func_defs_and_loops():
    """
    Tests whether the monkey patching of pandas function works
    """
    test_code = get_test_code_with_function_def_and_for_loop()

    extracted_dag = _pipeline_executor.singleton.run(
        python_code=test_code, track_code_references=True).dag

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(CodeReference(4, 9, 4, 44),
                         "pd.DataFrame([0, 1], columns=['A'])"))
    expected_select_1 = DagNode(
        1, BasicCodeLocation("<string-source>", 8),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails('dropna', ['A']),
        OptionalCodeInfo(CodeReference(8, 9, 8, 20), 'df.dropna()'))
    expected_dag.add_edge(expected_data_source, expected_select_1)
    expected_select_2 = DagNode(
        2, BasicCodeLocation("<string-source>", 8),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails('dropna', ['A']),
        OptionalCodeInfo(CodeReference(8, 9, 8, 20), 'df.dropna()'))
    expected_dag.add_edge(expected_select_1, expected_select_2)
    compare(networkx.to_dict_of_dicts(extracted_dag),
            networkx.to_dict_of_dicts(expected_dag))
コード例 #2
0
def test_frame_merge_sorted():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'merge') works if the sort option is set to True
    """
    test_code = cleandoc("""
        import pandas as pd

        df_a = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]})
        df_b = pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]})
        df_merged = df_a.merge(df_b, on='B', sort=True)
        df_expected = pd.DataFrame({'A': [5, 8, 4, 2], 'B': [1, 2, 4, 5], 'C': [1, 11, 5, None]})
        pd.testing.assert_frame_equal(df_merged, df_expected)
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(5)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3])

    expected_dag = networkx.DiGraph()
    expected_a = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A', 'B']),
        OptionalCodeInfo(
            CodeReference(3, 7, 3, 65),
            "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]})"))
    expected_b = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['B', 'C']),
        OptionalCodeInfo(
            CodeReference(4, 7, 4, 69),
            "pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]})"))
    expected_join = DagNode(
        2, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.JOIN,
                        FunctionInfo('pandas.core.frame', 'merge')),
        DagNodeDetails("on 'B'", ['A', 'B', 'C']),
        OptionalCodeInfo(CodeReference(5, 12, 5, 47),
                         "df_a.merge(df_b, on='B', sort=True)"))
    expected_dag.add_edge(expected_a, expected_join)
    expected_dag.add_edge(expected_b, expected_join)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_join]
    lineage_output = inspection_results_data_source[RowLineage(5)]
    expected_lineage_df = DataFrame(
        [[5, 1, 1., {LineageId(0, 4), LineageId(1, 0)}],
         [8, 2, 11., {LineageId(0, 3), LineageId(1, 3)}],
         [4, 4, 5., {LineageId(0, 2), LineageId(1, 1)}],
         [2, 5, math.nan, {LineageId(0, 1), LineageId(1, 4)}]],
        columns=['A', 'B', 'C', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
コード例 #3
0
def test_statsmodels_add_constant():
    """
    Tests whether the monkey patching of ('statsmodel.api', 'add_constant') works
    """
    test_code = cleandoc("""
        import numpy as np
        import statsmodels.api as sm
        np.random.seed(42)
        test = np.random.random(100)
        test = sm.add_constant(test)
        assert len(test) == 100
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    expected_dag = networkx.DiGraph()
    expected_random = DagNode(
        0, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('numpy.random', 'random')),
        DagNodeDetails('random', ['array']),
        OptionalCodeInfo(CodeReference(4, 7, 4, 28), "np.random.random(100)"))

    expected_constant = DagNode(
        1, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.PROJECTION_MODIFY,
                        FunctionInfo('statsmodel.api', 'add_constant')),
        DagNodeDetails('Adds const column', ['array']),
        OptionalCodeInfo(CodeReference(5, 7, 5, 28), "sm.add_constant(test)"))
    expected_dag.add_edge(expected_random, expected_constant)

    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_random]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=1)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_constant]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[numpy.array([0.5, 1.]), {LineageId(0, 0)}],
         [numpy.array([0.5, 1.]), {LineageId(0, 1)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=1)
コード例 #4
0
def test_frame__setitem__():
    """
    Tests whether the monkey patching of ('pandas.core.frame', '__setitem__') works
    """
    test_code = cleandoc("""
                import pandas as pd

                pandas_df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                              'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                              'baz': [1, 2, 3, 4, 5, 6],
                              'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
                pandas_df['baz'] = pandas_df['baz'] + 1
                df_expected = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                              'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                              'baz': [2, 3, 4, 5, 6, 7],
                              'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
                pd.testing.assert_frame_equal(pandas_df, df_expected)
                """)
    inspector_result = _pipeline_executor.singleton.run(python_code=test_code, track_code_references=True,
                                                        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(0,
                                   BasicCodeLocation("<string-source>", 3),
                                   OperatorContext(OperatorType.DATA_SOURCE,
                                                   FunctionInfo('pandas.core.frame', 'DataFrame')),
                                   DagNodeDetails(None, ['foo', 'bar', 'baz', 'zoo']),
                                   OptionalCodeInfo(CodeReference(3, 12, 6, 53),
                                                    "pd.DataFrame({'foo': ['one', 'one', 'one', 'two', "
                                                    "'two', 'two'],\n"
                                                    "              'bar': ['A', 'B', 'C', 'A', 'B', 'C'],\n"
                                                    "              'baz': [1, 2, 3, 4, 5, 6],\n"
                                                    "              'zoo': ['x', 'y', 'z', 'q', 'w', 't']})"))
    expected_project = DagNode(1,
                               BasicCodeLocation("<string-source>", 7),
                               OperatorContext(OperatorType.PROJECTION,
                                               FunctionInfo('pandas.core.frame', '__getitem__')),
                               DagNodeDetails("to ['baz']", ['baz']),
                               OptionalCodeInfo(CodeReference(7, 19, 7, 35), "pandas_df['baz']"))
    expected_dag.add_edge(expected_data_source, expected_project)
    expected_project_modify = DagNode(2,
                                      BasicCodeLocation("<string-source>", 7),
                                      OperatorContext(OperatorType.PROJECTION_MODIFY,
                                                      FunctionInfo('pandas.core.frame', '__setitem__')),
                                      DagNodeDetails("modifies ['baz']", ['foo', 'bar', 'baz', 'zoo']),
                                      OptionalCodeInfo(CodeReference(7, 0, 7, 39),
                                                       "pandas_df['baz'] = pandas_df['baz'] + 1"))
    expected_dag.add_edge(expected_data_source, expected_project_modify)

    compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[expected_project_modify]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame([['one', 'A', 2, 'x', {LineageId(0, 0)}],
                                     ['one', 'B', 3, 'y', {LineageId(0, 1)}]],
                                    columns=['foo', 'bar', 'baz', 'zoo', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
コード例 #5
0
def test_frame__getitem__selection():
    """
    Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for filtering
    """
    test_code = cleandoc("""
                import pandas as pd

                df = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]})
                df_selection = df[df['A'] > 3]
                df_expected = pd.DataFrame({'A': [4, 8, 5], 'B': [4, 11, None]})
                pd.testing.assert_frame_equal(df_selection.reset_index(drop=True), df_expected.reset_index(drop=True))
                """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A', 'B']),
        OptionalCodeInfo(
            CodeReference(3, 5, 3, 67),
            "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]})"))
    expected_projection = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.PROJECTION,
                        FunctionInfo('pandas.core.frame', '__getitem__')),
        DagNodeDetails("to ['A']", ['A']),
        OptionalCodeInfo(CodeReference(4, 18, 4, 25), "df['A']"))
    expected_dag.add_edge(expected_data_source, expected_projection)
    expected_selection = DagNode(
        2, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', '__getitem__')),
        DagNodeDetails("Select by Series: df[df['A'] > 3]", ['A', 'B']),
        OptionalCodeInfo(CodeReference(4, 15, 4, 30), "df[df['A'] > 3]"))
    expected_dag.add_edge(expected_data_source, expected_selection)

    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_selection]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[4, 4., {LineageId(0, 2)}], [8, 11., {LineageId(0, 3)}]],
        columns=['A', 'B', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
コード例 #6
0
def test_my_word_to_vec_transformer():
    """
    Tests whether the monkey patching of ('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer') works
    """
    test_code = cleandoc("""
                import pandas as pd
                from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer
                import numpy as np

                df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']})
                word_to_vec = MyW2VTransformer(min_count=2, size=2, workers=1)
                encoded_data = word_to_vec.fit_transform(df)
                assert encoded_data.shape == (4, 2)
                """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(3)],
        custom_monkey_patching=[custom_monkeypatching])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(
            CodeReference(5, 5, 5, 62),
            "pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']})"))
    expected_estimator = DagNode(
        1, BasicCodeLocation("<string-source>", 6),
        OperatorContext(
            OperatorType.TRANSFORMER,
            FunctionInfo('example_pipelines.healthcare.healthcare_utils',
                         'MyW2VTransformer')),
        DagNodeDetails('Word2Vec', ['array']),
        OptionalCodeInfo(CodeReference(6, 14, 6, 62),
                         'MyW2VTransformer(min_count=2, size=2, workers=1)'))
    expected_dag.add_edge(expected_data_source, expected_estimator)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_estimator]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame(
        [[numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 0)}],
         [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 1)}],
         [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 2)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_series_equal(
        lineage_output["mlinspect_lineage"],
        expected_lineage_df["mlinspect_lineage"])
    assert expected_lineage_df.iloc[0, 0].shape == (3, )
コード例 #7
0
def test_groupby_agg():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'groupby') and ('pandas.core.groupbygeneric', 'agg')
    works.
    """
    test_code = cleandoc("""
        import pandas as pd

        df = pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], 'value': [1, 2, 1, 3, 4]})
        df_groupby_agg = df.groupby('group').agg(mean_value=('value', 'mean'))
        
        df_expected = pd.DataFrame({'group': ['A', 'B', 'C'], 'mean_value': [1, 3, 3]})
        pd.testing.assert_frame_equal(df_groupby_agg.reset_index(drop=False), df_expected.reset_index(drop=True))
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2])

    expected_dag = networkx.DiGraph()
    expected_data = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['group', 'value']),
        OptionalCodeInfo(
            CodeReference(3, 5, 3, 81),
            "pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], "
            "'value': [1, 2, 1, 3, 4]})"))
    expected_groupby_agg = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.GROUP_BY_AGG,
                        FunctionInfo('pandas.core.groupby.generic', 'agg')),
        DagNodeDetails(
            "Groupby 'group', Aggregate: '{'mean_value': ('value', 'mean')}'",
            ['group', 'mean_value']),
        OptionalCodeInfo(
            CodeReference(4, 17, 4, 70),
            "df.groupby('group').agg(mean_value=('value', 'mean'))"))
    expected_dag.add_edge(expected_data, expected_groupby_agg)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_groupby_agg]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [["A", 1, {LineageId(1, 0)}], ['B', 3, {LineageId(1, 1)}]],
        columns=['group', 'mean_value', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
コード例 #8
0
def test_frame__getitem__frame():
    """
    Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for multiple string arguments
    """
    test_code = cleandoc("""
                import pandas as pd

                df = pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], [9, 2, 3], [6, 1, 2], [1, 2, 3]], 
                    columns=['A', 'B', 'C'])
                df_projection = df[['A', 'C']]
                df_expected = pd.DataFrame([[0, 2], [1, 3], [4, 2], [9, 3], [6, 2], [1, 3]], columns=['A', 'C'])
                pd.testing.assert_frame_equal(df_projection, df_expected)
                """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A', 'B', 'C']),
        OptionalCodeInfo(
            CodeReference(3, 5, 4, 28),
            "pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], "
            "[9, 2, 3], [6, 1, 2], [1, 2, 3]], \n"
            "    columns=['A', 'B', 'C'])"))
    expected_project = DagNode(
        1, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.PROJECTION,
                        FunctionInfo('pandas.core.frame', '__getitem__')),
        DagNodeDetails("to ['A', 'C']", ['A', 'C']),
        OptionalCodeInfo(CodeReference(5, 16, 5, 30), "df[['A', 'C']]"))
    expected_dag.add_edge(expected_data_source, expected_project)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_project]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0, 2, {LineageId(0, 0)}], [1, 3, {LineageId(0, 1)}]],
        columns=['A', 'C', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
コード例 #9
0
def test_frame_replace():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'replace') works
    """
    test_code = cleandoc("""
        import pandas as pd

        df = pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], columns=['A'])
        df_replace = df.replace('Medium', 'Low')
        df_expected = pd.DataFrame(['Low', 'Low', 'Low', 'High', None], columns=['A'])
        pd.testing.assert_frame_equal(df_replace.reset_index(drop=True), df_expected.reset_index(drop=True))
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(
            CodeReference(3, 5, 3, 72),
            "pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], "
            "columns=['A'])"))
    expected_modify = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.PROJECTION_MODIFY,
                        FunctionInfo('pandas.core.frame', 'replace')),
        DagNodeDetails("Replace 'Medium' with 'Low'", ['A']),
        OptionalCodeInfo(CodeReference(4, 13, 4, 40),
                         "df.replace('Medium', 'Low')"))
    expected_dag.add_edge(expected_data_source, expected_modify)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_modify]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [['Low', {LineageId(0, 0)}], ['Low', {LineageId(0, 1)}]],
        columns=['A', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
コード例 #10
0
def get_expected_check_result_merge():
    """ Expected result for the code snippet in test_no_bias_introduced_for_merge"""
    failing_dag_node = DagNode(
        2, BasicCodeLocation('<string-source>', 5),
        OperatorContext(OperatorType.JOIN,
                        FunctionInfo('pandas.core.frame', 'merge')),
        DagNodeDetails("on 'B'", ['A', 'B', 'C']),
        OptionalCodeInfo(CodeReference(5, 12, 5, 36),
                         "df_a.merge(df_b, on='B')"))

    change_df = DataFrame({
        'sensitive_column_value': ['cat_a', 'cat_b', 'cat_c'],
        'count_before': [2, 2, 1],
        'count_after': [2, 1, 1],
        'ratio_before': [0.4, 0.4, 0.2],
        'ratio_after': [0.5, 0.25, 0.25],
        'relative_ratio_change': [(0.5 - 0.4) / 0.4, (.25 - 0.4) / 0.4,
                                  (0.25 - 0.2) / 0.2]
    })
    expected_distribution_change = BiasDistributionChange(
        failing_dag_node, False, (.25 - 0.4) / 0.4, change_df)
    expected_dag_node_to_change = {
        failing_dag_node: {
            'A': expected_distribution_change
        }
    }
    failure_message = 'A Join causes a min_relative_ratio_change of \'A\' by -0.37500000000000006, a value below the ' \
                      'configured minimum threshold -0.3!'
    expected_result = NoBiasIntroducedForResult(NoBiasIntroducedFor(['A']),
                                                CheckStatus.FAILURE,
                                                failure_message,
                                                expected_dag_node_to_change)
    return expected_result
コード例 #11
0
def test_black_box_operation():
    """
    Tests whether the monkey patching of pandas function works
    """
    test_code = cleandoc("""
        import pandas
        from mlinspect.testing._testing_helper_utils import black_box_df_op
        
        df = black_box_df_op()
        df = df.dropna()
        print("df")
        """)

    extracted_dag = _pipeline_executor.singleton.run(
        python_code=test_code, track_code_references=True).dag

    expected_dag = networkx.DiGraph()
    expected_missing_op = DagNode(
        -1, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.MISSING_OP, None),
        DagNodeDetails(
            'Warning! Operator <string-source>:5 (df.dropna()) encountered a '
            'DataFrame resulting from an operation without mlinspect support!',
            ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16),
                                     'df.dropna()'))
    expected_select = DagNode(
        0, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails('dropna', ['A']),
        OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()'))
    expected_dag.add_edge(expected_missing_op, expected_select)
    compare(networkx.to_dict_of_dicts(extracted_dag),
            networkx.to_dict_of_dicts(expected_dag))
コード例 #12
0
def test_no_missing_embeddings():
    """
    Tests whether NoMissingEmbeddings works for joins
    """
    test_code = cleandoc("""
            import pandas as pd
            from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer

            df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']})
            word_to_vec = MyW2VTransformer(min_count=2, size=2, workers=1)
            encoded_data = word_to_vec.fit_transform(df)
            """)

    inspector_result = PipelineInspector \
        .on_pipeline_from_string(test_code) \
        .add_check(NoMissingEmbeddings()) \
        .add_custom_monkey_patching_module(custom_monkeypatching) \
        .execute()

    check_result = inspector_result.check_to_check_results[NoMissingEmbeddings()]
    expected_failed_dag_node_with_result = {
        DagNode(1,
                BasicCodeLocation('<string-source>', 5),
                OperatorContext(OperatorType.TRANSFORMER,
                                FunctionInfo('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer')),
                DagNodeDetails('Word2Vec: fit_transform', ['array']),
                OptionalCodeInfo(CodeReference(5, 14, 5, 62), 'MyW2VTransformer(min_count=2, size=2, workers=1)'))
        : MissingEmbeddingsInfo(2, ['cat_b', 'cat_c'])}
    expected_result = NoMissingEmbeddingsResult(NoMissingEmbeddings(10), CheckStatus.FAILURE,
                                                'Missing embeddings were found!', expected_failed_dag_node_with_result)
    compare(check_result, expected_result)
コード例 #13
0
def test_numpy_random():
    """
    Tests whether the monkey patching of ('numpy.random', 'random') works
    """
    test_code = cleandoc("""
        import numpy as np
        np.random.seed(42)
        test = np.random.random(100)
        assert len(test) == 100
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]

    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('numpy.random', 'random')),
        DagNodeDetails('random', ['array']),
        OptionalCodeInfo(CodeReference(3, 7, 3, 28), "np.random.random(100)"))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=1)
コード例 #14
0
def get_expected_check_result_simple_imputer():
    """ Expected result for the code snippet in test_no_bias_introduced_for_simple_imputer"""
    imputer_dag_node = DagNode(
        1, BasicCodeLocation('<string-source>', 6),
        OperatorContext(OperatorType.TRANSFORMER,
                        FunctionInfo('sklearn.impute._base', 'SimpleImputer')),
        DagNodeDetails('Simple Imputer', ['A']),
        OptionalCodeInfo(
            CodeReference(6, 10, 6, 72),
            "SimpleImputer(missing_values=np.nan, strategy='most_frequent')"))

    change_df = DataFrame({
        'sensitive_column_value': ['cat_a', 'cat_c', math.nan],
        'count_before': [2, 1, 1],
        'count_after': [3, 1, 0],
        'ratio_before': [0.5, 0.25, 0.25],
        'ratio_after': [0.75, 0.25, 0.],
        'relative_ratio_change': [0.5, 0., -1.]
    })
    expected_distribution_change = BiasDistributionChange(
        imputer_dag_node, True, 0., change_df)
    expected_dag_node_to_change = {
        imputer_dag_node: {
            'A': expected_distribution_change
        }
    }
    expected_result = NoBiasIntroducedForResult(NoBiasIntroducedFor(['A']),
                                                CheckStatus.SUCCESS, None,
                                                expected_dag_node_to_change)
    return expected_result
def get_expected_check_result_simple_imputer():
    """ Expected result for the code snippet in test_no_bias_introduced_for_simple_imputer"""
    dag_node = DagNode(
        1, BasicCodeLocation('<string-source>', 6),
        OperatorContext(OperatorType.TRANSFORMER,
                        FunctionInfo('sklearn.impute._base', 'SimpleImputer')),
        DagNodeDetails('Simple Imputer: fit_transform', ['A']),
        OptionalCodeInfo(
            CodeReference(6, 10, 6, 72),
            "SimpleImputer(missing_values=np.nan, strategy='most_frequent')"))

    change_df = DataFrame({
        'sensitive_column_value': ['cat_a', 'cat_c', math.nan],
        'count_before': [2, 1, 1],
        'count_after': [3, 1, 0],
        'removed_records': [-1, 0, 1],
        'removal_probability': [0., 0., 1.],
        'normalized_removal_probability': [0., 0., 1.]
    })
    expected_probabilities = RemovalProbabilities(dag_node, True, 0.,
                                                  change_df)
    expected_dag_node_to_change = {dag_node: {'A': expected_probabilities}}
    failure_message = None
    expected_result = SimilarRemovalProbabilitiesForResult(
        SimilarRemovalProbabilitiesFor(['A']), CheckStatus.SUCCESS,
        failure_message, expected_dag_node_to_change)
    return expected_result
def get_expected_check_result_dropna():
    """ Expected result for the code snippet in test_no_bias_introduced_for_dropna"""
    dag_node = DagNode(
        1, BasicCodeLocation('<string-source>', 5),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails("dropna", ['A', 'B']),
        OptionalCodeInfo(CodeReference(5, 5, 5, 16), "df.dropna()"))

    change_df = DataFrame({
        'sensitive_column_value': ['cat_a', 'cat_c'],
        'count_before': [2, 3],
        'count_after': [0, 2],
        'removed_records': [2, 1],
        'removal_probability': [1., 1. / 3.],
        'normalized_removal_probability': [3., 1.]
    })
    expected_probabilities = RemovalProbabilities(dag_node, False, 3.,
                                                  change_df)
    expected_dag_node_to_change = {dag_node: {'A': expected_probabilities}}
    failure_message = "A Selection causes a max_probability_difference of 'A' by 3.0, a value above the configured " \
                      "maximum threshold 2.0!"
    expected_result = SimilarRemovalProbabilitiesForResult(
        SimilarRemovalProbabilitiesFor(['A']), CheckStatus.FAILURE,
        failure_message, expected_dag_node_to_change)
    return expected_result
コード例 #17
0
def test_frame__init__():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'DataFrame') works
    """
    test_code = cleandoc("""
        import pandas as pd

        df = pd.DataFrame([0, 1, 2], columns=['A'])
        assert len(df) == 3
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]

    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(CodeReference(3, 5, 3, 43),
                         "pd.DataFrame([0, 1, 2], columns=['A'])"))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0, {LineageId(0, 0)}], [1, {LineageId(0, 1)}]],
        columns=['A', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def get_expected_check_result_merge():
    """ Expected result for the code snippet in test_no_bias_introduced_for_merge"""
    dag_node = DagNode(
        2, BasicCodeLocation('<string-source>', 5),
        OperatorContext(OperatorType.JOIN,
                        FunctionInfo('pandas.core.frame', 'merge')),
        DagNodeDetails("on 'B'", ['A', 'B', 'C']),
        OptionalCodeInfo(CodeReference(5, 12, 5, 36),
                         "df_a.merge(df_b, on='B')"))

    change_df = DataFrame({
        'sensitive_column_value': ['cat_a', 'cat_b', 'cat_c'],
        'count_before': [2, 2, 1],
        'count_after': [2, 1, 1],
        'removed_records': [0, 1, 0],
        'removal_probability': [0., 0.5, 0.],
        'normalized_removal_probability': [0., 1., 0.]
    })
    expected_probabilities = RemovalProbabilities(dag_node, True, 1.,
                                                  change_df)
    expected_dag_node_to_change = {dag_node: {'A': expected_probabilities}}
    failure_message = None
    expected_result = SimilarRemovalProbabilitiesForResult(
        SimilarRemovalProbabilitiesFor(['A']), CheckStatus.SUCCESS,
        failure_message, expected_dag_node_to_change)
    return expected_result
コード例 #19
0
def test_read_csv():
    """
    Tests whether the monkey patching of ('pandas.io.parsers', 'read_csv') works
    """
    test_code = cleandoc("""
        import os
        import pandas as pd
        from mlinspect.utils import get_project_root
        
        train_file = os.path.join(str(get_project_root()), "example_pipelines", "adult_complex", "adult_train.csv")
        raw_data = pd.read_csv(train_file, na_values='?', index_col=0)
        assert len(raw_data) == 22792
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]
    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 6),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.io.parsers', 'read_csv')),
        DagNodeDetails(StringComparison(r".*\.csv"), [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income-per-year'
        ]),
        OptionalCodeInfo(
            CodeReference(6, 11, 6, 62),
            "pd.read_csv(train_file, na_values='?', index_col=0)"))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[
            46, 'Private', 128645, 'Some-college', 10, 'Divorced',
            'Prof-specialty', 'Not-in-family', 'White', 'Female', 0, 0, 40,
            'United-States', '<=50K', {LineageId(0, 0)}
        ],
         [
             29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married',
             'Handlers-cleaners', 'Not-in-family', 'White', 'Male', 0, 0, 50,
             'United-States', '<=50K', {LineageId(0, 1)}
         ]],
        columns=[
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income-per-year', 'mlinspect_lineage'
        ])

    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
コード例 #20
0
def test_get_rdataset():
    """
    Tests whether the monkey patching of ('statsmodels.datasets', 'get_rdataset') works
    """
    test_code = cleandoc("""
        import statsmodels.api as sm

        dat = sm.datasets.get_rdataset("Guerry", "HistData").data
        assert len(dat) == 86
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]
    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('statsmodels.datasets', 'get_rdataset')),
        DagNodeDetails(
            'Data from A.-M. Guerry, "Essay on the Moral Statistics of France"',
            [
                'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop',
                'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity',
                'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide',
                'Donation_clergy', 'Lottery', 'Desertion', 'Instruction',
                'Prostitutes', 'Distance', 'Area', 'Pop1831'
            ]),
        OptionalCodeInfo(CodeReference(3, 6, 3, 52),
                         """sm.datasets.get_rdataset("Guerry", "HistData")"""))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[
            1, 'E', 'Ain', 28870, 15890, 37, 5098, 33120, 35039, '2:Med', 73,
            58, 11, 71, 60, 69, 41, 55, 46, 13, 218.372, 5762, 346.03,
            {LineageId(0, 0)}
        ],
         [
             2, 'N', 'Aisne', 26226, 5521, 51, 8901, 14572, 12831, '2:Med', 22,
             10, 82, 4, 82, 36, 38, 82, 24, 327, 65.945, 7369, 513.0,
             {LineageId(0, 1)}
         ]],
        columns=[
            'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop',
            'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity',
            'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide',
            'Donation_clergy', 'Lottery', 'Desertion', 'Instruction',
            'Prostitutes', 'Distance', 'Area', 'Pop1831', 'mlinspect_lineage'
        ])

    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
コード例 #21
0
def test_frame_dropna():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'dropna') works
    """
    test_code = cleandoc("""
        import pandas as pd
        
        df = pd.DataFrame([0, 2, 4, 5, None], columns=['A'])
        assert len(df) == 5
        df = df.dropna()
        assert len(df) == 4
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(CodeReference(3, 5, 3, 52),
                         "pd.DataFrame([0, 2, 4, 5, None], columns=['A'])"))
    expected_select = DagNode(
        1, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails('dropna', ['A']),
        OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()'))
    expected_dag.add_edge(expected_data_source, expected_select)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_select]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0., {LineageId(0, 0)}], [2., {LineageId(0, 1)}]],
        columns=['A', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_removal_probab_dropna():
    """
    Tests whether SimilarRemovalProbabilitiesFor works for dropna
    """
    test_code = cleandoc("""
            import pandas as pd

            df = pd.DataFrame({'A': ['cat_a', 'cat_a', 'cat_c', 'cat_c', 'cat_c'], 
                               'B': [None, None, 1, 2, None]})
            df = df.dropna()
            """)

    inspector_result = PipelineInspector \
        .on_pipeline_from_string(test_code) \
        .add_check(SimilarRemovalProbabilitiesFor(['A'])) \
        .execute()

    check_result = inspector_result.check_to_check_results[
        SimilarRemovalProbabilitiesFor(['A'])]
    expected_result = get_expected_check_result_dropna()
    compare(check_result, expected_result)

    overview = SimilarRemovalProbabilitiesFor.get_removal_probabilities_overview_as_df(
        check_result)
    expected_df = pandas.DataFrame({
        'operator_type': [OperatorType.SELECTION],
        'description': ['dropna'],
        'code_reference': [CodeReference(5, 5, 5, 16)],
        'source_code': ['df.dropna()'],
        'function_info': [FunctionInfo('pandas.core.frame', 'dropna')],
        "'A' probability difference below the configured maximum test threshold":
        [True]
    })
    pandas.testing.assert_frame_equal(overview, expected_df)
    matplotlib.use(
        "template")  # Disable plt.show when executing nb as part of this test
    SimilarRemovalProbabilitiesFor.plot_removal_probability_histograms(
        list(check_result.removal_probability_change.values())[0]['A'])
    SimilarRemovalProbabilitiesFor.plot_distribution_change_histograms(
        list(check_result.removal_probability_change.values())[0]['A'])
コード例 #23
0
 def visit_operator(self, inspection_input) -> Iterable[any]:
     """
     Visit an operator
     """
     # pylint: disable=too-many-branches, too-many-statements
     if isinstance(inspection_input, InspectionInputUnaryOperator) and \
             inspection_input.operator_context.function_info == \
             FunctionInfo('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer'):
         # TODO: Are there existing word embedding transformers for sklearn we can use this for?
         self._is_embedding_operator = True
         for row in inspection_input.row_iterator:
             # Count missing embeddings
             embedding_array = row.output[0]
             is_zero_vector = not embedding_array.any()
             if is_zero_vector:
                 self._missing_embedding_count += 1
                 if len(self._missing_embeddings_examples) < self.example_threshold:
                     self._missing_embeddings_examples.append(row.input[0])
         yield None
     else:
         for _ in inspection_input.row_iterator:
             yield None
コード例 #24
0
def get_expected_dag_adult_easy(caller_filename: str,
                                line_offset: int = 0,
                                with_code_references=True):
    """
    Get the expected DAG for the adult_easy pipeline
    """
    # pylint: disable=too-many-locals
    # The line numbers differ slightly between the .py file and the.ipynb file
    expected_graph = networkx.DiGraph()

    expected_data_source = DagNode(
        0, BasicCodeLocation(caller_filename, 12 + line_offset),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.io.parsers', 'read_csv')),
        DagNodeDetails('adult_train.csv', [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income-per-year'
        ]),
        OptionalCodeInfo(
            CodeReference(12 + line_offset, 11, 12 + line_offset, 62),
            "pd.read_csv(train_file, na_values='?', index_col=0)"))
    expected_graph.add_node(expected_data_source)

    expected_select = DagNode(
        1, BasicCodeLocation(caller_filename, 14 + line_offset),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails('dropna', [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income-per-year'
        ]),
        OptionalCodeInfo(
            CodeReference(14 + line_offset, 7, 14 + line_offset, 24),
            'raw_data.dropna()'))
    expected_graph.add_edge(expected_data_source, expected_select)

    pipeline_str = "compose.ColumnTransformer(transformers=[\n" \
                   "    ('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), " \
                   "['education', 'workclass']),\n" \
                   "    ('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week'])\n" \
                   "])"
    expected_pipeline_project_one = DagNode(
        4, BasicCodeLocation(caller_filename, 18 + line_offset),
        OperatorContext(
            OperatorType.PROJECTION,
            FunctionInfo('sklearn.compose._column_transformer',
                         'ColumnTransformer')),
        DagNodeDetails("to ['education', 'workclass']",
                       ['education', 'workclass']),
        OptionalCodeInfo(
            CodeReference(18 + line_offset, 25, 21 + line_offset, 2),
            pipeline_str))
    expected_graph.add_edge(expected_select, expected_pipeline_project_one)
    expected_pipeline_project_two = DagNode(
        6, BasicCodeLocation(caller_filename, 18 + line_offset),
        OperatorContext(
            OperatorType.PROJECTION,
            FunctionInfo('sklearn.compose._column_transformer',
                         'ColumnTransformer')),
        DagNodeDetails("to ['age', 'hours-per-week']",
                       ['age', 'hours-per-week']),
        OptionalCodeInfo(
            CodeReference(18 + line_offset, 25, 21 + line_offset, 2),
            pipeline_str))
    expected_graph.add_edge(expected_select, expected_pipeline_project_two)

    expected_pipeline_transformer_one = DagNode(
        5, BasicCodeLocation(caller_filename, 19 + line_offset),
        OperatorContext(
            OperatorType.TRANSFORMER,
            FunctionInfo('sklearn.preprocessing._encoders', 'OneHotEncoder')),
        DagNodeDetails('One-Hot Encoder: fit_transform', ['array']),
        OptionalCodeInfo(
            CodeReference(19 + line_offset, 20, 19 + line_offset, 72),
            "preprocessing.OneHotEncoder(handle_unknown='ignore')"))
    expected_pipeline_transformer_two = DagNode(
        7, BasicCodeLocation(caller_filename, 20 + line_offset),
        OperatorContext(
            OperatorType.TRANSFORMER,
            FunctionInfo('sklearn.preprocessing._data', 'StandardScaler')),
        DagNodeDetails('Standard Scaler: fit_transform', ['array']),
        OptionalCodeInfo(
            CodeReference(20 + line_offset, 16, 20 + line_offset, 46),
            'preprocessing.StandardScaler()'))
    expected_graph.add_edge(expected_pipeline_project_one,
                            expected_pipeline_transformer_one)
    expected_graph.add_edge(expected_pipeline_project_two,
                            expected_pipeline_transformer_two)

    expected_pipeline_concatenation = DagNode(
        8, BasicCodeLocation(caller_filename, 18 + line_offset),
        OperatorContext(
            OperatorType.CONCATENATION,
            FunctionInfo('sklearn.compose._column_transformer',
                         'ColumnTransformer')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(
            CodeReference(18 + line_offset, 25, 21 + line_offset, 2),
            pipeline_str))
    expected_graph.add_edge(expected_pipeline_transformer_one,
                            expected_pipeline_concatenation)
    expected_graph.add_edge(expected_pipeline_transformer_two,
                            expected_pipeline_concatenation)

    expected_train_data = DagNode(
        9, BasicCodeLocation(caller_filename, 26 + line_offset),
        OperatorContext(
            OperatorType.TRAIN_DATA,
            FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(
            CodeReference(26 + line_offset, 19, 26 + line_offset, 48),
            'tree.DecisionTreeClassifier()'))
    expected_graph.add_edge(expected_pipeline_concatenation,
                            expected_train_data)

    expected_project = DagNode(
        2, BasicCodeLocation(caller_filename, 16 + line_offset),
        OperatorContext(OperatorType.PROJECTION,
                        FunctionInfo('pandas.core.frame', '__getitem__')),
        DagNodeDetails("to ['income-per-year']", ['income-per-year']),
        OptionalCodeInfo(
            CodeReference(16 + line_offset, 38, 16 + line_offset, 61),
            "data['income-per-year']"))
    expected_graph.add_edge(expected_select, expected_project)

    expected_project_modify = DagNode(
        3, BasicCodeLocation(caller_filename, 16 + line_offset),
        OperatorContext(
            OperatorType.PROJECTION_MODIFY,
            FunctionInfo('sklearn.preprocessing._label', 'label_binarize')),
        DagNodeDetails("label_binarize, classes: ['>50K', '<=50K']",
                       ['array']),
        OptionalCodeInfo(
            CodeReference(16 + line_offset, 9, 16 + line_offset, 89),
            "preprocessing.label_binarize(data['income-per-year'], "
            "classes=['>50K', '<=50K'])"))
    expected_graph.add_edge(expected_project, expected_project_modify)

    expected_train_labels = DagNode(
        10, BasicCodeLocation(caller_filename, 26 + line_offset),
        OperatorContext(
            OperatorType.TRAIN_LABELS,
            FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(
            CodeReference(26 + line_offset, 19, 26 + line_offset, 48),
            'tree.DecisionTreeClassifier()'))
    expected_graph.add_edge(expected_project_modify, expected_train_labels)

    expected_estimator = DagNode(
        11, BasicCodeLocation(caller_filename, 26 + line_offset),
        OperatorContext(
            OperatorType.ESTIMATOR,
            FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')),
        DagNodeDetails('Decision Tree', []),
        OptionalCodeInfo(
            CodeReference(26 + line_offset, 19, 26 + line_offset, 48),
            'tree.DecisionTreeClassifier()'))
    expected_graph.add_edge(expected_train_data, expected_estimator)
    expected_graph.add_edge(expected_train_labels, expected_estimator)

    if not with_code_references:
        for dag_node in expected_graph.nodes:
            dag_node.optional_code_info = None

    return expected_graph
コード例 #25
0
def test_ols_fit():
    """
    Tests whether the monkey patching of ('statsmodels.regression.linear_model.OLS', 'fit') works
    """
    test_code = cleandoc("""
        import numpy as np
        import statsmodels.api as sm
        np.random.seed(42)
        nobs = 100
        X = np.random.random((nobs, 2))
        X = sm.add_constant(X)
        beta = [1, .1, .5]
        e = np.random.random(nobs)
        y = np.dot(X, beta) + e
        results = sm.OLS(y, X).fit()
        assert results.summary() is not None
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(3)])
    inspector_result.dag.remove_nodes_from(
        list(inspector_result.dag.nodes)[0:4])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[1])

    expected_dag = networkx.DiGraph()
    expected_train_data = DagNode(
        3, BasicCodeLocation("<string-source>", 10),
        OperatorContext(OperatorType.TRAIN_DATA,
                        FunctionInfo('statsmodel.api.OLS', 'fit')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)'))
    expected_train_labels = DagNode(
        4, BasicCodeLocation("<string-source>", 10),
        OperatorContext(OperatorType.TRAIN_LABELS,
                        FunctionInfo('statsmodel.api.OLS', 'fit')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)'))
    expected_ols = DagNode(
        5, BasicCodeLocation("<string-source>", 10),
        OperatorContext(OperatorType.ESTIMATOR,
                        FunctionInfo('statsmodel.api.OLS', 'fit')),
        DagNodeDetails('Decision Tree', []),
        OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)'))
    expected_dag.add_edge(expected_train_data, expected_ols)
    expected_dag.add_edge(expected_train_labels, expected_ols)

    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_train_data]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame(
        [[
            numpy.array([1.0, 0.3745401188473625, 0.9507143064099162]),
            {LineageId(3, 0)}
        ],
         [
             numpy.array([1.0, 0.7319939418114051, 0.5986584841970366]),
             {LineageId(3, 1)}
         ],
         [
             numpy.array([1.0, 0.15601864044243652, 0.15599452033620265]),
             {LineageId(3, 2)}
         ]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=0.1)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_train_labels]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame([[2.154842811243982, {LineageId(5, 0)}],
                                     [1.4566686012747074, {LineageId(5, 1)}],
                                     [1.2552278383069588, {LineageId(5, 2)}]],
                                    columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=0.1)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_ols]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame(
        [[{LineageId(5, 0), LineageId(3, 0)}],
         [{LineageId(5, 1), LineageId(3, 1)}],
         [{LineageId(5, 2), LineageId(3, 2)}]],
        columns=['mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        check_column_type=False)