def test_numpy_random():
    """
    Tests whether the monkey patching of ('numpy.random', 'random') works
    """
    test_code = cleandoc("""
        import numpy as np
        np.random.seed(42)
        test = np.random.random(100)
        assert len(test) == 100
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]

    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('numpy.random', 'random')),
        DagNodeDetails('random', ['array']),
        OptionalCodeInfo(CodeReference(3, 7, 3, 28), "np.random.random(100)"))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=1)
def test_frame_merge_sorted():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'merge') works if the sort option is set to True
    """
    test_code = cleandoc("""
        import pandas as pd

        df_a = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]})
        df_b = pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]})
        df_merged = df_a.merge(df_b, on='B', sort=True)
        df_expected = pd.DataFrame({'A': [5, 8, 4, 2], 'B': [1, 2, 4, 5], 'C': [1, 11, 5, None]})
        pd.testing.assert_frame_equal(df_merged, df_expected)
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(5)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3])

    expected_dag = networkx.DiGraph()
    expected_a = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A', 'B']),
        OptionalCodeInfo(
            CodeReference(3, 7, 3, 65),
            "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]})"))
    expected_b = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['B', 'C']),
        OptionalCodeInfo(
            CodeReference(4, 7, 4, 69),
            "pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]})"))
    expected_join = DagNode(
        2, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.JOIN,
                        FunctionInfo('pandas.core.frame', 'merge')),
        DagNodeDetails("on 'B'", ['A', 'B', 'C']),
        OptionalCodeInfo(CodeReference(5, 12, 5, 47),
                         "df_a.merge(df_b, on='B', sort=True)"))
    expected_dag.add_edge(expected_a, expected_join)
    expected_dag.add_edge(expected_b, expected_join)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_join]
    lineage_output = inspection_results_data_source[RowLineage(5)]
    expected_lineage_df = DataFrame(
        [[5, 1, 1., {LineageId(0, 4), LineageId(1, 0)}],
         [8, 2, 11., {LineageId(0, 3), LineageId(1, 3)}],
         [4, 4, 5., {LineageId(0, 2), LineageId(1, 1)}],
         [2, 5, math.nan, {LineageId(0, 1), LineageId(1, 4)}]],
        columns=['A', 'B', 'C', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_frame__init__():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'DataFrame') works
    """
    test_code = cleandoc("""
        import pandas as pd

        df = pd.DataFrame([0, 1, 2], columns=['A'])
        assert len(df) == 3
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]

    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(CodeReference(3, 5, 3, 43),
                         "pd.DataFrame([0, 1, 2], columns=['A'])"))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0, {LineageId(0, 0)}], [1, {LineageId(0, 1)}]],
        columns=['A', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_get_rdataset():
    """
    Tests whether the monkey patching of ('statsmodels.datasets', 'get_rdataset') works
    """
    test_code = cleandoc("""
        import statsmodels.api as sm

        dat = sm.datasets.get_rdataset("Guerry", "HistData").data
        assert len(dat) == 86
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]
    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('statsmodels.datasets', 'get_rdataset')),
        DagNodeDetails(
            'Data from A.-M. Guerry, "Essay on the Moral Statistics of France"',
            [
                'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop',
                'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity',
                'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide',
                'Donation_clergy', 'Lottery', 'Desertion', 'Instruction',
                'Prostitutes', 'Distance', 'Area', 'Pop1831'
            ]),
        OptionalCodeInfo(CodeReference(3, 6, 3, 52),
                         """sm.datasets.get_rdataset("Guerry", "HistData")"""))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[
            1, 'E', 'Ain', 28870, 15890, 37, 5098, 33120, 35039, '2:Med', 73,
            58, 11, 71, 60, 69, 41, 55, 46, 13, 218.372, 5762, 346.03,
            {LineageId(0, 0)}
        ],
         [
             2, 'N', 'Aisne', 26226, 5521, 51, 8901, 14572, 12831, '2:Med', 22,
             10, 82, 4, 82, 36, 38, 82, 24, 327, 65.945, 7369, 513.0,
             {LineageId(0, 1)}
         ]],
        columns=[
            'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop',
            'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity',
            'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide',
            'Donation_clergy', 'Lottery', 'Desertion', 'Instruction',
            'Prostitutes', 'Distance', 'Area', 'Pop1831', 'mlinspect_lineage'
        ])

    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_statsmodels_add_constant():
    """
    Tests whether the monkey patching of ('statsmodel.api', 'add_constant') works
    """
    test_code = cleandoc("""
        import numpy as np
        import statsmodels.api as sm
        np.random.seed(42)
        test = np.random.random(100)
        test = sm.add_constant(test)
        assert len(test) == 100
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    expected_dag = networkx.DiGraph()
    expected_random = DagNode(
        0, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('numpy.random', 'random')),
        DagNodeDetails('random', ['array']),
        OptionalCodeInfo(CodeReference(4, 7, 4, 28), "np.random.random(100)"))

    expected_constant = DagNode(
        1, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.PROJECTION_MODIFY,
                        FunctionInfo('statsmodel.api', 'add_constant')),
        DagNodeDetails('Adds const column', ['array']),
        OptionalCodeInfo(CodeReference(5, 7, 5, 28), "sm.add_constant(test)"))
    expected_dag.add_edge(expected_random, expected_constant)

    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_random]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=1)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_constant]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[numpy.array([0.5, 1.]), {LineageId(0, 0)}],
         [numpy.array([0.5, 1.]), {LineageId(0, 1)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=1)
def test_read_csv():
    """
    Tests whether the monkey patching of ('pandas.io.parsers', 'read_csv') works
    """
    test_code = cleandoc("""
        import os
        import pandas as pd
        from mlinspect.utils import get_project_root
        
        train_file = os.path.join(str(get_project_root()), "example_pipelines", "adult_complex", "adult_train.csv")
        raw_data = pd.read_csv(train_file, na_values='?', index_col=0)
        assert len(raw_data) == 22792
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    extracted_node: DagNode = list(inspector_result.dag.nodes)[0]
    expected_node = DagNode(
        0, BasicCodeLocation("<string-source>", 6),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.io.parsers', 'read_csv')),
        DagNodeDetails(StringComparison(r".*\.csv"), [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income-per-year'
        ]),
        OptionalCodeInfo(
            CodeReference(6, 11, 6, 62),
            "pd.read_csv(train_file, na_values='?', index_col=0)"))
    compare(extracted_node, expected_node)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        extracted_node]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[
            46, 'Private', 128645, 'Some-college', 10, 'Divorced',
            'Prof-specialty', 'Not-in-family', 'White', 'Female', 0, 0, 40,
            'United-States', '<=50K', {LineageId(0, 0)}
        ],
         [
             29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married',
             'Handlers-cleaners', 'Not-in-family', 'White', 'Male', 0, 0, 50,
             'United-States', '<=50K', {LineageId(0, 1)}
         ]],
        columns=[
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
            'income-per-year', 'mlinspect_lineage'
        ])

    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
Esempio n. 7
0
def test_frame__setitem__():
    """
    Tests whether the monkey patching of ('pandas.core.frame', '__setitem__') works
    """
    test_code = cleandoc("""
                import pandas as pd

                pandas_df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                              'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                              'baz': [1, 2, 3, 4, 5, 6],
                              'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
                pandas_df['baz'] = pandas_df['baz'] + 1
                df_expected = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                              'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                              'baz': [2, 3, 4, 5, 6, 7],
                              'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
                pd.testing.assert_frame_equal(pandas_df, df_expected)
                """)
    inspector_result = _pipeline_executor.singleton.run(python_code=test_code, track_code_references=True,
                                                        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(0,
                                   BasicCodeLocation("<string-source>", 3),
                                   OperatorContext(OperatorType.DATA_SOURCE,
                                                   FunctionInfo('pandas.core.frame', 'DataFrame')),
                                   DagNodeDetails(None, ['foo', 'bar', 'baz', 'zoo']),
                                   OptionalCodeInfo(CodeReference(3, 12, 6, 53),
                                                    "pd.DataFrame({'foo': ['one', 'one', 'one', 'two', "
                                                    "'two', 'two'],\n"
                                                    "              'bar': ['A', 'B', 'C', 'A', 'B', 'C'],\n"
                                                    "              'baz': [1, 2, 3, 4, 5, 6],\n"
                                                    "              'zoo': ['x', 'y', 'z', 'q', 'w', 't']})"))
    expected_project = DagNode(1,
                               BasicCodeLocation("<string-source>", 7),
                               OperatorContext(OperatorType.PROJECTION,
                                               FunctionInfo('pandas.core.frame', '__getitem__')),
                               DagNodeDetails("to ['baz']", ['baz']),
                               OptionalCodeInfo(CodeReference(7, 19, 7, 35), "pandas_df['baz']"))
    expected_dag.add_edge(expected_data_source, expected_project)
    expected_project_modify = DagNode(2,
                                      BasicCodeLocation("<string-source>", 7),
                                      OperatorContext(OperatorType.PROJECTION_MODIFY,
                                                      FunctionInfo('pandas.core.frame', '__setitem__')),
                                      DagNodeDetails("modifies ['baz']", ['foo', 'bar', 'baz', 'zoo']),
                                      OptionalCodeInfo(CodeReference(7, 0, 7, 39),
                                                       "pandas_df['baz'] = pandas_df['baz'] + 1"))
    expected_dag.add_edge(expected_data_source, expected_project_modify)

    compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[expected_project_modify]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame([['one', 'A', 2, 'x', {LineageId(0, 0)}],
                                     ['one', 'B', 3, 'y', {LineageId(0, 1)}]],
                                    columns=['foo', 'bar', 'baz', 'zoo', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_my_word_to_vec_transformer():
    """
    Tests whether the monkey patching of ('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer') works
    """
    test_code = cleandoc("""
                import pandas as pd
                from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer
                import numpy as np

                df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']})
                word_to_vec = MyW2VTransformer(min_count=2, size=2, workers=1)
                encoded_data = word_to_vec.fit_transform(df)
                assert encoded_data.shape == (4, 2)
                """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(3)],
        custom_monkey_patching=[custom_monkeypatching])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(
            CodeReference(5, 5, 5, 62),
            "pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']})"))
    expected_estimator = DagNode(
        1, BasicCodeLocation("<string-source>", 6),
        OperatorContext(
            OperatorType.TRANSFORMER,
            FunctionInfo('example_pipelines.healthcare.healthcare_utils',
                         'MyW2VTransformer')),
        DagNodeDetails('Word2Vec', ['array']),
        OptionalCodeInfo(CodeReference(6, 14, 6, 62),
                         'MyW2VTransformer(min_count=2, size=2, workers=1)'))
    expected_dag.add_edge(expected_data_source, expected_estimator)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_estimator]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame(
        [[numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 0)}],
         [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 1)}],
         [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 2)}]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_series_equal(
        lineage_output["mlinspect_lineage"],
        expected_lineage_df["mlinspect_lineage"])
    assert expected_lineage_df.iloc[0, 0].shape == (3, )
def test_frame__getitem__selection():
    """
    Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for filtering
    """
    test_code = cleandoc("""
                import pandas as pd

                df = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]})
                df_selection = df[df['A'] > 3]
                df_expected = pd.DataFrame({'A': [4, 8, 5], 'B': [4, 11, None]})
                pd.testing.assert_frame_equal(df_selection.reset_index(drop=True), df_expected.reset_index(drop=True))
                """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A', 'B']),
        OptionalCodeInfo(
            CodeReference(3, 5, 3, 67),
            "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]})"))
    expected_projection = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.PROJECTION,
                        FunctionInfo('pandas.core.frame', '__getitem__')),
        DagNodeDetails("to ['A']", ['A']),
        OptionalCodeInfo(CodeReference(4, 18, 4, 25), "df['A']"))
    expected_dag.add_edge(expected_data_source, expected_projection)
    expected_selection = DagNode(
        2, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', '__getitem__')),
        DagNodeDetails("Select by Series: df[df['A'] > 3]", ['A', 'B']),
        OptionalCodeInfo(CodeReference(4, 15, 4, 30), "df[df['A'] > 3]"))
    expected_dag.add_edge(expected_data_source, expected_selection)

    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_selection]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[4, 4., {LineageId(0, 2)}], [8, 11., {LineageId(0, 3)}]],
        columns=['A', 'B', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def run_row_index_annotation_testing_analyzer(code):
    """
    An utility function to test backends
    """
    result = PipelineInspector \
        .on_pipeline_from_string(code) \
        .add_required_inspection(RowLineage(10)) \
        .execute()
    inspection_results = result.inspection_to_annotations
    assert RowLineage(10) in inspection_results
    result = inspection_results[RowLineage(10)]
    return result
def test_groupby_agg():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'groupby') and ('pandas.core.groupbygeneric', 'agg')
    works.
    """
    test_code = cleandoc("""
        import pandas as pd

        df = pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], 'value': [1, 2, 1, 3, 4]})
        df_groupby_agg = df.groupby('group').agg(mean_value=('value', 'mean'))
        
        df_expected = pd.DataFrame({'group': ['A', 'B', 'C'], 'mean_value': [1, 3, 3]})
        pd.testing.assert_frame_equal(df_groupby_agg.reset_index(drop=False), df_expected.reset_index(drop=True))
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2])

    expected_dag = networkx.DiGraph()
    expected_data = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['group', 'value']),
        OptionalCodeInfo(
            CodeReference(3, 5, 3, 81),
            "pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], "
            "'value': [1, 2, 1, 3, 4]})"))
    expected_groupby_agg = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.GROUP_BY_AGG,
                        FunctionInfo('pandas.core.groupby.generic', 'agg')),
        DagNodeDetails(
            "Groupby 'group', Aggregate: '{'mean_value': ('value', 'mean')}'",
            ['group', 'mean_value']),
        OptionalCodeInfo(
            CodeReference(4, 17, 4, 70),
            "df.groupby('group').agg(mean_value=('value', 'mean'))"))
    expected_dag.add_edge(expected_data, expected_groupby_agg)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_groupby_agg]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [["A", 1, {LineageId(1, 0)}], ['B', 3, {LineageId(1, 1)}]],
        columns=['group', 'mean_value', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_frame__getitem__frame():
    """
    Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for multiple string arguments
    """
    test_code = cleandoc("""
                import pandas as pd

                df = pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], [9, 2, 3], [6, 1, 2], [1, 2, 3]], 
                    columns=['A', 'B', 'C'])
                df_projection = df[['A', 'C']]
                df_expected = pd.DataFrame([[0, 2], [1, 3], [4, 2], [9, 3], [6, 2], [1, 3]], columns=['A', 'C'])
                pd.testing.assert_frame_equal(df_projection, df_expected)
                """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A', 'B', 'C']),
        OptionalCodeInfo(
            CodeReference(3, 5, 4, 28),
            "pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], "
            "[9, 2, 3], [6, 1, 2], [1, 2, 3]], \n"
            "    columns=['A', 'B', 'C'])"))
    expected_project = DagNode(
        1, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.PROJECTION,
                        FunctionInfo('pandas.core.frame', '__getitem__')),
        DagNodeDetails("to ['A', 'C']", ['A', 'C']),
        OptionalCodeInfo(CodeReference(5, 16, 5, 30), "df[['A', 'C']]"))
    expected_dag.add_edge(expected_data_source, expected_project)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_project]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0, 2, {LineageId(0, 0)}], [1, 3, {LineageId(0, 1)}]],
        columns=['A', 'C', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
Esempio n. 13
0
def run_row_index_annotation_testing_analyzer(code):
    """
    An utility function to test backends
    """
    result = PipelineInspector \
        .on_pipeline_from_string(code) \
        .add_required_inspection(RowLineage(10)) \
        .execute()
    inspection_results = result.dag_node_to_inspection_results
    dag_node_to_lineage_inspection = {}
    for dag_node, inspection_result in inspection_results.items():
        assert RowLineage(10) in inspection_result
        dag_node_to_lineage_inspection[dag_node] = inspection_result[
            RowLineage(10)]
    return dag_node_to_lineage_inspection
def test_frame_replace():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'replace') works
    """
    test_code = cleandoc("""
        import pandas as pd

        df = pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], columns=['A'])
        df_replace = df.replace('Medium', 'Low')
        df_expected = pd.DataFrame(['Low', 'Low', 'Low', 'High', None], columns=['A'])
        pd.testing.assert_frame_equal(df_replace.reset_index(drop=True), df_expected.reset_index(drop=True))
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(
            CodeReference(3, 5, 3, 72),
            "pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], "
            "columns=['A'])"))
    expected_modify = DagNode(
        1, BasicCodeLocation("<string-source>", 4),
        OperatorContext(OperatorType.PROJECTION_MODIFY,
                        FunctionInfo('pandas.core.frame', 'replace')),
        DagNodeDetails("Replace 'Medium' with 'Low'", ['A']),
        OptionalCodeInfo(CodeReference(4, 13, 4, 40),
                         "df.replace('Medium', 'Low')"))
    expected_dag.add_edge(expected_data_source, expected_modify)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_modify]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [['Low', {LineageId(0, 0)}], ['Low', {LineageId(0, 1)}]],
        columns=['A', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
def test_frame_dropna():
    """
    Tests whether the monkey patching of ('pandas.core.frame', 'dropna') works
    """
    test_code = cleandoc("""
        import pandas as pd
        
        df = pd.DataFrame([0, 2, 4, 5, None], columns=['A'])
        assert len(df) == 5
        df = df.dropna()
        assert len(df) == 4
        """)
    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(2)])

    expected_dag = networkx.DiGraph()
    expected_data_source = DagNode(
        0, BasicCodeLocation("<string-source>", 3),
        OperatorContext(OperatorType.DATA_SOURCE,
                        FunctionInfo('pandas.core.frame', 'DataFrame')),
        DagNodeDetails(None, ['A']),
        OptionalCodeInfo(CodeReference(3, 5, 3, 52),
                         "pd.DataFrame([0, 2, 4, 5, None], columns=['A'])"))
    expected_select = DagNode(
        1, BasicCodeLocation("<string-source>", 5),
        OperatorContext(OperatorType.SELECTION,
                        FunctionInfo('pandas.core.frame', 'dropna')),
        DagNodeDetails('dropna', ['A']),
        OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()'))
    expected_dag.add_edge(expected_data_source, expected_select)
    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_select]
    lineage_output = inspection_results_data_source[RowLineage(2)]
    expected_lineage_df = DataFrame(
        [[0., {LineageId(0, 0)}], [2., {LineageId(0, 1)}]],
        columns=['A', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True))
Esempio n. 16
0
def run_and_assert_all_op_outputs_inspected(py_file_path,
                                            sensitive_columns,
                                            dag_png_path,
                                            custom_monkey_patching=None):
    """
    Execute the pipeline with a few checks and inspections.
    Assert that mlinspect properly lets inspections inspect all DAG nodes
    """
    if custom_monkey_patching is None:
        custom_monkey_patching = []

    inspector_result = PipelineInspector \
        .on_pipeline_from_py_file(py_file_path) \
        .add_check(NoBiasIntroducedFor(sensitive_columns)) \
        .add_check(NoIllegalFeatures()) \
        .add_required_inspection(MissingEmbeddings(20)) \
        .add_required_inspection(RowLineage(5)) \
        .add_required_inspection(MaterializeFirstOutputRows(5)) \
        .add_custom_monkey_patching_modules(custom_monkey_patching) \
        .execute()

    for dag_node, inspection_result in inspector_result.dag_node_to_inspection_results.items(
    ):
        assert dag_node.operator_info.operator != OperatorType.MISSING_OP
        assert MaterializeFirstOutputRows(5) in inspection_result
        assert RowLineage(5) in inspection_result
        assert MissingEmbeddings(20) in inspection_result
        assert HistogramForColumns(sensitive_columns) in inspection_result
        if dag_node.operator_info.operator != OperatorType.ESTIMATOR:  # Estimator does not have output
            assert inspection_result[MaterializeFirstOutputRows(5)] is not None
            assert inspection_result[RowLineage(5)] is not None
            assert inspection_result[HistogramForColumns(
                sensitive_columns)] is not None
        else:
            assert inspection_result[MaterializeFirstOutputRows(5)] is None
            assert inspection_result[RowLineage(5)] is not None
            assert inspection_result[HistogramForColumns(
                sensitive_columns)] is None

    save_fig_to_path(inspector_result.dag, dag_png_path)
    assert os.path.isfile(dag_png_path)

    return inspector_result.dag
def run_multiple_test_analyzers(code):
    """
   An utility function to test backends.
   Also useful to debug annotation propagation.
   """
    analyzers = [RandomAnnotationTestingInspection(2), MaterializeFirstOutputRows(5),
                 RowLineage(2)]
    result = PipelineInspector \
        .on_pipeline_from_string(code) \
        .add_required_inspections(analyzers) \
        .execute()
    inspection_results = result.inspection_to_annotations
    return inspection_results, analyzers
def run_and_assert_all_op_outputs_inspected(py_file_path, sensitive_columns, dag_png_path):
    """
    Execute the pipeline with a few checks and inspections.
    Assert that mlinspect properly lets inspections inspect all DAG nodes
    """

    inspector_result = PipelineInspector \
        .on_pipeline_from_py_file(py_file_path) \
        .add_check(NoBiasIntroducedFor(sensitive_columns)) \
        .add_check(NoIllegalFeatures()) \
        .add_required_inspection(MissingEmbeddings(20)) \
        .add_required_inspection(RowLineage(5)) \
        .add_required_inspection(MaterializeFirstOutputRows(5)) \
        .execute()
    materialize_output = inspector_result.inspection_to_annotations[MaterializeFirstOutputRows(5)]
    assert len(materialize_output) == (len(inspector_result.dag.nodes) - 1)  # Estimator does not have output

    save_fig_to_path(inspector_result.dag, dag_png_path)
    assert os.path.isfile(dag_png_path)
def test_ols_fit():
    """
    Tests whether the monkey patching of ('statsmodels.regression.linear_model.OLS', 'fit') works
    """
    test_code = cleandoc("""
        import numpy as np
        import statsmodels.api as sm
        np.random.seed(42)
        nobs = 100
        X = np.random.random((nobs, 2))
        X = sm.add_constant(X)
        beta = [1, .1, .5]
        e = np.random.random(nobs)
        y = np.dot(X, beta) + e
        results = sm.OLS(y, X).fit()
        assert results.summary() is not None
        """)

    inspector_result = _pipeline_executor.singleton.run(
        python_code=test_code,
        track_code_references=True,
        inspections=[RowLineage(3)])
    inspector_result.dag.remove_nodes_from(
        list(inspector_result.dag.nodes)[0:4])
    inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[1])

    expected_dag = networkx.DiGraph()
    expected_train_data = DagNode(
        3, BasicCodeLocation("<string-source>", 10),
        OperatorContext(OperatorType.TRAIN_DATA,
                        FunctionInfo('statsmodel.api.OLS', 'fit')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)'))
    expected_train_labels = DagNode(
        4, BasicCodeLocation("<string-source>", 10),
        OperatorContext(OperatorType.TRAIN_LABELS,
                        FunctionInfo('statsmodel.api.OLS', 'fit')),
        DagNodeDetails(None, ['array']),
        OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)'))
    expected_ols = DagNode(
        5, BasicCodeLocation("<string-source>", 10),
        OperatorContext(OperatorType.ESTIMATOR,
                        FunctionInfo('statsmodel.api.OLS', 'fit')),
        DagNodeDetails('Decision Tree', []),
        OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)'))
    expected_dag.add_edge(expected_train_data, expected_ols)
    expected_dag.add_edge(expected_train_labels, expected_ols)

    compare(networkx.to_dict_of_dicts(inspector_result.dag),
            networkx.to_dict_of_dicts(expected_dag))

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_train_data]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame(
        [[
            numpy.array([1.0, 0.3745401188473625, 0.9507143064099162]),
            {LineageId(3, 0)}
        ],
         [
             numpy.array([1.0, 0.7319939418114051, 0.5986584841970366]),
             {LineageId(3, 1)}
         ],
         [
             numpy.array([1.0, 0.15601864044243652, 0.15599452033620265]),
             {LineageId(3, 2)}
         ]],
        columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=0.1)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_train_labels]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame([[2.154842811243982, {LineageId(5, 0)}],
                                     [1.4566686012747074, {LineageId(5, 1)}],
                                     [1.2552278383069588, {LineageId(5, 2)}]],
                                    columns=['array', 'mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        atol=0.1)

    inspection_results_data_source = inspector_result.dag_node_to_inspection_results[
        expected_ols]
    lineage_output = inspection_results_data_source[RowLineage(3)]
    expected_lineage_df = DataFrame(
        [[{LineageId(5, 0), LineageId(3, 0)}],
         [{LineageId(5, 1), LineageId(3, 1)}],
         [{LineageId(5, 2), LineageId(3, 2)}]],
        columns=['mlinspect_lineage'])
    pandas.testing.assert_frame_equal(
        lineage_output.reset_index(drop=True),
        expected_lineage_df.reset_index(drop=True),
        check_column_type=False)