def test_numpy_random(): """ Tests whether the monkey patching of ('numpy.random', 'random') works """ test_code = cleandoc(""" import numpy as np np.random.seed(42) test = np.random.random(100) assert len(test) == 100 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('numpy.random', 'random')), DagNodeDetails('random', ['array']), OptionalCodeInfo(CodeReference(3, 7, 3, 28), "np.random.random(100)")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1)
def test_frame_merge_sorted(): """ Tests whether the monkey patching of ('pandas.core.frame', 'merge') works if the sort option is set to True """ test_code = cleandoc(""" import pandas as pd df_a = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]}) df_b = pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]}) df_merged = df_a.merge(df_b, on='B', sort=True) df_expected = pd.DataFrame({'A': [5, 8, 4, 2], 'B': [1, 2, 4, 5], 'C': [1, 11, 5, None]}) pd.testing.assert_frame_equal(df_merged, df_expected) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(5)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_a = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B']), OptionalCodeInfo( CodeReference(3, 7, 3, 65), "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]})")) expected_b = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['B', 'C']), OptionalCodeInfo( CodeReference(4, 7, 4, 69), "pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]})")) expected_join = DagNode( 2, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.JOIN, FunctionInfo('pandas.core.frame', 'merge')), DagNodeDetails("on 'B'", ['A', 'B', 'C']), OptionalCodeInfo(CodeReference(5, 12, 5, 47), "df_a.merge(df_b, on='B', sort=True)")) expected_dag.add_edge(expected_a, expected_join) expected_dag.add_edge(expected_b, expected_join) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_join] lineage_output = inspection_results_data_source[RowLineage(5)] expected_lineage_df = DataFrame( [[5, 1, 1., {LineageId(0, 4), LineageId(1, 0)}], [8, 2, 11., {LineageId(0, 3), LineageId(1, 3)}], [4, 4, 5., {LineageId(0, 2), LineageId(1, 1)}], [2, 5, math.nan, {LineageId(0, 1), LineageId(1, 4)}]], columns=['A', 'B', 'C', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame__init__(): """ Tests whether the monkey patching of ('pandas.core.frame', 'DataFrame') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([0, 1, 2], columns=['A']) assert len(df) == 3 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo(CodeReference(3, 5, 3, 43), "pd.DataFrame([0, 1, 2], columns=['A'])")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0, {LineageId(0, 0)}], [1, {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_get_rdataset(): """ Tests whether the monkey patching of ('statsmodels.datasets', 'get_rdataset') works """ test_code = cleandoc(""" import statsmodels.api as sm dat = sm.datasets.get_rdataset("Guerry", "HistData").data assert len(dat) == 86 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('statsmodels.datasets', 'get_rdataset')), DagNodeDetails( 'Data from A.-M. Guerry, "Essay on the Moral Statistics of France"', [ 'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop', 'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity', 'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide', 'Donation_clergy', 'Lottery', 'Desertion', 'Instruction', 'Prostitutes', 'Distance', 'Area', 'Pop1831' ]), OptionalCodeInfo(CodeReference(3, 6, 3, 52), """sm.datasets.get_rdataset("Guerry", "HistData")""")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[ 1, 'E', 'Ain', 28870, 15890, 37, 5098, 33120, 35039, '2:Med', 73, 58, 11, 71, 60, 69, 41, 55, 46, 13, 218.372, 5762, 346.03, {LineageId(0, 0)} ], [ 2, 'N', 'Aisne', 26226, 5521, 51, 8901, 14572, 12831, '2:Med', 22, 10, 82, 4, 82, 36, 38, 82, 24, 327, 65.945, 7369, 513.0, {LineageId(0, 1)} ]], columns=[ 'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop', 'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity', 'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide', 'Donation_clergy', 'Lottery', 'Desertion', 'Instruction', 'Prostitutes', 'Distance', 'Area', 'Pop1831', 'mlinspect_lineage' ]) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_statsmodels_add_constant(): """ Tests whether the monkey patching of ('statsmodel.api', 'add_constant') works """ test_code = cleandoc(""" import numpy as np import statsmodels.api as sm np.random.seed(42) test = np.random.random(100) test = sm.add_constant(test) assert len(test) == 100 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) expected_dag = networkx.DiGraph() expected_random = DagNode( 0, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('numpy.random', 'random')), DagNodeDetails('random', ['array']), OptionalCodeInfo(CodeReference(4, 7, 4, 28), "np.random.random(100)")) expected_constant = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('statsmodel.api', 'add_constant')), DagNodeDetails('Adds const column', ['array']), OptionalCodeInfo(CodeReference(5, 7, 5, 28), "sm.add_constant(test)")) expected_dag.add_edge(expected_random, expected_constant) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_random] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_constant] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[numpy.array([0.5, 1.]), {LineageId(0, 0)}], [numpy.array([0.5, 1.]), {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1)
def test_read_csv(): """ Tests whether the monkey patching of ('pandas.io.parsers', 'read_csv') works """ test_code = cleandoc(""" import os import pandas as pd from mlinspect.utils import get_project_root train_file = os.path.join(str(get_project_root()), "example_pipelines", "adult_complex", "adult_train.csv") raw_data = pd.read_csv(train_file, na_values='?', index_col=0) assert len(raw_data) == 22792 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 6), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.io.parsers', 'read_csv')), DagNodeDetails(StringComparison(r".*\.csv"), [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year' ]), OptionalCodeInfo( CodeReference(6, 11, 6, 62), "pd.read_csv(train_file, na_values='?', index_col=0)")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[ 46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty', 'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K', {LineageId(0, 0)} ], [ 29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K', {LineageId(0, 1)} ]], columns=[ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year', 'mlinspect_lineage' ]) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame__setitem__(): """ Tests whether the monkey patching of ('pandas.core.frame', '__setitem__') works """ test_code = cleandoc(""" import pandas as pd pandas_df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 'baz': [1, 2, 3, 4, 5, 6], 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) pandas_df['baz'] = pandas_df['baz'] + 1 df_expected = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 'baz': [2, 3, 4, 5, 6, 7], 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) pd.testing.assert_frame_equal(pandas_df, df_expected) """) inspector_result = _pipeline_executor.singleton.run(python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_data_source = DagNode(0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['foo', 'bar', 'baz', 'zoo']), OptionalCodeInfo(CodeReference(3, 12, 6, 53), "pd.DataFrame({'foo': ['one', 'one', 'one', 'two', " "'two', 'two'],\n" " 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],\n" " 'baz': [1, 2, 3, 4, 5, 6],\n" " 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})")) expected_project = DagNode(1, BasicCodeLocation("<string-source>", 7), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['baz']", ['baz']), OptionalCodeInfo(CodeReference(7, 19, 7, 35), "pandas_df['baz']")) expected_dag.add_edge(expected_data_source, expected_project) expected_project_modify = DagNode(2, BasicCodeLocation("<string-source>", 7), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('pandas.core.frame', '__setitem__')), DagNodeDetails("modifies ['baz']", ['foo', 'bar', 'baz', 'zoo']), OptionalCodeInfo(CodeReference(7, 0, 7, 39), "pandas_df['baz'] = pandas_df['baz'] + 1")) expected_dag.add_edge(expected_data_source, expected_project_modify) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[expected_project_modify] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame([['one', 'A', 2, 'x', {LineageId(0, 0)}], ['one', 'B', 3, 'y', {LineageId(0, 1)}]], columns=['foo', 'bar', 'baz', 'zoo', 'mlinspect_lineage']) pandas.testing.assert_frame_equal(lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_my_word_to_vec_transformer(): """ Tests whether the monkey patching of ('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer') works """ test_code = cleandoc(""" import pandas as pd from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer import numpy as np df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']}) word_to_vec = MyW2VTransformer(min_count=2, size=2, workers=1) encoded_data = word_to_vec.fit_transform(df) assert encoded_data.shape == (4, 2) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(3)], custom_monkey_patching=[custom_monkeypatching]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo( CodeReference(5, 5, 5, 62), "pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']})")) expected_estimator = DagNode( 1, BasicCodeLocation("<string-source>", 6), OperatorContext( OperatorType.TRANSFORMER, FunctionInfo('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer')), DagNodeDetails('Word2Vec', ['array']), OptionalCodeInfo(CodeReference(6, 14, 6, 62), 'MyW2VTransformer(min_count=2, size=2, workers=1)')) expected_dag.add_edge(expected_data_source, expected_estimator) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_estimator] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 0)}], [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 1)}], [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 2)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_series_equal( lineage_output["mlinspect_lineage"], expected_lineage_df["mlinspect_lineage"]) assert expected_lineage_df.iloc[0, 0].shape == (3, )
def test_frame__getitem__selection(): """ Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for filtering """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]}) df_selection = df[df['A'] > 3] df_expected = pd.DataFrame({'A': [4, 8, 5], 'B': [4, 11, None]}) pd.testing.assert_frame_equal(df_selection.reset_index(drop=True), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B']), OptionalCodeInfo( CodeReference(3, 5, 3, 67), "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]})")) expected_projection = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['A']", ['A']), OptionalCodeInfo(CodeReference(4, 18, 4, 25), "df['A']")) expected_dag.add_edge(expected_data_source, expected_projection) expected_selection = DagNode( 2, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("Select by Series: df[df['A'] > 3]", ['A', 'B']), OptionalCodeInfo(CodeReference(4, 15, 4, 30), "df[df['A'] > 3]")) expected_dag.add_edge(expected_data_source, expected_selection) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_selection] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[4, 4., {LineageId(0, 2)}], [8, 11., {LineageId(0, 3)}]], columns=['A', 'B', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def run_row_index_annotation_testing_analyzer(code): """ An utility function to test backends """ result = PipelineInspector \ .on_pipeline_from_string(code) \ .add_required_inspection(RowLineage(10)) \ .execute() inspection_results = result.inspection_to_annotations assert RowLineage(10) in inspection_results result = inspection_results[RowLineage(10)] return result
def test_groupby_agg(): """ Tests whether the monkey patching of ('pandas.core.frame', 'groupby') and ('pandas.core.groupbygeneric', 'agg') works. """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], 'value': [1, 2, 1, 3, 4]}) df_groupby_agg = df.groupby('group').agg(mean_value=('value', 'mean')) df_expected = pd.DataFrame({'group': ['A', 'B', 'C'], 'mean_value': [1, 3, 3]}) pd.testing.assert_frame_equal(df_groupby_agg.reset_index(drop=False), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['group', 'value']), OptionalCodeInfo( CodeReference(3, 5, 3, 81), "pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], " "'value': [1, 2, 1, 3, 4]})")) expected_groupby_agg = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.GROUP_BY_AGG, FunctionInfo('pandas.core.groupby.generic', 'agg')), DagNodeDetails( "Groupby 'group', Aggregate: '{'mean_value': ('value', 'mean')}'", ['group', 'mean_value']), OptionalCodeInfo( CodeReference(4, 17, 4, 70), "df.groupby('group').agg(mean_value=('value', 'mean'))")) expected_dag.add_edge(expected_data, expected_groupby_agg) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_groupby_agg] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [["A", 1, {LineageId(1, 0)}], ['B', 3, {LineageId(1, 1)}]], columns=['group', 'mean_value', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame__getitem__frame(): """ Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for multiple string arguments """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], [9, 2, 3], [6, 1, 2], [1, 2, 3]], columns=['A', 'B', 'C']) df_projection = df[['A', 'C']] df_expected = pd.DataFrame([[0, 2], [1, 3], [4, 2], [9, 3], [6, 2], [1, 3]], columns=['A', 'C']) pd.testing.assert_frame_equal(df_projection, df_expected) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B', 'C']), OptionalCodeInfo( CodeReference(3, 5, 4, 28), "pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], " "[9, 2, 3], [6, 1, 2], [1, 2, 3]], \n" " columns=['A', 'B', 'C'])")) expected_project = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['A', 'C']", ['A', 'C']), OptionalCodeInfo(CodeReference(5, 16, 5, 30), "df[['A', 'C']]")) expected_dag.add_edge(expected_data_source, expected_project) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_project] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0, 2, {LineageId(0, 0)}], [1, 3, {LineageId(0, 1)}]], columns=['A', 'C', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def run_row_index_annotation_testing_analyzer(code): """ An utility function to test backends """ result = PipelineInspector \ .on_pipeline_from_string(code) \ .add_required_inspection(RowLineage(10)) \ .execute() inspection_results = result.dag_node_to_inspection_results dag_node_to_lineage_inspection = {} for dag_node, inspection_result in inspection_results.items(): assert RowLineage(10) in inspection_result dag_node_to_lineage_inspection[dag_node] = inspection_result[ RowLineage(10)] return dag_node_to_lineage_inspection
def test_frame_replace(): """ Tests whether the monkey patching of ('pandas.core.frame', 'replace') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], columns=['A']) df_replace = df.replace('Medium', 'Low') df_expected = pd.DataFrame(['Low', 'Low', 'Low', 'High', None], columns=['A']) pd.testing.assert_frame_equal(df_replace.reset_index(drop=True), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo( CodeReference(3, 5, 3, 72), "pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], " "columns=['A'])")) expected_modify = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('pandas.core.frame', 'replace')), DagNodeDetails("Replace 'Medium' with 'Low'", ['A']), OptionalCodeInfo(CodeReference(4, 13, 4, 40), "df.replace('Medium', 'Low')")) expected_dag.add_edge(expected_data_source, expected_modify) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_modify] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [['Low', {LineageId(0, 0)}], ['Low', {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame_dropna(): """ Tests whether the monkey patching of ('pandas.core.frame', 'dropna') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([0, 2, 4, 5, None], columns=['A']) assert len(df) == 5 df = df.dropna() assert len(df) == 4 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo(CodeReference(3, 5, 3, 52), "pd.DataFrame([0, 2, 4, 5, None], columns=['A'])")) expected_select = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()')) expected_dag.add_edge(expected_data_source, expected_select) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_select] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0., {LineageId(0, 0)}], [2., {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def run_and_assert_all_op_outputs_inspected(py_file_path, sensitive_columns, dag_png_path, custom_monkey_patching=None): """ Execute the pipeline with a few checks and inspections. Assert that mlinspect properly lets inspections inspect all DAG nodes """ if custom_monkey_patching is None: custom_monkey_patching = [] inspector_result = PipelineInspector \ .on_pipeline_from_py_file(py_file_path) \ .add_check(NoBiasIntroducedFor(sensitive_columns)) \ .add_check(NoIllegalFeatures()) \ .add_required_inspection(MissingEmbeddings(20)) \ .add_required_inspection(RowLineage(5)) \ .add_required_inspection(MaterializeFirstOutputRows(5)) \ .add_custom_monkey_patching_modules(custom_monkey_patching) \ .execute() for dag_node, inspection_result in inspector_result.dag_node_to_inspection_results.items( ): assert dag_node.operator_info.operator != OperatorType.MISSING_OP assert MaterializeFirstOutputRows(5) in inspection_result assert RowLineage(5) in inspection_result assert MissingEmbeddings(20) in inspection_result assert HistogramForColumns(sensitive_columns) in inspection_result if dag_node.operator_info.operator != OperatorType.ESTIMATOR: # Estimator does not have output assert inspection_result[MaterializeFirstOutputRows(5)] is not None assert inspection_result[RowLineage(5)] is not None assert inspection_result[HistogramForColumns( sensitive_columns)] is not None else: assert inspection_result[MaterializeFirstOutputRows(5)] is None assert inspection_result[RowLineage(5)] is not None assert inspection_result[HistogramForColumns( sensitive_columns)] is None save_fig_to_path(inspector_result.dag, dag_png_path) assert os.path.isfile(dag_png_path) return inspector_result.dag
def run_multiple_test_analyzers(code): """ An utility function to test backends. Also useful to debug annotation propagation. """ analyzers = [RandomAnnotationTestingInspection(2), MaterializeFirstOutputRows(5), RowLineage(2)] result = PipelineInspector \ .on_pipeline_from_string(code) \ .add_required_inspections(analyzers) \ .execute() inspection_results = result.inspection_to_annotations return inspection_results, analyzers
def run_and_assert_all_op_outputs_inspected(py_file_path, sensitive_columns, dag_png_path): """ Execute the pipeline with a few checks and inspections. Assert that mlinspect properly lets inspections inspect all DAG nodes """ inspector_result = PipelineInspector \ .on_pipeline_from_py_file(py_file_path) \ .add_check(NoBiasIntroducedFor(sensitive_columns)) \ .add_check(NoIllegalFeatures()) \ .add_required_inspection(MissingEmbeddings(20)) \ .add_required_inspection(RowLineage(5)) \ .add_required_inspection(MaterializeFirstOutputRows(5)) \ .execute() materialize_output = inspector_result.inspection_to_annotations[MaterializeFirstOutputRows(5)] assert len(materialize_output) == (len(inspector_result.dag.nodes) - 1) # Estimator does not have output save_fig_to_path(inspector_result.dag, dag_png_path) assert os.path.isfile(dag_png_path)
def test_ols_fit(): """ Tests whether the monkey patching of ('statsmodels.regression.linear_model.OLS', 'fit') works """ test_code = cleandoc(""" import numpy as np import statsmodels.api as sm np.random.seed(42) nobs = 100 X = np.random.random((nobs, 2)) X = sm.add_constant(X) beta = [1, .1, .5] e = np.random.random(nobs) y = np.dot(X, beta) + e results = sm.OLS(y, X).fit() assert results.summary() is not None """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(3)]) inspector_result.dag.remove_nodes_from( list(inspector_result.dag.nodes)[0:4]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[1]) expected_dag = networkx.DiGraph() expected_train_data = DagNode( 3, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.TRAIN_DATA, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails(None, ['array']), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_train_labels = DagNode( 4, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.TRAIN_LABELS, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails(None, ['array']), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_ols = DagNode( 5, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.ESTIMATOR, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails('Decision Tree', []), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_dag.add_edge(expected_train_data, expected_ols) expected_dag.add_edge(expected_train_labels, expected_ols) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_train_data] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[ numpy.array([1.0, 0.3745401188473625, 0.9507143064099162]), {LineageId(3, 0)} ], [ numpy.array([1.0, 0.7319939418114051, 0.5986584841970366]), {LineageId(3, 1)} ], [ numpy.array([1.0, 0.15601864044243652, 0.15599452033620265]), {LineageId(3, 2)} ]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=0.1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_train_labels] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame([[2.154842811243982, {LineageId(5, 0)}], [1.4566686012747074, {LineageId(5, 1)}], [1.2552278383069588, {LineageId(5, 2)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=0.1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_ols] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[{LineageId(5, 0), LineageId(3, 0)}], [{LineageId(5, 1), LineageId(3, 1)}], [{LineageId(5, 2), LineageId(3, 2)}]], columns=['mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), check_column_type=False)