def test_frame_merge_sorted(): """ Tests whether the monkey patching of ('pandas.core.frame', 'merge') works if the sort option is set to True """ test_code = cleandoc(""" import pandas as pd df_a = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]}) df_b = pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]}) df_merged = df_a.merge(df_b, on='B', sort=True) df_expected = pd.DataFrame({'A': [5, 8, 4, 2], 'B': [1, 2, 4, 5], 'C': [1, 11, 5, None]}) pd.testing.assert_frame_equal(df_merged, df_expected) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(5)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_a = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B']), OptionalCodeInfo( CodeReference(3, 7, 3, 65), "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [7, 5, 4, 2, 1]})")) expected_b = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['B', 'C']), OptionalCodeInfo( CodeReference(4, 7, 4, 69), "pd.DataFrame({'B': [1, 4, 3, 2, 5], 'C': [1, 5, 4, 11, None]})")) expected_join = DagNode( 2, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.JOIN, FunctionInfo('pandas.core.frame', 'merge')), DagNodeDetails("on 'B'", ['A', 'B', 'C']), OptionalCodeInfo(CodeReference(5, 12, 5, 47), "df_a.merge(df_b, on='B', sort=True)")) expected_dag.add_edge(expected_a, expected_join) expected_dag.add_edge(expected_b, expected_join) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_join] lineage_output = inspection_results_data_source[RowLineage(5)] expected_lineage_df = DataFrame( [[5, 1, 1., {LineageId(0, 4), LineageId(1, 0)}], [8, 2, 11., {LineageId(0, 3), LineageId(1, 3)}], [4, 4, 5., {LineageId(0, 2), LineageId(1, 1)}], [2, 5, math.nan, {LineageId(0, 1), LineageId(1, 4)}]], columns=['A', 'B', 'C', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_black_box_operation(): """ Tests whether the monkey patching of pandas function works """ test_code = cleandoc(""" import pandas from mlinspect.testing._testing_helper_utils import black_box_df_op df = black_box_df_op() df = df.dropna() print("df") """) extracted_dag = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True).dag expected_dag = networkx.DiGraph() expected_missing_op = DagNode( -1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.MISSING_OP, None), DagNodeDetails( 'Warning! Operator <string-source>:5 (df.dropna()) encountered a ' 'DataFrame resulting from an operation without mlinspect support!', ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()')) expected_select = DagNode( 0, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()')) expected_dag.add_edge(expected_missing_op, expected_select) compare(networkx.to_dict_of_dicts(extracted_dag), networkx.to_dict_of_dicts(expected_dag))
def test_func_defs_and_loops(): """ Tests whether the monkey patching of pandas function works """ test_code = get_test_code_with_function_def_and_for_loop() extracted_dag = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True).dag expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo(CodeReference(4, 9, 4, 44), "pd.DataFrame([0, 1], columns=['A'])")) expected_select_1 = DagNode( 1, BasicCodeLocation("<string-source>", 8), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(8, 9, 8, 20), 'df.dropna()')) expected_dag.add_edge(expected_data_source, expected_select_1) expected_select_2 = DagNode( 2, BasicCodeLocation("<string-source>", 8), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(8, 9, 8, 20), 'df.dropna()')) expected_dag.add_edge(expected_select_1, expected_select_2) compare(networkx.to_dict_of_dicts(extracted_dag), networkx.to_dict_of_dicts(expected_dag))
def test_statsmodels_add_constant(): """ Tests whether the monkey patching of ('statsmodel.api', 'add_constant') works """ test_code = cleandoc(""" import numpy as np import statsmodels.api as sm np.random.seed(42) test = np.random.random(100) test = sm.add_constant(test) assert len(test) == 100 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) expected_dag = networkx.DiGraph() expected_random = DagNode( 0, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('numpy.random', 'random')), DagNodeDetails('random', ['array']), OptionalCodeInfo(CodeReference(4, 7, 4, 28), "np.random.random(100)")) expected_constant = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('statsmodel.api', 'add_constant')), DagNodeDetails('Adds const column', ['array']), OptionalCodeInfo(CodeReference(5, 7, 5, 28), "sm.add_constant(test)")) expected_dag.add_edge(expected_random, expected_constant) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_random] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_constant] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[numpy.array([0.5, 1.]), {LineageId(0, 0)}], [numpy.array([0.5, 1.]), {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1)
def test_frame__setitem__(): """ Tests whether the monkey patching of ('pandas.core.frame', '__setitem__') works """ test_code = cleandoc(""" import pandas as pd pandas_df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 'baz': [1, 2, 3, 4, 5, 6], 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) pandas_df['baz'] = pandas_df['baz'] + 1 df_expected = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 'baz': [2, 3, 4, 5, 6, 7], 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) pd.testing.assert_frame_equal(pandas_df, df_expected) """) inspector_result = _pipeline_executor.singleton.run(python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_data_source = DagNode(0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['foo', 'bar', 'baz', 'zoo']), OptionalCodeInfo(CodeReference(3, 12, 6, 53), "pd.DataFrame({'foo': ['one', 'one', 'one', 'two', " "'two', 'two'],\n" " 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],\n" " 'baz': [1, 2, 3, 4, 5, 6],\n" " 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})")) expected_project = DagNode(1, BasicCodeLocation("<string-source>", 7), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['baz']", ['baz']), OptionalCodeInfo(CodeReference(7, 19, 7, 35), "pandas_df['baz']")) expected_dag.add_edge(expected_data_source, expected_project) expected_project_modify = DagNode(2, BasicCodeLocation("<string-source>", 7), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('pandas.core.frame', '__setitem__')), DagNodeDetails("modifies ['baz']", ['foo', 'bar', 'baz', 'zoo']), OptionalCodeInfo(CodeReference(7, 0, 7, 39), "pandas_df['baz'] = pandas_df['baz'] + 1")) expected_dag.add_edge(expected_data_source, expected_project_modify) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[expected_project_modify] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame([['one', 'A', 2, 'x', {LineageId(0, 0)}], ['one', 'B', 3, 'y', {LineageId(0, 1)}]], columns=['foo', 'bar', 'baz', 'zoo', 'mlinspect_lineage']) pandas.testing.assert_frame_equal(lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_my_word_to_vec_transformer(): """ Tests whether the monkey patching of ('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer') works """ test_code = cleandoc(""" import pandas as pd from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer import numpy as np df = pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']}) word_to_vec = MyW2VTransformer(min_count=2, size=2, workers=1) encoded_data = word_to_vec.fit_transform(df) assert encoded_data.shape == (4, 2) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(3)], custom_monkey_patching=[custom_monkeypatching]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo( CodeReference(5, 5, 5, 62), "pd.DataFrame({'A': ['cat_a', 'cat_b', 'cat_a', 'cat_c']})")) expected_estimator = DagNode( 1, BasicCodeLocation("<string-source>", 6), OperatorContext( OperatorType.TRANSFORMER, FunctionInfo('example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer')), DagNodeDetails('Word2Vec', ['array']), OptionalCodeInfo(CodeReference(6, 14, 6, 62), 'MyW2VTransformer(min_count=2, size=2, workers=1)')) expected_dag.add_edge(expected_data_source, expected_estimator) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_estimator] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 0)}], [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 1)}], [numpy.array([0.0, 0.0, 0.0]), {LineageId(0, 2)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_series_equal( lineage_output["mlinspect_lineage"], expected_lineage_df["mlinspect_lineage"]) assert expected_lineage_df.iloc[0, 0].shape == (3, )
def test_frame__getitem__selection(): """ Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for filtering """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]}) df_selection = df[df['A'] > 3] df_expected = pd.DataFrame({'A': [4, 8, 5], 'B': [4, 11, None]}) pd.testing.assert_frame_equal(df_selection.reset_index(drop=True), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[3]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B']), OptionalCodeInfo( CodeReference(3, 5, 3, 67), "pd.DataFrame({'A': [0, 2, 4, 8, 5], 'B': [1, 5, 4, 11, None]})")) expected_projection = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['A']", ['A']), OptionalCodeInfo(CodeReference(4, 18, 4, 25), "df['A']")) expected_dag.add_edge(expected_data_source, expected_projection) expected_selection = DagNode( 2, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("Select by Series: df[df['A'] > 3]", ['A', 'B']), OptionalCodeInfo(CodeReference(4, 15, 4, 30), "df[df['A'] > 3]")) expected_dag.add_edge(expected_data_source, expected_selection) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_selection] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[4, 4., {LineageId(0, 2)}], [8, 11., {LineageId(0, 3)}]], columns=['A', 'B', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_groupby_agg(): """ Tests whether the monkey patching of ('pandas.core.frame', 'groupby') and ('pandas.core.groupbygeneric', 'agg') works. """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], 'value': [1, 2, 1, 3, 4]}) df_groupby_agg = df.groupby('group').agg(mean_value=('value', 'mean')) df_expected = pd.DataFrame({'group': ['A', 'B', 'C'], 'mean_value': [1, 3, 3]}) pd.testing.assert_frame_equal(df_groupby_agg.reset_index(drop=False), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['group', 'value']), OptionalCodeInfo( CodeReference(3, 5, 3, 81), "pd.DataFrame({'group': ['A', 'B', 'A', 'C', 'B'], " "'value': [1, 2, 1, 3, 4]})")) expected_groupby_agg = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.GROUP_BY_AGG, FunctionInfo('pandas.core.groupby.generic', 'agg')), DagNodeDetails( "Groupby 'group', Aggregate: '{'mean_value': ('value', 'mean')}'", ['group', 'mean_value']), OptionalCodeInfo( CodeReference(4, 17, 4, 70), "df.groupby('group').agg(mean_value=('value', 'mean'))")) expected_dag.add_edge(expected_data, expected_groupby_agg) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_groupby_agg] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [["A", 1, {LineageId(1, 0)}], ['B', 3, {LineageId(1, 1)}]], columns=['group', 'mean_value', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame__getitem__frame(): """ Tests whether the monkey patching of ('pandas.core.frame', '__getitem__') works for multiple string arguments """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], [9, 2, 3], [6, 1, 2], [1, 2, 3]], columns=['A', 'B', 'C']) df_projection = df[['A', 'C']] df_expected = pd.DataFrame([[0, 2], [1, 3], [4, 2], [9, 3], [6, 2], [1, 3]], columns=['A', 'C']) pd.testing.assert_frame_equal(df_projection, df_expected) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A', 'B', 'C']), OptionalCodeInfo( CodeReference(3, 5, 4, 28), "pd.DataFrame([[0, None, 2], [1, 2, 3], [4, None, 2], " "[9, 2, 3], [6, 1, 2], [1, 2, 3]], \n" " columns=['A', 'B', 'C'])")) expected_project = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['A', 'C']", ['A', 'C']), OptionalCodeInfo(CodeReference(5, 16, 5, 30), "df[['A', 'C']]")) expected_dag.add_edge(expected_data_source, expected_project) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_project] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0, 2, {LineageId(0, 0)}], [1, 3, {LineageId(0, 1)}]], columns=['A', 'C', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame_replace(): """ Tests whether the monkey patching of ('pandas.core.frame', 'replace') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], columns=['A']) df_replace = df.replace('Medium', 'Low') df_expected = pd.DataFrame(['Low', 'Low', 'Low', 'High', None], columns=['A']) pd.testing.assert_frame_equal(df_replace.reset_index(drop=True), df_expected.reset_index(drop=True)) """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[2]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo( CodeReference(3, 5, 3, 72), "pd.DataFrame(['Low', 'Medium', 'Low', 'High', None], " "columns=['A'])")) expected_modify = DagNode( 1, BasicCodeLocation("<string-source>", 4), OperatorContext(OperatorType.PROJECTION_MODIFY, FunctionInfo('pandas.core.frame', 'replace')), DagNodeDetails("Replace 'Medium' with 'Low'", ['A']), OptionalCodeInfo(CodeReference(4, 13, 4, 40), "df.replace('Medium', 'Low')")) expected_dag.add_edge(expected_data_source, expected_modify) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_modify] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [['Low', {LineageId(0, 0)}], ['Low', {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def add_test_label_node(test_label_arg, caller_filename, function_info, lineno, optional_code_reference, optional_source_code): """Add a Test Label DAG Node for a estimator.score call""" # pylint: disable=too-many-arguments operator_context = OperatorContext(OperatorType.TEST_LABELS, function_info) input_info_test_labels = get_input_info(test_label_arg, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) test_label_op_id = _pipeline_executor.singleton.get_next_op_id() test_labels_dag_node = DagNode( test_label_op_id, BasicCodeLocation(caller_filename, lineno), operator_context, DagNodeDetails(None, get_column_names(test_label_arg)), get_optional_code_info_or_none(optional_code_reference, optional_source_code)) input_infos = SklearnBackend.before_call( operator_context, [input_info_test_labels.annotated_dfobject]) label_backend_result = SklearnBackend.after_call(operator_context, input_infos, test_label_arg) add_dag_node(test_labels_dag_node, [input_info_test_labels.dag_node], label_backend_result) test_labels_result = label_backend_result.annotated_dfobject.result_data return label_backend_result, test_labels_dag_node, test_labels_result
def test_numpy_random(): """ Tests whether the monkey patching of ('numpy.random', 'random') works """ test_code = cleandoc(""" import numpy as np np.random.seed(42) test = np.random.random(100) assert len(test) == 100 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('numpy.random', 'random')), DagNodeDetails('random', ['array']), OptionalCodeInfo(CodeReference(3, 7, 3, 28), "np.random.random(100)")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0.5, {LineageId(0, 0)}], [0.5, {LineageId(0, 1)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=1)
def add_train_data_node(estimator, train_data_arg, function_info): """Add a Train Label DAG Node for a estimator.fit call""" input_info_train_data = get_input_info( train_data_arg, estimator.mlinspect_caller_filename, estimator.mlinspect_lineno, function_info, estimator.mlinspect_optional_code_reference, estimator.mlinspect_optional_source_code) train_data_op_id = _pipeline_executor.singleton.get_next_op_id() operator_context = OperatorContext(OperatorType.TRAIN_DATA, function_info) train_data_dag_node = DagNode( train_data_op_id, BasicCodeLocation(estimator.mlinspect_caller_filename, estimator.mlinspect_lineno), operator_context, DagNodeDetails(None, ["array"]), get_optional_code_info_or_none( estimator.mlinspect_optional_code_reference, estimator.mlinspect_optional_source_code)) input_infos = SklearnBackend.before_call( operator_context, [input_info_train_data.annotated_dfobject]) data_backend_result = SklearnBackend.after_call(operator_context, input_infos, train_data_arg) add_dag_node(train_data_dag_node, [input_info_train_data.dag_node], data_backend_result) train_data_result = data_backend_result.annotated_dfobject.result_data return data_backend_result, train_data_dag_node, train_data_result
def patched_fit_transform(self, *args, **kwargs): """ Patch for ('example_pipelines.healthcare.healthcare_utils.MyW2VTransformer', 'fit_transform') """ # pylint: disable=no-method-argument self.mlinspect_fit_transform_active = True # pylint: disable=attribute-defined-outside-init original = gorilla.get_original_attribute( healthcare_utils.MyW2VTransformer, 'fit_transform') function_info = FunctionInfo( 'example_pipelines.healthcare.healthcare_utils', 'MyW2VTransformer') input_info = get_input_info(args[0], self.mlinspect_caller_filename, self.mlinspect_lineno, function_info, self.mlinspect_optional_code_reference, self.mlinspect_optional_source_code) operator_context = OperatorContext(OperatorType.TRANSFORMER, function_info) input_infos = SklearnBackend.before_call( operator_context, [input_info.annotated_dfobject]) result = original(self, input_infos[0].result_data, *args[1:], **kwargs) backend_result = SklearnBackend.after_call(operator_context, input_infos, result) new_return_value = backend_result.annotated_dfobject.result_data assert isinstance(new_return_value, MlinspectNdarray) dag_node = DagNode( singleton.get_next_op_id(), BasicCodeLocation(self.mlinspect_caller_filename, self.mlinspect_lineno), operator_context, DagNodeDetails("Word2Vec: fit_transform", ['array']), get_optional_code_info_or_none( self.mlinspect_optional_code_reference, self.mlinspect_optional_source_code)) add_dag_node(dag_node, [input_info.dag_node], backend_result) self.mlinspect_fit_transform_active = False # pylint: disable=attribute-defined-outside-init return new_return_value
def test_frame__init__(): """ Tests whether the monkey patching of ('pandas.core.frame', 'DataFrame') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([0, 1, 2], columns=['A']) assert len(df) == 3 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo(CodeReference(3, 5, 3, 43), "pd.DataFrame([0, 1, 2], columns=['A'])")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0, {LineageId(0, 0)}], [1, {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_get_rdataset(): """ Tests whether the monkey patching of ('statsmodels.datasets', 'get_rdataset') works """ test_code = cleandoc(""" import statsmodels.api as sm dat = sm.datasets.get_rdataset("Guerry", "HistData").data assert len(dat) == 86 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('statsmodels.datasets', 'get_rdataset')), DagNodeDetails( 'Data from A.-M. Guerry, "Essay on the Moral Statistics of France"', [ 'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop', 'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity', 'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide', 'Donation_clergy', 'Lottery', 'Desertion', 'Instruction', 'Prostitutes', 'Distance', 'Area', 'Pop1831' ]), OptionalCodeInfo(CodeReference(3, 6, 3, 52), """sm.datasets.get_rdataset("Guerry", "HistData")""")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[ 1, 'E', 'Ain', 28870, 15890, 37, 5098, 33120, 35039, '2:Med', 73, 58, 11, 71, 60, 69, 41, 55, 46, 13, 218.372, 5762, 346.03, {LineageId(0, 0)} ], [ 2, 'N', 'Aisne', 26226, 5521, 51, 8901, 14572, 12831, '2:Med', 22, 10, 82, 4, 82, 36, 38, 82, 24, 327, 65.945, 7369, 513.0, {LineageId(0, 1)} ]], columns=[ 'dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop', 'Literacy', 'Donations', 'Infants', 'Suicides', 'MainCity', 'Wealth', 'Commerce', 'Clergy', 'Crime_parents', 'Infanticide', 'Donation_clergy', 'Lottery', 'Desertion', 'Instruction', 'Prostitutes', 'Distance', 'Area', 'Pop1831', 'mlinspect_lineage' ]) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_read_csv(): """ Tests whether the monkey patching of ('pandas.io.parsers', 'read_csv') works """ test_code = cleandoc(""" import os import pandas as pd from mlinspect.utils import get_project_root train_file = os.path.join(str(get_project_root()), "example_pipelines", "adult_complex", "adult_train.csv") raw_data = pd.read_csv(train_file, na_values='?', index_col=0) assert len(raw_data) == 22792 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) extracted_node: DagNode = list(inspector_result.dag.nodes)[0] expected_node = DagNode( 0, BasicCodeLocation("<string-source>", 6), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.io.parsers', 'read_csv')), DagNodeDetails(StringComparison(r".*\.csv"), [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year' ]), OptionalCodeInfo( CodeReference(6, 11, 6, 62), "pd.read_csv(train_file, na_values='?', index_col=0)")) compare(extracted_node, expected_node) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ extracted_node] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[ 46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty', 'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K', {LineageId(0, 0)} ], [ 29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K', {LineageId(0, 1)} ]], columns=[ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year', 'mlinspect_lineage' ]) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def test_frame_dropna(): """ Tests whether the monkey patching of ('pandas.core.frame', 'dropna') works """ test_code = cleandoc(""" import pandas as pd df = pd.DataFrame([0, 2, 4, 5, None], columns=['A']) assert len(df) == 5 df = df.dropna() assert len(df) == 4 """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(2)]) expected_dag = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation("<string-source>", 3), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.core.frame', 'DataFrame')), DagNodeDetails(None, ['A']), OptionalCodeInfo(CodeReference(3, 5, 3, 52), "pd.DataFrame([0, 2, 4, 5, None], columns=['A'])")) expected_select = DagNode( 1, BasicCodeLocation("<string-source>", 5), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', ['A']), OptionalCodeInfo(CodeReference(5, 5, 5, 16), 'df.dropna()')) expected_dag.add_edge(expected_data_source, expected_select) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_select] lineage_output = inspection_results_data_source[RowLineage(2)] expected_lineage_df = DataFrame( [[0., {LineageId(0, 0)}], [2., {LineageId(0, 1)}]], columns=['A', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True))
def get_input_info(df_object, caller_filename, lineno, function_info, optional_code_reference, optional_source_code) \ -> InputInfo: """ Uses the patched _mlinspect_dag_node attribute and the singleton.op_id_to_dag_node map to find the parent DAG node for the DAG node we want to insert in the next step. """ # pylint: disable=too-many-arguments, unused-argument, protected-access, unused-variable, too-many-locals if isinstance(df_object, DataFrame): columns = list(df_object.columns) # TODO: Update this for numpy arrays etc. later elif isinstance(df_object, Series): columns = [df_object.name] elif isinstance(df_object, (csr_matrix, numpy.ndarray)): columns = ['array'] else: raise NotImplementedError("TODO: Mlinspect info storage for type: '{}'".format(type(df_object))) if hasattr(df_object, "_mlinspect_annotation"): input_op_id = df_object._mlinspect_dag_node input_dag_node = singleton.op_id_to_dag_node[input_op_id] annotation_df = df_object._mlinspect_annotation input_info = InputInfo(input_dag_node, AnnotatedDfObject(df_object, annotation_df)) else: operator_context = OperatorContext(OperatorType.DATA_SOURCE, function_info) backend_result = execute_inspection_visits_data_source(operator_context, df_object) if optional_code_reference: code_reference = "({})".format(optional_source_code) else: code_reference = "" description = "Warning! Operator {}:{} {} encountered a DataFrame resulting from an operation " \ "without mlinspect support!".format(caller_filename, lineno, code_reference) missing_op_id = singleton.get_next_missing_op_id() input_dag_node = DagNode(missing_op_id, BasicCodeLocation(caller_filename, lineno), OperatorContext(OperatorType.MISSING_OP, None), DagNodeDetails(description, columns), OptionalCodeInfo(optional_code_reference, optional_source_code)) add_dag_node(input_dag_node, [], backend_result) annotation_df = backend_result.annotated_dfobject.result_annotation input_info = InputInfo(input_dag_node, AnnotatedDfObject(df_object, annotation_df)) return input_info
def get_expected_dag_adult_easy_py(): """ Get the expected DAG for the adult_easy pipeline """ # pylint: disable=too-many-locals expected_graph = networkx.DiGraph() expected_data_source = DagNode(18, OperatorType.DATA_SOURCE, CodeReference(12, 11, 12, 62), ('pandas.io.parsers', 'read_csv'), "adult_train.csv", ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year'], "pd.read_csv(train_file, na_values='?', index_col=0)") expected_graph.add_node(expected_data_source) expected_select = DagNode(20, OperatorType.SELECTION, CodeReference(14, 7, 14, 24), ('pandas.core.frame', 'dropna'), "dropna", ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year'], 'raw_data.dropna()') expected_graph.add_edge(expected_data_source, expected_select) expected_train_data = DagNode(56, OperatorType.TRAIN_DATA, CodeReference(24, 18, 26, 51), ('sklearn.pipeline', 'fit', 'Train Data'), None, ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year'], 'income_pipeline.fit(data, labels)' ) expected_graph.add_edge(expected_select, expected_train_data) pipeline_str = "compose.ColumnTransformer(transformers=[\n" \ " ('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), " \ "['education', 'workclass']),\n" \ " ('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week'])\n" \ "])" expected_pipeline_project_one = DagNode(34, OperatorType.PROJECTION, CodeReference(18, 25, 21, 2), ('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'), "to ['education'] (ColumnTransformer)", ['education'], pipeline_str) expected_graph.add_edge(expected_train_data, expected_pipeline_project_one) expected_pipeline_project_two = DagNode(35, OperatorType.PROJECTION, CodeReference(18, 25, 21, 2), ('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'), "to ['workclass'] (ColumnTransformer)", ['workclass'], pipeline_str) expected_graph.add_edge(expected_train_data, expected_pipeline_project_two) expected_pipeline_project_three = DagNode(40, OperatorType.PROJECTION, CodeReference(18, 25, 21, 2), ('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'), "to ['age'] (ColumnTransformer)", ['age'], pipeline_str) expected_graph.add_edge(expected_train_data, expected_pipeline_project_three) expected_pipeline_project_four = DagNode(41, OperatorType.PROJECTION, CodeReference(18, 25, 21, 2), ('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'), "to ['hours-per-week'] (ColumnTransformer)", ['hours-per-week'], pipeline_str) expected_graph.add_edge(expected_train_data, expected_pipeline_project_four) expected_pipeline_transformer_one = DagNode(34, OperatorType.TRANSFORMER, CodeReference(19, 20, 19, 72), ('sklearn.preprocessing._encoders', 'OneHotEncoder', 'Pipeline'), "Categorical Encoder (OneHotEncoder), Column: 'education'", ['education'], "preprocessing.OneHotEncoder(handle_unknown='ignore')") expected_graph.add_edge(expected_pipeline_project_one, expected_pipeline_transformer_one) expected_pipeline_transformer_two = DagNode(35, OperatorType.TRANSFORMER, CodeReference(19, 20, 19, 72), ('sklearn.preprocessing._encoders', 'OneHotEncoder', 'Pipeline'), "Categorical Encoder (OneHotEncoder), Column: 'workclass'", ['workclass'], "preprocessing.OneHotEncoder(handle_unknown='ignore')") expected_graph.add_edge(expected_pipeline_project_two, expected_pipeline_transformer_two) expected_pipeline_transformer_three = DagNode(40, OperatorType.TRANSFORMER, CodeReference(20, 16, 20, 46), ('sklearn.preprocessing._data', 'StandardScaler', 'Pipeline'), "Numerical Encoder (StandardScaler), Column: 'age'", ['age'], 'preprocessing.StandardScaler()') expected_graph.add_edge(expected_pipeline_project_three, expected_pipeline_transformer_three) expected_pipeline_transformer_four = DagNode(41, OperatorType.TRANSFORMER, CodeReference(20, 16, 20, 46), ('sklearn.preprocessing._data', 'StandardScaler', 'Pipeline'), "Numerical Encoder (StandardScaler), Column: 'hours-per-week'", ['hours-per-week'], 'preprocessing.StandardScaler()') expected_graph.add_edge(expected_pipeline_project_four, expected_pipeline_transformer_four) expected_pipeline_concatenation = DagNode(46, OperatorType.CONCATENATION, CodeReference(18, 25, 21, 2), ('sklearn.compose._column_transformer', 'ColumnTransformer', 'Concatenation'), None, ['array'], pipeline_str) expected_graph.add_edge(expected_pipeline_transformer_one, expected_pipeline_concatenation) expected_graph.add_edge(expected_pipeline_transformer_two, expected_pipeline_concatenation) expected_graph.add_edge(expected_pipeline_transformer_three, expected_pipeline_concatenation) expected_graph.add_edge(expected_pipeline_transformer_four, expected_pipeline_concatenation) expected_estimator = DagNode(51, OperatorType.ESTIMATOR, CodeReference(26, 19, 26, 48), ('sklearn.tree._classes', 'DecisionTreeClassifier', 'Pipeline'), "Decision Tree", source_code='tree.DecisionTreeClassifier()') expected_graph.add_edge(expected_pipeline_concatenation, expected_estimator) expected_pipeline_fit = DagNode(56, OperatorType.FIT, CodeReference(24, 18, 26, 51), ('sklearn.pipeline', 'fit', 'Pipeline'), source_code='income_pipeline.fit(data, labels)') expected_graph.add_edge(expected_estimator, expected_pipeline_fit) expected_project = DagNode(23, OperatorType.PROJECTION, CodeReference(16, 38, 16, 61), ('pandas.core.frame', '__getitem__', 'Projection'), "to ['income-per-year']", ['income-per-year'], "data['income-per-year']") expected_graph.add_edge(expected_select, expected_project) expected_project_modify = DagNode(28, OperatorType.PROJECTION_MODIFY, CodeReference(16, 9, 16, 89), ('sklearn.preprocessing._label', 'label_binarize'), "label_binarize, classes: ['>50K', '<=50K']", ['array'], "preprocessing.label_binarize(data['income-per-year'], " "classes=['>50K', '<=50K'])") expected_graph.add_edge(expected_project, expected_project_modify) expected_train_labels = DagNode(56, OperatorType.TRAIN_LABELS, CodeReference(24, 18, 26, 51), ('sklearn.pipeline', 'fit', 'Train Labels'), None, ['array'], 'income_pipeline.fit(data, labels)') expected_graph.add_edge(expected_project_modify, expected_train_labels) expected_graph.add_edge(expected_train_labels, expected_pipeline_fit) return expected_graph
def get_expected_dag_adult_easy(caller_filename: str, line_offset: int = 0, with_code_references=True): """ Get the expected DAG for the adult_easy pipeline """ # pylint: disable=too-many-locals # The line numbers differ slightly between the .py file and the.ipynb file expected_graph = networkx.DiGraph() expected_data_source = DagNode( 0, BasicCodeLocation(caller_filename, 12 + line_offset), OperatorContext(OperatorType.DATA_SOURCE, FunctionInfo('pandas.io.parsers', 'read_csv')), DagNodeDetails('adult_train.csv', [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year' ]), OptionalCodeInfo( CodeReference(12 + line_offset, 11, 12 + line_offset, 62), "pd.read_csv(train_file, na_values='?', index_col=0)")) expected_graph.add_node(expected_data_source) expected_select = DagNode( 1, BasicCodeLocation(caller_filename, 14 + line_offset), OperatorContext(OperatorType.SELECTION, FunctionInfo('pandas.core.frame', 'dropna')), DagNodeDetails('dropna', [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year' ]), OptionalCodeInfo( CodeReference(14 + line_offset, 7, 14 + line_offset, 24), 'raw_data.dropna()')) expected_graph.add_edge(expected_data_source, expected_select) pipeline_str = "compose.ColumnTransformer(transformers=[\n" \ " ('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), " \ "['education', 'workclass']),\n" \ " ('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week'])\n" \ "])" expected_pipeline_project_one = DagNode( 4, BasicCodeLocation(caller_filename, 18 + line_offset), OperatorContext( OperatorType.PROJECTION, FunctionInfo('sklearn.compose._column_transformer', 'ColumnTransformer')), DagNodeDetails("to ['education', 'workclass']", ['education', 'workclass']), OptionalCodeInfo( CodeReference(18 + line_offset, 25, 21 + line_offset, 2), pipeline_str)) expected_graph.add_edge(expected_select, expected_pipeline_project_one) expected_pipeline_project_two = DagNode( 6, BasicCodeLocation(caller_filename, 18 + line_offset), OperatorContext( OperatorType.PROJECTION, FunctionInfo('sklearn.compose._column_transformer', 'ColumnTransformer')), DagNodeDetails("to ['age', 'hours-per-week']", ['age', 'hours-per-week']), OptionalCodeInfo( CodeReference(18 + line_offset, 25, 21 + line_offset, 2), pipeline_str)) expected_graph.add_edge(expected_select, expected_pipeline_project_two) expected_pipeline_transformer_one = DagNode( 5, BasicCodeLocation(caller_filename, 19 + line_offset), OperatorContext( OperatorType.TRANSFORMER, FunctionInfo('sklearn.preprocessing._encoders', 'OneHotEncoder')), DagNodeDetails('One-Hot Encoder: fit_transform', ['array']), OptionalCodeInfo( CodeReference(19 + line_offset, 20, 19 + line_offset, 72), "preprocessing.OneHotEncoder(handle_unknown='ignore')")) expected_pipeline_transformer_two = DagNode( 7, BasicCodeLocation(caller_filename, 20 + line_offset), OperatorContext( OperatorType.TRANSFORMER, FunctionInfo('sklearn.preprocessing._data', 'StandardScaler')), DagNodeDetails('Standard Scaler: fit_transform', ['array']), OptionalCodeInfo( CodeReference(20 + line_offset, 16, 20 + line_offset, 46), 'preprocessing.StandardScaler()')) expected_graph.add_edge(expected_pipeline_project_one, expected_pipeline_transformer_one) expected_graph.add_edge(expected_pipeline_project_two, expected_pipeline_transformer_two) expected_pipeline_concatenation = DagNode( 8, BasicCodeLocation(caller_filename, 18 + line_offset), OperatorContext( OperatorType.CONCATENATION, FunctionInfo('sklearn.compose._column_transformer', 'ColumnTransformer')), DagNodeDetails(None, ['array']), OptionalCodeInfo( CodeReference(18 + line_offset, 25, 21 + line_offset, 2), pipeline_str)) expected_graph.add_edge(expected_pipeline_transformer_one, expected_pipeline_concatenation) expected_graph.add_edge(expected_pipeline_transformer_two, expected_pipeline_concatenation) expected_train_data = DagNode( 9, BasicCodeLocation(caller_filename, 26 + line_offset), OperatorContext( OperatorType.TRAIN_DATA, FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')), DagNodeDetails(None, ['array']), OptionalCodeInfo( CodeReference(26 + line_offset, 19, 26 + line_offset, 48), 'tree.DecisionTreeClassifier()')) expected_graph.add_edge(expected_pipeline_concatenation, expected_train_data) expected_project = DagNode( 2, BasicCodeLocation(caller_filename, 16 + line_offset), OperatorContext(OperatorType.PROJECTION, FunctionInfo('pandas.core.frame', '__getitem__')), DagNodeDetails("to ['income-per-year']", ['income-per-year']), OptionalCodeInfo( CodeReference(16 + line_offset, 38, 16 + line_offset, 61), "data['income-per-year']")) expected_graph.add_edge(expected_select, expected_project) expected_project_modify = DagNode( 3, BasicCodeLocation(caller_filename, 16 + line_offset), OperatorContext( OperatorType.PROJECTION_MODIFY, FunctionInfo('sklearn.preprocessing._label', 'label_binarize')), DagNodeDetails("label_binarize, classes: ['>50K', '<=50K']", ['array']), OptionalCodeInfo( CodeReference(16 + line_offset, 9, 16 + line_offset, 89), "preprocessing.label_binarize(data['income-per-year'], " "classes=['>50K', '<=50K'])")) expected_graph.add_edge(expected_project, expected_project_modify) expected_train_labels = DagNode( 10, BasicCodeLocation(caller_filename, 26 + line_offset), OperatorContext( OperatorType.TRAIN_LABELS, FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')), DagNodeDetails(None, ['array']), OptionalCodeInfo( CodeReference(26 + line_offset, 19, 26 + line_offset, 48), 'tree.DecisionTreeClassifier()')) expected_graph.add_edge(expected_project_modify, expected_train_labels) expected_estimator = DagNode( 11, BasicCodeLocation(caller_filename, 26 + line_offset), OperatorContext( OperatorType.ESTIMATOR, FunctionInfo('sklearn.tree._classes', 'DecisionTreeClassifier')), DagNodeDetails('Decision Tree', []), OptionalCodeInfo( CodeReference(26 + line_offset, 19, 26 + line_offset, 48), 'tree.DecisionTreeClassifier()')) expected_graph.add_edge(expected_train_data, expected_estimator) expected_graph.add_edge(expected_train_labels, expected_estimator) if not with_code_references: for dag_node in expected_graph.nodes: dag_node.optional_code_info = None return expected_graph
def test_ols_fit(): """ Tests whether the monkey patching of ('statsmodels.regression.linear_model.OLS', 'fit') works """ test_code = cleandoc(""" import numpy as np import statsmodels.api as sm np.random.seed(42) nobs = 100 X = np.random.random((nobs, 2)) X = sm.add_constant(X) beta = [1, .1, .5] e = np.random.random(nobs) y = np.dot(X, beta) + e results = sm.OLS(y, X).fit() assert results.summary() is not None """) inspector_result = _pipeline_executor.singleton.run( python_code=test_code, track_code_references=True, inspections=[RowLineage(3)]) inspector_result.dag.remove_nodes_from( list(inspector_result.dag.nodes)[0:4]) inspector_result.dag.remove_node(list(inspector_result.dag.nodes)[1]) expected_dag = networkx.DiGraph() expected_train_data = DagNode( 3, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.TRAIN_DATA, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails(None, ['array']), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_train_labels = DagNode( 4, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.TRAIN_LABELS, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails(None, ['array']), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_ols = DagNode( 5, BasicCodeLocation("<string-source>", 10), OperatorContext(OperatorType.ESTIMATOR, FunctionInfo('statsmodel.api.OLS', 'fit')), DagNodeDetails('Decision Tree', []), OptionalCodeInfo(CodeReference(10, 10, 10, 22), 'sm.OLS(y, X)')) expected_dag.add_edge(expected_train_data, expected_ols) expected_dag.add_edge(expected_train_labels, expected_ols) compare(networkx.to_dict_of_dicts(inspector_result.dag), networkx.to_dict_of_dicts(expected_dag)) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_train_data] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[ numpy.array([1.0, 0.3745401188473625, 0.9507143064099162]), {LineageId(3, 0)} ], [ numpy.array([1.0, 0.7319939418114051, 0.5986584841970366]), {LineageId(3, 1)} ], [ numpy.array([1.0, 0.15601864044243652, 0.15599452033620265]), {LineageId(3, 2)} ]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=0.1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_train_labels] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame([[2.154842811243982, {LineageId(5, 0)}], [1.4566686012747074, {LineageId(5, 1)}], [1.2552278383069588, {LineageId(5, 2)}]], columns=['array', 'mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), atol=0.1) inspection_results_data_source = inspector_result.dag_node_to_inspection_results[ expected_ols] lineage_output = inspection_results_data_source[RowLineage(3)] expected_lineage_df = DataFrame( [[{LineageId(5, 0), LineageId(3, 0)}], [{LineageId(5, 1), LineageId(3, 1)}], [{LineageId(5, 2), LineageId(3, 2)}]], columns=['mlinspect_lineage']) pandas.testing.assert_frame_equal( lineage_output.reset_index(drop=True), expected_lineage_df.reset_index(drop=True), check_column_type=False)
def get_expected_result(): """ Get the expected PrintFirstRowsAnalyzer(2) result for the adult_easy example """ pipeline_str = "compose.ColumnTransformer(transformers=[\n " \ "('categorical', preprocessing.OneHotEncoder(handle_unknown='ignore'), " \ "['education', 'workclass']),\n " \ "('numeric', preprocessing.StandardScaler(), ['age', 'hours-per-week'])\n])" expected_result = { DagNode(node_id=18, operator_type=OperatorType.DATA_SOURCE, module=('pandas.io.parsers', 'read_csv'), code_reference=CodeReference(12, 11, 12, 62), description='adult_train.csv', columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year'], source_code="pd.read_csv(train_file, na_values='?', index_col=0)"): DataFrame([[46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty', 'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K'], [29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K']], columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year']), DagNode(node_id=20, operator_type=OperatorType.SELECTION, module=('pandas.core.frame', 'dropna'), code_reference=CodeReference(14, 7, 14, 24), description='dropna', columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year'], source_code='raw_data.dropna()' ): DataFrame([[46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty', 'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K'], [29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K']], columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year']), DagNode(node_id=23, operator_type=OperatorType.PROJECTION, module=('pandas.core.frame', '__getitem__', 'Projection'), code_reference=CodeReference(16, 38, 16, 61), description="to ['income-per-year']", columns=['income-per-year'], source_code="data['income-per-year']"): DataFrame([['<=50K'], ['<=50K']], columns=['array']), DagNode(node_id=28, operator_type=OperatorType.PROJECTION_MODIFY, module=('sklearn.preprocessing._label', 'label_binarize'), code_reference=CodeReference(16, 9, 16, 89), description="label_binarize, classes: ['>50K', '<=50K']", columns=['array'], source_code="preprocessing.label_binarize(data['income-per-year'], classes=['>50K', '<=50K'])"): DataFrame([[array(1)], [array(1)]], columns=['array']), DagNode(node_id=56, operator_type=OperatorType.TRAIN_DATA, module=('sklearn.pipeline', 'fit', 'Train Data'), code_reference=CodeReference(24, 18, 26, 51), description=None, columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year'], source_code='income_pipeline.fit(data, labels)'): DataFrame([ [46, 'Private', 128645, 'Some-college', 10, 'Divorced', 'Prof-specialty', 'Not-in-family', 'White', 'Female', 0, 0, 40, 'United-States', '<=50K'], [29, 'Local-gov', 115585, 'Some-college', 10, 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', 0, 0, 50, 'United-States', '<=50K']], columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year']), DagNode(node_id=56, operator_type=OperatorType.TRAIN_LABELS, module=('sklearn.pipeline', 'fit', 'Train Labels'), code_reference=CodeReference(24, 18, 26, 51), description=None, columns=['array'], source_code='income_pipeline.fit(data, labels)'): DataFrame([[array(1)], [array(1)]], columns=['array']), DagNode(node_id=40, operator_type=OperatorType.PROJECTION, code_reference=CodeReference(18, 25, 21, 2), module=('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'), description="to ['age'] (ColumnTransformer)", columns=['age'], source_code=pipeline_str): DataFrame([[46], [29]], columns=['age']), DagNode(node_id=34, operator_type=OperatorType.PROJECTION, code_reference=CodeReference(18, 25, 21, 2), module=('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'), description="to ['education'] (ColumnTransformer)", columns=['education'], source_code=pipeline_str): DataFrame([['Some-college'], ['Some-college']], columns=['education']), DagNode(node_id=41, operator_type=OperatorType.PROJECTION, code_reference=CodeReference(18, 25, 21, 2), module=('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'), description="to ['hours-per-week'] (ColumnTransformer)", columns=['hours-per-week'], source_code=pipeline_str): DataFrame([[40], [50]], columns=['hours-per-week']), DagNode(node_id=35, operator_type=OperatorType.PROJECTION, code_reference=CodeReference(18, 25, 21, 2), module=('sklearn.compose._column_transformer', 'ColumnTransformer', 'Projection'), description="to ['workclass'] (ColumnTransformer)", columns=['workclass'], source_code=pipeline_str): DataFrame([['Private'], ['Local-gov']], columns=['workclass']), DagNode(node_id=40, operator_type=OperatorType.TRANSFORMER, code_reference=CodeReference(20, 16, 20, 46), module=('sklearn.preprocessing._data', 'StandardScaler', 'Pipeline'), description="Numerical Encoder (StandardScaler), Column: 'age'", columns=['age'], source_code='preprocessing.StandardScaler()'): DataFrame([[array(RangeComparison(0.5, 0.6))], [array(RangeComparison(-0.8, -0.7))]], columns=['age']), DagNode(node_id=41, operator_type=OperatorType.TRANSFORMER, code_reference=CodeReference(20, 16, 20, 46), module=('sklearn.preprocessing._data', 'StandardScaler', 'Pipeline'), description="Numerical Encoder (StandardScaler), Column: 'hours-per-week'", columns=['hours-per-week'], source_code='preprocessing.StandardScaler()'): DataFrame([[array(RangeComparison(-0.09, -0.08))], [array(RangeComparison(0.7, 0.8))]], columns=['hours-per-week']), DagNode(node_id=34, operator_type=OperatorType.TRANSFORMER, code_reference=CodeReference(19, 20, 19, 72), module=('sklearn.preprocessing._encoders', 'OneHotEncoder', 'Pipeline'), description="Categorical Encoder (OneHotEncoder), Column: 'education'", columns=['education'], source_code="preprocessing.OneHotEncoder(handle_unknown='ignore')"): DataFrame([[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])], [array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])]], columns=['education']), DagNode(node_id=35, operator_type=OperatorType.TRANSFORMER, code_reference=CodeReference(19, 20, 19, 72), module=('sklearn.preprocessing._encoders', 'OneHotEncoder', 'Pipeline'), description="Categorical Encoder (OneHotEncoder), Column: 'workclass'", columns=['workclass'], source_code="preprocessing.OneHotEncoder(handle_unknown='ignore')"): DataFrame([[array([0., 0., 1., 0., 0., 0., 0.])], [array([0., 1., 0., 0., 0., 0., 0.])]], columns=['workclass']), DagNode(node_id=46, operator_type=OperatorType.CONCATENATION, code_reference=CodeReference(18, 25, 21, 2), module=('sklearn.compose._column_transformer', 'ColumnTransformer', 'Concatenation'), description=None, columns=['array'], source_code=pipeline_str): DataFrame([[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., RangeComparison(0.5, 0.6), RangeComparison(-0.09, -0.08)])], [array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., RangeComparison(-0.8, -0.7), RangeComparison(0.7, 0.8)])]], columns=['array']), DagNode(node_id=51, operator_type=OperatorType.ESTIMATOR, code_reference=CodeReference(26, 19, 26, 48), module=('sklearn.tree._classes', 'DecisionTreeClassifier', 'Pipeline'), description='Decision Tree', source_code='tree.DecisionTreeClassifier()'): None } return expected_result