def target_plot_inter(X,Y,features,label,grid_ranges=None): from pdpbox import info_plots df = pd.concat([X, Y], axis=1, join_axes=[X.index]) if(grid_ranges is None): fig, ax, summary_df = info_plots.target_plot_interact(df,features=features,feature_names=features,target=label,grid_types=['equal','equal']) else: fig, ax, summary_df = info_plots.target_plot_interact(df,features=features,feature_names=features,target=label,grid_types=['equal','equal'], show_outliers='True',grid_ranges=grid_ranges) return fig, ax
def test_onehot_numeric_gridpoints_outliers(titanic_data, titanic_target): """ 'show_outliers' implies 'custom_grid_points' or another custom grid definition TODO: again, this should be explicit to the user """ fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Fare', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Fare', 'Embarked'], target=titanic_target, cust_grid_points=[range(0, 100, 10), None], show_outliers=True) expected = pd.DataFrame({ 'x1': { 0: 0, 15: 5, 29: 9 }, 'x2': { 0: 0, 15: 0, 29: 2 }, 'display_column_1': { 0: '[0, 10)', 15: '[50, 60)', 29: '> 90' }, 'display_column_2': { 0: 'Embarked_C', 15: 'Embarked_C', 29: 'Embarked_S' }, 'value_lower_1': { 0: 0.0, 15: 50.0, 29: 90.0 }, 'value_upper_1': { 0: 10.0, 15: 60.0, 29: nan }, 'count': { 0: 37.0, 15: 6.0, 29: 26.0 }, 'Survived': { 0: 0.24324324324324326, 15: 1.0, 29: 0.7692307692307693 } }) assert_frame_equal(expected, summary_df.loc[[0, 15, 29], :], check_like=True) assert summary_df['count'].sum() == len(titanic_data)
def interactionPlotReal(data, featuresToExamine, pr): fig, axes, summary_df = info_plots.target_plot_interact( df=data, features=featuresToExamine, feature_names=featuresToExamine, target=pr.resultColumn) save("interactionPlotReal", plt=plt, fig=fig)
def test_onehot_numeric_gridtype_equal(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Fare', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Fare', 'Embarked'], target=titanic_target, grid_types=['equal', 'equal']) expected = pd.DataFrame({ 'x1': { 0: 0, 13: 4, 26: 8 }, 'x2': { 0: 0, 13: 1, 26: 2 }, 'display_column_1': { 0: '[0, 56.93)', 13: '[227.7, 284.63)', 26: '[455.4, 512.33]' }, 'display_column_2': { 0: 'Embarked_C', 13: 'Embarked_Q', 26: 'Embarked_S' }, 'value_lower_1': { 0: 0.0, 13: 227.70186666666666, 26: 455.4037333333333 }, 'value_upper_1': { 0: 56.925466666666665, 13: 284.62733333333335, 26: 512.3292 }, 'count': { 0: 108.0, 13: 0.0, 26: 0.0 }, 'Survived': { 0: 0.42592592592592593, 13: 0.0, 26: 0.0 } }) assert_frame_equal(expected, summary_df.loc[[0, 13, 26], :], check_like=True) assert summary_df['count'].sum() == len(titanic_data) # all bins should have the same width (equal grid type) assert (summary_df.value_upper_1 - summary_df.value_lower_1).diff().sum() < 1e-9
def test_onehot_numeric_gridranges_outliers_endpoint(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Fare', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Fare', 'Embarked'], target=titanic_target, grid_types=['equal', 'equal'], grid_ranges=[(0, 100), None], show_outliers=True, endpoint=False) expected = pd.DataFrame({ 'x1': { 0: 0, 15: 5, 29: 9 }, 'x2': { 0: 0, 15: 0, 29: 2 }, 'display_column_1': { 0: '[0, 11.11)', 15: '[55.56, 66.67)', 29: '>= 100' }, 'display_column_2': { 0: 'Embarked_C', 15: 'Embarked_C', 29: 'Embarked_S' }, 'value_lower_1': { 0: 0.0, 15: 55.55555555555556, 29: 100.0 }, 'value_upper_1': { 0: 11.11111111111111, 15: 66.66666666666666, 29: nan }, 'count': { 0: 37.0, 15: 8.0, 29: 24.0 }, 'Survived': { 0: 0.24324324324324326, 15: 0.75, 29: 0.75 } }) assert_frame_equal(expected, summary_df.loc[[0, 15, 29], :], check_like=True) assert summary_df['count'].sum() == len(titanic_data)
def test_onehot_numeric_percentile(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Fare', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Fare', 'Embarked'], target=titanic_target, percentile_ranges=[(5, 95), None]) expected = pd.DataFrame({ 'x1': { 0: 0, 13: 4, 26: 8 }, 'x2': { 0: 0, 13: 1, 26: 2 }, 'display_column_1': { 0: '[7.22, 7.75)', 13: '[13, 16.1)', 26: '[56.5, 112.08]' }, 'display_column_2': { 0: 'Embarked_C', 13: 'Embarked_Q', 26: 'Embarked_S' }, 'value_lower_1': { 0: 7.225, 13: 13.0, 26: 56.4958 }, 'value_upper_1': { 0: 7.75, 13: 16.1, 26: 112.07915 }, 'count': { 0: 27.0, 13: 8.0, 26: 50.0 }, 'Survived': { 0: 0.25925925925925924, 13: 0.375, 26: 0.56 } }) assert_frame_equal(expected, summary_df.loc[[0, 13, 26], :], check_like=True) fare = titanic_data['Fare'] inside_percentile = ((fare >= fare.quantile(0.05)) & (fare <= fare.quantile(0.95))) assert summary_df['count'].sum() == inside_percentile.sum()
def test_binary_onehot(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=["Sex", ["Embarked_C", "Embarked_Q", "Embarked_S"]], feature_names=["Sex", "Embarked"], target=titanic_target, ) assert summary_df["count"].sum() == len(titanic_data)
def test_onehot_numeric_gridpoints_outliers_endpoint(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Fare', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Fare', 'Embarked'], target=titanic_target, cust_grid_points=[range(0, 100, 10), None], show_outliers=True, endpoint=False) expected = pd.DataFrame({ 'x1': { 0: 0, 15: 5, 29: 9 }, 'x2': { 0: 0, 15: 0, 29: 2 }, 'display_column_1': { 0: '[0, 10)', 15: '[50, 60)', 29: '>= 90' }, 'display_column_2': { 0: 'Embarked_C', 15: 'Embarked_C', 29: 'Embarked_S' }, 'value_lower_1': { 0: 0.0, 15: 50.0, 29: 90.0 }, 'value_upper_1': { 0: 10.0, 15: 60.0, 29: nan }, 'count': { 0: 37.0, 15: 6.0, 29: 28.0 }, 'Survived': { 0: 0.24324324324324326, 15: 1.0, 29: 0.7857142857142857 } }) assert_frame_equal(expected, summary_df.loc[[0, 15, 29], :], check_like=True) assert summary_df['count'].sum() == len(titanic_data)
def test_onehot_numeric_endpoint(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=["Fare", ["Embarked_C", "Embarked_Q", "Embarked_S"]], feature_names=["Fare", "Embarked"], target=titanic_target, endpoint=False, ) assert summary_df["count"].sum() == len(titanic_data)
def test_onehot_numeric_num_grid_points(titanic_data, titanic_target): fare_grid_points = 15 fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Fare', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Fare', 'Embarked'], target=titanic_target, num_grid_points=[fare_grid_points, None]) expected = pd.DataFrame({ 'x1': { 0: 0, 21: 7, 41: 13 }, 'x2': { 0: 0, 21: 0, 41: 2 }, 'display_column_1': { 0: '[0, 7.23)', 21: '[14.45, 19.26)', 41: '[86.5, 512.33]' }, 'display_column_2': { 0: 'Embarked_C', 21: 'Embarked_C', 41: 'Embarked_S' }, 'value_lower_1': { 0: 0.0, 21: 14.4542, 41: 86.5 }, 'value_upper_1': { 0: 7.2292000000000005, 21: 19.2583, 41: 512.3292 }, 'count': { 0: 29.0, 21: 21.0, 41: 31.0 }, 'Survived': { 0: 0.2413793103448276, 21: 0.3333333333333333, 41: 0.8064516129032258 } }) assert_frame_equal(expected, summary_df.loc[[0, 21, 41], :], check_like=True) assert summary_df['count'].sum() == len(titanic_data) assert len(summary_df) == (fare_grid_points - 1) * 3
def test_onehot_numeric_endpoint(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Fare', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Fare', 'Embarked'], target=titanic_target, endpoint=False) expected = pd.DataFrame({ 'x1': { 0: 0, 15: 5, 29: 9 }, 'x2': { 0: 0, 15: 0, 29: 2 }, 'display_column_1': { 0: '[0, 7.73)', 15: '[16.7, 26)', 29: '>= 512.33' }, 'display_column_2': { 0: 'Embarked_C', 15: 'Embarked_C', 29: 'Embarked_S' }, 'value_lower_1': { 0: 0.0, 15: 16.7, 29: 512.3292 }, 'value_upper_1': { 0: 7.732844444444444, 15: 26.0, 29: nan }, 'count': { 0: 29.0, 15: 11.0, 29: 0.0 }, 'Survived': { 0: 0.2413793103448276, 15: 0.7272727272727273, 29: 0.0 } }) assert_frame_equal(expected, summary_df.loc[[0, 15, 29], :], check_like=True) assert summary_df['count'].sum() == len(titanic_data)
def test_onehot_numeric_num_grid_points(titanic_data, titanic_target): fare_grid_points = 15 fig, axes, summary_df = target_plot_interact( df=titanic_data, features=["Fare", ["Embarked_C", "Embarked_Q", "Embarked_S"]], feature_names=["Fare", "Embarked"], target=titanic_target, num_grid_points=[fare_grid_points, None], ) assert summary_df["count"].sum() == len(titanic_data) assert len(summary_df) == (fare_grid_points - 1) * 3
def test_binary_numeric(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=["Fare", "Sex"], feature_names=["Fare", "Sex"], target=titanic_target, show_percentile=True, percentile_ranges=[(5, 95), None], show_outliers=True, ) assert summary_df["count"].sum() == len(titanic_data)
def test_onehot_numeric(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Fare', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Fare', 'Embarked'], target=titanic_target) expected = pd.DataFrame({ 'x1': { 0: 0, 13: 4, 26: 8 }, 'x2': { 0: 0, 13: 1, 26: 2 }, 'display_column_1': { 0: '[0, 7.73)', 13: '[13, 16.7)', 26: '[73.5, 512.33]' }, 'display_column_2': { 0: 'Embarked_C', 13: 'Embarked_Q', 26: 'Embarked_S' }, 'value_lower_1': { 0: 0.0, 13: 13.0, 26: 73.5 }, 'value_upper_1': { 0: 7.732844444444444, 13: 16.7, 26: 512.3292 }, 'count': { 0: 29.0, 13: 8.0, 26: 51.0 }, 'Survived': { 0: 0.2413793103448276, 13: 0.375, 26: 0.6862745098039216 } }) assert_frame_equal(expected, summary_df.loc[[0, 13, 26], :], check_like=True) assert summary_df['count'].sum() == len(titanic_data)
def test_onehot_numeric_gridpoints_outliers_endpoint(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=["Fare", ["Embarked_C", "Embarked_Q", "Embarked_S"]], feature_names=["Fare", "Embarked"], target=titanic_target, cust_grid_points=[range(0, 100, 10), None], show_outliers=True, endpoint=False, ) assert summary_df["count"].sum() == len(titanic_data)
def test_onehot_numeric_show_outliers_endpoint(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=["Fare", ["Embarked_C", "Embarked_Q", "Embarked_S"]], feature_names=["Fare", "Embarked"], target=titanic_target, percentile_ranges=[(5, 95), None], show_percentile=True, show_outliers=True, endpoint=False, ) assert summary_df["count"].sum() == len(titanic_data)
def test_onehot_numeric_percentile(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=["Fare", ["Embarked_C", "Embarked_Q", "Embarked_S"]], feature_names=["Fare", "Embarked"], target=titanic_target, percentile_ranges=[(5, 95), None], ) fare = titanic_data["Fare"] inside_percentile = (fare >= fare.quantile(0.05)) & (fare <= fare.quantile(0.95)) assert summary_df["count"].sum() == inside_percentile.sum()
def test_onehot_numeric_gridtype_equal(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=["Fare", ["Embarked_C", "Embarked_Q", "Embarked_S"]], feature_names=["Fare", "Embarked"], target=titanic_target, grid_types=["equal", "equal"], ) assert summary_df["count"].sum() == len(titanic_data) # all bins should have the same width (equal grid type) assert (summary_df.value_upper_1 - summary_df.value_lower_1).diff().sum() < 1e-9
def test_onehot_numeric_gridpoints_outliers(titanic_data, titanic_target): """ 'show_outliers' implies 'custom_grid_points' or another custom grid definition TODO: again, this should be explicit to the user """ fig, axes, summary_df = target_plot_interact( df=titanic_data, features=["Fare", ["Embarked_C", "Embarked_Q", "Embarked_S"]], feature_names=["Fare", "Embarked"], target=titanic_target, cust_grid_points=[range(0, 100, 10), None], show_outliers=True, ) assert summary_df["count"].sum() == len(titanic_data)
def test_onehot_numeric_cust_grid_points(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=["Fare", ["Embarked_C", "Embarked_Q", "Embarked_S"]], feature_names=["Fare", "Embarked"], target=titanic_target, cust_grid_points=[range(0, 100, 10), None], ) # counts must total to the total observations inside the defined range inside_range = (titanic_data["Fare"] >= 0) & (titanic_data["Fare"] <= 90) assert summary_df["count"].sum() == inside_range.sum() # lower and upper values must follow the prescribed grid assert (summary_df.groupby("x1").value_lower_1.mean().values == np.arange( 0.0, 90, 10)).all() assert (summary_df.groupby("x1").value_upper_1.mean().values == np.arange( 10.0, 100, 10)).all()
def test_binary_onehot(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Sex', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Sex', 'Embarked'], target=titanic_target) expected = pd.DataFrame({ 'x1': { 0: 0, 2: 0, 5: 1 }, 'x2': { 0: 0, 2: 2, 5: 2 }, 'display_column_1': { 0: 'Sex_0', 2: 'Sex_0', 5: 'Sex_1' }, 'display_column_2': { 0: 'Embarked_C', 2: 'Embarked_S', 5: 'Embarked_S' }, 'count': { 0: 73, 2: 205, 5: 441 }, 'Survived': { 0: 0.8767123287671232, 2: 0.6926829268292682, 5: 0.1746031746031746 } }) assert_frame_equal(expected, summary_df.loc[[0, 2, 5], :], check_like=True) assert summary_df['count'].sum() == len(titanic_data)
def test_onehot_numeric_gridranges(titanic_data, titanic_target): """ Grid type must be 'equal' for grid ranges to work TODO: maybe this should be automatic or at least warn the user when grid types not 'equal' """ fig, axes, summary_df = target_plot_interact( df=titanic_data, features=["Fare", ["Embarked_C", "Embarked_Q", "Embarked_S"]], feature_names=["Fare", "Embarked"], target=titanic_target, grid_types=["equal", "equal"], grid_ranges=[(0, 100), None], ) # counts must total to the total observations inside the defined range inside_range = (titanic_data["Fare"] >= 0) & (titanic_data["Fare"] <= 100) assert summary_df["count"].sum() == inside_range.sum() # first and last values must be equal to the defined ranges assert summary_df.groupby("x2").first().value_lower_1.unique()[0] == 0.0 assert summary_df.groupby("x2").last().value_upper_1.unique()[0] == 100.0
#Now let's see the prediction of our model, how many survive for each age category fig, axes, summary_df = info_plots.actual_plot( model=titanic_model, X=titanic_data[titanic_features], feature='Age', feature_name='Age', show_percentile=True ) display(summary_df) #And finally let's obtain the PDP for the feature Age pdp_fare = pdp.pdp_isolate( model=titanic_model, dataset=titanic_data, model_features=titanic_features, feature='Age' ) fig, axes = pdp.pdp_plot(pdp_fare, 'Age', plot_pts_dist=True) #Let's study the link between Age and Pclass #Statistics of survivors based on Age and Pclass fig, axes, summary_df = info_plots.target_plot_interact( df=titanic_data, features=['Age', 'Pclass'], feature_names=['Age', 'Pclass'], target=titanic_target ) display(summary_df.head()) #Prediction of our model, impact if Age and Pclass fig, axes, summary_df = info_plots.actual_plot_interact( model=titanic_model, X=titanic_data[titanic_features], features=['Age', 'Pclass'], feature_names=['Age', 'Pclass'] ) display(summary_df.head()) #PDP for the interaction between Age and Pclass inter1 = pdp.pdp_interact( model=titanic_model, dataset=titanic_data, model_features=titanic_features, features=['Age', 'Pclass'] ) fig, axes = pdp.pdp_interact_plot( pdp_interact_out=inter1, feature_names=['age', 'Pclass'], plot_type='contour', x_quantile=True, plot_pdp=True )
def test_binary_numeric(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact(df=titanic_data, features=['Fare', 'Sex'], feature_names=['Fare', 'Sex'], target=titanic_target, show_percentile=True, percentile_ranges=[(5, 95), None], show_outliers=True) expected = pd.DataFrame({ 'x1': { 0: 0, 11: 5, 21: 10 }, 'x2': { 0: 0, 11: 1, 21: 1 }, 'display_column_1': { 0: '< 7.22', 11: '[13, 16.1)', 21: '> 112.08' }, 'display_column_2': { 0: 'Sex_0', 11: 'Sex_1', 21: 'Sex_1' }, 'value_lower_1': { 0: nan, 11: 13.0, 21: 112.07915 }, 'value_upper_1': { 0: 7.225, 11: 16.1, 21: nan }, 'percentile_column_1': { 0: '< 5', 11: '[45, 55)', 21: '> 95' }, 'percentile_lower_1': { 0: 0.0, 11: 45.0, 21: 95.0 }, 'percentile_upper_1': { 0: 5.0, 11: 55.0, 21: 100.0 }, 'count': { 0: 1, 11: 59, 21: 15 }, 'Survived': { 0: 0.0, 11: 0.1864406779661017, 21: 0.4 } }) assert_frame_equal(expected, summary_df.loc[[0, 11, 21], :], check_like=True) assert summary_df['count'].sum() == len(titanic_data)
def test_onehot_numeric_show_outliers_endpoint(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Fare', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Fare', 'Embarked'], target=titanic_target, percentile_ranges=[(5, 95), None], show_percentile=True, show_outliers=True, endpoint=False) expected = pd.DataFrame({ 'x1': { 0: 0, 16: 5, 32: 10 }, 'x2': { 0: 0, 16: 1, 32: 2 }, 'display_column_1': { 0: '< 7.22', 16: '[13, 16.1)', 32: '>= 112.08' }, 'display_column_2': { 0: 'Embarked_C', 16: 'Embarked_Q', 32: 'Embarked_S' }, 'value_lower_1': { 0: nan, 16: 13.0, 32: 112.07915 }, 'value_upper_1': { 0: 7.225, 16: 16.1, 32: nan }, 'percentile_column_1': { 0: '< 5', 16: '[45, 55)', 32: '>= 95' }, 'percentile_lower_1': { 0: 0.0, 16: 45.0, 32: 95.0 }, 'percentile_upper_1': { 0: 5.0, 16: 55.0, 32: 100.0 }, 'count': { 0: 2.0, 16: 8.0, 32: 24.0 }, 'Survived': { 0: 0.0, 16: 0.375, 32: 0.75 } }) assert_frame_equal(expected, summary_df.loc[[0, 16, 32], :], check_like=True) assert summary_df['count'].sum() == len(titanic_data)
def test_onehot_numeric_cust_grid_points(titanic_data, titanic_target): fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Fare', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Fare', 'Embarked'], target=titanic_target, cust_grid_points=[range(0, 100, 10), None]) expected = pd.DataFrame({ 'x1': { 0: 0, 13: 4, 26: 8 }, 'x2': { 0: 0, 13: 1, 26: 2 }, 'display_column_1': { 0: '[0, 10)', 13: '[40, 50)', 26: '[80, 90]' }, 'display_column_2': { 0: 'Embarked_C', 13: 'Embarked_Q', 26: 'Embarked_S' }, 'value_lower_1': { 0: 0.0, 13: 40.0, 26: 80.0 }, 'value_upper_1': { 0: 10.0, 13: 50.0, 26: 90.0 }, 'count': { 0: 37.0, 13: 0.0, 26: 10.0 }, 'Survived': { 0: 0.24324324324324326, 13: 0.0, 26: 0.9 } }) assert_frame_equal(expected, summary_df.loc[[0, 13, 26], :], check_like=True) # counts must total to the total observations inside the defined range inside_range = (titanic_data['Fare'] >= 0) & (titanic_data['Fare'] <= 90) assert summary_df['count'].sum() == inside_range.sum() # lower and upper values must follow the prescribed grid assert (summary_df.groupby('x1').value_lower_1.mean().values == np.arange( 0., 90, 10)).all() assert (summary_df.groupby('x1').value_upper_1.mean().values == np.arange( 10., 100, 10)).all()
def test_onehot_numeric_gridranges(titanic_data, titanic_target): """ Grid type must be 'equal' for grid ranges to work TODO: maybe this should be automatic or at least warn the user when grid types not 'equal' """ fig, axes, summary_df = target_plot_interact( df=titanic_data, features=['Fare', ['Embarked_C', 'Embarked_Q', 'Embarked_S']], feature_names=['Fare', 'Embarked'], target=titanic_target, grid_types=['equal', 'equal'], grid_ranges=[(0, 100), None]) expected = pd.DataFrame({ 'x1': { 0: 0, 13: 4, 26: 8 }, 'x2': { 0: 0, 13: 1, 26: 2 }, 'display_column_1': { 0: '[0, 11.11)', 13: '[44.44, 55.56)', 26: '[88.89, 100]' }, 'display_column_2': { 0: 'Embarked_C', 13: 'Embarked_Q', 26: 'Embarked_S' }, 'value_lower_1': { 0: 0.0, 13: 44.44444444444444, 26: 88.88888888888889 }, 'value_upper_1': { 0: 11.11111111111111, 13: 55.55555555555556, 26: 100.0 }, 'count': { 0: 37.0, 13: 0.0, 26: 4.0 }, 'Survived': { 0: 0.24324324324324326, 13: 0.0, 26: 1.0 } }) assert_frame_equal(expected, summary_df.loc[[0, 13, 26], :], check_like=True) # counts must total to the total observations inside the defined range inside_range = (titanic_data['Fare'] >= 0) & (titanic_data['Fare'] <= 100) assert summary_df['count'].sum() == inside_range.sum() # first and last values must be equal to the defined ranges assert summary_df.groupby('x2').first().value_lower_1.unique()[0] == 0.0 assert summary_df.groupby('x2').last().value_upper_1.unique()[0] == 100.0
def target_interact_plot(self, feature, var_name = None, target=None, sample = 10000, show_outliers=True, **kargs): fig, axes, self.summary['target_interact'] = info_plots.target_plot_interact( df=self.sample(sample), target= target or self.target, features= feature, feature_names = var_name or feature, show_outliers=show_outliers, **kargs) plt.show()
# %% pdp_plot fig, axes = pdp.pdp_plot( pdp_isolate_out=pdp_isolated_tmp, feature_name=x_cols[:2], center=True, x_quantile=True, ncols=3, plot_lines=True, frac_to_plot=100, plot_pts_dist=True, ) # %% target_plot_interact ---------------------------------------------------- fig, axes, summary_df = info_plots.target_plot_interact( df=XY, features=x_cols[2:], feature_names=x_cols[2:], target=y_cols[0], ) # %% actual_plot_interact ---------------------------------------------------- fig, axes, summary_df = info_plots.actual_plot_interact( model=model, X=X, features=x_cols[3:], feature_names=x_cols[3:], which_classes=[2, 5], ) # %% pdp_interact: Preset ----------------------------------------------------
df.insured_education_level.unique() fig, axes, summary_df = info_plots.target_plot(\ df=df, feature=['MD', 'PhD', 'Associate', 'Masters', 'High School', 'College','JD'], feature_name='insured_education_level', target='fraud_reported') fig, axes, summary_df = info_plots.actual_plot(\ model, X_test, feature='insured_education_level', feature_name='InsuredEducation',predict_kwds={}) fig, axes, summary_df = info_plots.target_plot_interact(\ df=data, features=['insured_education_level', 'age'], feature_names=['insured_education_level', 'age'], target='fraud_reported') fig, axes, summary_df = info_plots.target_plot_interact(\ df=data, features=['vehicle_age', 'months_as_customer'], feature_names=['vehicle_age', 'months_as_customer'], target='fraud_reported') pdp_limit = pdp.pdp_isolate(\ model, dataset=X_test, model_features=X_test.columns, feature='months_as_customer') fig, axes = pdp.pdp_plot(\ pdp_limit, 'months_as_customer', frac_to_plot=0.2, plot_lines=True, x_quantile=True, show_percentile=True,