def test__infer_task(): # each check is in the same order as in the original implementation from ppscore.calculation import _infer_task df = pd.read_csv("examples/titanic.csv") assert _infer_task(df, "Age", "Age") == "predict_itself" df["constant"] = 1 assert _infer_task(df, "Age", "constant") == "predict_constant" assert _infer_task(df, "Age", "Survived") == "classification" df = df.reset_index() df["id"] = df["index"].astype(str) assert _infer_task(df, "Age", "id") == "predict_id" # classification because numeric but few categories assert _infer_task(df, "Age", "SibSp") == "classification" df["Pclass_category"] = df["Pclass"].astype("category") assert _infer_task(df, "Age", "Pclass_category") == "classification" df["Pclass_datetime"] = pd.to_datetime(df["Pclass"], infer_datetime_format=True) with pytest.raises(Exception): pps.score(df, "Age", "Pclass_datetime") assert _infer_task(df, "Survived", "Age") == "regression"
def get_pps_array(df): ps=[] for i,col1 in enumerate(df.columns): ps.append([]) for col2 in df.columns: ps[i].append(pps.score(df,col1,col2)['ppscore']) return np.array(ps)
def run_predictive_power_score(df, target, save_path): """ Calculates the predictive power score (pps) for each feature. If the score is 0, then it is not any better than a baseline model. If it's 1, then the feature is a perfect predictor. The model_score is the weighted F1 score for a univariate model predicting the target. :param df: pandas dataframe :param target: name of our target :param save_path: path in which to save the output """ df = pd.get_dummies(df, dummy_na=True) df[target] = df[target].astype(str) pps_df = pd.DataFrame({}) for feature in tqdm(list(df)): if feature != target: temp_score_dict = pps.score(df, feature, target) temp_ppscore = temp_score_dict.get('ppscore') temp_model_score = temp_score_dict.get('model_score') temp_df = pd.DataFrame({ 'feature': [feature], 'pps': [temp_ppscore], 'model_score': [temp_model_score] }) pps_df = pps_df.append(temp_df) pps_df.to_csv(os.path.join(save_path, 'predictive_power_score.csv'), index=False)
def feature_score(df): score_check = st.sidebar.checkbox('View PPScore between features ') if score_check: score_var1 = st.sidebar.selectbox('Select feature', df.columns) score_var2 = st.sidebar.selectbox('Select target', df.columns) st.markdown('**Prediction of {} using {}**'.format(score_var2, score_var1)) score = pps.score(df, score_var1, score_var2) st.write(score['ppscore'])
def drop_corr(df, target_column, correlation_percent=0.8, return_cols=False): ''' use predictive power score to choose which of the correlated columns to drop ''' temp_df = df.copy() temp_df['target'] = target_column pos, _ = get_corr(df, 0.8) correlated_columns = pos['columns'] cols_to_drop = [] for pair in correlated_columns: col_1, col_2 = pair score_1 = pps.score(temp_df, col_1, 'target')['ppscore'] score_2 = pps.score(temp_df, col_2, 'target')['ppscore'] if score_1 > score_2: cols_to_drop.append(col_2) else: cols_to_drop.append(col_1) if return_cols: return df.drop(cols_to_drop, axis=1), cols_to_drop return df.drop(cols_to_drop, axis=1)
# - Changing some data types # - Renaming the column names to be more clear # %% df = df[["Survived", "Pclass", "Sex", "Age", "Ticket", "Fare", "Embarked"]] df = df.rename(columns={"Pclass": "Class"}) df = df.rename(columns={"Ticket": "TicketID"}) df = df.rename(columns={"Fare": "TicketPrice"}) df = df.rename(columns={"Embarked": "Port"}) # %% [markdown] # ## Single Predictive Power Score # - Answering the question: how well can Sex predict the Survival probability? # %% pps.score(df, "Sex", "Survived") # %% [markdown] # ## PPS matrix # - Answering the question: which predictive patterns exist between the columns? # %% matrix = pps.matrix(df) # %% matrix # %% heatmap(matrix) # %% [markdown]
def test_score(): df = pd.DataFrame() df["x"] = np.random.uniform(-2, 2, 1_000) df["error"] = np.random.uniform(-0.5, 0.5, 1_000) df["y"] = df["x"] * df["x"] + df["error"] df["constant"] = 1 df = df.reset_index() df["id"] = df["index"].astype(str) df["x_greater_0_boolean"] = df["x"] > 0 # df["x_greater_0_string"] = df["x_greater_0_boolean"].astype(str) df["x_greater_0_string"] = pd.Series( df["x_greater_0_boolean"].apply(str), dtype="string" ) df["x_greater_0_string_object"] = df["x_greater_0_string"].astype("object") df["x_greater_0_string_category"] = df["x_greater_0_string"].astype("category") df["x_greater_0_boolean_object"] = df["x_greater_0_boolean"].astype("object") df["x_greater_0_boolean_category"] = df["x_greater_0_boolean"].astype("category") df["nan"] = np.nan duplicate_column_names_df = pd.DataFrame() duplicate_column_names_df["x1"] = np.random.uniform(-2, 2, 10) duplicate_column_names_df["x2"] = np.random.uniform(-2, 2, 10) duplicate_column_names_df["unique_column_name"] = np.random.uniform(-2, 2, 10) duplicate_column_names_df.columns = [ "duplicate_column_name", "duplicate_column_name", "unique_column_name", ] # check input types with pytest.raises(TypeError): numpy_array = np.random.randn(10, 10) # not a DataFrame pps.score(numpy_array, "x", "y") with pytest.raises(ValueError): pps.score(df, "x_column_that_does_not_exist", "y") with pytest.raises(ValueError): pps.score(df, "x", "y_column_that_does_not_exist") with pytest.raises(AttributeError): # the task argument is not supported any more pps.score(df, "x", "y", task="classification") with pytest.raises(AssertionError): # df shall not have duplicate column names pps.score( duplicate_column_names_df, "duplicate_column_name", "unique_column_name" ) with pytest.raises(AssertionError): # df shall not have duplicate column names pps.score( duplicate_column_names_df, "unique_column_name", "duplicate_column_name" ) # check cross_validation # if more folds than data, there is an error with pytest.raises(ValueError): assert pps.score(df, "x", "y", cross_validation=2000, catch_errors=False) # check random_seed assert pps.score(df, "x", "y", random_seed=1) == pps.score( df, "x", "y", random_seed=1 ) assert pps.score(df, "x", "y", random_seed=1) != pps.score( df, "x", "y", random_seed=2 ) # the random seed that is drawn automatically is smaller than <1000 assert pps.score(df, "x", "y") != pps.score(df, "x", "y", random_seed=123_456) # check invalid_score invalid_score = -99 assert ( pps.score(df, "nan", "y", invalid_score=invalid_score)["ppscore"] == invalid_score ) # check catch_errors using the cross_validation error from above assert pps.score(df, "x", "y", cross_validation=2000, invalid_score=invalid_score, catch_errors=True)["ppscore"] == invalid_score # check case discrimination assert pps.score(df, "x", "y")["case"] == "regression" assert pps.score(df, "x", "x_greater_0_string")["case"] == "classification" assert pps.score(df, "x", "constant")["case"] == "target_is_constant" assert pps.score(df, "x", "x")["case"] == "predict_itself" assert pps.score(df, "x", "id")["case"] == "target_is_id" assert pps.score(df, "nan", "y")["case"] == "empty_dataframe_after_dropping_na" # check scores # feature is id assert pps.score(df, "id", "y")["ppscore"] == 0 # numeric feature and target assert pps.score(df, "x", "y")["ppscore"] > 0.5 assert pps.score(df, "y", "x")["ppscore"] < 0.05 # boolean feature or target assert pps.score(df, "x", "x_greater_0_boolean")["ppscore"] > 0.6 assert pps.score(df, "x_greater_0_boolean", "x")["ppscore"] < 0.6 # string feature or target assert pps.score(df, "x", "x_greater_0_string")["ppscore"] > 0.6 assert pps.score(df, "x_greater_0_string", "x")["ppscore"] < 0.6 # object feature or target assert pps.score(df, "x", "x_greater_0_string_object")["ppscore"] > 0.6 assert pps.score(df, "x_greater_0_string_object", "x")["ppscore"] < 0.6 # category feature or target assert pps.score(df, "x", "x_greater_0_string_category")["ppscore"] > 0.6 assert pps.score(df, "x_greater_0_string_category", "x")["ppscore"] < 0.6 # object feature or target assert pps.score(df, "x", "x_greater_0_boolean_object")["ppscore"] > 0.6 assert pps.score(df, "x_greater_0_boolean_object", "x")["ppscore"] < 0.6 # category feature or target assert pps.score(df, "x", "x_greater_0_boolean_category")["ppscore"] > 0.6 assert pps.score(df, "x_greater_0_boolean_category", "x")["ppscore"] < 0.6
return sns.heatmap(df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True) # %% df = pd.DataFrame() df["x"] = np.random.uniform(-2, 2, 1_000_000) df["error"] = np.random.uniform(-0.5, 0.5, 1_000_000) df["y"] = df["x"] * df["x"] + df["error"] # %% sns.scatterplot(x="x", y="y", data=df.sample(10_000)) # %% matrix = pps.matrix(df) # %% matrix # %% heatmap(matrix) # %% pps.score(df, "x", "y") # %%
import ppscore as pps import pandas as pd import numpy as np import matplotlib.pyplot as plt import warnings warnings.simplefilter(action='ignore') data = pd.read_csv('dataset_challenge_one.tsv', delimiter='\t') data = data.fillna(0) predictors = data.columns[:-1] score = [] var = [] for i in predictors: score.append(pps.score(data, i, 'class')['ppscore']) var.append(i) ppscore_df = pd.DataFrame({'predictor': var, 'ppscore': score}) ppscore_df = ppscore_df.sort_values(['ppscore'], ascending=False).reset_index() plt.figure(figsize=(6, 3)) plt.plot(ppscore_df['predictor'].index, ppscore_df['ppscore']) plt.grid() plt.xticks(np.arange(0, 1600, 50), rotation='vertical', fontsize=10) plt.ylabel('ppscore', fontsize=16) plt.xlabel('Predictor', fontsize=10) plt.yticks(np.arange(0, 0.3, 0.05), fontsize=10) plt.title('Predictive Power Score Plot', fontsize=20) plt.show() ppscore_df['ppscore'].value_counts() bins = [ -1,
plinear_data = np.hstack((dat, dat)) + 1 df = pd.DataFrame(data=plinear_data, columns=["x", "y"]) enc = KBinsDiscretizer(n_bins=10, encode='ordinal') XP_data_nmi = np.squeeze( enc.fit_transform(np.atleast_2d(plinear_data[:, 0]).T)) enc = KBinsDiscretizer(n_bins=10, encode='ordinal') YP_data_nmi = np.squeeze( enc.fit_transform(np.atleast_2d(plinear_data[:, 1]).T)) plinear_pc = np.round( pearsonr(plinear_data[:, 0], plinear_data[:, 1])[0], 2) plinear_nmi = np.round( normalized_mutual_info_score(XP_data_nmi, YP_data_nmi), 2) plinear_hsic = np.round(ℍ(plinear_data[:, 0], plinear_data[:, 1]), 2) plinear_pps = np.round(pps.score(df, "x", "y")['ppscore'], 2) print('Linear Relationship:') print('\tCorrelation : ', plinear_pc) print('\tNMI : ', plinear_nmi) print('\tpps : ', plinear_pps) print('\tHSIC : ', plinear_hsic) # Linear Data dat = np.random.rand(n, 1) linear_data = np.hstack((dat, dat)) + 0.04 * np.random.randn(n, 2) df = pd.DataFrame(data=linear_data, columns=["x", "y"]) enc = KBinsDiscretizer(n_bins=10, encode='ordinal') XL_data_nmi = np.squeeze( enc.fit_transform(np.atleast_2d(linear_data[:, 0]).T))
def predictive_power_score(self, df, feature_col_name, target_col_name): df = df.dropna(subset=[feature_col_name, target_col_name]) p = pps.score(df, feature_col_name, target_col_name) return p
def test_score(): df = pd.DataFrame() df["x"] = np.random.uniform(-2, 2, 1_000) df["error"] = np.random.uniform(-0.5, 0.5, 1_000) df["y"] = df["x"] * df["x"] + df["error"] df["constant"] = 1 df = df.reset_index() df["id"] = df["index"].astype(str) df["x_greater_0"] = df["x"] > 0 df["x_greater_0"] = df["x_greater_0"].astype(str) df["nan"] = np.nan with pytest.raises(Exception): pps.score(df, "nan", "y") assert pps.score(df, "x", "y", "regression")["task"] == "regression" assert pps.score(df, "x", "constant")["task"] == "predict_constant" assert pps.score(df, "x", "x")["task"] == "predict_itself" assert pps.score(df, "x", "id")["task"] == "predict_id" # feature is id assert pps.score(df, "id", "y")["ppscore"] == 0 # numeric feature and target assert pps.score(df, "x", "y")["ppscore"] > 0.5 assert pps.score(df, "y", "x")["ppscore"] < 0.05 # object feature or target assert pps.score(df, "x", "x_greater_0")["ppscore"] > 0.6 assert pps.score(df, "x_greater_0", "x")["ppscore"] < 0.6