def test_matrix(): df = pd.read_csv("examples/titanic.csv") df = df[["Age", "Survived"]] assert isinstance(pps.matrix(df), pd.DataFrame) assert isinstance(pps.matrix(df, output="dict"), dict) # matrix catches single score errors under the hood df["Age_datetime"] = pd.to_datetime(df["Age"], infer_datetime_format=True) assert pps.matrix(df[["Survived", "Age_datetime"]])["Survived"]["Age_datetime"] == 0
def Graph_Plots_For_Individual_Instrument(instrument_lst, flag): for instrument in instrument_lst: Plot_Items(instrument, flag) decompose_plot(instrument, flag) Divergence_Plots_For_Single_Instrument(instrument, flag) plot_peaks_troughs(instrument, flag) Plot_Multi_Axes(instrument, flag) #better corr graph data = pd.read_csv(instrument + ".csv", index_col="Date") plt.figure(figsize=(15, 10)) corrplot(data.corr(), size_scale=500, instrument_name=instrument) if flag == True: plt.savefig("man_select_inst" + "\\" + "BetterCorr_" + instrument + ".png") else: plt.savefig("rule_select_inst" + "\\" + "BetterCorr_" + instrument + ".png") #PPS matrix = pps.matrix(data) plt.figure(figsize=(18, 15)) heatmap(matrix, instrument) if flag == True: plt.savefig("man_select_inst" + "\\" + "PPS_" + instrument + ".png") else: plt.savefig("rule_select_inst" + "\\" + "PPS_" + instrument + ".png") Divergence_Plots()
def get_predictive_power_score(self) -> DataFrame: self.predictive_power_score = pps.matrix( self.data)[['x', 'y', 'ppscore']].pivot(index="y", columns="x", values="ppscore") return self.predictive_power_score.style.apply( Utils.background_gradients, cmap='Blues', m=0, M=1).highlight_null('white')
def test_matrix(): df = pd.read_csv("examples/titanic.csv") df = df[["Age", "Survived"]] df["Age_datetime"] = pd.to_datetime(df["Age"], infer_datetime_format=True) subset_df = df[["Survived", "Age_datetime"]] # check input types with pytest.raises(TypeError): numpy_array = np.random.randn(10, 10) # not a DataFrame pps.matrix(numpy_array) with pytest.raises(ValueError): pps.matrix(df, output="invalid_output_type") # check return types assert isinstance(pps.matrix(df), pd.DataFrame) assert isinstance(pps.matrix(df, output="list"), list) # matrix catches single score errors under the hood invalid_score = [ score for score in pps.matrix(subset_df, output="list") if (score["x"] == "Survived" and score["y"] == "Age_datetime") ][0] assert invalid_score["ppscore"] == 0
def get_connectivity_matrices(time_series, subjects, kinds=DEFAULT_KINDS): """Computes connectivity matrices Arguments: time_series {list} -- List of extracted time series subjects {list} -- List of corresponding subbjects Keyword Arguments: kinds {list} -- List of connectivity measures (default: {DEFAULT_KINDS}) Returns: {Numpy array} -- Connectivity matrices """ matrices = {} n_subjects = len(set(subjects)) for kind in kinds: print( f'{bcolors.OKBLUE}Computing {kind} of {n_subjects} subjects{bcolors.ENDC}' ) if kind in NILEARN_KINDS: if kind == 'tangent' and n_subjects < 2: print( f'{bcolors.FAIL}Tangent space parametrization can only be applied to a group of subjects, as it returns deviations to the mean. Skipping{bcolors.ENDC}' ) continue connectivity_measures = ConnectivityMeasure(kind=kind) connectivity_matrices = connectivity_measures.fit_transform( time_series) matrices[kind] = { subjects[i]: connectivity_matrices[i] for i in range(connectivity_matrices.shape[0]) } if kind == 'pps': for i, subject in enumerate(subjects): ts = pd.DataFrame(time_series[i]) matrix = pps.matrix(ts) matrix = matrix[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore') if not kind in matrices: matrices[kind] = {} matrices[kind][subject] = matrix.values return matrices
def create_heatmap_pp_score(df): z = pps.matrix(df).round(2).values x = df.columns.to_list() y = df.columns.to_list() fig2 = ff.create_annotated_heatmap(z, x, y) fig2.update_layout( autosize=False, width=600, height=600, ) return fig2
def calculate_pps_matrix(data, agg_method): _global_checks.global_check_import('ppscore', 'depend_matrix with PPS calculation') import ppscore as pps pps_result = pps.matrix(data, sample=None) pps_matrix = pps_result[["x", "y", "ppscore"]].pivot( columns="x", index="y", values="ppscore" ) pps_matrix.rename_axis(None, axis=1, inplace=True) pps_matrix.rename_axis(None, axis=0, inplace=True) # aggregate values to make symmetric matrix if agg_method == "max": pps_matrix = np.maximum(pps_matrix, pps_matrix.transpose()) if agg_method == "min": pps_matrix = np.minimum(pps_matrix, pps_matrix.transpose()) if agg_method == "mean": pps_matrix = (pps_matrix + pps_matrix.transpose()) / 2 return pps_matrix
def pps_heatmap(df): """ Function for calculating the Predictive Power Score and plotting a heatmap Args: Pandas DataFrame or Series object __________ Returns: figure """ pps_mtrx = pps.matrix(df) pps_mtrx1 = pps_mtrx[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore') plt.figure(figsize=(24, 8)) ax = sb.heatmap(pps_mtrx1, vmin=0, vmax=1, cmap="afmhot_r", linewidths=0.5, annot=True) ax.set_title("PPS matrix") ax.set_xlabel("feature") ax.set_ylabel("target") return ax
sns.heatmap(correlation, annot=True, square=True, cmap='coolwarm') plt.show() """From the analysis it was evident that **area_mean, perimeter_mean, perimeter_worst, area_worst** have a very high correlation hence they were removed from features. After applying **PPS (Predictive Power Score)** """ new_cols = ['radius_mean','texture_mean','smoothness_mean','compactness_mean', 'concavity_mean','concave points_mean','symmetry_mean','fractal_dimension_mean', 'radius_se','texture_se','perimeter_se','area_se','smoothness_se', 'compactness_se','concavity_se','concave points_se','symmetry_se', 'fractal_dimension_se','radius_worst','texture_worst', 'smoothness_worst','compactness_worst','concavity_worst','concave points_worst', 'symmetry_worst','fractal_dimension_worst'] pp_mean = pps.matrix(df[new_cols]) plt.figure(figsize=(30,30)) plt.rcParams.update({'font.size': 15}) sns.heatmap(pp_mean, annot=True, square=True, cmap='coolwarm') plt.show() """# Creation of X and Y for training the machine learning models. **Label encoder converts the 2 text classification label (M & B) into numeric 1 and 0.** M label is translated to 1 and B label is translated to 0. Splitting data into **train and test with 75:25** ratio """ # dataframe X contains feature selected X = df[new_cols] # dataframe Y contains the corresponding labels
d = st.sidebar.selectbox(label="Select a year:", options=severity_historical["year"].unique().tolist(), index=0) l = st.sidebar.selectbox( label="Select a variable (for univariate distributions):", options=selectcols, index=0) subsetdf = severity_historical.loc[severity_historical["year"] == d].drop( labels=["FIPS", "year"], axis=1) col1, col2, col3 = st.beta_columns(3) matrix_df = pps.matrix(subsetdf)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore') fig1 = plt.figure(1) sns.histplot(subsetdf, x=l).set_title("Univariate Distribution") fig2 = plt.figure(2) sns.heatmap(subsetdf.corr(), cmap="flare", annot=True).set_title("Correlation Matrix") fig3 = plt.figure(3) sns.heatmap(matrix_df, cmap="cubehelix_r", annot=True).set_title("Predictive Power Score") col1.pyplot(fig1, use_container_widths=True) col2.pyplot(fig2, use_container_widths=True)
'''data source initialization''' this_dir = os.path.dirname(os.path.abspath(__file__)) data_path = os.path.join(this_dir, 'winequality-red.csv') quality_data = pd.read_csv(data_path) # quality_data.drop_duplicates(inplace=True) # duplicates seems to be relevant and may exist due to precess satbility, so I'm leaving them '''predictive score matrix''' matrix_pp = pps.matrix(quality_data)[['x', 'y', 'ppscore']].pivot( columns='x', index='y', values='ppscore') fig0, ax0 = plt.subplots() plot0 = sns.heatmap(matrix_pp, annot=True, ax=ax0) # plt.show() # most of features have no predictive power, so let's drop them :) '''dependent and independet variables separation and cleaning''' def robust_zscore(d: 'DataFrame'): a = np.asanyarray(d) median = np.median(a, axis=0) mad = median_abs_deviation(a, axis=0) z = (a - median) / mad return z
mean_rslts.sort_values(["Name_1", "Name_2"], inplace=True) rslts = mean_rslts.BattleResult.values.copy() rslts.resize([144, 144]) # Normalization rslts_min, rslts_max = np.abs(np.nanmin(rslts)), np.abs(np.nanmax(rslts)) rslts_norm = (rslts + rslts_min) / (rslts_min + rslts_max) rslts_norm = np.nan_to_num(rslts_norm, nan=0) # Plot fig = go.Figure(data=go.Heatmap(z=rslts_norm, x=mean_rslts.Name_1.unique(), y=mean_rslts.Name_2.unique(), colorscale='Viridis')) fig.update_layout(title='Mean Battle Results', xaxis_nticks=50) fig.show() # Predictive Power Score pps_matrix = pps.matrix(battle, sample=5000) fig = go.Figure(data=go.Heatmap(z=pps_matrix.values, x=pps_matrix.index, y=pps_matrix.columns, colorscale='Viridis')) fig.update_layout(title='Predictive Power Score', xaxis_nticks=50) fig.show()
# there are 1863 population samples, for whom genetics is available - len(set(total_df.index).intersection(samples)) # of them, 198 have olink data for proteins print( "from here, use code below if you want only pop analysis, otherwise scroll to the bottom" ) assert False # Selecting non-obese - only healthy, have no olink data (to avoid an overfit) total_df = total_df.loc[total_df.index.intersection(samples).difference( olinked_samples)] # 1665 samples total_df_sample = total_df print(total_df_sample.shape) # Using PPS matrix_df = pps.matrix(total_df_sample)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore') fig = plt.figure(figsize=(16, 16)) ax = sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True) axlim = total_df_sample.shape[1] ax.set_ylim( 0, axlim ) # a patch for the heatmap in sns - it was broken in matplotlib 3.1.1 plt.savefig("pps.png") plt.close() # No findings
print("----- Z Test Results -----") print("T stat. = " + str(t_stat_3)) print("P value = " + str(p_value_3)) # P-value is less than 0.05 print("") # T-test: Checking if the distribution means (fares of survivors vs fares of non-survivors) are statistically different t_stat_4, p_value_4 = stats.ttest_ind(dist_c, dist_d) print("----- T Test Results -----") print("T stat. = " + str(t_stat_4)) print("P value = " + str(p_value_4)) # P-value is less than 0.05 # PP - Score matrix_data = pps.matrix(data)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore') matrix_data = matrix_data.apply( lambda x: round(x, 2)) # Rounding matrix_data's values to 0,XX sns.heatmap(matrix_data, vmin=0, vmax=1, cmap="Blues", linewidths=0.75, annot=True) #%% Feature Engineering data['AgeCat'] = '' data['AgeCat'].loc[(data['Age'] < 18)] = 'young'
def feature_predictive_power(data, feature, target, id_column, top=5): ''' Calculate the predictive power of features on a feature of interest for effective Exploratory data analysis and feature engineering Parameter: ----------------------------------------- data: DataFrame Data which contain the feature. feature: str Name of the Column of interest on which EDA or feature engineering is to be performed target: str Name of the target column in the data. id_column: str Name of the ID column top(default = 5): int Top predictive columns for the feature Top must be less than number of features Returns: descriptive dataframe. ''' df = data.copy() data = data.drop(columns=[id_column], axis=1) features = list(data.columns) rt = ps.matrix(data) if feature == id_column: raise KeyError( f"{feature} is not a predictive feature, try any of {features}") predictors = pd.DataFrame(rt.loc[feature].sort_values(ascending=False)) numerical_feats = [] categorical_feats = [] for f in features: try: data[f] = data[f].astype(float) numerical_feats.append(f) except: categorical_feats.append(f) #constant feature check if len(rt.loc[feature].unique()) == 1: raise Exception( f"{feature} is a constant difference feature, all other features are perfect predictors. \nConsideer dropping {feature}" ) #numerical cases if feature in numerical_feats: predictors = predictors[1:top + 1].rename( columns={feature: 'predictive_power'}) #categorical cases else: #nominal feature check if len(data[feature].unique()) > data.shape[0] * .75: raise Exception( f"{feature} is a nominal feature, consider dropping it") else: predictors = predictors[1:top + 1].rename( columns={feature: 'predictive_power'}) return predictors
df = df.rename(columns={"Fare": "TicketPrice"}) df = df.rename(columns={"Embarked": "Port"}) # %% [markdown] # ## Single Predictive Power Score # - Answering the question: how well can Sex predict the Survival probability? # %% pps.score(df, "Sex", "Survived") # %% [markdown] # ## PPS matrix # - Answering the question: which predictive patterns exist between the columns? # %% matrix = pps.matrix(df) # %% matrix # %% heatmap(matrix) # %% [markdown] # ## Correlation matrix # - As a comparison to the PPS matrix # %% corr_heatmap(df.corr()) # %%
def cria_matriz_pps(df): correlations = pps.matrix(df).pivot(columns='x', index='y', values='ppscore') plt.figure(figsize=(20,20)) sns.heatmap(correlations, annot=True)