コード例 #1
0
def test_matrix():
    df = pd.read_csv("examples/titanic.csv")
    df = df[["Age", "Survived"]]

    assert isinstance(pps.matrix(df), pd.DataFrame)
    assert isinstance(pps.matrix(df, output="dict"), dict)

    # matrix catches single score errors under the hood
    df["Age_datetime"] = pd.to_datetime(df["Age"], infer_datetime_format=True)
    assert pps.matrix(df[["Survived",
                          "Age_datetime"]])["Survived"]["Age_datetime"] == 0
コード例 #2
0
def Graph_Plots_For_Individual_Instrument(instrument_lst, flag):
    for instrument in instrument_lst:
        Plot_Items(instrument, flag)
        decompose_plot(instrument, flag)
        Divergence_Plots_For_Single_Instrument(instrument, flag)
        plot_peaks_troughs(instrument, flag)
        Plot_Multi_Axes(instrument, flag)
        #better corr graph
        data = pd.read_csv(instrument + ".csv", index_col="Date")
        plt.figure(figsize=(15, 10))
        corrplot(data.corr(), size_scale=500, instrument_name=instrument)
        if flag == True:
            plt.savefig("man_select_inst" + "\\" + "BetterCorr_" + instrument +
                        ".png")
        else:
            plt.savefig("rule_select_inst" + "\\" + "BetterCorr_" +
                        instrument + ".png")
        #PPS
        matrix = pps.matrix(data)
        plt.figure(figsize=(18, 15))
        heatmap(matrix, instrument)
        if flag == True:
            plt.savefig("man_select_inst" + "\\" + "PPS_" + instrument +
                        ".png")
        else:
            plt.savefig("rule_select_inst" + "\\" + "PPS_" + instrument +
                        ".png")

    Divergence_Plots()
コード例 #3
0
 def get_predictive_power_score(self) -> DataFrame:
     self.predictive_power_score = pps.matrix(
         self.data)[['x', 'y', 'ppscore']].pivot(index="y",
                                                 columns="x",
                                                 values="ppscore")
     return self.predictive_power_score.style.apply(
         Utils.background_gradients, cmap='Blues', m=0,
         M=1).highlight_null('white')
コード例 #4
0
def test_matrix():
    df = pd.read_csv("examples/titanic.csv")
    df = df[["Age", "Survived"]]
    df["Age_datetime"] = pd.to_datetime(df["Age"], infer_datetime_format=True)
    subset_df = df[["Survived", "Age_datetime"]]

    # check input types
    with pytest.raises(TypeError):
        numpy_array = np.random.randn(10, 10)  # not a DataFrame
        pps.matrix(numpy_array)

    with pytest.raises(ValueError):
        pps.matrix(df, output="invalid_output_type")

    # check return types
    assert isinstance(pps.matrix(df), pd.DataFrame)
    assert isinstance(pps.matrix(df, output="list"), list)

    # matrix catches single score errors under the hood
    invalid_score = [
        score
        for score in pps.matrix(subset_df, output="list")
        if (score["x"] == "Survived" and score["y"] == "Age_datetime")
    ][0]
    assert invalid_score["ppscore"] == 0
コード例 #5
0
def get_connectivity_matrices(time_series, subjects, kinds=DEFAULT_KINDS):
    """Computes connectivity matrices
    
    Arguments:
        time_series {list} -- List of extracted time series
        subjects {list} -- List of corresponding subbjects
    
    Keyword Arguments:
        kinds {list} -- List of connectivity measures (default: {DEFAULT_KINDS})
    
    Returns:
        {Numpy array} -- Connectivity matrices
    """
    matrices = {}
    n_subjects = len(set(subjects))

    for kind in kinds:
        print(
            f'{bcolors.OKBLUE}Computing {kind} of {n_subjects} subjects{bcolors.ENDC}'
        )

        if kind in NILEARN_KINDS:
            if kind == 'tangent' and n_subjects < 2:
                print(
                    f'{bcolors.FAIL}Tangent space parametrization can only be applied to a group of subjects, as it returns deviations to the mean. Skipping{bcolors.ENDC}'
                )
                continue

            connectivity_measures = ConnectivityMeasure(kind=kind)

            connectivity_matrices = connectivity_measures.fit_transform(
                time_series)

            matrices[kind] = {
                subjects[i]: connectivity_matrices[i]
                for i in range(connectivity_matrices.shape[0])
            }

        if kind == 'pps':
            for i, subject in enumerate(subjects):
                ts = pd.DataFrame(time_series[i])
                matrix = pps.matrix(ts)
                matrix = matrix[['x', 'y', 'ppscore']].pivot(columns='x',
                                                             index='y',
                                                             values='ppscore')

                if not kind in matrices:
                    matrices[kind] = {}

                matrices[kind][subject] = matrix.values

    return matrices
コード例 #6
0
def create_heatmap_pp_score(df):
    z = pps.matrix(df).round(2).values

    x = df.columns.to_list()
    y = df.columns.to_list()

    fig2 = ff.create_annotated_heatmap(z, x, y)

    fig2.update_layout(
        autosize=False,
        width=600,
        height=600, )

    return fig2
コード例 #7
0
def calculate_pps_matrix(data, agg_method):
    _global_checks.global_check_import('ppscore', 'depend_matrix with PPS calculation')
    import ppscore as pps
    pps_result = pps.matrix(data, sample=None)
    pps_matrix = pps_result[["x", "y", "ppscore"]].pivot(
        columns="x", index="y", values="ppscore"
    )
    pps_matrix.rename_axis(None, axis=1, inplace=True)
    pps_matrix.rename_axis(None, axis=0, inplace=True)
    # aggregate values to make symmetric matrix
    if agg_method == "max":
        pps_matrix = np.maximum(pps_matrix, pps_matrix.transpose())
    if agg_method == "min":
        pps_matrix = np.minimum(pps_matrix, pps_matrix.transpose())
    if agg_method == "mean":
        pps_matrix = (pps_matrix + pps_matrix.transpose()) / 2
    return pps_matrix
コード例 #8
0
 def pps_heatmap(df):
     """
         Function for calculating the Predictive Power Score and plotting a heatmap
             Args:
                 Pandas DataFrame or Series object
             __________
             Returns:
                 figure
     """
     pps_mtrx = pps.matrix(df)
     pps_mtrx1 = pps_mtrx[['x', 'y', 'ppscore']].pivot(columns='x',
                                                       index='y',
                                                       values='ppscore')
     plt.figure(figsize=(24, 8))
     ax = sb.heatmap(pps_mtrx1,
                     vmin=0,
                     vmax=1,
                     cmap="afmhot_r",
                     linewidths=0.5,
                     annot=True)
     ax.set_title("PPS matrix")
     ax.set_xlabel("feature")
     ax.set_ylabel("target")
     return ax
コード例 #9
0
sns.heatmap(correlation, annot=True, square=True, cmap='coolwarm')
plt.show()

"""From the analysis it was evident that **area_mean, perimeter_mean, perimeter_worst, area_worst** have a very high correlation hence they were removed from features.

After applying **PPS (Predictive Power Score)**
"""

new_cols = ['radius_mean','texture_mean','smoothness_mean','compactness_mean',
             'concavity_mean','concave points_mean','symmetry_mean','fractal_dimension_mean',
            'radius_se','texture_se','perimeter_se','area_se','smoothness_se',
             'compactness_se','concavity_se','concave points_se','symmetry_se',
             'fractal_dimension_se','radius_worst','texture_worst',
             'smoothness_worst','compactness_worst','concavity_worst','concave points_worst',
             'symmetry_worst','fractal_dimension_worst']
pp_mean = pps.matrix(df[new_cols])
plt.figure(figsize=(30,30))
plt.rcParams.update({'font.size': 15})
sns.heatmap(pp_mean, annot=True, square=True, cmap='coolwarm')
plt.show()

"""# Creation of X and Y for training the machine learning models.

**Label encoder converts the 2 text classification label (M & B) into numeric 1 and 0.** M label is translated to 1 and B label is translated to 0.

Splitting data into **train and test with 75:25** ratio
"""

# dataframe X contains feature selected
X = df[new_cols]
# dataframe Y contains the corresponding labels
コード例 #10
0
d = st.sidebar.selectbox(label="Select a year:",
                         options=severity_historical["year"].unique().tolist(),
                         index=0)

l = st.sidebar.selectbox(
    label="Select a variable (for univariate distributions):",
    options=selectcols,
    index=0)

subsetdf = severity_historical.loc[severity_historical["year"] == d].drop(
    labels=["FIPS", "year"], axis=1)

col1, col2, col3 = st.beta_columns(3)

matrix_df = pps.matrix(subsetdf)[['x', 'y', 'ppscore']].pivot(columns='x',
                                                              index='y',
                                                              values='ppscore')

fig1 = plt.figure(1)
sns.histplot(subsetdf, x=l).set_title("Univariate Distribution")

fig2 = plt.figure(2)
sns.heatmap(subsetdf.corr(), cmap="flare",
            annot=True).set_title("Correlation Matrix")

fig3 = plt.figure(3)
sns.heatmap(matrix_df, cmap="cubehelix_r",
            annot=True).set_title("Predictive Power Score")

col1.pyplot(fig1, use_container_widths=True)
col2.pyplot(fig2, use_container_widths=True)
コード例 #11
0

'''data source initialization'''


this_dir = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.join(this_dir, 'winequality-red.csv')
quality_data = pd.read_csv(data_path)
# quality_data.drop_duplicates(inplace=True)
# duplicates seems to be relevant and may exist due to precess satbility, so I'm leaving them


'''predictive score matrix'''


matrix_pp = pps.matrix(quality_data)[['x', 'y', 'ppscore']].pivot(
    columns='x', index='y', values='ppscore')
fig0, ax0 = plt.subplots()
plot0 = sns.heatmap(matrix_pp, annot=True, ax=ax0)
# plt.show()
# most of features have no predictive power, so let's drop them :)


'''dependent and independet variables separation and cleaning'''


def robust_zscore(d: 'DataFrame'):
    a = np.asanyarray(d)
    median = np.median(a, axis=0)
    mad = median_abs_deviation(a, axis=0)
    z = (a - median) / mad
    return z
コード例 #12
0
ファイル: eda.py プロジェクト: franperic/pokehackathon
mean_rslts.sort_values(["Name_1", "Name_2"], inplace=True)

rslts = mean_rslts.BattleResult.values.copy()
rslts.resize([144, 144])

# Normalization
rslts_min, rslts_max = np.abs(np.nanmin(rslts)), np.abs(np.nanmax(rslts))
rslts_norm = (rslts + rslts_min) / (rslts_min + rslts_max)
rslts_norm = np.nan_to_num(rslts_norm, nan=0)

# Plot
fig = go.Figure(data=go.Heatmap(z=rslts_norm,
                                x=mean_rslts.Name_1.unique(),
                                y=mean_rslts.Name_2.unique(),
                                colorscale='Viridis'))

fig.update_layout(title='Mean Battle Results', xaxis_nticks=50)

fig.show()

# Predictive Power Score
pps_matrix = pps.matrix(battle, sample=5000)

fig = go.Figure(data=go.Heatmap(z=pps_matrix.values,
                                x=pps_matrix.index,
                                y=pps_matrix.columns,
                                colorscale='Viridis'))

fig.update_layout(title='Predictive Power Score', xaxis_nticks=50)

fig.show()
コード例 #13
0
# there are 1863 population samples, for whom genetics is available - len(set(total_df.index).intersection(samples))
# of them, 198 have olink data for proteins
print(
    "from here, use code below if you want only pop analysis, otherwise scroll to the bottom"
)
assert False

# Selecting non-obese - only healthy, have no olink data (to avoid an overfit)
total_df = total_df.loc[total_df.index.intersection(samples).difference(
    olinked_samples)]  # 1665 samples
total_df_sample = total_df
print(total_df_sample.shape)

# Using PPS
matrix_df = pps.matrix(total_df_sample)[['x', 'y',
                                         'ppscore']].pivot(columns='x',
                                                           index='y',
                                                           values='ppscore')
fig = plt.figure(figsize=(16, 16))
ax = sns.heatmap(matrix_df,
                 vmin=0,
                 vmax=1,
                 cmap="Blues",
                 linewidths=0.5,
                 annot=True)
axlim = total_df_sample.shape[1]
ax.set_ylim(
    0, axlim
)  # a patch for the heatmap in sns - it was broken in matplotlib 3.1.1
plt.savefig("pps.png")
plt.close()
# No findings
コード例 #14
0
ファイル: titanic.py プロジェクト: akertek/ML-Projects
print("----- Z Test Results -----")
print("T stat. = " + str(t_stat_3))
print("P value = " + str(p_value_3))  # P-value is less than 0.05

print("")

# T-test: Checking if the distribution means (fares of survivors vs fares of non-survivors) are statistically different
t_stat_4, p_value_4 = stats.ttest_ind(dist_c, dist_d)
print("----- T Test Results -----")
print("T stat. = " + str(t_stat_4))
print("P value = " + str(p_value_4))  # P-value is less than 0.05

# PP - Score

matrix_data = pps.matrix(data)[['x', 'y', 'ppscore']].pivot(columns='x',
                                                            index='y',
                                                            values='ppscore')
matrix_data = matrix_data.apply(
    lambda x: round(x, 2))  # Rounding matrix_data's values to 0,XX

sns.heatmap(matrix_data,
            vmin=0,
            vmax=1,
            cmap="Blues",
            linewidths=0.75,
            annot=True)

#%% Feature Engineering

data['AgeCat'] = ''
data['AgeCat'].loc[(data['Age'] < 18)] = 'young'
コード例 #15
0
def feature_predictive_power(data, feature, target, id_column, top=5):
    '''
    Calculate the predictive power of features on a feature of interest for effective Exploratory data analysis and feature engineering 

    Parameter:
    -----------------------------------------
    data: DataFrame

        Data which contain the feature.

    feature: str

        Name of the Column of interest on which EDA or feature engineering is to be performed
    
    target: str 

        Name of the target column in the data.
    
    id_column: str

        Name of the ID column 

    top(default = 5): int 

        Top predictive columns for the feature 
        Top must be less than number of features

    Returns:
        descriptive dataframe.
    '''
    df = data.copy()
    data = data.drop(columns=[id_column], axis=1)
    features = list(data.columns)

    rt = ps.matrix(data)
    if feature == id_column:
        raise KeyError(
            f"{feature} is not a predictive feature, try any of {features}")

    predictors = pd.DataFrame(rt.loc[feature].sort_values(ascending=False))
    numerical_feats = []
    categorical_feats = []
    for f in features:
        try:
            data[f] = data[f].astype(float)
            numerical_feats.append(f)
        except:
            categorical_feats.append(f)

    #constant feature check
    if len(rt.loc[feature].unique()) == 1:
        raise Exception(
            f"{feature} is a constant difference feature, all other features are perfect predictors. \nConsideer dropping {feature}"
        )

    #numerical cases
    if feature in numerical_feats:
        predictors = predictors[1:top + 1].rename(
            columns={feature: 'predictive_power'})

    #categorical cases
    else:
        #nominal feature check
        if len(data[feature].unique()) > data.shape[0] * .75:
            raise Exception(
                f"{feature} is a nominal feature, consider dropping it")
        else:
            predictors = predictors[1:top + 1].rename(
                columns={feature: 'predictive_power'})

    return predictors
コード例 #16
0
df = df.rename(columns={"Fare": "TicketPrice"})
df = df.rename(columns={"Embarked": "Port"})

# %% [markdown]
# ## Single Predictive Power Score
# - Answering the question: how well can Sex predict the Survival probability?

# %%
pps.score(df, "Sex", "Survived")

# %% [markdown]
# ## PPS matrix
# - Answering the question: which predictive patterns exist between the columns?

# %%
matrix = pps.matrix(df)

# %%
matrix

# %%
heatmap(matrix)

# %% [markdown]
# ## Correlation matrix
# - As a comparison to the PPS matrix

# %%
corr_heatmap(df.corr())

# %%
コード例 #17
0
def cria_matriz_pps(df):
    correlations = pps.matrix(df).pivot(columns='x', index='y',  values='ppscore')
    plt.figure(figsize=(20,20))
    sns.heatmap(correlations, annot=True)