Exemple #1
0
def column_correlations(dataset_a,
                        dataset_b,
                        categorical_columns,
                        theil_u=True):
    """
    Column-wise correlation calculation between ``dataset_a`` and ``dataset_b``.

    :param dataset_a: First DataFrame
    :param dataset_b: Second DataFrame
    :param categorical_columns: The columns containing categorical values
    :param theil_u: Whether to use Theil's U. If False, use Cramer's V.
    :return: Mean correlation between all columns.
    """
    if categorical_columns is None:
        categorical_columns = list()
    elif categorical_columns == 'all':
        categorical_columns = dataset_a.columns
    assert dataset_a.columns.tolist() == dataset_b.columns.tolist()
    corr = pd.DataFrame(columns=dataset_a.columns, index=['correlation'])

    for column in dataset_a.columns.tolist():
        if column in categorical_columns:
            if theil_u:
                corr[column] = theils_u(dataset_a[column].sort_values(),
                                        dataset_b[column].sort_values())
            else:
                corr[column] = cramers_v(dataset_a[column].sort_values(),
                                         dataset_b[column].sort_vaues())
        else:
            corr[column], _ = ss.pearsonr(dataset_a[column].sort_values(),
                                          dataset_b[column].sort_values())
    corr.fillna(value=np.nan, inplace=True)
    correlation = np.mean(corr.values.flatten())
    return correlation
Exemple #2
0
def corr_categories(df):
    cols = df.columns
    df1 = pd.DataFrame(columns = ['Var1', 'Var2', 'Corr_Cat'])
    for i in cols:
        #j=i[i+1]
        for j in cols:
            if i != j:
                new_row = {'Var1':i, 'Var2':j, 'Corr_Cat': theils_u(df[i], df[j])} 
                df1 = df1.append(new_row, ignore_index=True)
    return df1.sort_values(by=['Corr_Cat'], ascending=False)
Exemple #3
0
def correlation(df, y):

    #Co-relating between categorical and categorical variables
    for col in df.columns:
        if df[col].dtype == object:
            corr = theils_u(df[col], y)
            print('{} = {:.3f}'.format(col, corr))
        else:
            corr = correlation_ratio(y, df[col])
            print('{} = {:.3f}'.format(col, corr))
Exemple #4
0
def calculateUncertanityCoeff(df: DataFrame,
                              labels: List[str]) -> List[List[float]]:
    """Calculates Theil's U uncertainity coefficient. Implemented as in:
        https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9

        Args:
            df:                         dataframe one-hot encoding for M labels
            labels (1,M):               name of columns with M labels, one-hot encoding

        Returns:
            uncertanity_coeff (M,M):    theil's uncertanity coefficient for labels
    """

    uncertanity_coeff = [[0] * len(labels) for label in labels]

    for label_1 in range(len(labels)):
        for label_2 in range(len(labels)):
            uncertanity_coeff[label_1][label_2] = theils_u(
                df[labels[label_1]], df[labels[label_2]])

    return uncertanity_coeff
Exemple #5
0
def get_correlation_dataframe(data, **kwargs):
    """
    Parameters
    ----------
    data: pandas.DataFrame
    DataFrame with nominal or metrical columns

    kwargs:
    show_progress: bool, default=False
    Prints each row if True

    Returns
    -------
    var name=data_corr: pandas.DataFrame,
    with two column names and their correlation
    """

    if 'show_progress' not in kwargs:
        kwargs['show_progress'] = False
    data_corr = pd.DataFrame(columns=[
        'variable1', 'variable2', 'correlation', 'correlation_rounded'
    ])
    for variable1 in data:
        for variable2 in data:
            # nominal-nominal -> Theils U
            if type(data[variable1][0]) == str and type(
                    data[variable2][0]) == str:
                corr = nominal.theils_u(data[variable1],
                                        data[variable2],
                                        nan_replace_value='f')
            # metircal-metrical -> Pearsons R
            elif util_func.is_number(
                    data[variable1][0]) and util_func.is_number(
                        data[variable2][0]):
                corr = scipy.stats.stats.pearsonr(data[variable1],
                                                  data[variable2])[0]
                # change range from [-1, 1] to [0, 1] as the other metrics
                corr = (corr + 1) / 2
            # metrical-nominal -> correlation ratio
            elif type(data[variable1][0]) == str and util_func.is_number(
                    data[variable2][0]):
                corr = nominal.correlation_ratio(data[variable1],
                                                 data[variable2],
                                                 nan_replace_value='f')
            elif type(data[variable2][0]) == str and util_func.is_number(
                    data[variable1][0]):
                corr = nominal.correlation_ratio(data[variable2],
                                                 data[variable1],
                                                 nan_replace_value='f')
            else:
                print('var1-type: ' + str(type(data[variable1][0])) +
                      ', var2-type: ' + str(type(data[variable2][0])))
                print('var1: ' + str(data[variable1][0]) + ', var2: ' +
                      str(data[variable2][0]))
            new_row = {
                'variable1': variable1,
                'variable2': variable2,
                'correlation': corr,
                'correlation_rounded': round(corr, 2)
            }
            data_corr = data_corr.append(new_row, ignore_index=True)
            if kwargs['show_progress']:
                print(new_row)
    return data_corr
Exemple #6
0
def categorical_categorical(df, col1, col2):

    corr = theils_u(df[col1], df[col2])
    print(corr)
Exemple #7
0
		corr = correlation_ratio(df['subscribed'], df[col])
		print('{:.3f}'.format(corr))

print('\n\n\n')

for col in df.columns:
	if df[col].dtype == object:
		print('{} vs Subscribed'.format(col))
		corr = theils_u(df['subscribed'], df[col])
		print('{:.3f}'.format(corr))

"""
#Marital status and Loans: is there a relation?

print('\nAnalyzing marital status vs Loans:\n')
status = df.marital.unique()

for s in status:
    print(s)
    data = df.groupby(df.marital).get_group(s)['loan'].value_counts()
    print('{:.2f}{}'.format(100 * data['yes'] / (data['yes'] + data['no']),
                            '%'))

corelation = theils_u(df['loan'], df['marital'])

print('\nCorrelation ratio = {:.3f}'.format(corelation))
print('Marital status does not influence loans')

#Jobs and Durations: again
anova_test(df)
Exemple #8
0
df_all = pd.read_csv('C:/Users/vince_000/Documents/BPI Challenge 2019/New_Exports/clusters_02_02_01.csv')

# Build database connection

engine = db.create_engine('mssql+pyodbc://adminuser:Yxcvbnm@[email protected]/ProMi?driver=ODBC+Driver+17+for+SQL+Server')
con = engine.connect()
metadata = db.MetaData(schema = 'stg')

table = db.Table('case_table_filtered',metadata,autoload = True, autoload_with=engine)
ResultProxy = con.execute(db.select([table]))
ResultSet = ResultProxy.fetchall()
df_export = pd.DataFrame(ResultSet)
df_export.columns = ResultSet[0].keys()

df_val = df_all.merge(df_export, left_on= 'case', right_on = '_case_concept_name_')

column_name = ['_case_Spend_area_text_', '_case_Sub_spend_area_text_', '_case_Name_','_case_Vendor_']
theils_u = []
cramers_v = []

for c in column_name:
    theils_u.append(nl.theils_u(df_val['cluster'] , df_val[c]))
    cramers_v.append(nl.cramers_v(df_val[c], df_val['cluster'] ))
    
df_nominal_corr = pd.DataFrame({'column' : column_name, 'uncertainty coefficient': theils_u})

df_nominal_corr.to_csv('C:/Users/vince_000/Documents/GitHub/BPI_Challenge_2019/Python/Clustering/Correlation_to_Clustering_in_02_02_01/Correlation_to_Clustering_in_02_02_01.csv', index = False)

#group_analysis = df_val.filter(['_case_Vendor_', 'cluster', '_case_concept_name_']).groupby(['_case_Vendor_', 'cluster']).count()

#
# Let's verify that Cramér's V is a symmetric function
cramers_v(df['Survived'], df['Pclass']) == cramers_v(df['Pclass'],
                                                     df['Survived'])

# %%
# You can also draw a mosaic plot for these variables
mosaic(data=df,
       index=['Survived', 'Pclass'],
       statistic=True,
       axes_label=True,
       gap=[0.01, 0.02])

# %%
# Take advantage of the asymmetry of Theil's U calculating it for the same variables.
# This is U(Survived|Pcalss) that is "U for Survived given Pclass"
theils_u(df['Survived'], df['Pclass'])

# %%
# Just check that the opposite direction gives you a different result
theils_u(df['Pclass'], df['Survived'])

# %%
# Let's draw a violin plot of Age and Pclass
violinPlot(data=df,
           varx='Pclass',
           vary='Age',
           title='Passenger age VS Passenger class',
           xlab='Pclass',
           ylab='Age')
# In case you're not using a Jupyter notebook run also the following:
# plt.show()
def theils_u(x, y):
    return theils_u(x, y)