def conditional_entropy(x, y, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE): """ Calculates the conditional entropy of x given y: S(x|y) Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy **Returns:** float Parameters ---------- x : list / NumPy ndarray / Pandas Series A sequence of measurements y : list / NumPy ndarray / Pandas Series A sequence of measurements nan_strategy : string, default = 'replace' How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. nan_replace_value : any, default = 0.0 The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. """ if nan_strategy == REPLACE: x, y = replace_nan_with_value(x, y, nan_replace_value) elif nan_strategy == DROP: x, y = remove_incomplete_samples(x, y) y_counter = Counter(y) xy_counter = Counter(list(zip(x,y))) total_occurrences = sum(y_counter.values()) entropy = 0.0 for xy in xy_counter.keys(): p_xy = xy_counter[xy] / total_occurrences p_y = y_counter[xy[1]] / total_occurrences entropy += p_xy * math.log(p_y/p_xy) return entropy
def correlation_ratio(categories, measurements, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE): """ Calculates the Correlation Ratio (sometimes marked by the greek letter Eta) for categorical-continuous association. Answers the question - given a continuous value of a measurement, is it possible to know which category is it associated with? Value is in the range [0,1], where 0 means a category cannot be determined by a continuous measurement, and 1 means a category can be determined with absolute certainty. Wikipedia: https://en.wikipedia.org/wiki/Correlation_ratio **Returns:** float in the range of [0,1] Parameters ---------- categories : list / NumPy ndarray / Pandas Series A sequence of categorical measurements measurements : list / NumPy ndarray / Pandas Series A sequence of continuous measurements nan_strategy : string, default = 'replace' How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. nan_replace_value : any, default = 0.0 The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. """ if nan_strategy == REPLACE: categories, measurements = replace_nan_with_value( categories, measurements, nan_replace_value) elif nan_strategy == DROP: categories, measurements = remove_incomplete_samples( categories, measurements) categories = convert(categories, 'array') measurements = convert(measurements, 'array') fcat, _ = pd.factorize(categories) cat_num = np.max(fcat) + 1 y_avg_array = np.zeros(cat_num) n_array = np.zeros(cat_num) for i in range(0, cat_num): cat_measures = measurements[np.argwhere(fcat == i).flatten()] n_array[i] = len(cat_measures) y_avg_array[i] = np.average(cat_measures) y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array) numerator = np.sum( np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2))) denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2)) if numerator == 0: eta = 0.0 else: eta = np.sqrt(numerator / denominator) return eta
def theils_u(x, y, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE): """ Calculates Theil's U statistic (Uncertainty coefficient) for categorical- categorical association. This is the uncertainty of x given y: value is on the range of [0,1] - where 0 means y provides no information about x, and 1 means y provides full information about x. This is an asymmetric coefficient: U(x,y) != U(y,x) Wikipedia: https://en.wikipedia.org/wiki/Uncertainty_coefficient **Returns:** float in the range of [0,1] Parameters ---------- x : list / NumPy ndarray / Pandas Series A sequence of categorical measurements y : list / NumPy ndarray / Pandas Series A sequence of categorical measurements nan_strategy : string, default = 'replace' How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. nan_replace_value : any, default = 0.0 The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. """ if nan_strategy == REPLACE: x, y = replace_nan_with_value(x, y, nan_replace_value) elif nan_strategy == DROP: x, y = remove_incomplete_samples(x, y) s_xy = conditional_entropy(x, y) x_counter = Counter(x) total_occurrences = sum(x_counter.values()) p_x = list(map(lambda n: n / total_occurrences, x_counter.values())) s_x = ss.entropy(p_x) if s_x == 0: return 1 else: return (s_x - s_xy) / s_x
def cramers_v(x, y, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE): """ Calculates Cramer's V statistic for categorical-categorical association. Uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328. This is a symmetric coefficient: V(x,y) = V(y,x) Original function taken from: https://stackoverflow.com/a/46498792/5863503 Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V **Returns:** float in the range of [0,1] Parameters ---------- x : list / NumPy ndarray / Pandas Series A sequence of categorical measurements y : list / NumPy ndarray / Pandas Series A sequence of categorical measurements nan_strategy : string, default = 'replace' How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. nan_replace_value : any, default = 0.0 The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. """ if nan_strategy == REPLACE: x, y = replace_nan_with_value(x, y, nan_replace_value) elif nan_strategy == DROP: x, y = remove_incomplete_samples(x, y) confusion_matrix = pd.crosstab(x, y) chi2 = ss.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum().sum() phi2 = chi2 / n r, k = confusion_matrix.shape phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) rcorr = r - ((r - 1)**2) / (n - 1) kcorr = k - ((k - 1)**2) / (n - 1) return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
def cramers_v(x, y, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE, min_bin_size=5, min_set_size=20): """ Calculates Cramer's V statistic for categorical-categorical association. Uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328. This is a symmetric coefficient: V(x,y) = V(y,x) Original function taken from: https://stackoverflow.com/a/46498792/5863503 Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V **Returns:** float in the range of [0,1] and an integer, the considered sample size Parameters ---------- x : list / NumPy ndarray / Pandas Series A sequence of categorical measurements y : list / NumPy ndarray / Pandas Series A sequence of categorical measurements nan_strategy : string, default = 'replace' How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. nan_replace_value : any, default = 0.0 The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'. min_bin_size : int The minimal number of elements a bin should contain min_set_size : int The minimal number of elements a whole set should contain """ size = 0 x, y = remove_small_bins(x, y, min_bin_size) if nan_strategy == REPLACE: x, y = replace_nan_with_value(x, y, nan_replace_value) elif nan_strategy == DROP: x, y = remove_incomplete_samples(x, y) size = len(x) if size < min_set_size: #sample size too small return NOT_ENOUGH_SAMPLES, size confusion_matrix = pd.crosstab(np.array(x, dtype=object), np.array(y, dtype=object)) if confusion_matrix.shape[0] <= 1 or confusion_matrix.shape[1] <= 1: #number of bins too small return NOT_ENOUGH_CATEGORIES, size chi2 = ss.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum().sum() phi2 = chi2 / n r, k = confusion_matrix.shape phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) rcorr = r - ((r - 1)**2) / (n - 1) kcorr = k - ((k - 1)**2) / (n - 1) #not enough data #should not happend though #if min((kcorr-1),(rcorr-1)) == 0: # return -0.5, size return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))), size