def test_entropy_one_column(self, idadf): if len(idadf.columns) >= 1: result = entropy(idadf, target = idadf.columns[0]) assert(isinstance(result, float))
def test_entropy_default(self, idadf): if len(idadf.columns) > 1: result = entropy(idadf) assert(isinstance(result, pandas.core.series.Series)) assert(len(result.index) == len(idadf.columns))
def test_entropy_multiple_columns(self, idadf): if len(idadf.columns) > 1: result = entropy(idadf, target = [idadf.columns[0],idadf.columns[1]]) assert(isinstance(result, float))
def su(idadf, target=None, features=None, ignore_indexer=True): """ Compute the symmetric uncertainty coefficients between a set of features and a set of target in an IdaDataFrame. Parameters ---------- idadf : IdaDataFrame target : str or list of str, optional A column or list of columns against to be used as target. Per default, consider all columns features : str or list of str, optional A column or list of columns to be used as features. Per default, consider all columns. ignore_indexer : bool, default: True Per default, ignore the column declared as indexer in idadf Returns ------- Pandas.DataFrame or Pandas.Series if only one target Notes ----- Input columns as target and features should be categorical, otherwise this measure does not make much sense. Examples -------- >>> idadf = IdaDataFrame(idadb, "IRIS") >>> su(idadf) """ # Check input target, features = _check_input(idadf, target, features, ignore_indexer) entropy_dict = dict() length = len(idadf) corrector = np.log(length) * length values = OrderedDict() for t in target: if t not in values: values[t] = OrderedDict() features_notarget = [x for x in features if (x != t)] for feature in features_notarget: if feature not in values: values[feature] = OrderedDict() if t not in values[feature]: if t not in entropy_dict: entropy_dict[t] = entropy(idadf, t, mode="raw") if feature not in entropy_dict: entropy_dict[feature] = entropy(idadf, feature, mode="raw") join_entropy = entropy(idadf, [t] + [feature], mode="raw") disjoin_entropy = entropy_dict[t] + entropy_dict[feature] value = (2.0 * (disjoin_entropy - join_entropy + corrector) / (disjoin_entropy + corrector * 2)) values[t][feature] = value if feature in target: values[feature][t] = value result = pd.DataFrame(values).fillna(np.nan) result = result.dropna(axis=1, how="all") if len(result.columns) > 1: order = [x for x in result.columns if x in features ] + [x for x in features if x not in result.columns] result = result.reindex(order) if len(result.columns) == 1: if len(result) == 1: result = result.iloc[0, 0] else: result = result[result.columns[0]].copy() result.sort(ascending=True) else: result = result.fillna(1) return result
def su(idadf, target = None, features = None, ignore_indexer=True): """ Compute the symmetric uncertainty coefficients between a set of features and a set of target in an IdaDataFrame. Parameters ---------- idadf : IdaDataFrame target : str or list of str, optional A column or list of columns against to be used as target. Per default, consider all columns features : str or list of str, optional A column or list of columns to be used as features. Per default, consider all columns. ignore_indexer : bool, default: True Per default, ignore the column declared as indexer in idadf Returns ------- Pandas.DataFrame or Pandas.Series if only one target Notes ----- Input columns as target and features should be categorical, otherwise this measure does not make much sense. Examples -------- >>> idadf = IdaDataFrame(idadb, "IRIS") >>> su(idadf) """ # Check input target, features = _check_input(idadf, target, features, ignore_indexer) entropy_dict = dict() length = len(idadf) corrector = np.log(length)*length values = OrderedDict() for t in target: if t not in values: values[t] = OrderedDict() features_notarget = [x for x in features if (x != t)] for feature in features_notarget: if feature not in values: values[feature] = OrderedDict() if t not in values[feature]: if t not in entropy_dict: entropy_dict[t] = entropy(idadf, t, mode = "raw") if feature not in entropy_dict: entropy_dict[feature] = entropy(idadf, feature, mode = "raw") join_entropy = entropy(idadf, [t] + [feature], mode = "raw") disjoin_entropy = entropy_dict[t] + entropy_dict[feature] value = (2.0*(disjoin_entropy - join_entropy + corrector)/(disjoin_entropy + corrector*2)) values[t][feature] = value if feature in target: values[feature][t] = value result = pd.DataFrame(values).fillna(np.nan) result = result.dropna(axis=1, how="all") if len(result.columns) > 1: order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns] result = result.reindex(order) if len(result.columns) == 1: if len(result) == 1: result = result.iloc[0,0] else: result = result[result.columns[0]].copy() result.sort(ascending = True) else: result = result.fillna(1) return result
def test_entropy_one_column(self, idadf): if len(idadf.columns) >= 1: result = entropy(idadf, target=idadf.columns[0]) assert (isinstance(result, float))
def test_entropy_multiple_columns(self, idadf): if len(idadf.columns) > 1: result = entropy(idadf, target=[idadf.columns[0], idadf.columns[1]]) assert (isinstance(result, float))
def test_entropy_default(self, idadf): if len(idadf.columns) > 1: result = entropy(idadf) assert (isinstance(result, pandas.core.series.Series)) assert (len(result.index) == len(idadf.columns))