Beispiel #1
0
def info_gain(idadf, target = None, features = None, ignore_indexer=True):
    """
    Compute the information gain / mutual information coefficients between a 
    set of features and a set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> info_gain(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)
    
    entropy_dict = OrderedDict()
    length = len(idadf)
    loglength = log(length)
    
    values = OrderedDict()
    
    for t in target:
        if t not in values:
            values[t] = OrderedDict() 
        features_notarget = [x for x in features if (x != t)]
        
        for feature in features_notarget:
            if feature not in values:
                values[feature] = OrderedDict()
            if t not in values[feature]:
                if t not in entropy_dict:
                    entropy_dict[t] = entropy(idadf, t, mode = "raw")
                if feature not in entropy_dict:
                    entropy_dict[feature] = entropy(idadf, feature, mode = "raw")
                join_entropy = entropy(idadf, [t] + [feature], mode = "raw")            
                
                value = ((entropy_dict[t] + entropy_dict[feature] - join_entropy)/length + loglength)/log(2)
                values[t][feature] = value
                if feature in target:
                    values[feature][t] = value
    
    result = pd.DataFrame(values).fillna(np.nan)
    result = result.dropna(axis=1, how="all")
    
    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns]
        result = result.reindex(order)
    
    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0,0]
        else:
            result = result[result.columns[0]].copy()
            result.sort(ascending = False) 

    return result        
Beispiel #2
0
def info_gain(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the information gain / mutual information coefficients between a 
    set of features and a set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> info_gain(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)

    entropy_dict = OrderedDict()
    length = len(idadf)
    loglength = log(length)

    values = OrderedDict()

    for t in target:
        if t not in values:
            values[t] = OrderedDict()
        features_notarget = [x for x in features if (x != t)]

        for feature in features_notarget:
            if feature not in values:
                values[feature] = OrderedDict()
            if t not in values[feature]:
                if t not in entropy_dict:
                    entropy_dict[t] = entropy(idadf, t, mode="raw")
                if feature not in entropy_dict:
                    entropy_dict[feature] = entropy(idadf, feature, mode="raw")
                join_entropy = entropy(idadf, [t] + [feature], mode="raw")

                value = (
                    (entropy_dict[t] + entropy_dict[feature] - join_entropy) /
                    length + loglength) / log(2)
                values[t][feature] = value
                if feature in target:
                    values[feature][t] = value

    result = pd.DataFrame(values).fillna(np.nan)
    result = result.dropna(axis=1, how="all")

    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features
                 ] + [x for x in features if x not in result.columns]
        result = result.reindex(order)

    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0, 0]
        else:
            result = result[result.columns[0]].copy()
            result.sort_values(inplace=True, ascending=False)

    return result
Beispiel #3
0
def gain_ratio(idadf,
               target=None,
               features=None,
               symmetry=True,
               ignore_indexer=True):
    """
    Compute the gain ratio coefficients between a set of features and a 
    set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
        
    symmetry : bool, default: True
        If True, compute the symmetric gain ratio as defined by
        [Lopez de Mantaras 1991]. Otherwise, the asymmetric gain ratio. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> gain_ratio(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)

    entropy_dict = dict()
    length = len(idadf)
    values = OrderedDict()
    corrector = length * np.log(length)

    for t in target:
        if t not in values:
            values[t] = OrderedDict()
        features_notarget = [x for x in features if (x != t)]

        for feature in features_notarget:
            if feature not in values:
                values[feature] = OrderedDict()

            if t not in values[feature]:  # i.e. it was not already computed
                if t not in entropy_dict:
                    entropy_dict[t] = entropy(idadf, t, mode="raw")
                if feature not in entropy_dict:
                    entropy_dict[feature] = entropy(idadf, feature, mode="raw")

                join_entropy = entropy(idadf, [t] + [feature], mode="raw")
                disjoin_entropy = entropy_dict[t] + entropy_dict[feature]
                info_gain = (disjoin_entropy - join_entropy)

                if symmetry:
                    gain_ratio = (info_gain + corrector) / (
                        disjoin_entropy + 2 * corrector
                    )  # 2* because symmetric
                    values[t][feature] = gain_ratio
                    if feature in target:
                        values[feature][t] = gain_ratio
                else:
                    gain_ratio_1 = (info_gain + corrector) / (entropy_dict[t] +
                                                              corrector)
                    values[t][feature] = gain_ratio_1
                    if feature in target:
                        gain_ratio_2 = (info_gain + corrector) / (
                            entropy_dict[feature] + corrector)
                        values[feature][t] = gain_ratio_2

    ### Fill the matrix
    result = pd.DataFrame(values).fillna(np.nan)
    result = result.dropna(axis=1, how="all")

    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features
                 ] + [x for x in features if x not in result.columns]
        result = result.reindex(order)

    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0, 0]
        else:
            result = result[result.columns[0]].copy()
            result.sort(ascending=True)
    else:
        result = result.fillna(1)

    return result
Beispiel #4
0
def gain_ratio(idadf, target=None, features=None, symmetry=True, ignore_indexer=True):
    """
    Compute the gain ratio coefficients between a set of features and a 
    set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
        
    symmetry : bool, default: True
        If True, compute the symmetric gain ratio as defined by
        [Lopez de Mantaras 1991]. Otherwise, the asymmetric gain ratio. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> gain_ratio(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)

    entropy_dict = dict()
    length = len(idadf)
    values = OrderedDict()
    corrector = length * np.log(length)

    for t in target:
        if t not in values:
            values[t] = OrderedDict()
        features_notarget = [x for x in features if (x != t)]

        for feature in features_notarget:
            if feature not in values:
                values[feature] = OrderedDict()

            if t not in values[feature]:  # i.e. it was not already computed
                if t not in entropy_dict:
                    entropy_dict[t] = entropy(idadf, t, mode="raw")
                if feature not in entropy_dict:
                    entropy_dict[feature] = entropy(idadf, feature, mode="raw")

                join_entropy = entropy(idadf, [t] + [feature], mode="raw")
                disjoin_entropy = entropy_dict[t] + entropy_dict[feature]
                info_gain = disjoin_entropy - join_entropy

                if symmetry:
                    gain_ratio = (info_gain + corrector) / (disjoin_entropy + 2 * corrector)  # 2* because symmetric
                    values[t][feature] = gain_ratio
                    if feature in target:
                        values[feature][t] = gain_ratio
                else:
                    gain_ratio_1 = (info_gain + corrector) / (entropy_dict[t] + corrector)
                    values[t][feature] = gain_ratio_1
                    if feature in target:
                        gain_ratio_2 = (info_gain + corrector) / (entropy_dict[feature] + corrector)
                        values[feature][t] = gain_ratio_2

    ### Fill the matrix
    result = pd.DataFrame(values).fillna(np.nan)
    result = result.dropna(axis=1, how="all")

    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns]
        result = result.reindex(order)

    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0, 0]
        else:
            result = result[result.columns[0]].copy()
            result.sort(ascending=True)
    else:
        result = result.fillna(1)

    return result