Ejemplo n.º 1
0
def processContinuousFeatures(algorithm, df, column_name, entropy, config):

    #if True:
    if df[column_name].nunique() <= 20:
        unique_values = sorted(df[column_name].unique())
    else:
        unique_values = []

        df_mean = df[column_name].mean()
        df_std = df[column_name].std(ddof=0)
        df_min = df[column_name].min()
        df_max = df[column_name].max()

        unique_values.append(df[column_name].min())
        unique_values.append(df[column_name].max())
        unique_values.append(df[column_name].mean())

        scales = list(range(-3, +4, 1))
        for scale in scales:
            if df_mean + scale * df_std > df_min and df_mean + scale * df_std < df_max:
                unique_values.append(df_mean + scale * df_std)

        unique_values.sort()

    #print(column_name,"->",unique_values)

    subset_gainratios = []
    subset_gains = []
    subset_ginis = []
    subset_red_stdevs = []
    subset_chi_squares = []

    if len(unique_values) == 1:
        winner_threshold = unique_values[0]
        df[column_name] = np.where(df[column_name] <= winner_threshold,
                                   "<=" + str(winner_threshold),
                                   ">" + str(winner_threshold))
        return df

    for i in range(0, len(unique_values) - 1):
        threshold = unique_values[i]

        subset1 = df[df[column_name] <= threshold]
        subset2 = df[df[column_name] > threshold]

        subset1_rows = subset1.shape[0]
        subset2_rows = subset2.shape[0]
        total_instances = df.shape[0]  #subset1_rows+subset2_rows

        subset1_probability = subset1_rows / total_instances
        subset2_probability = subset2_rows / total_instances

        if algorithm == 'ID3' or algorithm == 'C4.5':
            threshold_gain = entropy - subset1_probability * Training.calculateEntropy(
                subset1,
                config) - subset2_probability * Training.calculateEntropy(
                    subset2, config)
            subset_gains.append(threshold_gain)

        if algorithm == 'C4.5':  #C4.5 also need gain in the block above. That's why, instead of else if we used direct if condition here

            threshold_splitinfo = -subset1_probability * math.log(
                subset1_probability, 2) - subset2_probability * math.log(
                    subset2_probability, 2)
            gainratio = threshold_gain / threshold_splitinfo
            subset_gainratios.append(gainratio)

        elif algorithm == 'CART':
            decision_for_subset1 = subset1['Decision'].value_counts().tolist()
            decision_for_subset2 = subset2['Decision'].value_counts().tolist()

            gini_subset1 = 1
            gini_subset2 = 1

            for j in range(0, len(decision_for_subset1)):
                gini_subset1 = gini_subset1 - math.pow(
                    (decision_for_subset1[j] / subset1_rows), 2)

            for j in range(0, len(decision_for_subset2)):
                gini_subset2 = gini_subset2 - math.pow(
                    (decision_for_subset2[j] / subset2_rows), 2)

            gini = (subset1_rows / total_instances) * gini_subset1 + (
                subset2_rows / total_instances) * gini_subset2

            subset_ginis.append(gini)

        elif algorithm == "CHAID":
            #subset1 = high, subset2 = normal

            unique_decisions = df['Decision'].unique()  #Yes, No
            num_of_decisions = len(unique_decisions)  #2

            subset1_expected = subset1.shape[0] / num_of_decisions
            subset2_expected = subset2.shape[0] / num_of_decisions

            chi_square = 0
            for d in unique_decisions:  #Yes, No

                #decision = Yes
                subset1_d = subset1[subset1["Decision"] == d]  #high, yes
                subset2_d = subset2[subset2["Decision"] == d]  #normal, yes

                subset1_d_chi_square = math.sqrt(
                    ((subset1_d.shape[0] - subset1_expected) *
                     (subset1_d.shape[0] - subset1_expected)) /
                    subset1_expected)

                subset2_d_chi_square = math.sqrt(
                    ((subset2_d.shape[0] - subset2_expected) *
                     (subset2_d.shape[0] - subset2_expected)) /
                    subset2_expected)

                chi_square = chi_square + subset1_d_chi_square + subset2_d_chi_square

            subset_chi_squares.append(chi_square)

        #----------------------------------
        elif algorithm == 'Regression':
            superset_stdev = df['Decision'].std(ddof=0)
            subset1_stdev = subset1['Decision'].std(ddof=0)
            subset2_stdev = subset2['Decision'].std(ddof=0)

            threshold_weighted_stdev = (
                subset1_rows / total_instances) * subset1_stdev + (
                    subset2_rows / total_instances) * subset2_stdev
            threshold_reducted_stdev = superset_stdev - threshold_weighted_stdev
            subset_red_stdevs.append(threshold_reducted_stdev)

        #----------------------------------

    if algorithm == "C4.5":
        winner_one = subset_gainratios.index(max(subset_gainratios))
    elif algorithm == "ID3":  #actually, ID3 does not support for continuous features but we can still do it
        winner_one = subset_gains.index(max(subset_gains))
    elif algorithm == "CART":
        winner_one = subset_ginis.index(min(subset_ginis))
    elif algorithm == "CHAID":
        winner_one = subset_chi_squares.index(max(subset_chi_squares))
    elif algorithm == "Regression":
        winner_one = subset_red_stdevs.index(max(subset_red_stdevs))

    winner_threshold = unique_values[winner_one]
    #print(column_name,": ", winner_threshold," in ", unique_values)

    #print("theshold is ",winner_threshold," for ",column_name)
    df[column_name] = np.where(df[column_name] <= winner_threshold,
                               "<=" + str(winner_threshold),
                               ">" + str(winner_threshold))

    return df
Ejemplo n.º 2
0
def processContinuousFeatures(algorithm, df, column_name, entropy, config):
    unique_values = sorted(df[column_name].unique())
    #print(column_name,"->",unique_values)

    subset_gainratios = []
    subset_gains = []
    subset_ginis = []
    subset_red_stdevs = []

    if len(unique_values) == 1:
        winner_threshold = unique_values[0]
        df[column_name] = np.where(df[column_name] <= winner_threshold,
                                   "<=" + str(winner_threshold),
                                   ">" + str(winner_threshold))
        return df

    for i in range(0, len(unique_values) - 1):
        threshold = unique_values[i]

        subset1 = df[df[column_name] <= threshold]
        subset2 = df[df[column_name] > threshold]

        subset1_rows = subset1.shape[0]
        subset2_rows = subset2.shape[0]
        total_instances = df.shape[0]  #subset1_rows+subset2_rows

        subset1_probability = subset1_rows / total_instances
        subset2_probability = subset2_rows / total_instances

        if algorithm == 'ID3' or algorithm == 'C4.5':
            threshold_gain = entropy - subset1_probability * Training.calculateEntropy(
                subset1,
                config) - subset2_probability * Training.calculateEntropy(
                    subset2, config)
            subset_gains.append(threshold_gain)

        if algorithm == 'C4.5':  #C4.5 also need gain in the block above. That's why, instead of else if we used direct if condition here
            threshold_splitinfo = -subset1_probability * math.log(
                subset1_probability, 2) - subset2_probability * math.log(
                    subset2_probability, 2)
            gainratio = threshold_gain / threshold_splitinfo
            subset_gainratios.append(gainratio)

        elif algorithm == 'CART':
            decision_for_subset1 = subset1['Decision'].value_counts().tolist()
            decision_for_subset2 = subset2['Decision'].value_counts().tolist()

            gini_subset1 = 1
            gini_subset2 = 1

            for j in range(0, len(decision_for_subset1)):
                gini_subset1 = gini_subset1 - math.pow(
                    (decision_for_subset1[j] / subset1_rows), 2)

            for j in range(0, len(decision_for_subset2)):
                gini_subset2 = gini_subset2 - math.pow(
                    (decision_for_subset2[j] / subset2_rows), 2)

            gini = (subset1_rows / total_instances) * gini_subset1 + (
                subset2_rows / total_instances) * gini_subset2

            subset_ginis.append(gini)

        #----------------------------------
        elif algorithm == 'Regression':
            superset_stdev = df['Decision'].std(ddof=0)
            subset1_stdev = subset1['Decision'].std(ddof=0)
            subset2_stdev = subset2['Decision'].std(ddof=0)

            threshold_weighted_stdev = (
                subset1_rows / total_instances) * subset1_stdev + (
                    subset2_rows / total_instances) * subset2_stdev
            threshold_reducted_stdev = superset_stdev - threshold_weighted_stdev
            subset_red_stdevs.append(threshold_reducted_stdev)

        #----------------------------------

    if algorithm == "C4.5":
        winner_one = subset_gainratios.index(max(subset_gainratios))
    elif algorithm == "ID3":  #actually, ID3 does not support for continuous features but we can still do it
        winner_one = subset_gains.index(max(subset_gains))
    elif algorithm == "CART":
        winner_one = subset_ginis.index(min(subset_ginis))
    elif algorithm == "Regression":
        winner_one = subset_red_stdevs.index(max(subset_red_stdevs))

    winner_threshold = unique_values[winner_one]

    #print("theshold is ",winner_threshold," for ",column_name)
    df[column_name] = np.where(df[column_name] <= winner_threshold,
                               "<=" + str(winner_threshold),
                               ">" + str(winner_threshold))

    return df