def findDecision(df, config): algorithm = config['algorithm'] decision_classes = df["Decision"].unique() #----------------------------- if algorithm == 'Regression': stdev = df['Decision'].std(ddof=0) entropy = 0 if algorithm == "ID3" or algorithm == "C4.5": entropy = calculateEntropy(df, config) #print("entropy: ",entropy) columns = df.shape[1]; instances = df.shape[0] gains = []; gainratios = []; ginis = []; reducted_stdevs = []; chi_squared_values = [] for i in range(0, columns-1): column_name = df.columns[i] column_type = df[column_name].dtypes #print(column_name,"->",column_type) if column_type != 'object': df = Preprocess.processContinuousFeatures(algorithm, df, column_name, entropy, config) classes = df[column_name].value_counts() gain = entropy * 1; splitinfo = 0; gini = 0; weighted_stdev = 0; chi_squared_value = 0 for j in range(0, len(classes)): current_class = classes.keys().tolist()[j] #print(column_name,"->",current_class) subdataset = df[df[column_name] == current_class] #print(subdataset) subset_instances = subdataset.shape[0] class_probability = subset_instances/instances if algorithm == 'ID3' or algorithm == 'C4.5': subset_entropy = calculateEntropy(subdataset, config) #print("entropy for this sub dataset is ", subset_entropy) gain = gain - class_probability * subset_entropy if algorithm == 'C4.5': splitinfo = splitinfo - class_probability*math.log(class_probability, 2) elif algorithm == 'CART': #GINI index decision_list = subdataset['Decision'].value_counts().tolist() subgini = 1 for k in range(0, len(decision_list)): subgini = subgini - math.pow((decision_list[k]/subset_instances), 2) gini = gini + (subset_instances / instances) * subgini elif algorithm == 'CHAID': num_of_decisions = len(decision_classes) expected = subset_instances / num_of_decisions for d in decision_classes: num_of_d = subdataset[subdataset["Decision"] == d].shape[0] chi_square_of_d = math.sqrt(((num_of_d - expected) * (num_of_d - expected)) / expected) chi_squared_value += chi_square_of_d elif algorithm == 'Regression': subset_stdev = subdataset['Decision'].std(ddof=0) weighted_stdev = weighted_stdev + (subset_instances/instances)*subset_stdev #iterating over classes for loop end #------------------------------- if algorithm == "ID3": gains.append(gain) elif algorithm == "C4.5": if splitinfo == 0: splitinfo = 100 #this can be if data set consists of 2 rows and current column consists of 1 class. still decision can be made (decisions for these 2 rows same). set splitinfo to very large value to make gain ratio very small. in this way, we won't find this column as the most dominant one. gainratio = gain / splitinfo gainratios.append(gainratio) elif algorithm == "CART": ginis.append(gini) elif algorithm == "CHAID": chi_squared_values.append(chi_squared_value) elif algorithm == 'Regression': reducted_stdev = stdev - weighted_stdev reducted_stdevs.append(reducted_stdev) #print(df) if algorithm == "ID3": winner_index = gains.index(max(gains)) metric_value = entropy metric_name = "Entropy" elif algorithm == "C4.5": winner_index = gainratios.index(max(gainratios)) metric_value = entropy metric_name = "Entropy" elif algorithm == "CART": winner_index = ginis.index(min(ginis)) metric_value = min(ginis) metric_name = "Gini" elif algorithm == "CHAID": winner_index = chi_squared_values.index(max(chi_squared_values)) metric_value = max(chi_squared_values) metric_name = "ChiSquared" elif algorithm == "Regression": winner_index = reducted_stdevs.index(max(reducted_stdevs)) metric_value = max(reducted_stdevs) metric_name = "Std" winner_name = df.columns[winner_index] return winner_name, df.shape[0], metric_value, metric_name
def findGains(df, config): algorithm = config['algorithm'] decision_classes = df["Decision"].unique() #----------------------------- entropy = 0 if algorithm == "ID3" or algorithm == "C4.5": entropy = calculateEntropy(df, config) columns = df.shape[1] instances = df.shape[0] gains = [] for i in range(0, columns - 1): column_name = df.columns[i] column_type = df[column_name].dtypes #print(column_name,"->",column_type) if column_type != 'object': df = Preprocess.processContinuousFeatures(algorithm, df, column_name, entropy, config) classes = df[column_name].value_counts() splitinfo = 0 if algorithm == 'ID3' or algorithm == 'C4.5': gain = entropy * 1 else: gain = 0 for j in range(0, len(classes)): current_class = classes.keys().tolist()[j] #print(column_name,"->",current_class) subdataset = df[df[column_name] == current_class] #print(subdataset) subset_instances = subdataset.shape[0] class_probability = subset_instances / instances if algorithm == 'ID3' or algorithm == 'C4.5': subset_entropy = calculateEntropy(subdataset, config) gain = gain - class_probability * subset_entropy if algorithm == 'C4.5': splitinfo = splitinfo - class_probability * math.log( class_probability, 2) elif algorithm == 'CART': #GINI index decision_list = subdataset['Decision'].value_counts().tolist() subgini = 1 for k in range(0, len(decision_list)): subgini = subgini - math.pow( (decision_list[k] / subset_instances), 2) gain = gain + (subset_instances / instances) * subgini elif algorithm == 'CHAID': num_of_decisions = len(decision_classes) expected = subset_instances / num_of_decisions for d in decision_classes: num_of_d = subdataset[subdataset["Decision"] == d].shape[0] chi_square_of_d = math.sqrt( ((num_of_d - expected) * (num_of_d - expected)) / expected) gain += chi_square_of_d elif algorithm == 'Regression': subset_stdev = subdataset['Decision'].std(ddof=0) gain = gain + (subset_instances / instances) * subset_stdev #iterating over classes for loop end #------------------------------- if algorithm == 'Regression': stdev = df['Decision'].std(ddof=0) gain = stdev - gain if algorithm == 'C4.5': if splitinfo == 0: splitinfo = 100 #this can be if data set consists of 2 rows and current column consists of 1 class. still decision can be made (decisions for these 2 rows same). set splitinfo to very large value to make gain ratio very small. in this way, we won't find this column as the most dominant one. gain = gain / splitinfo #---------------------------------- gains.append(gain) #------------------------------------------------- resp_obj = {} resp_obj["gains"] = {} for idx, feature in enumerate( df.columns[0:-1]): #Decision is always the last column #print(idx, feature) resp_obj["gains"][feature] = gains[idx] resp_obj["entropy"] = entropy return resp_obj