Esempio n. 1
0
def findDecision(df, config):
	
	algorithm = config['algorithm']
	decision_classes = df["Decision"].unique()
	
	#-----------------------------
	
	if algorithm == 'Regression':
		stdev = df['Decision'].std(ddof=0)
	
	entropy = 0
	
	if algorithm == "ID3" or algorithm == "C4.5":
		entropy = calculateEntropy(df, config)
	#print("entropy: ",entropy)

	columns = df.shape[1]; instances = df.shape[0]

	gains = []; gainratios = []; ginis = []; reducted_stdevs = []; chi_squared_values = []

	for i in range(0, columns-1):
		column_name = df.columns[i]
		column_type = df[column_name].dtypes
		
		#print(column_name,"->",column_type)
		
		if column_type != 'object':
			df = Preprocess.processContinuousFeatures(algorithm, df, column_name, entropy, config)
		
		classes = df[column_name].value_counts()
		
		gain = entropy * 1; splitinfo = 0; gini = 0; weighted_stdev = 0; chi_squared_value = 0
		
		for j in range(0, len(classes)):
			current_class = classes.keys().tolist()[j]
			#print(column_name,"->",current_class)
			
			subdataset = df[df[column_name] == current_class]
			#print(subdataset)
			
			subset_instances = subdataset.shape[0]
			class_probability = subset_instances/instances
			
			if algorithm == 'ID3' or algorithm == 'C4.5':
				subset_entropy = calculateEntropy(subdataset, config)
				#print("entropy for this sub dataset is ", subset_entropy)
				gain = gain - class_probability * subset_entropy			
			
			if algorithm == 'C4.5':
				splitinfo = splitinfo - class_probability*math.log(class_probability, 2)
			
			elif algorithm == 'CART': #GINI index
				decision_list = subdataset['Decision'].value_counts().tolist()
				
				subgini = 1
				
				for k in range(0, len(decision_list)):
					subgini = subgini - math.pow((decision_list[k]/subset_instances), 2)
				
				gini = gini + (subset_instances / instances) * subgini
			
			elif algorithm == 'CHAID':
				num_of_decisions = len(decision_classes)
				
				expected = subset_instances / num_of_decisions
				
				for d in decision_classes:
					num_of_d = subdataset[subdataset["Decision"] == d].shape[0]
					
					chi_square_of_d = math.sqrt(((num_of_d - expected) * (num_of_d - expected)) / expected)
					
					chi_squared_value += chi_square_of_d
				
			elif algorithm == 'Regression':
				subset_stdev = subdataset['Decision'].std(ddof=0)
				weighted_stdev = weighted_stdev + (subset_instances/instances)*subset_stdev
		
		#iterating over classes for loop end
		#-------------------------------
		
		if algorithm == "ID3":
			gains.append(gain)
		
		elif algorithm == "C4.5":
		
			if splitinfo == 0:
				splitinfo = 100 #this can be if data set consists of 2 rows and current column consists of 1 class. still decision can be made (decisions for these 2 rows same). set splitinfo to very large value to make gain ratio very small. in this way, we won't find this column as the most dominant one.
				
			gainratio = gain / splitinfo
			gainratios.append(gainratio)
		
		elif algorithm == "CART":
			ginis.append(gini)
		
		elif algorithm == "CHAID":
			chi_squared_values.append(chi_squared_value)
		
		elif algorithm == 'Regression':
			reducted_stdev = stdev - weighted_stdev
			reducted_stdevs.append(reducted_stdev)
	
	#print(df)
	if algorithm == "ID3":
		winner_index = gains.index(max(gains))
		metric_value = entropy
		metric_name = "Entropy"
	elif algorithm == "C4.5":
		winner_index = gainratios.index(max(gainratios))
		metric_value = entropy
		metric_name = "Entropy"
	elif algorithm == "CART":
		winner_index = ginis.index(min(ginis))
		metric_value = min(ginis)
		metric_name = "Gini"
	elif algorithm == "CHAID":
		winner_index = chi_squared_values.index(max(chi_squared_values))
		metric_value = max(chi_squared_values)
		metric_name = "ChiSquared"
	elif algorithm == "Regression":
		winner_index = reducted_stdevs.index(max(reducted_stdevs))
		metric_value = max(reducted_stdevs)
		metric_name = "Std"
	winner_name = df.columns[winner_index]

	return winner_name, df.shape[0], metric_value, metric_name
Esempio n. 2
0
def findGains(df, config):

    algorithm = config['algorithm']
    decision_classes = df["Decision"].unique()

    #-----------------------------

    entropy = 0

    if algorithm == "ID3" or algorithm == "C4.5":
        entropy = calculateEntropy(df, config)

    columns = df.shape[1]
    instances = df.shape[0]

    gains = []

    for i in range(0, columns - 1):
        column_name = df.columns[i]
        column_type = df[column_name].dtypes

        #print(column_name,"->",column_type)

        if column_type != 'object':
            df = Preprocess.processContinuousFeatures(algorithm, df,
                                                      column_name, entropy,
                                                      config)

        classes = df[column_name].value_counts()

        splitinfo = 0
        if algorithm == 'ID3' or algorithm == 'C4.5':
            gain = entropy * 1
        else:
            gain = 0

        for j in range(0, len(classes)):
            current_class = classes.keys().tolist()[j]
            #print(column_name,"->",current_class)

            subdataset = df[df[column_name] == current_class]
            #print(subdataset)

            subset_instances = subdataset.shape[0]
            class_probability = subset_instances / instances

            if algorithm == 'ID3' or algorithm == 'C4.5':
                subset_entropy = calculateEntropy(subdataset, config)
                gain = gain - class_probability * subset_entropy

            if algorithm == 'C4.5':
                splitinfo = splitinfo - class_probability * math.log(
                    class_probability, 2)

            elif algorithm == 'CART':  #GINI index
                decision_list = subdataset['Decision'].value_counts().tolist()

                subgini = 1

                for k in range(0, len(decision_list)):
                    subgini = subgini - math.pow(
                        (decision_list[k] / subset_instances), 2)

                gain = gain + (subset_instances / instances) * subgini

            elif algorithm == 'CHAID':
                num_of_decisions = len(decision_classes)

                expected = subset_instances / num_of_decisions

                for d in decision_classes:
                    num_of_d = subdataset[subdataset["Decision"] == d].shape[0]

                    chi_square_of_d = math.sqrt(
                        ((num_of_d - expected) *
                         (num_of_d - expected)) / expected)

                    gain += chi_square_of_d

            elif algorithm == 'Regression':
                subset_stdev = subdataset['Decision'].std(ddof=0)
                gain = gain + (subset_instances / instances) * subset_stdev

        #iterating over classes for loop end
        #-------------------------------

        if algorithm == 'Regression':
            stdev = df['Decision'].std(ddof=0)
            gain = stdev - gain
        if algorithm == 'C4.5':
            if splitinfo == 0:
                splitinfo = 100  #this can be if data set consists of 2 rows and current column consists of 1 class. still decision can be made (decisions for these 2 rows same). set splitinfo to very large value to make gain ratio very small. in this way, we won't find this column as the most dominant one.
            gain = gain / splitinfo

        #----------------------------------

        gains.append(gain)

    #-------------------------------------------------

    resp_obj = {}
    resp_obj["gains"] = {}

    for idx, feature in enumerate(
            df.columns[0:-1]):  #Decision is always the last column
        #print(idx, feature)
        resp_obj["gains"][feature] = gains[idx]

    resp_obj["entropy"] = entropy

    return resp_obj