Beispiel #1
0
def addEpochAlpha(epoch, alpha):
    file = "outputs/rules/alphas.py"
    content = "   if epoch == " + str(epoch) + ":\n"
    content += "      return " + str(alpha)
    functions.storeRule(file, content)
Beispiel #2
0
def buildDecisionTree(df,root,file, config, dataset_features):

	if root == 1:
		if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True:
			raw_df = df.copy()
	
	algorithm = config['algorithm']
	enableAdaboost = config['enableAdaboost']
	
	#--------------------------------------
	
	#print(df.shape)
	charForResp = "'"
	if algorithm == 'Regression':
		charForResp = ""

	tmp_root = root * 1
	
	df_copy = df.copy()
	
	winner_name = findDecision(df, config)
	
	#find winner index, this cannot be returned by find decision because columns dropped in previous steps
	j = 0 
	for i in dataset_features:
		if i == winner_name:
			winner_index = j
		j = j + 1
	
	numericColumn = False
	if dataset_features[winner_name] != 'object':
		numericColumn = True
	
	#restoration
	columns = df.shape[1]
	for i in range(0, columns-1):
		column_name = df.columns[i]; column_type = df[column_name].dtypes
		if column_type != 'object' and column_name != winner_name:
			df[column_name] = df_copy[column_name]
	
	classes = df[winner_name].value_counts().keys().tolist()

	for i in range(0,len(classes)):
		current_class = classes[i]
		subdataset = df[df[winner_name] == current_class]
		subdataset = subdataset.drop(columns=[winner_name])
		
		if numericColumn == True:
			compareTo = current_class #current class might be <=x or >x in this case
		else:
			compareTo = " == '"+str(current_class)+"'"
		
		#print(subdataset)
		
		terminateBuilding = False
		
		#-----------------------------------------------
		#can decision be made?
		
		if enableAdaboost == True:
			#final_decision = subdataset['Decision'].value_counts().idxmax()
			final_decision = subdataset['Decision'].mean() #get average
			terminateBuilding = True
		elif len(subdataset['Decision'].value_counts().tolist()) == 1:
			final_decision = subdataset['Decision'].value_counts().keys().tolist()[0] #all items are equal in this case
			terminateBuilding = True
		elif subdataset.shape[1] == 1: #if decision cannot be made even though all columns dropped
			final_decision = subdataset['Decision'].value_counts().idxmax() #get the most frequent one
			terminateBuilding = True
		elif algorithm == 'Regression' and subdataset.shape[0] < 5: #pruning condition
		#elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition
			final_decision = subdataset['Decision'].mean() #get average
			terminateBuilding = True
		#-----------------------------------------------
		
		if i == 0:
			check_condition = "if"
		else:
			check_condition = "elif"
		
		functions.storeRule(file,(functions.formatRule(root),"",check_condition," obj[",str(winner_index),"]",compareTo,":"))
		
		#-----------------------------------------------
		
		if terminateBuilding == True: #check decision is made
			functions.storeRule(file,(functions.formatRule(root+1),"return ",charForResp+str(final_decision)+charForResp))
			
		else: #decision is not made, continue to create branch and leafs
			root = root + 1 #the following rule will be included by this rule. increase root
			buildDecisionTree(subdataset, root, file, config, dataset_features)
		
		root = tmp_root * 1
	
	#---------------------------------------------
	
	#calculate accuracy metrics
	if root == 1:
		if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True:
		#this is reguler decision tree. find accuracy here.
			moduleName = "outputs/rules/rules"
			fp, pathname, description = imp.find_module(moduleName)
			myrules = imp.load_module(moduleName, fp, pathname, description) #rules0
			
			num_of_features = df.shape[1] - 1
			instances = df.shape[0]
			classified = 0; mae = 0; mse = 0
			
			#instead of for loops, pandas functions perform well
			raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1)
			if algorithm != 'Regression':
				idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index
				
				#raw_df['Classified'] = 0
				#raw_df.loc[idx, 'Classified'] = 1
				#print(raw_df)
				
				accuracy = 100*len(idx)/instances
				print("Accuracy: ", accuracy,"% on ",instances," instances")
			else:
				raw_df['Absolute_Error'] = abs(raw_df['Prediction'] - raw_df['Decision'])
				raw_df['Absolute_Error_Squared'] = raw_df['Absolute_Error'] * raw_df['Absolute_Error']
				
				#print(raw_df)
				
				mae = raw_df['Absolute_Error'].sum()/instances
				print("MAE: ",mae)
				
				mse = raw_df['Absolute_Error_Squared'].sum()/instances
				rmse = math.sqrt(mse)
				print("RMSE: ",rmse)
				
				mean = raw_df['Decision'].mean()
				print("Mean: ", mean)
				
				if mean > 0:
					print("MAE / Mean: ",100*mae/mean,"%")
					print("RMSE / Mean: ",100*rmse/mean,"%")
Beispiel #3
0
def buildDecisionTree(df,root,file, config, dataset_features):
	
	algorithm = config['algorithm']
	enableAdaboost = config['enableAdaboost']
	debug = config['debug'] 
	
	#--------------------------------------
	
	#print(df.shape)
	charForResp = "'"
	if algorithm == 'Regression':
		charForResp = ""

	tmp_root = root * 1
	
	df_copy = df.copy()
	
	winner_name = findDecision(df, config)
	
	#find winner index, this cannot be returned by find decision because columns dropped in previous steps
	j = 0 
	for i in dataset_features:
		if i == winner_name:
			winner_index = j
		j = j + 1
	
	numericColumn = False
	if dataset_features[winner_name] != 'object':
		numericColumn = True
	
	#restoration
	columns = df.shape[1]
	for i in range(0, columns-1):
		column_name = df.columns[i]; column_type = df[column_name].dtypes
		if column_type != 'object' and column_name != winner_name:
			df[column_name] = df_copy[column_name]
	
	classes = df[winner_name].value_counts().keys().tolist()

	for i in range(0,len(classes)):
		current_class = classes[i]
		subdataset = df[df[winner_name] == current_class]
		subdataset = subdataset.drop(columns=[winner_name])
		
		if numericColumn == True:
			compareTo = current_class #current class might be <=x or >x in this case
		else:
			compareTo = " == '"+str(current_class)+"'"
		
		#print(subdataset)
		
		terminateBuilding = False
		
		#-----------------------------------------------
		#can decision be made?
		
		if enableAdaboost == True:
			#final_decision = subdataset['Decision'].value_counts().idxmax()
			final_decision = subdataset['Decision'].mean() #get average
			terminateBuilding = True
		elif len(subdataset['Decision'].value_counts().tolist()) == 1:
			final_decision = subdataset['Decision'].value_counts().keys().tolist()[0] #all items are equal in this case
			terminateBuilding = True
		elif subdataset.shape[1] == 1: #if decision cannot be made even though all columns dropped
			final_decision = subdataset['Decision'].value_counts().idxmax() #get the most frequent one
			terminateBuilding = True
		elif algorithm == 'Regression' and subdataset.shape[0] < 5: #pruning condition
		#elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition
			final_decision = subdataset['Decision'].mean() #get average
			terminateBuilding = True
		#-----------------------------------------------
		
		if debug == True:
			print(functions.formatRule(root),"if ",winner_name,compareTo,":")
		else:
			functions.storeRule(file,(functions.formatRule(root),"if obj[",str(winner_index),"]",compareTo,":"))
		
		#-----------------------------------------------
		
		if terminateBuilding == True: #check decision is made
			if debug == True:
				print(functions.formatRule(root+1),"return ",charForResp+str(final_decision)+charForResp)
			else:
				functions.storeRule(file,(functions.formatRule(root+1),"return ",charForResp+str(final_decision)+charForResp))
		else: #decision is not made, continue to create branch and leafs
			root = root + 1 #the following rule will be included by this rule. increase root
			buildDecisionTree(subdataset, root, file, config, dataset_features)
		
		root = tmp_root * 1
Beispiel #4
0
def buildDecisionTree(df, root, file, config, dataset_features):

    models = []

    if root == 1:
        if config['RandomForest'] != True:
            raw_df = df.copy()

    algorithm = config['algorithm']

    #--------------------------------------

    #print(df.shape)
    charForResp = "'"
    if algorithm == 'Regression':
        charForResp = ""

    tmp_root = root * 1

    df_copy = df.copy()

    winner_name = findDecision(df, config)

    j = 0
    for i in dataset_features:
        if i == winner_name:
            winner_index = j
        j = j + 1

    numericColumn = False
    if dataset_features[winner_name] != 'object':
        numericColumn = True

    #restoration
    columns = df.shape[1]
    for i in range(0, columns - 1):
        column_name = df.columns[i]
        column_type = df[column_name].dtypes
        if column_type != 'object' and column_name != winner_name:
            df[column_name] = df_copy[column_name]

    classes = df[winner_name].value_counts().keys().tolist()

    for i in range(0, len(classes)):
        current_class = classes[i]
        subdataset = df[df[winner_name] == current_class]
        subdataset = subdataset.drop(columns=[winner_name])

        if numericColumn == True:
            compareTo = current_class
        else:
            compareTo = " == '" + str(current_class) + "'"

        #print(subdataset)

        terminateBuilding = False

        #-----------------------------------------------
        if len(subdataset['Decision'].value_counts().tolist()) == 1:
            final_decision = subdataset['Decision'].value_counts().keys(
            ).tolist()[0]
            terminateBuilding = True
        elif subdataset.shape[1] == 1:
            final_decision = subdataset['Decision'].value_counts().idxmax()
            terminateBuilding = True
        elif algorithm == 'Regression' and subdataset.shape[0] < 5:

            final_decision = subdataset['Decision'].mean()
            terminateBuilding = True
        #-----------------------------------------------

        if i == 0:
            check_condition = "if"
        else:
            check_condition = "elif"

        functions.storeRule(file,
                            (functions.formatRule(root), "", check_condition,
                             " obj[", str(winner_index), "]", compareTo, ":"))

        #-----------------------------------------------

        if terminateBuilding == True:
            functions.storeRule(
                file, (functions.formatRule(root + 1), "return ",
                       charForResp + str(final_decision) + charForResp))

        else:
            root = root + 1
            buildDecisionTree(subdataset, root, file, config, dataset_features)

        root = tmp_root * 1

    #---------------------------------------------

    #calculate accuracy metrics
    if root == 1:
        if config['RandomForest'] != True:

            moduleName = "outputs/rules/rules"
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname, description)
            models.append(myrules)

            num_of_features = df.shape[1] - 1
            instances = df.shape[0]
            classified = 0
            mae = 0
            mse = 0

            raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1)
            #print(raw_df['Prediction'])

            #exit()
            if algorithm != 'Regression':
                True_positive = 0
                True_negative = 0
                False_positive = 0
                False_negative = 0
                for i in range(instances):
                    if (raw_df['Prediction'][i] == raw_df['Decision'][i]
                            and raw_df['Prediction'][i] == 'Yes'):
                        True_positive = True_positive + 1

                    if ((raw_df['Prediction'][i] != raw_df['Decision'][i]
                         and raw_df['Prediction'][i] == 'No')):
                        False_negative = False_negative + 1

                    if ((raw_df['Prediction'][i] != raw_df['Decision'][i]
                         and raw_df['Prediction'][i] == 'Yes')):
                        False_positive = False_positive + 1

                idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index

                accuracy = 100 * len(idx) / instances
                precision = 100 * True_positive / (True_positive +
                                                   False_positive)
                recall = 100 * True_positive / (True_positive + False_negative)

                F1_score = 2 * (precision * recall) / (precision + recall)
                print("accuracy: ", accuracy, "% trên tổng số ", instances,
                      "phép thử")
                print("precision: ", precision, "% ")
                print("recall: ", recall, "% ")
                print("F1 score: ", F1_score)

            else:
                raw_df['Absolute_Error'] = abs(raw_df['Prediction'] -
                                               raw_df['Decision'])
                raw_df['Absolute_Error_Squared'] = raw_df[
                    'Absolute_Error'] * raw_df['Absolute_Error']

                #print(raw_df)

                mae = raw_df['Absolute_Error'].sum() / instances
                print("MAE: ", mae)

                mse = raw_df['Absolute_Error_Squared'].sum() / instances
                rmse = math.sqrt(mse)
                print("RMSE: ", rmse)

                mean = raw_df['Decision'].mean()
                print("Mean: ", mean)

                if mean > 0:
                    print("MAE / Mean: ", 100 * mae / mean, "%")
                    print("RMSE / Mean: ", 100 * rmse / mean, "%")

    return models