Ejemplo n.º 1
0
def findPrediction(row):
    epoch = row['Epoch']
    row = row.drop(labels=['Epoch'])
    columns = row.shape[0]

    params = []
    for j in range(0, columns - 1):
        params.append(row[j])

    moduleName = "outputs/rules/rules_%d" % (epoch)
    fp, pathname, description = imp.find_module(moduleName)
    myrules = imp.load_module(moduleName, fp, pathname, description)

    prediction = functions.sign(myrules.findDecision(params))

    return prediction
Ejemplo n.º 2
0
def createBranch(config, current_class, subdataset, numericColumn, branch_index
	, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric):
	
	algorithm = config['algorithm']
	enableAdaboost = config['enableAdaboost']
	enableGBM = config['enableGBM']
	max_depth = config['max_depth']
	enableParallelism = config['enableParallelism']
	
	charForResp = "'"
	if algorithm == 'Regression':
		charForResp = ""
	
	#---------------------------
	
	json_file = file.split(".")[0]+".json"
	
	tmp_root = root * 1
	parents_raw = copy.copy(parents)
	
	#---------------------------
	
	if numericColumn == True:
		compareTo = current_class #current class might be <=x or >x in this case
	else:
		compareTo = " == '"+str(current_class)+"'"
	
	#print(subdataset)
	
	terminateBuilding = False
	
	#-----------------------------------------------
	#can decision be made?
	
	if enableGBM == True and root >= max_depth: #max depth
		final_decision = subdataset['Decision'].mean()
		terminateBuilding = True
	elif enableAdaboost == True:
		#final_decision = subdataset['Decision'].value_counts().idxmax()
		final_decision = functions.sign(subdataset['Decision'].mean()) #get average
		terminateBuilding = True
		enableParallelism = False
	elif len(subdataset['Decision'].value_counts().tolist()) == 1:
		final_decision = subdataset['Decision'].value_counts().keys().tolist()[0] #all items are equal in this case
		terminateBuilding = True
	elif subdataset.shape[1] == 1: #if decision cannot be made even though all columns dropped
		final_decision = subdataset['Decision'].value_counts().idxmax() #get the most frequent one
		terminateBuilding = True
	elif algorithm == 'Regression' and subdataset.shape[0] < 5: #pruning condition
	#elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition
		final_decision = subdataset['Decision'].mean() #get average
		terminateBuilding = True
	
	#-----------------------------------------------
	
	if enableParallelism == True:
		check_condition = "if" #TODO: elif checks might be above than if statements in parallel
	else:	
		if branch_index == 0:
			check_condition = "if"
		else:
			check_condition = "elif"
	
	check_rule = check_condition+" obj["+str(winner_index)+"]"+compareTo+":"
	
	leaf_id = str(uuid.uuid1())
	custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
		
	if enableParallelism != True:
		
		#check_rule += " # feature: "+winner_name+", instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4))
		
		functions.storeRule(file,(functions.formatRule(root),"",check_rule))
	else:
		
		sample_rule = {}
		sample_rule["current_level"] = root
		sample_rule["leaf_id"] = leaf_id
		sample_rule["parents"] = parents
		sample_rule["rule"] = check_rule
		sample_rule["feature_idx"] = winner_index
		sample_rule["feature_name"] = winner_name
		sample_rule["instances"] = num_of_instances
		sample_rule["metric"] = metric
		sample_rule["return_statement"] = 0
		
		#json to string
		sample_rule = json.dumps(sample_rule)
	
		functions.createFile(custom_rule_file, "")
		functions.storeRule(custom_rule_file, sample_rule)
	
	#-----------------------------------------------
	
	if terminateBuilding == True: #check decision is made
		
		parents = copy.copy(leaf_id)
		leaf_id = str(uuid.uuid1())
		
		decision_rule = "return "+charForResp+str(final_decision)+charForResp
		
		if enableParallelism != True:
			#serial
			functions.storeRule(file,(functions.formatRule(root+1),decision_rule))
		else:
			#parallel			
			sample_rule = {}
			sample_rule["current_level"] = root+1
			sample_rule["leaf_id"] = leaf_id
			sample_rule["parents"] = parents
			sample_rule["rule"] = decision_rule
			sample_rule["feature_idx"] = winner_index
			sample_rule["feature_name"] = winner_name
			sample_rule["instances"] = num_of_instances
			sample_rule["metric"] = 0
			sample_rule["return_statement"] = 1
			
			#json to string
			sample_rule = ", "+json.dumps(sample_rule)
			
			functions.storeRule(custom_rule_file, sample_rule)
	
	else: #decision is not made, continue to create branch and leafs
		root = root + 1 #the following rule will be included by this rule. increase root
		parents = copy.copy(leaf_id)
		
		buildDecisionTree(subdataset, root, file, config, dataset_features
			, root-1, leaf_id, parents)
					
		root = tmp_root * 1
		parents = copy.copy(parents_raw)
Ejemplo n.º 3
0
def predict(model, param):
	
	trees = model["trees"]
	config = model["config"]
	alphas = model["alphas"]
	nan_values = model["nan_values"]
	
	#-----------------------
	#handle missing values
	
	column_index = 0
	for column in nan_values:
		column_name = column[0]
		missing_value = column[1]
		
		if pd.isna(missing_value) != True:
			#print("missing values will be replaced with ",missing_value," in ",column_name," column")
			
			if pd.isna(param[column_index]):
				param[column_index] = missing_value
			
		column_index = column_index + 1
			
	#print("instance: ", param)
	#-----------------------
	
	enableGBM = config['enableGBM']
	adaboost = config['enableAdaboost']
	
	#-----------------------
	
	classification = False
	prediction = 0
	prediction_classes = []
	
	#-----------------------
	
	if enableGBM == True:
		
		if len(trees) == config['epochs']:
			classification = False
		else:
			classification = True
			prediction_classes = [0 for i in alphas]
		
	#-----------------------
	
	if len(trees) > 1: #boosting
		index = 0
		for tree in trees:
			if adaboost != True:
				
				custom_prediction = tree.findDecision(param)
				
				if custom_prediction != None:
					if type(custom_prediction) != str: #regression
						
						if enableGBM == True and classification == True:
							prediction_classes[index % len(alphas)] += custom_prediction
						else:
							prediction += custom_prediction
					else:
						classification = True
						prediction_classes.append(custom_prediction)
			else:
				prediction += alphas[index] * tree.findDecision(param)
			index = index + 1
		
		if adaboost == True:
			prediction = functions.sign(prediction)
	else: #regular decision tree
		tree = trees[0]
		prediction = tree.findDecision(param)
	
	if classification == False:
		return prediction
	else:
		if enableGBM == True and classification == True:
			return alphas[np.argmax(prediction_classes)]
		else:
			unique_labels = np.unique(prediction_classes)
			prediction_counts = []
			
			for i in range(0, len(unique_labels)):
				count = 0
				for j in prediction_classes:
					if j == unique_labels[i]:
						count = count + 1
				prediction_counts.append(count)
			
			return unique_labels[np.argmax(prediction_counts)]
Ejemplo n.º 4
0
def predict(model, param):

    trees = model["trees"]
    config = model["config"]
    alphas = model["alphas"]

    #-----------------------

    enableGBM = config['enableGBM']
    adaboost = config['enableAdaboost']

    #-----------------------

    classification = False
    prediction = 0
    prediction_classes = []

    #-----------------------

    if enableGBM == True:

        if len(trees) == config['epochs']:
            classification = False
        else:
            classification = True
            prediction_classes = [0 for i in alphas]

    #-----------------------

    if len(trees) > 1:  #boosting
        index = 0
        for tree in trees:
            if adaboost != True:

                custom_prediction = tree.findDecision(param)

                if custom_prediction != None:
                    if type(custom_prediction) != str:  #regression

                        if enableGBM == True and classification == True:
                            prediction_classes[
                                index % len(alphas)] += custom_prediction
                        else:
                            prediction += custom_prediction
                    else:
                        classification = True
                        prediction_classes.append(custom_prediction)
            else:
                prediction += alphas[index] * tree.findDecision(param)
            index = index + 1

        if adaboost == True:
            prediction = functions.sign(prediction)
    else:  #regular decision tree
        tree = trees[0]
        prediction = tree.findDecision(param)

    if classification == False:
        return prediction
    else:
        if enableGBM == True and classification == True:
            return alphas[np.argmax(prediction_classes)]
        else:
            unique_labels = np.unique(prediction_classes)
            prediction_counts = []

            for i in range(0, len(unique_labels)):
                count = 0
                for j in prediction_classes:
                    if j == unique_labels[i]:
                        count = count + 1
                prediction_counts.append(count)

            return unique_labels[np.argmax(prediction_counts)]
Ejemplo n.º 5
0
def predict(model, param):
    """
	Parameters:
		model (built chefboost model): you should pass model argument to the return of fit function
		param (list): pass input features as python list
		
		e.g. chef.predict(model, param = ['Sunny', 'Hot', 'High', 'Weak'])
	Returns:
		prediction
	"""

    trees = model["trees"]
    config = model["config"]

    alphas = []
    if "alphas" in model:
        alphas = model["alphas"]

    nan_values = []
    if "nan_values" in model:
        nan_values = model["nan_values"]

    #-----------------------
    #handle missing values

    column_index = 0
    for column in nan_values:
        column_name = column[0]
        missing_value = column[1]

        if pd.isna(missing_value) != True:
            #print("missing values will be replaced with ",missing_value," in ",column_name," column")

            if pd.isna(param[column_index]):
                param[column_index] = missing_value

        column_index = column_index + 1

    #print("instance: ", param)
    #-----------------------

    enableGBM = config['enableGBM']
    adaboost = config['enableAdaboost']
    enableRandomForest = config['enableRandomForest']

    #-----------------------

    classification = False
    prediction = 0
    prediction_classes = []

    #-----------------------

    if enableGBM == True:

        if len(trees) == config['epochs']:
            classification = False
        else:
            classification = True
            prediction_classes = [0 for i in alphas]

    #-----------------------

    if len(trees) > 1:  #bagging or boosting
        index = 0
        for tree in trees:
            if adaboost != True:

                custom_prediction = tree.findDecision(param)

                if custom_prediction != None:
                    if type(custom_prediction) != str:  #regression

                        if enableGBM == True and classification == True:
                            prediction_classes[
                                index % len(alphas)] += custom_prediction
                        else:
                            prediction += custom_prediction
                    else:
                        classification = True
                        prediction_classes.append(custom_prediction)
            else:  #adaboost
                prediction += alphas[index] * tree.findDecision(param)
            index = index + 1

        if enableRandomForest == True:
            #notice that gbm requires cumilative sum but random forest requires mean of each tree
            prediction = prediction / len(trees)

        if adaboost == True:
            prediction = functions.sign(prediction)
    else:  #regular decision tree
        tree = trees[0]
        prediction = tree.findDecision(param)

    if classification == False:
        return prediction
    else:
        if enableGBM == True and classification == True:
            return alphas[np.argmax(prediction_classes)]
        else:  #classification
            #e.g. random forest
            #get predictions made by different trees
            predictions = np.array(prediction_classes)

            #find the most frequent prediction
            (values, counts) = np.unique(predictions, return_counts=True)
            idx = np.argmax(counts)
            prediction = values[idx]

            return prediction
Ejemplo n.º 6
0
def buildDecisionTree(df, root, file, config, dataset_features):

    models = []

    if root == 1:
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            raw_df = df.copy()

    algorithm = config['algorithm']
    enableAdaboost = config['enableAdaboost']

    #--------------------------------------

    #print(df.shape)
    charForResp = "'"
    if algorithm == 'Regression':
        charForResp = ""

    tmp_root = root * 1

    df_copy = df.copy()

    winner_name = findDecision(df, config)

    #find winner index, this cannot be returned by find decision because columns dropped in previous steps
    j = 0
    for i in dataset_features:
        if i == winner_name:
            winner_index = j
        j = j + 1

    numericColumn = False
    if dataset_features[winner_name] != 'object':
        numericColumn = True

    #restoration
    columns = df.shape[1]
    for i in range(0, columns - 1):
        column_name = df.columns[i]
        column_type = df[column_name].dtypes
        if column_type != 'object' and column_name != winner_name:
            df[column_name] = df_copy[column_name]

    classes = df[winner_name].value_counts().keys().tolist()

    for i in range(0, len(classes)):
        current_class = classes[i]
        subdataset = df[df[winner_name] == current_class]
        subdataset = subdataset.drop(columns=[winner_name])

        if numericColumn == True:
            compareTo = current_class  #current class might be <=x or >x in this case
        else:
            compareTo = " == '" + str(current_class) + "'"

        #print(subdataset)

        terminateBuilding = False

        #-----------------------------------------------
        #can decision be made?

        if enableAdaboost == True:
            #final_decision = subdataset['Decision'].value_counts().idxmax()
            final_decision = functions.sign(
                subdataset['Decision'].mean())  #get average
            terminateBuilding = True
        elif len(subdataset['Decision'].value_counts().tolist()) == 1:
            final_decision = subdataset['Decision'].value_counts().keys(
            ).tolist()[0]  #all items are equal in this case
            terminateBuilding = True
        elif subdataset.shape[
                1] == 1:  #if decision cannot be made even though all columns dropped
            final_decision = subdataset['Decision'].value_counts().idxmax(
            )  #get the most frequent one
            terminateBuilding = True
        elif algorithm == 'Regression' and subdataset.shape[
                0] < 5:  #pruning condition
            #elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition
            final_decision = subdataset['Decision'].mean()  #get average
            terminateBuilding = True
        #-----------------------------------------------

        if i == 0:
            check_condition = "if"
        else:
            check_condition = "elif"

        functions.storeRule(file,
                            (functions.formatRule(root), "", check_condition,
                             " obj[", str(winner_index), "]", compareTo, ":"))

        #-----------------------------------------------

        if terminateBuilding == True:  #check decision is made
            functions.storeRule(
                file, (functions.formatRule(root + 1), "return ",
                       charForResp + str(final_decision) + charForResp))

        else:  #decision is not made, continue to create branch and leafs
            root = root + 1  #the following rule will be included by this rule. increase root
            buildDecisionTree(subdataset, root, file, config, dataset_features)

        root = tmp_root * 1

    #---------------------------------------------

    #calculate accuracy metrics
    if root == 1:
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            #this is reguler decision tree. find accuracy here.
            moduleName = "outputs/rules/rules"
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0
            models.append(myrules)

            num_of_features = df.shape[1] - 1
            instances = df.shape[0]
            classified = 0
            mae = 0
            mse = 0

            #instead of for loops, pandas functions perform well
            raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1)
            if algorithm != 'Regression':
                idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index

                #raw_df['Classified'] = 0
                #raw_df.loc[idx, 'Classified'] = 1
                #print(raw_df)

                accuracy = 100 * len(idx) / instances
                print("Accuracy: ", accuracy, "% on ", instances, " instances")
            else:
                raw_df['Absolute_Error'] = abs(raw_df['Prediction'] -
                                               raw_df['Decision'])
                raw_df['Absolute_Error_Squared'] = raw_df[
                    'Absolute_Error'] * raw_df['Absolute_Error']

                #print(raw_df)

                mae = raw_df['Absolute_Error'].sum() / instances
                print("MAE: ", mae)

                mse = raw_df['Absolute_Error_Squared'].sum() / instances
                rmse = math.sqrt(mse)
                print("RMSE: ", rmse)

                mean = raw_df['Decision'].mean()
                print("Mean: ", mean)

                if mean > 0:
                    print("MAE / Mean: ", 100 * mae / mean, "%")
                    print("RMSE / Mean: ", 100 * rmse / mean, "%")

    return models