Example #1
0
def reconstructRules(source):

    #print("Reconstructing ",source)

    file_name = source.split(".json")[0]
    file_name = file_name + ".py"

    functions.createFile(file_name,
                         "#This rule was reconstructed from " + source + "\n")

    with open(source, 'r') as f:
        rules = json.load(f)

    #print(rules)

    def padleft(rule, level):
        for i in range(0, level):
            rule = "\t" + rule
        return rule

    #print("def findDecision(obj):")

    max_level = 0

    rule_set = []
    #json file might not store rules respectively
    for instance in rules:
        if len(instance) > 0:
            rule = []
            rule.append(instance["current_level"])
            rule.append(instance["leaf_id"])
            rule.append(instance["parents"])
            rule.append(instance["rule"])
            rule_set.append(rule)
            #print(padleft(instance["rule"], instance["current_level"]))

    df = np.array(rule_set)

    def extractRules(df, parent='root', level=1):

        level_raw = level * 1
        parent_raw = copy.copy(parent)

        for i in range(0, df.shape[0]):
            leaf_id = df[i][1]
            parent_id = df[i][2]
            rule = df[i][3]

            if parent_id == parent:
                functions.storeRule(file_name, padleft(rule, level))

                level = level + 1
                parent = copy.copy(leaf_id)
                extractRules(df, parent, level)
                level = level_raw * 1
                parent = copy.copy(parent_raw)  #restore

    functions.storeRule(file_name, "def findDecision(obj):")
    extractRules(df)
Example #2
0
def apply(df, config, header, dataset_features, validation_df = None):
	
	models = []
	
	num_of_trees = config['num_of_trees']
	
	parallelism_on = config["enableParallelism"]
	
	#TODO: is this logical for 48x2 cores?
	#config["enableParallelism"] = False #run each tree in parallel but each branch in serial
	
	#TODO: reconstruct for parallel run is problematic. you should reconstruct based on tree id.
	
	input_params = []
	
	pbar = tqdm(range(0, num_of_trees), desc='Bagging')
	for i in pbar:
		pbar.set_description("Sub decision tree %d is processing" % (i+1))
		subset = df.sample(frac=1/num_of_trees)
		
		root = 1
		
		moduleName = "outputs/rules/rule_"+str(i)
		file = moduleName+".py"
		
		functions.createFile(file, header)
		
		if parallelism_on: #parallel run
			input_params.append((subset, root, file, config, dataset_features, 0, 0, 'root', i))
		
		else: #serial run
			Training.buildDecisionTree(subset,root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', tree_id = i)
		
	#-------------------------------
	
	if parallelism_on:
		num_cores = config["num_cores"]
		pool = Training.MyPool(num_cores)
		results = pool.starmap(buildDecisionTree, input_params)
		pool.close()
		pool.join()
	
	#-------------------------------
	#collect models for both serial and parallel here
	for i in range(0, num_of_trees):
		moduleName = "outputs/rules/rule_"+str(i)
		fp, pathname, description = imp.find_module(moduleName)
		myrules = imp.load_module(moduleName, fp, pathname, description)
		models.append(myrules)
	
	#-------------------------------
	
	return models
Example #3
0
def apply(df, config, header, dataset_features, validation_df=None):

    models = []

    num_of_trees = config['num_of_trees']

    pbar = tqdm(range(0, num_of_trees), desc='Bagging')

    for i in pbar:
        #for i in range(0, num_of_trees):
        pbar.set_description("Sub decision tree %d is processing" % (i + 1))
        subset = df.sample(frac=1 / num_of_trees)

        root = 1

        moduleName = "outputs/rules/rule_" + str(i)
        file = moduleName + ".py"
        json_file = moduleName + ".json"

        functions.createFile(file, header)
        functions.createFile(json_file, "[\n")

        Training.buildDecisionTree(subset,
                                   root,
                                   file,
                                   config,
                                   dataset_features,
                                   parent_level=0,
                                   leaf_id=0,
                                   parents='root')

        functions.storeRule(json_file, "{}]")

        #--------------------------------

        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname, description)
        models.append(myrules)

    #-------------------------------

    return models
Example #4
0
def reconstructRules(source, feature_names):
	
	#print("Reconstructing ",source)
	
	file_name = source.split(".json")[0]
	file_name = file_name+".py"
	
	#-----------------------------------
	
	constructor = "def findDecision(obj): #"
	idx = 0
	for feature in feature_names:
		constructor = constructor + "obj["+str(idx)+"]: "+feature
		
		if idx < len(feature_names) - 1:
			constructor = constructor+", "
		idx = idx + 1
	
	functions.createFile(file_name, constructor+"\n")
	
	#-----------------------------------
	
	with open(source, 'r') as f:
		rules = json.load(f)

	#print(rules)

	def padleft(rule, level):
		for i in range(0, level):
			rule = "\t"+rule
		return rule

	#print("def findDecision(obj):")

	max_level = 0

	rule_set = []
	#json file might not store rules respectively
	for instance in rules:
		if len(instance) > 0:
			rule = []
			rule.append(instance["current_level"])
			rule.append(instance["leaf_id"])
			rule.append(instance["parents"])
			rule.append(instance["rule"])
			rule.append(instance["feature_name"])
			rule.append(instance["instances"])
			rule.append(instance["metric"])
			rule.append(instance["return_statement"])
			rule_set.append(rule)
			#print(padleft(instance["rule"], instance["current_level"]))

	df = np.array(rule_set)
	
	def extractRules(df, parent = 'root', level=1):
	
		level_raw = level * 1; parent_raw = copy.copy(parent)
		
		else_rule = ""
		
		leaf_idx = 0
		for i in range(0 ,df.shape[0]):
			current_level = int(df[i][0])
			leaf_id = df[i][1]
			parent_id = df[i][2]
			rule = df[i][3]
			feature_name = df[i][4]
			instances = int(df[i][5])
			metric = float(df[i][6])
			return_statement = int(df[i][7])
			
			if parent_id == parent:
				
				if_statement = False
				if rule[0:2] == "if":
					if_statement = True
				
				else_statement = False
				if rule[0:5] == "else:":
					else_statement = True
					else_rule = rule
				
				#------------------------
				
				if else_statement != True:
				
					if if_statement == True and leaf_idx > 0:
						rule = "el"+rule
					
					#print(padleft(rule, level), "(", leaf_idx,")")
					
					if leaf_idx == 0 and return_statement == 0:
						explainer = {}
						explainer["feature"] = feature_name
						explainer["instances"] = instances
						explainer["metric_value"] = round(metric, 4)
						explainer["depth"] = current_level
						explainer = "# "+json.dumps(explainer)
						functions.storeRule(file_name, padleft(explainer, level))
					
					functions.storeRule(file_name, padleft(rule, level))
					
					level = level + 1; parent = copy.copy(leaf_id)
					extractRules(df, parent, level)
					level = level_raw * 1; parent = copy.copy(parent_raw) #restore
					
					leaf_idx = leaf_idx + 1
		
		#add else statement
		
		if else_rule != "":
			#print(padleft(else_rule, level))
			functions.storeRule(file_name, padleft(else_rule, level))
			
	#------------------------------------
	
	extractRules(df)
Example #5
0
def buildDecisionTree(df, root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', validation_df = None):
	
	models = []
	feature_names = df.columns[0:-1]
	
	enableParallelism = config['enableParallelism']
	algorithm = config['algorithm']
	
	json_file = file.split(".")[0]+".json"
	
	if root == 1:
		if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True:
			raw_df = df.copy()
	
	#--------------------------------------
	
	df_copy = df.copy()
	
	winner_name, num_of_instances, metric, metric_name = findDecision(df, config)
	
	#find winner index, this cannot be returned by find decision because columns dropped in previous steps
	j = 0 
	for i in dataset_features:
		if i == winner_name:
			winner_index = j
		j = j + 1
	
	numericColumn = False
	if dataset_features[winner_name] != 'object':
		numericColumn = True
	
	#restoration
	columns = df.shape[1]
	for i in range(0, columns-1):
		column_name = df.columns[i]; column_type = df[column_name].dtypes
		if column_type != 'object' and column_name != winner_name:
			df[column_name] = df_copy[column_name]
	
	classes = df[winner_name].value_counts().keys().tolist()
		
	#-----------------------------------------------------
	
	num_cores = config["num_cores"]
	
	input_params = []
	
	#serial approach
	for i in range(0,len(classes)):
		current_class = classes[i]
		subdataset = df[df[winner_name] == current_class]
		subdataset = subdataset.drop(columns=[winner_name])
		branch_index = i * 1
		
		#create branches serially
		if enableParallelism != True:
			
			if i == 0:
				#descriptor = "# Feature: "+winner_name+", Instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4))
				
				descriptor = {
					"feature": winner_name,
					"instances": num_of_instances,
					#"metric_name": metric_name,
					"metric_value": round(metric, 4),
					"depth": parent_level + 1
				}
				descriptor = "# "+json.dumps(descriptor)
				
				functions.storeRule(file, (functions.formatRule(root), "", descriptor))
			
			createBranch(config, current_class, subdataset, numericColumn, branch_index
				, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric)
		else:
			input_params.append((config, current_class, subdataset, numericColumn, branch_index
				, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric))
	
	#---------------------------
	#add else condition in the decision tree
	
	if df.Decision.dtypes == 'object': #classification
		pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
		pivot = pivot.rename(columns = {"Decision": "Instances","index": "Decision"})
		pivot = pivot.sort_values(by = ["Instances"], ascending = False).reset_index()
		
		else_decision = "return '%s'" % (pivot.iloc[0].Decision)
		
		if enableParallelism != True:
			functions.storeRule(file,(functions.formatRule(root), "else:"))
			functions.storeRule(file,(functions.formatRule(root+1), else_decision))
		else: #parallelism
			leaf_id = str(uuid.uuid1())
			custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
			
			check_rule = "else: "+else_decision
			
			sample_rule = {}
			sample_rule["current_level"] = root
			sample_rule["leaf_id"] = leaf_id
			sample_rule["parents"] = parents
			sample_rule["rule"] = check_rule
			sample_rule["feature_idx"] = -1
			sample_rule["feature_name"] = ""
			sample_rule["instances"] = df.shape[0]
			sample_rule["metric"] = 0
			sample_rule["return_statement"] = 0
			
			#json to string
			sample_rule = json.dumps(sample_rule)
			
			functions.createFile(custom_rule_file, "")
			functions.storeRule(custom_rule_file, sample_rule)
			
	else: #regression
		else_decision = "return %s" % (subdataset.Decision.mean())
				
		if enableParallelism != True:
			functions.storeRule(file,(functions.formatRule(root), "else:"))
			functions.storeRule(file,(functions.formatRule(root+1), else_decision))
		else:
			leaf_id = str(uuid.uuid1())
			custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
			
			check_rule = "else: "+else_decision
			
			sample_rule = "   {\n"
			sample_rule += "      \"current_level\": "+str(root)+",\n"
			sample_rule += "      \"leaf_id\": \""+str(leaf_id)+"\",\n"
			sample_rule += "      \"parents\": \""+parents+"\",\n"
			sample_rule += "      \"rule\": \""+check_rule+"\"\n"
			sample_rule += "   }"
			
			functions.createFile(custom_rule_file, "")
			functions.storeRule(custom_rule_file, sample_rule)
	
	#---------------------------
	
	#create branches in parallel
	if enableParallelism == True:
		"""
		#this usage causes trouble for recursive functions
		with Pool(number_of_cpus) as pool:
			pool.starmap(createBranch, input_params)
		"""
		
		pool = MyPool(num_cores)
		results = pool.starmap(createBranch, input_params)
		pool.close()
		pool.join()
	
	#---------------------------------------------
	
	if root == 1:
		
		if enableParallelism == True:

			#custom rules are stored in .txt files. merge them all in a json file
			
			functions.createFile(json_file, "[\n")
			
			custom_rules = []
			
			file_index = 0
			for file in os.listdir(os.getcwd()+"/outputs/rules"):
				if file.endswith(".txt"):
					custom_rules.append(os.getcwd()+"/outputs/rules/"+file)
					#print(file) #this file stores a custom rule
					f = open(os.getcwd()+"/outputs/rules/"+file, "r")
					custom_rule = f.read()
					
					if file_index > 0:
						custom_rule = ", "+custom_rule
					
					functions.storeRule(json_file, custom_rule)
					f.close()
					file_index = file_index + 1
					
			functions.storeRule(json_file, "]")
			
			#-----------------------------------
			
			#custom rules are already merged in a json file. clear messy custom rules
			#TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule.
			
			for file in custom_rules:
				os.remove(file)
			
			#-----------------------------------
			
			reconstructRules(json_file, feature_names)

			#feature importance should be calculated by demand?
			feature_importance(json_file, dataset_features)
			
			#-----------------------------------
		
		#is regular decision tree
		if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True:
		#this is reguler decision tree. find accuracy here.
			
			moduleName = "outputs/rules/rules"
			fp, pathname, description = imp.find_module(moduleName)
			myrules = imp.load_module(moduleName, fp, pathname, description) #rules0
			models.append(myrules)
			
	return models
Example #6
0
def createBranch(config, current_class, subdataset, numericColumn, branch_index
	, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric):
	
	algorithm = config['algorithm']
	enableAdaboost = config['enableAdaboost']
	enableGBM = config['enableGBM']
	max_depth = config['max_depth']
	enableParallelism = config['enableParallelism']
	
	charForResp = "'"
	if algorithm == 'Regression':
		charForResp = ""
	
	#---------------------------
	
	json_file = file.split(".")[0]+".json"
	
	tmp_root = root * 1
	parents_raw = copy.copy(parents)
	
	#---------------------------
	
	if numericColumn == True:
		compareTo = current_class #current class might be <=x or >x in this case
	else:
		compareTo = " == '"+str(current_class)+"'"
	
	#print(subdataset)
	
	terminateBuilding = False
	
	#-----------------------------------------------
	#can decision be made?
	
	if enableGBM == True and root >= max_depth: #max depth
		final_decision = subdataset['Decision'].mean()
		terminateBuilding = True
	elif enableAdaboost == True:
		#final_decision = subdataset['Decision'].value_counts().idxmax()
		final_decision = functions.sign(subdataset['Decision'].mean()) #get average
		terminateBuilding = True
		enableParallelism = False
	elif len(subdataset['Decision'].value_counts().tolist()) == 1:
		final_decision = subdataset['Decision'].value_counts().keys().tolist()[0] #all items are equal in this case
		terminateBuilding = True
	elif subdataset.shape[1] == 1: #if decision cannot be made even though all columns dropped
		final_decision = subdataset['Decision'].value_counts().idxmax() #get the most frequent one
		terminateBuilding = True
	elif algorithm == 'Regression' and subdataset.shape[0] < 5: #pruning condition
	#elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition
		final_decision = subdataset['Decision'].mean() #get average
		terminateBuilding = True
	
	#-----------------------------------------------
	
	if enableParallelism == True:
		check_condition = "if" #TODO: elif checks might be above than if statements in parallel
	else:	
		if branch_index == 0:
			check_condition = "if"
		else:
			check_condition = "elif"
	
	check_rule = check_condition+" obj["+str(winner_index)+"]"+compareTo+":"
	
	leaf_id = str(uuid.uuid1())
	custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
		
	if enableParallelism != True:
		
		#check_rule += " # feature: "+winner_name+", instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4))
		
		functions.storeRule(file,(functions.formatRule(root),"",check_rule))
	else:
		
		sample_rule = {}
		sample_rule["current_level"] = root
		sample_rule["leaf_id"] = leaf_id
		sample_rule["parents"] = parents
		sample_rule["rule"] = check_rule
		sample_rule["feature_idx"] = winner_index
		sample_rule["feature_name"] = winner_name
		sample_rule["instances"] = num_of_instances
		sample_rule["metric"] = metric
		sample_rule["return_statement"] = 0
		
		#json to string
		sample_rule = json.dumps(sample_rule)
	
		functions.createFile(custom_rule_file, "")
		functions.storeRule(custom_rule_file, sample_rule)
	
	#-----------------------------------------------
	
	if terminateBuilding == True: #check decision is made
		
		parents = copy.copy(leaf_id)
		leaf_id = str(uuid.uuid1())
		
		decision_rule = "return "+charForResp+str(final_decision)+charForResp
		
		if enableParallelism != True:
			#serial
			functions.storeRule(file,(functions.formatRule(root+1),decision_rule))
		else:
			#parallel			
			sample_rule = {}
			sample_rule["current_level"] = root+1
			sample_rule["leaf_id"] = leaf_id
			sample_rule["parents"] = parents
			sample_rule["rule"] = decision_rule
			sample_rule["feature_idx"] = winner_index
			sample_rule["feature_name"] = winner_name
			sample_rule["instances"] = num_of_instances
			sample_rule["metric"] = 0
			sample_rule["return_statement"] = 1
			
			#json to string
			sample_rule = ", "+json.dumps(sample_rule)
			
			functions.storeRule(custom_rule_file, sample_rule)
	
	else: #decision is not made, continue to create branch and leafs
		root = root + 1 #the following rule will be included by this rule. increase root
		parents = copy.copy(leaf_id)
		
		buildDecisionTree(subdataset, root, file, config, dataset_features
			, root-1, leaf_id, parents)
					
		root = tmp_root * 1
		parents = copy.copy(parents_raw)
Example #7
0
def reconstructRules(source):

    #print("Reconstructing ",source)

    file_name = source.split(".json")[0]
    file_name = file_name + ".py"

    functions.createFile(file_name,
                         "#This rule was reconstructed from " + source + "\n")

    with open(source, 'r') as f:
        rules = json.load(f)

    #print(rules)

    def padleft(rule, level):
        for i in range(0, level):
            rule = "\t" + rule
        return rule

    #print("def findDecision(obj):")

    max_level = 0

    rule_set = []
    #json file might not store rules respectively
    for instance in rules:
        if len(instance) > 0:
            rule = []
            rule.append(instance["current_level"])
            rule.append(instance["leaf_id"])
            rule.append(instance["parents"])
            rule.append(instance["rule"])
            rule_set.append(rule)
            #print(padleft(instance["rule"], instance["current_level"]))

    df = np.array(rule_set)

    def extractRules(df, parent='root', level=1):

        level_raw = level * 1
        parent_raw = copy.copy(parent)

        else_rule = ""

        leaf_idx = 0
        for i in range(0, df.shape[0]):
            leaf_id = df[i][1]
            parent_id = df[i][2]
            rule = df[i][3]

            if parent_id == parent:

                if_statement = False
                if rule[0:2] == "if":
                    if_statement = True

                else_statement = False
                if rule[0:5] == "else:":
                    else_statement = True
                    else_rule = rule

                #------------------------

                if else_statement != True:

                    if if_statement == True and leaf_idx > 0:
                        rule = "el" + rule

                    #print(padleft(rule, level), "(", leaf_idx,")")

                    functions.storeRule(file_name, padleft(rule, level))

                    level = level + 1
                    parent = copy.copy(leaf_id)
                    extractRules(df, parent, level)
                    level = level_raw * 1
                    parent = copy.copy(parent_raw)  #restore

                    leaf_idx = leaf_idx + 1

        #add else statement

        if else_rule != "":
            #print(padleft(else_rule, level))
            functions.storeRule(file_name, padleft(else_rule, level))

    #------------------------------------

    #print("def findDecision(obj):")
    functions.storeRule(file_name, "def findDecision(obj):")
    extractRules(df)
Example #8
0
def buildDecisionTree(df,
                      root,
                      file,
                      config,
                      dataset_features,
                      parent_level=0,
                      leaf_id=0,
                      parents='root'):

    models = []

    enableParallelism = config['enableParallelism']
    algorithm = config['algorithm']

    json_file = file.split(".")[0] + ".json"

    if root == 1:
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            raw_df = df.copy()

    #--------------------------------------

    df_copy = df.copy()

    winner_name = findDecision(df, config)

    #find winner index, this cannot be returned by find decision because columns dropped in previous steps
    j = 0
    for i in dataset_features:
        if i == winner_name:
            winner_index = j
        j = j + 1

    numericColumn = False
    if dataset_features[winner_name] != 'object':
        numericColumn = True

    #restoration
    columns = df.shape[1]
    for i in range(0, columns - 1):
        column_name = df.columns[i]
        column_type = df[column_name].dtypes
        if column_type != 'object' and column_name != winner_name:
            df[column_name] = df_copy[column_name]

    classes = df[winner_name].value_counts().keys().tolist()

    #-----------------------------------------------------

    #TO-DO: you should specify the number of cores in config
    num_cores = int(multiprocessing.cpu_count() /
                    2)  #allocate half of your total cores

    input_params = []

    #serial approach
    for i in range(0, len(classes)):
        current_class = classes[i]
        subdataset = df[df[winner_name] == current_class]
        subdataset = subdataset.drop(columns=[winner_name])
        branch_index = i * 1

        #create branches serially
        if enableParallelism != True:
            createBranch(config, current_class, subdataset, numericColumn,
                         branch_index, winner_index, root, parents, file,
                         dataset_features)
        else:
            input_params.append((config, current_class, subdataset,
                                 numericColumn, branch_index, winner_index,
                                 root, parents, file, dataset_features))

    #---------------------------
    #add else condition in the decision tree

    if df.Decision.dtypes == 'object':  #classification
        pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
        pivot = pivot.rename(columns={
            "Decision": "Instances",
            "index": "Decision"
        })
        pivot = pivot.sort_values(by=["Instances"],
                                  ascending=False).reset_index()

        else_decision = "return '%s'" % (pivot.iloc[0].Decision)

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:  #parallelism
            leaf_id = str(uuid.uuid1())
            custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt"

            check_rule = "else: " + else_decision

            sample_rule = "   {\n"
            sample_rule += "      \"current_level\": " + str(root) + ",\n"
            sample_rule += "      \"leaf_id\": \"" + str(leaf_id) + "\",\n"
            sample_rule += "      \"parents\": \"" + parents + "\",\n"
            sample_rule += "      \"rule\": \"" + check_rule + "\"\n"
            sample_rule += "   }"

            functions.createFile(custom_rule_file, "")
            functions.storeRule(custom_rule_file, sample_rule)

    else:  #regression
        else_decision = "return %s" % (subdataset.Decision.mean())

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:
            leaf_id = str(uuid.uuid1())
            custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt"

            check_rule = "else: " + else_decision

            sample_rule = "   {\n"
            sample_rule += "      \"current_level\": " + str(root) + ",\n"
            sample_rule += "      \"leaf_id\": \"" + str(leaf_id) + "\",\n"
            sample_rule += "      \"parents\": \"" + parents + "\",\n"
            sample_rule += "      \"rule\": \"" + check_rule + "\"\n"
            sample_rule += "   }"

            functions.createFile(custom_rule_file, "")
            functions.storeRule(custom_rule_file, sample_rule)

    #---------------------------

    #create branches in parallel
    if enableParallelism == True:
        """
		#this usage causes trouble for recursive functions
		with Pool(number_of_cpus) as pool:
			pool.starmap(createBranch, input_params)
		"""

        pool = MyPool(num_cores)
        results = pool.starmap(createBranch, input_params)
        pool.close()
        pool.join()

    #---------------------------------------------

    #calculate accuracy metrics
    if root == 1:

        if enableParallelism == True:

            #custom rules are stored in .txt files. merge them all in a json file

            functions.createFile(json_file, "[\n")

            custom_rules = []

            file_index = 0
            for file in os.listdir(os.getcwd() + "/outputs/rules"):
                if file.endswith(".txt"):
                    custom_rules.append(os.getcwd() + "/outputs/rules/" + file)
                    #print(file) #this file stores a custom rule
                    f = open(os.getcwd() + "/outputs/rules/" + file, "r")
                    custom_rule = f.read()

                    if file_index > 0:
                        custom_rule = ", " + custom_rule

                    functions.storeRule(json_file, custom_rule)
                    f.close()
                    file_index = file_index + 1

            functions.storeRule(json_file, "]")

            #-----------------------------------

            #custom rules are already merged in a json file. clear messy custom rules
            #TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule.

            for file in custom_rules:
                os.remove(file)

            #-----------------------------------

            reconstructRules(json_file)

            #-----------------------------------

        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            #this is reguler decision tree. find accuracy here.

            moduleName = "outputs/rules/rules"
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0
            models.append(myrules)

            num_of_features = df.shape[1] - 1
            instances = df.shape[0]
            classified = 0
            mae = 0
            mse = 0

            #instead of for loops, pandas functions perform well
            raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1)
            if algorithm != 'Regression':
                idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index

                #raw_df['Classified'] = 0
                #raw_df.loc[idx, 'Classified'] = 1
                #print(raw_df)

                accuracy = 100 * len(idx) / instances
                print("Accuracy: ", accuracy, "% on ", instances, " instances")
            else:
                raw_df['Absolute_Error'] = abs(raw_df['Prediction'] -
                                               raw_df['Decision'])
                raw_df['Absolute_Error_Squared'] = raw_df[
                    'Absolute_Error'] * raw_df['Absolute_Error']

                #print(raw_df)

                mae = raw_df['Absolute_Error'].sum() / instances
                print("MAE: ", mae)

                mse = raw_df['Absolute_Error_Squared'].sum() / instances
                rmse = math.sqrt(mse)
                print("RMSE: ", rmse)

                mean = raw_df['Decision'].mean()
                print("Mean: ", mean)

                if mean > 0:
                    print("MAE / Mean: ", 100 * mae / mean, "%")
                    print("RMSE / Mean: ", 100 * rmse / mean, "%")

    return models
def fit(df, config):
	
	target_label = df.columns[len(df.columns)-1]
	if target_label != 'Decision':
		print("Expected: Decision, Existing: ",target_label)
		raise ValueError('Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame')
	
	#------------------------
	#handle NaN values
	
	nan_values = []
	
	for column in df.columns:
		if df[column].dtypes != 'object':
			min_value = df[column].min()
			idx = df[df[column].isna()].index
			
			nan_value = []
			nan_value.append(column)
			
			if idx.shape[0] > 0:
				df.loc[idx, column] = min_value - 1
				nan_value.append(min_value - 1)
				min_value - 1
				#print("NaN values are replaced to ", min_value - 1, " in column ", column)
			else:
				nan_value.append(None)
			
			nan_values.append(nan_value)
	
	#------------------------
	
	#initialize params and folders
	config = functions.initializeParams(config)
	functions.initializeFolders()
	
	#------------------------
	
	algorithm = config['algorithm']
	
	valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression']
	
	if algorithm not in valid_algorithms:
		raise ValueError('Invalid algorithm passed. You passed ', algorithm," but valid algorithms are ",valid_algorithms)
	
	#------------------------

	enableRandomForest = config['enableRandomForest']
	num_of_trees = config['num_of_trees']
	enableMultitasking = config['enableMultitasking'] #no longer used. check to remove this variable.

	enableGBM = config['enableGBM']
	epochs = config['epochs']
	learning_rate = config['learning_rate']

	enableAdaboost = config['enableAdaboost']
	enableParallelism = config['enableParallelism']
	
	#this will handle basic decision stumps. parallelism is not required.
	if enableRandomForest == True:
		config['enableParallelism'] = False
		enableParallelism = False
	
	#------------------------
	raw_df = df.copy()
	num_of_rows = df.shape[0]; num_of_columns = df.shape[1]
	
	if algorithm == 'Regression':
		if df['Decision'].dtypes == 'object':
			raise ValueError('Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.')

	if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm
		algorithm = 'Regression'
		config['algorithm'] = 'Regression'
		global_stdev = df['Decision'].std(ddof=0)

	if enableGBM == True:
		print("Gradient Boosting Machines...")
		algorithm = 'Regression'
		config['algorithm'] = 'Regression'
	
	if enableAdaboost == True:
		#enableParallelism = False
		for j in range(0, num_of_columns):
			column_name = df.columns[j]
			if df[column_name].dtypes  == 'object':
				raise ValueError('Adaboost must be run on numeric data set for both features and target')
		
	#-------------------------
	
	print(algorithm," tree is going to be built...")
	
	dataset_features = dict() #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales.

	header = "def findDecision(obj): #"
	
	num_of_columns = df.shape[1]-1
	for i in range(0, num_of_columns):
		column_name = df.columns[i]
		dataset_features[column_name] = df[column_name].dtypes
		header = header + "obj[" + str(i) +"]: "+column_name
		if i != num_of_columns - 1:
			header = header + ", "
	
	header = header + "\n"
		
	#------------------------
	
	begin = time.time()
	
	trees = []; alphas = []

	if enableAdaboost == True:
		trees, alphas = adaboost.apply(df, config, header, dataset_features)

	elif enableGBM == True:
		
		if df['Decision'].dtypes == 'object': #transform classification problem to regression
			trees, alphas = gbm.classifier(df, config, header, dataset_features)
			classification = True
			
		else: #regression
			trees = gbm.regressor(df, config, header, dataset_features)
			classification = False
				
	elif enableRandomForest == True:
		trees = randomforest.apply(df, config, header, dataset_features)
	else: #regular decision tree building

		root = 1; file = "outputs/rules/rules.py"
		functions.createFile(file, header)
		
		if enableParallelism == True:
			json_file = "outputs/rules/rules.json"
			functions.createFile(json_file, "[\n")
			
		trees = Training.buildDecisionTree(df,root,file, config, dataset_features
			, 0, 0, 'root')
		
	print("finished in ",time.time() - begin," seconds")
	
	obj = {
		"trees": trees,
		"alphas": alphas,
		"config": config,
		"nan_values": nan_values
	}
	
	return obj
Example #10
0
def regressor(df, config, header, dataset_features, validation_df = None, process_id = None):
	models = []
	
	#we will update decisions in every epoch, this will be used to restore
	base_actuals = df.Decision.values
	
	algorithm = config['algorithm']
	
	enableRandomForest = config['enableRandomForest']
	num_of_trees = config['num_of_trees']
	enableMultitasking = config['enableMultitasking']

	enableGBM = config['enableGBM']
	epochs = config['epochs']
	learning_rate = config['learning_rate']

	enableAdaboost = config['enableAdaboost']
	
	#------------------------------
	
	boosted_from = 0; boosted_to = 0
	
	#------------------------------
	
	base_df = df.copy()
	
	#gbm will manipulate actuals. store its raw version.
	target_values = base_df['Decision'].values
	num_of_instances = target_values.shape[0]
	
	root = 1
	file = "outputs/rules/rules0.py"; json_file = "outputs/rules/rules0.json"
	functions.createFile(file, header)
	functions.createFile(json_file, "[\n")
	
	Training.buildDecisionTree(df,root,file, config, dataset_features
		, parent_level = 0, leaf_id = 0, parents = 'root') #generate rules0
	
	#functions.storeRule(json_file," {}]")
	
	df = base_df.copy()
	
	base_df['Boosted_Prediction'] = 0
	
	#------------------------------
	
	best_epoch_idx = 0; best_epoch_loss = 1000000
	
	pbar = tqdm(range(1, epochs+1), desc='Boosting')
	
	#for index in range(1,epochs+1):
	#for index in tqdm(range(1,epochs+1), desc='Boosting'):
	for index in pbar:
		#print("epoch ",index," - ",end='')
		loss = 0
		
		#run data(i-1) and rules(i-1), save data1
		
		#dynamic import
		moduleName = "outputs/rules/rules%s" % (index-1)
		fp, pathname, description = imp.find_module(moduleName)
		myrules = imp.load_module(moduleName, fp, pathname, description) #rules0
		
		models.append(myrules)
		
		new_data_set = "outputs/data/data%s.csv" % (index)
		f = open(new_data_set, "w")
		
		#put header in the following file
		columns = df.shape[1]
		
		mae = 0
		
		#----------------------------------------
		
		df['Epoch'] = index
		df['Prediction'] = df.apply(findPrediction, axis=1)
		
		base_df['Boosted_Prediction'] += df['Prediction']
		
		loss = (base_df['Boosted_Prediction'] - base_df['Decision']).pow(2).sum()
		current_loss = loss / num_of_instances #mse
		
		if index == 1: 
			boosted_from = current_loss * 1
		elif index == epochs:
			boosted_to = current_loss * 1
		
		if current_loss < best_epoch_loss:
			best_epoch_loss = current_loss * 1
			best_epoch_idx = index * 1
		
		df['Decision'] = int(learning_rate)*(df['Decision'] - df['Prediction'])
		df = df.drop(columns = ['Epoch', 'Prediction'])
		
		#---------------------------------
		
		df.to_csv(new_data_set, index=False)
		#data(i) created
		
		#---------------------------------
		
		file = "outputs/rules/rules"+str(index)+".py"
		json_file = "outputs/rules/rules"+str(index)+".json"
		
		functions.createFile(file, header)
		functions.createFile(json_file, "[\n")
		
		current_df = df.copy()
		Training.buildDecisionTree(df,root,file, config, dataset_features
			, parent_level = 0, leaf_id = 0, parents = 'root', main_process_id = process_id)
		
		#functions.storeRule(json_file," {}]")
		
		df = current_df.copy() #numeric features require this restoration to apply findDecision function
		
		#rules(i) created
		
		loss = loss / num_of_instances
		#print("epoch ",index," - loss: ",loss)
		#print("loss: ",loss)
		pbar.set_description("Epoch %d. Loss: %d. Process: " % (index, loss))
		
		gc.collect()
		
	#---------------------------------
	
	print("The best epoch is ", best_epoch_idx," with ", best_epoch_loss," loss value")
	models = models[0:best_epoch_idx]
	config["epochs"] = best_epoch_idx
	
	print("MSE of ",num_of_instances," instances are boosted from ",boosted_from," to ",best_epoch_loss," in ",epochs," epochs")
	
	return models
Example #11
0
def regressor(df, config, header, dataset_features):
    models = []

    algorithm = config['algorithm']

    enableRandomForest = config['enableRandomForest']
    num_of_trees = config['num_of_trees']
    enableMultitasking = config['enableMultitasking']

    enableGBM = config['enableGBM']
    epochs = config['epochs']
    learning_rate = config['learning_rate']

    enableAdaboost = config['enableAdaboost']

    #------------------------------

    boosted_from = 0
    boosted_to = 0

    #------------------------------

    base_df = df.copy()

    #gbm will manipulate actuals. store its raw version.
    target_values = base_df['Decision'].values
    num_of_instances = target_values.shape[0]

    root = 1
    file = "outputs/rules/rules0.py"
    functions.createFile(file, header)

    Training.buildDecisionTree(df, root, file, config,
                               dataset_features)  #generate rules0

    df = base_df.copy()

    base_df['Boosted_Prediction'] = 0

    #------------------------------

    pbar = tqdm(range(1, epochs + 1), desc='Boosting')

    #for index in range(1,epochs+1):
    #for index in tqdm(range(1,epochs+1), desc='Boosting'):
    for index in pbar:
        #print("epoch ",index," - ",end='')
        loss = 0

        #run data(i-1) and rules(i-1), save data1

        #dynamic import
        moduleName = "outputs/rules/rules%s" % (index - 1)
        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname,
                                  description)  #rules0

        models.append(myrules)

        new_data_set = "outputs/data/data%s.csv" % (index)
        f = open(new_data_set, "w")

        #put header in the following file
        columns = df.shape[1]

        mae = 0

        #----------------------------------------

        df['Epoch'] = index
        df['Prediction'] = df.apply(findPrediction, axis=1)

        base_df['Boosted_Prediction'] += df['Prediction']

        loss = (base_df['Boosted_Prediction'] -
                base_df['Decision']).pow(2).sum()

        if index == 1:
            boosted_from = loss / num_of_instances
        elif index == epochs:
            boosted_to = loss / num_of_instances

        df['Decision'] = int(learning_rate) * (df['Decision'] -
                                               df['Prediction'])
        df = df.drop(columns=['Epoch', 'Prediction'])

        #---------------------------------

        df.to_csv(new_data_set, index=False)
        #data(i) created

        #---------------------------------

        file = "outputs/rules/rules" + str(index) + ".py"

        functions.createFile(file, header)

        current_df = df.copy()
        Training.buildDecisionTree(df, root, file, config, dataset_features)
        df = current_df.copy(
        )  #numeric features require this restoration to apply findDecision function

        #rules(i) created

        loss = loss / num_of_instances
        #print("epoch ",index," - loss: ",loss)
        #print("loss: ",loss)
        pbar.set_description("Epoch %d. Loss: %d. Process: " % (index, loss))

        #---------------------------------

    print(num_of_instances, " instances are boosted from ", boosted_from,
          " to ", boosted_to, " in ", epochs, " epochs")

    return models
Example #12
0
def classifier(df, config, header, dataset_features):

    models = []

    print("gradient boosting for classification")

    epochs = config['epochs']

    temp_df = df.copy()
    original_dataset = df.copy()
    worksheet = df.copy()

    classes = df['Decision'].unique()

    boosted_predictions = np.zeros([df.shape[0], len(classes)])

    pbar = tqdm(range(0, epochs), desc='Boosting')

    #store actual set, we will use this to calculate loss
    actual_set = pd.DataFrame(np.zeros([df.shape[0], len(classes)]),
                              columns=classes)
    for i in range(0, len(classes)):
        current_class = classes[i]
        actual_set[current_class] = np.where(df['Decision'] == current_class,
                                             1, 0)
    actual_set = actual_set.values  #transform it to numpy array

    #for epoch in range(0, epochs):
    for epoch in pbar:
        for i in range(0, len(classes)):
            current_class = classes[i]

            if epoch == 0:
                temp_df['Decision'] = np.where(df['Decision'] == current_class,
                                               1, 0)
                worksheet['Y_' + str(i)] = temp_df['Decision']
            else:
                temp_df['Decision'] = worksheet['Y-P_' + str(i)]

            predictions = []

            #change data type for decision column
            temp_df[['Decision']].astype('int64')

            root = 1
            file = "outputs/rules/rules-for-" + current_class + "-round-" + str(
                epoch) + ".py"

            functions.createFile(file, header)

            Training.buildDecisionTree(temp_df, root, file, config,
                                       dataset_features)
            #decision rules created
            #----------------------------

            #dynamic import
            moduleName = "outputs/rules/rules-for-" + current_class + "-round-" + str(
                epoch)
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0

            models.append(myrules)

            num_of_columns = df.shape[1]

            for row, instance in df.iterrows():
                features = []
                for j in range(0, num_of_columns - 1):  #iterate on features
                    features.append(instance[j])

                actual = temp_df.loc[row]['Decision']
                prediction = myrules.findDecision(features)

                predictions.append(prediction)

            #----------------------------
            if epoch == 0:
                worksheet['F_' + str(i)] = 0
            else:
                worksheet['F_' + str(i)] = pd.Series(predictions).values

            boosted_predictions[:, i] = boosted_predictions[:, i] + worksheet[
                'F_' + str(i)].values.astype(np.float32)

            #print(boosted_predictions[0:5,:])

            worksheet['P_' + str(i)] = 0

            #----------------------------
            temp_df = df.copy()  #restoration

        for row, instance in worksheet.iterrows():
            f_scores = []
            for i in range(0, len(classes)):
                f_scores.append(instance['F_' + str(i)])

            probabilities = functions.softmax(f_scores)

            for j in range(0, len(probabilities)):
                instance['P_' + str(j)] = probabilities[j]

            worksheet.loc[row] = instance

        for i in range(0, len(classes)):
            worksheet['Y-P_' +
                      str(i)] = worksheet['Y_' + str(i)] - worksheet['P_' +
                                                                     str(i)]

        prediction_set = np.zeros([df.shape[0], len(classes)])
        for i in range(0, boosted_predictions.shape[0]):
            predicted_index = np.argmax(boosted_predictions[i])
            prediction_set[i][predicted_index] = 1

        #----------------------------
        #find loss for this epoch: prediction_set vs actual_set
        classified = 0
        for i in range(0, actual_set.shape[0]):
            actual = np.argmax(actual_set[i])
            prediction = np.argmax(prediction_set[i])
            #print("actual: ",actual," - prediction: ",prediction)

            if actual == prediction:
                classified = classified + 1

        accuracy = str(100 * classified / actual_set.shape[0]) + "%"

        #----------------------------

        #print(worksheet.head())
        #print("round ",epoch+1)
        pbar.set_description("Epoch %d. Accuracy: %s. Process: " %
                             (epoch + 1, accuracy))

    return models, classes
Example #13
0
def apply(df, config, header, dataset_features, validation_df = None, process_id = None):

	models = []

	num_of_trees = config['num_of_trees']

	parallelism_on = config["enableParallelism"]

	#TODO: is this logical for 48x2 cores?
	#config["enableParallelism"] = False #run each tree in parallel but each branch in serial

	#TODO: reconstruct for parallel run is problematic. you should reconstruct based on tree id.

	input_params = []

	pbar = tqdm(range(0, num_of_trees), desc='Bagging')
	for i in pbar:
		pbar.set_description("Sub decision tree %d is processing" % (i+1))
		subset = df.sample(frac=1/num_of_trees)

		root = 1

		moduleName = "outputs/rules/rule_"+str(i)
		file = moduleName+".py"

		functions.createFile(file, header)

		if parallelism_on: #parallel run
			input_params.append((subset, root, file, config, dataset_features, 0, 0, 'root', i, None, process_id))

		else: #serial run
			Training.buildDecisionTree(subset,root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', tree_id = i, main_process_id = process_id)

	#-------------------------------

	if parallelism_on:
		num_cores = config["num_cores"]

		#---------------------------------

		if num_of_trees <= num_cores:
			POOL_SIZE = num_of_trees
		else:
			POOL_SIZE = num_cores

		with closing(multiprocessing.Pool(POOL_SIZE)) as pool:
			funclist = []
			for input_param in input_params:
				f = pool.apply_async(buildDecisionTree, [*input_param])
				funclist.append(f)

			#all functions registered here
			#results = []
			for f in tqdm(funclist):
				branch_results = f.get(timeout = 100000)
				#results.append(branch_results)

			pool.close()
			pool.terminate()

	#-------------------------------
	#collect models for both serial and parallel here
	for i in range(0, num_of_trees):
		moduleName = "outputs/rules/rule_"+str(i)
		fp, pathname, description = imp.find_module(moduleName)
		myrules = imp.load_module(moduleName, fp, pathname, description)
		models.append(myrules)

	#-------------------------------

	return models
Example #14
0
def buildDecisionTree(df,
                      root,
                      file,
                      config,
                      dataset_features,
                      parent_level=0,
                      leaf_id=0,
                      parents='root',
                      tree_id=0,
                      validation_df=None,
                      main_process_id=None):

    models = []

    decision_rules = []

    feature_names = df.columns[0:-1]

    enableParallelism = config['enableParallelism']
    algorithm = config['algorithm']

    json_file = file.split(".")[0] + ".json"

    random_forest_enabled = config['enableRandomForest']
    enableGBM = config['enableGBM']
    enableAdaboost = config['enableAdaboost']

    if root == 1:
        if random_forest_enabled != True and enableGBM != True and enableAdaboost != True:
            raw_df = df.copy()

    #--------------------------------------

    df_copy = df.copy()

    winner_name, num_of_instances, metric, metric_name = findDecision(
        df, config)

    #find winner index, this cannot be returned by find decision because columns dropped in previous steps
    j = 0
    for i in dataset_features:
        if i == winner_name:
            winner_index = j
        j = j + 1

    numericColumn = False
    if dataset_features[winner_name] != 'object':
        numericColumn = True

    #restoration
    columns = df.shape[1]
    for i in range(0, columns - 1):
        #column_name = df.columns[i]; column_type = df[column_name].dtypes #numeric field already transformed to object. you cannot check it with df itself, you should check df_copy
        column_name = df_copy.columns[i]
        column_type = df_copy[column_name].dtypes
        if column_type != 'object' and column_name != winner_name:
            df[column_name] = df_copy[column_name]

    classes = df[winner_name].value_counts().keys().tolist()
    #print("classes: ",classes," in ", winner_name)
    #-----------------------------------------------------

    num_cores = config["num_cores"]

    input_params = []

    #serial approach
    for i in range(0, len(classes)):
        current_class = classes[i]
        subdataset = df[df[winner_name] == current_class]
        subdataset = subdataset.drop(columns=[winner_name])
        branch_index = i * 1

        #create branches serially
        if enableParallelism != True:

            if i == 0:

                descriptor = {
                    "feature": winner_name,
                    "instances": num_of_instances,
                    #"metric_name": metric_name,
                    "metric_value": round(metric, 4),
                    "depth": parent_level + 1
                }
                descriptor = "# " + json.dumps(descriptor)

                functions.storeRule(
                    file, (functions.formatRule(root), "", descriptor))

            results = createBranch(config,
                                   current_class,
                                   subdataset,
                                   numericColumn,
                                   branch_index,
                                   winner_name,
                                   winner_index,
                                   root,
                                   parents,
                                   file,
                                   dataset_features,
                                   num_of_instances,
                                   metric,
                                   tree_id=tree_id,
                                   main_process_id=main_process_id)

            decision_rules = decision_rules + results

        else:
            input_params.append(
                (config, current_class, subdataset, numericColumn,
                 branch_index, winner_name, winner_index, root, parents, file,
                 dataset_features, num_of_instances, metric, tree_id,
                 main_process_id))

    #---------------------------
    #add else condition in the decision tree

    if df.Decision.dtypes == 'object':  #classification
        pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
        pivot = pivot.rename(columns={
            "Decision": "Instances",
            "index": "Decision"
        })
        pivot = pivot.sort_values(by=["Instances"],
                                  ascending=False).reset_index()

        else_decision = "return '%s'" % (pivot.iloc[0].Decision)

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:  #parallelism
            leaf_id = str(uuid.uuid1())

            check_rule = "else: " + else_decision

            sample_rule = {}
            sample_rule["current_level"] = root
            sample_rule["leaf_id"] = leaf_id
            sample_rule["parents"] = parents
            sample_rule["rule"] = check_rule
            sample_rule["feature_idx"] = -1
            sample_rule["feature_name"] = ""
            sample_rule["instances"] = df.shape[0]
            sample_rule["metric"] = 0
            sample_rule["return_statement"] = 0
            sample_rule["tree_id"] = tree_id

            #json to string
            sample_rule = json.dumps(sample_rule)
            decision_rules.append(sample_rule)

    else:  #regression
        else_decision = "return %s" % (subdataset.Decision.mean())

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:
            leaf_id = str(uuid.uuid1())

            check_rule = "else: " + else_decision

            sample_rule = {}
            sample_rule["current_level"] = root
            sample_rule["leaf_id"] = leaf_id
            sample_rule["parents"] = parents
            sample_rule["rule"] = check_rule
            sample_rule["tree_id"] = tree_id
            sample_rule["feature_name"] = ""
            sample_rule["instances"] = 0
            sample_rule["metric"] = 0
            sample_rule["return_statement"] = 1

            #json to string
            sample_rule = json.dumps(sample_rule)
            decision_rules.append(sample_rule)

    #---------------------------

    try:
        main_process = psutil.Process(main_process_id)
        children = main_process.children(recursive=True)
        active_processes = len(children) + 1  #plus parent
        #active_processes = len(children)
    except:
        active_processes = 100  #set a large initial value

    results = []
    #create branches in parallel
    if enableParallelism == True:

        required_threads = active_processes + len(classes)

        #if parent_level == 0 and random_forest_enabled != True:
        if main_process_id != None and num_cores >= required_threads:  #len(classes) branches will be run in parallel

            #POOL_SIZE = num_cores
            POOL_SIZE = len(classes)

            #with closing(multiprocessing.Pool(POOL_SIZE)) as pool:
            with closing(MyPool(POOL_SIZE)) as pool:
                funclist = []

                for input_param in input_params:
                    f = pool.apply_async(createBranchWrapper,
                                         [createBranch, input_param])
                    funclist.append(f)

                #all functions registered here

                for f in funclist:
                    branch_results = f.get(timeout=100000)

                    for branch_result in branch_results:
                        results.append(branch_result)

                pool.close()
                pool.terminate()

            #--------------------------------

        else:  #serial
            for input_param in input_params:
                sub_results = createBranchWrapper(createBranch, input_param)
                for sub_result in sub_results:
                    results.append(sub_result)

        #--------------------------------

        decision_rules = decision_rules + results

        #--------------------------------

        if root != 1:  #return children results until the root node
            return decision_rules

    #---------------------------------------------

    if root == 1:

        if enableParallelism == True:

            #custom rules are stored in decision_rules. merge them all in a json file first

            json_rules = "[\n"  #initialize

            file_index = 0
            for custom_rule in decision_rules:

                json_rules += custom_rule

                if file_index < len(decision_rules) - 1:
                    json_rules += ", "

                json_rules += "\n"

                file_index = file_index + 1

            #-----------------------------------

            json_rules += "]"
            functions.createFile(json_file, json_rules)

            #-----------------------------------
            #reconstruct rules from json to py

            reconstructRules(json_file, feature_names)

            #-----------------------------------

        #is regular decision tree
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            #this is reguler decision tree. find accuracy here.

            moduleName = "outputs/rules/rules"
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0
            models.append(myrules)

    return models
Example #15
0
def initializeAlphaFile():
    file = "outputs/rules/alphas.py"
    header = "def findAlpha(epoch):\n"
    functions.createFile(file, header)
Example #16
0
def fit(df, config={}, validation_df=None):
    """
	Parameters:
		df (pandas data frame): Training data frame. The target column must be named as 'Decision' and it has to be in the last column
		
		config (dictionary):
			
			config = {
				'algorithm' (string): ID3, 'C4.5, CART, CHAID or Regression
				'enableParallelism' (boolean): False
				
				'enableGBM' (boolean): True,
				'epochs' (int): 7,
				'learning_rate' (int): 1,
				
				'enableRandomForest' (boolean): True,
				'num_of_trees' (int): 5,
				
				'enableAdaboost' (boolean): True,
				'num_of_weak_classifier' (int): 4
			}
			
		validation_df (pandas data frame): if nothing is passed to validation data frame, then the function validates built trees for training data frame
		
	Returns:
		chefboost model
		
	"""

    process_id = os.getpid()

    base_df = df.copy()

    target_label = df.columns[len(df.columns) - 1]
    if target_label != 'Decision':
        print("Expected: Decision, Existing: ", target_label)
        raise ValueError(
            'Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame'
        )

    #------------------------
    #handle NaN values

    nan_values = []

    for column in df.columns:
        if df[column].dtypes != 'object':
            min_value = df[column].min()
            idx = df[df[column].isna()].index

            nan_value = []
            nan_value.append(column)

            if idx.shape[0] > 0:
                df.loc[idx, column] = min_value - 1
                nan_value.append(min_value - 1)
                min_value - 1
                #print("NaN values are replaced to ", min_value - 1, " in column ", column)
            else:
                nan_value.append(None)

            nan_values.append(nan_value)

    #------------------------

    #initialize params and folders
    config = functions.initializeParams(config)
    functions.initializeFolders()

    #------------------------

    algorithm = config['algorithm']

    valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression']

    if algorithm not in valid_algorithms:
        raise ValueError('Invalid algorithm passed. You passed ', algorithm,
                         " but valid algorithms are ", valid_algorithms)

    #------------------------

    enableRandomForest = config['enableRandomForest']
    num_of_trees = config['num_of_trees']
    enableMultitasking = config[
        'enableMultitasking']  #no longer used. check to remove this variable.

    enableGBM = config['enableGBM']
    epochs = config['epochs']
    learning_rate = config['learning_rate']

    enableAdaboost = config['enableAdaboost']
    enableParallelism = config['enableParallelism']

    #------------------------

    if enableParallelism == True:
        print("[INFO]: ", config["num_cores"],
              "CPU cores will be allocated in parallel running")

    #------------------------
    raw_df = df.copy()
    num_of_rows = df.shape[0]
    num_of_columns = df.shape[1]

    if algorithm == 'Regression':
        if df['Decision'].dtypes == 'object':
            raise ValueError(
                'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.'
            )

    if df['Decision'].dtypes != 'object':  #this must be regression tree even if it is not mentioned in algorithm

        if algorithm != 'Regression':
            print(
                "WARNING: You set the algorithm to ", algorithm,
                " but the Decision column of your data set has non-object type."
            )
            print(
                "That's why, the algorithm is set to Regression to handle the data set."
            )

        algorithm = 'Regression'
        config['algorithm'] = 'Regression'
        global_stdev = df['Decision'].std(ddof=0)

    if enableGBM == True:
        print("Gradient Boosting Machines...")
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'

    if enableAdaboost == True:
        #enableParallelism = False
        for j in range(0, num_of_columns):
            column_name = df.columns[j]
            if df[column_name].dtypes == 'object':
                raise ValueError(
                    'Adaboost must be run on numeric data set for both features and target'
                )

    #-------------------------

    print(algorithm, " tree is going to be built...")

    dataset_features = dict(
    )  #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales.

    header = "def findDecision(obj): #"

    num_of_columns = df.shape[1] - 1
    for i in range(0, num_of_columns):
        column_name = df.columns[i]
        dataset_features[column_name] = df[column_name].dtypes
        header = header + "obj[" + str(i) + "]: " + column_name
        if i != num_of_columns - 1:
            header = header + ", "

    header = header + "\n"

    #------------------------

    begin = time.time()

    trees = []
    alphas = []

    if enableAdaboost == True:
        trees, alphas = adaboost.apply(df,
                                       config,
                                       header,
                                       dataset_features,
                                       validation_df=validation_df)

    elif enableGBM == True:

        if df['Decision'].dtypes == 'object':  #transform classification problem to regression
            trees, alphas = gbm.classifier(df,
                                           config,
                                           header,
                                           dataset_features,
                                           validation_df=validation_df)
            classification = True

        else:  #regression
            trees = gbm.regressor(df,
                                  config,
                                  header,
                                  dataset_features,
                                  validation_df=validation_df)
            classification = False

    elif enableRandomForest == True:
        trees = randomforest.apply(df,
                                   config,
                                   header,
                                   dataset_features,
                                   validation_df=validation_df,
                                   process_id=process_id)
    else:  #regular decision tree building

        root = 1
        file = "outputs/rules/rules.py"
        functions.createFile(file, header)

        if enableParallelism == True:
            json_file = "outputs/rules/rules.json"
            functions.createFile(json_file, "[\n")

        trees = Training.buildDecisionTree(df,
                                           root=root,
                                           file=file,
                                           config=config,
                                           dataset_features=dataset_features,
                                           parent_level=0,
                                           leaf_id=0,
                                           parents='root',
                                           validation_df=validation_df,
                                           main_process_id=process_id)

    print("-------------------------")
    print("finished in ", time.time() - begin, " seconds")

    obj = {
        "trees": trees,
        "alphas": alphas,
        "config": config,
        "nan_values": nan_values
    }

    #-----------------------------------------

    #train set accuracy
    df = base_df.copy()
    evaluate(obj, df, task='train')

    #validation set accuracy
    if isinstance(validation_df, pd.DataFrame):
        evaluate(obj, validation_df, task='validation')

    #-----------------------------------------

    return obj
Example #17
0
def apply(df, config, header, dataset_features):

    models = []
    alphas = []

    initializeAlphaFile()

    num_of_weak_classifier = config['num_of_weak_classifier']

    #------------------------

    rows = df.shape[0]
    columns = df.shape[1]
    final_predictions = pd.DataFrame(np.zeros([rows, 1]),
                                     columns=['prediction'])

    worksheet = df.copy()
    worksheet['Weight'] = 1 / rows  #uniform distribution initially

    final_predictions = pd.DataFrame(np.zeros((df.shape[0], 2)),
                                     columns=['Prediction', 'Actual'])
    final_predictions['Actual'] = df['Decision']

    #for i in range(0, num_of_weak_classifier):
    pbar = tqdm(range(0, num_of_weak_classifier), desc='Adaboosting')
    for i in pbar:
        worksheet['Decision'] = worksheet['Weight'] * worksheet['Decision']

        root = 1
        file = "outputs/rules/rules_" + str(i) + ".py"

        functions.createFile(file, header)

        #print(worksheet)
        Training.buildDecisionTree(worksheet.drop(columns=['Weight']),
                                   root,
                                   file,
                                   config,
                                   dataset_features,
                                   parent_level=0,
                                   leaf_id=0,
                                   parents='root')

        #---------------------------------------

        moduleName = "outputs/rules/rules_" + str(i)
        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname, description)
        models.append(myrules)

        #---------------------------------------

        df['Epoch'] = i
        worksheet['Prediction'] = df.apply(findPrediction, axis=1)
        df = df.drop(columns=['Epoch'])

        #---------------------------------------
        worksheet['Actual'] = df['Decision']
        worksheet['Loss'] = abs(worksheet['Actual'] -
                                worksheet['Prediction']) / 2
        worksheet[
            'Weight_Times_Loss'] = worksheet['Loss'] * worksheet['Weight']

        epsilon = worksheet['Weight_Times_Loss'].sum()
        alpha = math.log(
            (1 - epsilon) /
            epsilon) / 2  #use alpha to update weights in the next round
        alphas.append(alpha)

        #-----------------------------

        #store alpha
        addEpochAlpha(i, alpha)

        #-----------------------------

        worksheet['Alpha'] = alpha
        worksheet['New_Weights'] = worksheet['Weight'] * (
            -alpha * worksheet['Actual'] * worksheet['Prediction']).apply(
                math.exp)

        #normalize
        worksheet['New_Weights'] = worksheet['New_Weights'] / worksheet[
            'New_Weights'].sum()
        worksheet['Weight'] = worksheet['New_Weights']
        worksheet['Decision'] = df['Decision']

        final_predictions['Prediction'] = final_predictions[
            'Prediction'] + worksheet['Alpha'] * worksheet['Prediction']
        #print(final_predictions)
        worksheet = worksheet.drop(columns=[
            'New_Weights', 'Prediction', 'Actual', 'Loss', 'Weight_Times_Loss',
            'Alpha'
        ])

        mae = (np.abs(final_predictions['Prediction'].apply(functions.sign) -
                      final_predictions['Actual']) /
               2).sum() / final_predictions.shape[0]
        #print(mae)
        pbar.set_description("Epoch %d. Loss: %d. Process: " % (i + 1, mae))

    #------------------------------
    final_predictions['Prediction'] = final_predictions['Prediction'].apply(
        functions.sign)
    final_predictions['Absolute_Error'] = np.abs(
        final_predictions['Actual'] - final_predictions['Prediction']) / 2
    #print(final_predictions)
    mae = final_predictions['Absolute_Error'].sum(
    ) / final_predictions.shape[0]
    print("Loss (MAE) found ", mae, " with ", num_of_weak_classifier,
          ' weak classifiers')

    return models, alphas
Example #18
0
def apply(df, config, header, dataset_features):

    models = []

    num_of_trees = config['num_of_trees']

    pbar = tqdm(range(0, num_of_trees), desc='Bagging')

    for i in pbar:
        #for i in range(0, num_of_trees):
        pbar.set_description("Sub decision tree %d is processing" % (i + 1))
        subset = df.sample(frac=1 / num_of_trees)

        root = 1

        moduleName = "outputs/rules/rule_" + str(i)
        file = moduleName + ".py"
        json_file = moduleName + ".json"

        functions.createFile(file, header)
        functions.createFile(json_file, "[\n")

        Training.buildDecisionTree(subset,
                                   root,
                                   file,
                                   config,
                                   dataset_features,
                                   parent_level=0,
                                   leaf_id=0,
                                   parents='root')

        functions.storeRule(json_file, "{}]")

        #--------------------------------

        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname, description)
        models.append(myrules)

    #-------------------------------
    #check regression or classification
    if df['Decision'].dtypes == 'object': problem_type = 'classification'
    else: problem_type = 'regression'

    actual_values = df['Decision'].values
    num_of_features = df.shape[1] - 1  #discard Decision
    number_of_instances = df.shape[0]

    global_predictions = []

    #if classification get the max number of prediction
    if problem_type == 'classification':
        for i in range(0, num_of_trees):

            moduleName = "outputs/rules/rule_" + str(i)
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname, description)

            predictions = []

            for index, instance in df.iterrows():
                params = []
                for j in range(0, num_of_features):
                    params.append(instance[j])

                #index row, i th column
                prediction = myrules.findDecision(params)
                predictions.append(prediction)
                #print(i,"th tree prediction: ",prediction)

            #print(predictions)
            global_predictions.append(predictions)

        #-------------------------------
        classified = 0
        for index in range(0, len(actual_values)):

            actual = actual_values[index]
            predictions = []
            for i in range(0, num_of_trees):
                prediction = global_predictions[i][index]
                if prediction != None:  #why None exists in some cases?
                    predictions.append(prediction)

            predictions = np.array(predictions)
            unique_values = np.unique(predictions)

            if unique_values.shape[0] == 1:
                prediction = unique_values[0]
            else:
                counts = []
                for unique in unique_values:
                    count = 0
                    for j in predictions:
                        if unique == j:
                            count = count + 1
                    counts.append(count)

                #print("unique: ",unique_values)
                #print("counts: ",counts)

                prediction = None

                if len(counts) > 0:
                    max_index = np.argmax(np.array(counts))
                    prediction = unique_values[max_index]

            #print(index,". actual: ",actual," - prediction: ", prediction)
            if actual == prediction:
                classified = classified + 1

        print("Accuracy: ", 100 * classified / number_of_instances, "% on ",
              number_of_instances, " instances")

    return models