Python createFile Examples

Programming Language: Python

Namespace/Package Name: chefboost.commons.functions

Method/Function: createFile

Examples at hotexamples.com: 18

Python createFile - 18 examples found. These are the top rated real world Python examples of chefboost.commons.functions.createFile extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: Training.py Project: iketutg/chefboost

def reconstructRules(source):

    #print("Reconstructing ",source)

    file_name = source.split(".json")[0]
    file_name = file_name + ".py"

    functions.createFile(file_name,
                         "#This rule was reconstructed from " + source + "\n")

    with open(source, 'r') as f:
        rules = json.load(f)

    #print(rules)

    def padleft(rule, level):
        for i in range(0, level):
            rule = "\t" + rule
        return rule

    #print("def findDecision(obj):")

    max_level = 0

    rule_set = []
    #json file might not store rules respectively
    for instance in rules:
        if len(instance) > 0:
            rule = []
            rule.append(instance["current_level"])
            rule.append(instance["leaf_id"])
            rule.append(instance["parents"])
            rule.append(instance["rule"])
            rule_set.append(rule)
            #print(padleft(instance["rule"], instance["current_level"]))

    df = np.array(rule_set)

    def extractRules(df, parent='root', level=1):

        level_raw = level * 1
        parent_raw = copy.copy(parent)

        for i in range(0, df.shape[0]):
            leaf_id = df[i][1]
            parent_id = df[i][2]
            rule = df[i][3]

            if parent_id == parent:
                functions.storeRule(file_name, padleft(rule, level))

                level = level + 1
                parent = copy.copy(leaf_id)
                extractRules(df, parent, level)
                level = level_raw * 1
                parent = copy.copy(parent_raw)  #restore

    functions.storeRule(file_name, "def findDecision(obj):")
    extractRules(df)

Example #2

Show file

File: randomforest.py Project: LeeEnsub/chefboost

def apply(df, config, header, dataset_features, validation_df = None):
	
	models = []
	
	num_of_trees = config['num_of_trees']
	
	parallelism_on = config["enableParallelism"]
	
	#TODO: is this logical for 48x2 cores?
	#config["enableParallelism"] = False #run each tree in parallel but each branch in serial
	
	#TODO: reconstruct for parallel run is problematic. you should reconstruct based on tree id.
	
	input_params = []
	
	pbar = tqdm(range(0, num_of_trees), desc='Bagging')
	for i in pbar:
		pbar.set_description("Sub decision tree %d is processing" % (i+1))
		subset = df.sample(frac=1/num_of_trees)
		
		root = 1
		
		moduleName = "outputs/rules/rule_"+str(i)
		file = moduleName+".py"
		
		functions.createFile(file, header)
		
		if parallelism_on: #parallel run
			input_params.append((subset, root, file, config, dataset_features, 0, 0, 'root', i))
		
		else: #serial run
			Training.buildDecisionTree(subset,root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', tree_id = i)
		
	#-------------------------------
	
	if parallelism_on:
		num_cores = config["num_cores"]
		pool = Training.MyPool(num_cores)
		results = pool.starmap(buildDecisionTree, input_params)
		pool.close()
		pool.join()
	
	#-------------------------------
	#collect models for both serial and parallel here
	for i in range(0, num_of_trees):
		moduleName = "outputs/rules/rule_"+str(i)
		fp, pathname, description = imp.find_module(moduleName)
		myrules = imp.load_module(moduleName, fp, pathname, description)
		models.append(myrules)
	
	#-------------------------------
	
	return models

Example #3

Show file

File: randomforest.py Project: Sandy4321/chefboost

def apply(df, config, header, dataset_features, validation_df=None):

    models = []

    num_of_trees = config['num_of_trees']

    pbar = tqdm(range(0, num_of_trees), desc='Bagging')

    for i in pbar:
        #for i in range(0, num_of_trees):
        pbar.set_description("Sub decision tree %d is processing" % (i + 1))
        subset = df.sample(frac=1 / num_of_trees)

        root = 1

        moduleName = "outputs/rules/rule_" + str(i)
        file = moduleName + ".py"
        json_file = moduleName + ".json"

        functions.createFile(file, header)
        functions.createFile(json_file, "[\n")

        Training.buildDecisionTree(subset,
                                   root,
                                   file,
                                   config,
                                   dataset_features,
                                   parent_level=0,
                                   leaf_id=0,
                                   parents='root')

        functions.storeRule(json_file, "{}]")

        #--------------------------------

        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname, description)
        models.append(myrules)

    #-------------------------------

    return models

Example #4

Show file

def reconstructRules(source, feature_names):
	
	#print("Reconstructing ",source)
	
	file_name = source.split(".json")[0]
	file_name = file_name+".py"
	
	#-----------------------------------
	
	constructor = "def findDecision(obj): #"
	idx = 0
	for feature in feature_names:
		constructor = constructor + "obj["+str(idx)+"]: "+feature
		
		if idx < len(feature_names) - 1:
			constructor = constructor+", "
		idx = idx + 1
	
	functions.createFile(file_name, constructor+"\n")
	
	#-----------------------------------
	
	with open(source, 'r') as f:
		rules = json.load(f)

	#print(rules)

	def padleft(rule, level):
		for i in range(0, level):
			rule = "\t"+rule
		return rule

	#print("def findDecision(obj):")

	max_level = 0

	rule_set = []
	#json file might not store rules respectively
	for instance in rules:
		if len(instance) > 0:
			rule = []
			rule.append(instance["current_level"])
			rule.append(instance["leaf_id"])
			rule.append(instance["parents"])
			rule.append(instance["rule"])
			rule.append(instance["feature_name"])
			rule.append(instance["instances"])
			rule.append(instance["metric"])
			rule.append(instance["return_statement"])
			rule_set.append(rule)
			#print(padleft(instance["rule"], instance["current_level"]))

	df = np.array(rule_set)
	
	def extractRules(df, parent = 'root', level=1):
	
		level_raw = level * 1; parent_raw = copy.copy(parent)
		
		else_rule = ""
		
		leaf_idx = 0
		for i in range(0 ,df.shape[0]):
			current_level = int(df[i][0])
			leaf_id = df[i][1]
			parent_id = df[i][2]
			rule = df[i][3]
			feature_name = df[i][4]
			instances = int(df[i][5])
			metric = float(df[i][6])
			return_statement = int(df[i][7])
			
			if parent_id == parent:
				
				if_statement = False
				if rule[0:2] == "if":
					if_statement = True
				
				else_statement = False
				if rule[0:5] == "else:":
					else_statement = True
					else_rule = rule
				
				#------------------------
				
				if else_statement != True:
				
					if if_statement == True and leaf_idx > 0:
						rule = "el"+rule
					
					#print(padleft(rule, level), "(", leaf_idx,")")
					
					if leaf_idx == 0 and return_statement == 0:
						explainer = {}
						explainer["feature"] = feature_name
						explainer["instances"] = instances
						explainer["metric_value"] = round(metric, 4)
						explainer["depth"] = current_level
						explainer = "# "+json.dumps(explainer)
						functions.storeRule(file_name, padleft(explainer, level))
					
					functions.storeRule(file_name, padleft(rule, level))
					
					level = level + 1; parent = copy.copy(leaf_id)
					extractRules(df, parent, level)
					level = level_raw * 1; parent = copy.copy(parent_raw) #restore
					
					leaf_idx = leaf_idx + 1
		
		#add else statement
		
		if else_rule != "":
			#print(padleft(else_rule, level))
			functions.storeRule(file_name, padleft(else_rule, level))
			
	#------------------------------------
	
	extractRules(df)

Example #5

Show file

def buildDecisionTree(df, root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', validation_df = None):
	
	models = []
	feature_names = df.columns[0:-1]
	
	enableParallelism = config['enableParallelism']
	algorithm = config['algorithm']
	
	json_file = file.split(".")[0]+".json"
	
	if root == 1:
		if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True:
			raw_df = df.copy()
	
	#--------------------------------------
	
	df_copy = df.copy()
	
	winner_name, num_of_instances, metric, metric_name = findDecision(df, config)
	
	#find winner index, this cannot be returned by find decision because columns dropped in previous steps
	j = 0 
	for i in dataset_features:
		if i == winner_name:
			winner_index = j
		j = j + 1
	
	numericColumn = False
	if dataset_features[winner_name] != 'object':
		numericColumn = True
	
	#restoration
	columns = df.shape[1]
	for i in range(0, columns-1):
		column_name = df.columns[i]; column_type = df[column_name].dtypes
		if column_type != 'object' and column_name != winner_name:
			df[column_name] = df_copy[column_name]
	
	classes = df[winner_name].value_counts().keys().tolist()
		
	#-----------------------------------------------------
	
	num_cores = config["num_cores"]
	
	input_params = []
	
	#serial approach
	for i in range(0,len(classes)):
		current_class = classes[i]
		subdataset = df[df[winner_name] == current_class]
		subdataset = subdataset.drop(columns=[winner_name])
		branch_index = i * 1
		
		#create branches serially
		if enableParallelism != True:
			
			if i == 0:
				#descriptor = "# Feature: "+winner_name+", Instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4))
				
				descriptor = {
					"feature": winner_name,
					"instances": num_of_instances,
					#"metric_name": metric_name,
					"metric_value": round(metric, 4),
					"depth": parent_level + 1
				}
				descriptor = "# "+json.dumps(descriptor)
				
				functions.storeRule(file, (functions.formatRule(root), "", descriptor))
			
			createBranch(config, current_class, subdataset, numericColumn, branch_index
				, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric)
		else:
			input_params.append((config, current_class, subdataset, numericColumn, branch_index
				, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric))
	
	#---------------------------
	#add else condition in the decision tree
	
	if df.Decision.dtypes == 'object': #classification
		pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
		pivot = pivot.rename(columns = {"Decision": "Instances","index": "Decision"})
		pivot = pivot.sort_values(by = ["Instances"], ascending = False).reset_index()
		
		else_decision = "return '%s'" % (pivot.iloc[0].Decision)
		
		if enableParallelism != True:
			functions.storeRule(file,(functions.formatRule(root), "else:"))
			functions.storeRule(file,(functions.formatRule(root+1), else_decision))
		else: #parallelism
			leaf_id = str(uuid.uuid1())
			custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
			
			check_rule = "else: "+else_decision
			
			sample_rule = {}
			sample_rule["current_level"] = root
			sample_rule["leaf_id"] = leaf_id
			sample_rule["parents"] = parents
			sample_rule["rule"] = check_rule
			sample_rule["feature_idx"] = -1
			sample_rule["feature_name"] = ""
			sample_rule["instances"] = df.shape[0]
			sample_rule["metric"] = 0
			sample_rule["return_statement"] = 0
			
			#json to string
			sample_rule = json.dumps(sample_rule)
			
			functions.createFile(custom_rule_file, "")
			functions.storeRule(custom_rule_file, sample_rule)
			
	else: #regression
		else_decision = "return %s" % (subdataset.Decision.mean())
				
		if enableParallelism != True:
			functions.storeRule(file,(functions.formatRule(root), "else:"))
			functions.storeRule(file,(functions.formatRule(root+1), else_decision))
		else:
			leaf_id = str(uuid.uuid1())
			custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
			
			check_rule = "else: "+else_decision
			
			sample_rule = "   {\n"
			sample_rule += "      \"current_level\": "+str(root)+",\n"
			sample_rule += "      \"leaf_id\": \""+str(leaf_id)+"\",\n"
			sample_rule += "      \"parents\": \""+parents+"\",\n"
			sample_rule += "      \"rule\": \""+check_rule+"\"\n"
			sample_rule += "   }"
			
			functions.createFile(custom_rule_file, "")
			functions.storeRule(custom_rule_file, sample_rule)
	
	#---------------------------
	
	#create branches in parallel
	if enableParallelism == True:
		"""
		#this usage causes trouble for recursive functions
		with Pool(number_of_cpus) as pool:
			pool.starmap(createBranch, input_params)
		"""
		
		pool = MyPool(num_cores)
		results = pool.starmap(createBranch, input_params)
		pool.close()
		pool.join()
	
	#---------------------------------------------
	
	if root == 1:
		
		if enableParallelism == True:

			#custom rules are stored in .txt files. merge them all in a json file
			
			functions.createFile(json_file, "[\n")
			
			custom_rules = []
			
			file_index = 0
			for file in os.listdir(os.getcwd()+"/outputs/rules"):
				if file.endswith(".txt"):
					custom_rules.append(os.getcwd()+"/outputs/rules/"+file)
					#print(file) #this file stores a custom rule
					f = open(os.getcwd()+"/outputs/rules/"+file, "r")
					custom_rule = f.read()
					
					if file_index > 0:
						custom_rule = ", "+custom_rule
					
					functions.storeRule(json_file, custom_rule)
					f.close()
					file_index = file_index + 1
					
			functions.storeRule(json_file, "]")
			
			#-----------------------------------
			
			#custom rules are already merged in a json file. clear messy custom rules
			#TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule.
			
			for file in custom_rules:
				os.remove(file)
			
			#-----------------------------------
			
			reconstructRules(json_file, feature_names)

			#feature importance should be calculated by demand?
			feature_importance(json_file, dataset_features)
			
			#-----------------------------------
		
		#is regular decision tree
		if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True:
		#this is reguler decision tree. find accuracy here.
			
			moduleName = "outputs/rules/rules"
			fp, pathname, description = imp.find_module(moduleName)
			myrules = imp.load_module(moduleName, fp, pathname, description) #rules0
			models.append(myrules)
			
	return models

Example #6

Show file

def createBranch(config, current_class, subdataset, numericColumn, branch_index
	, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric):
	
	algorithm = config['algorithm']
	enableAdaboost = config['enableAdaboost']
	enableGBM = config['enableGBM']
	max_depth = config['max_depth']
	enableParallelism = config['enableParallelism']
	
	charForResp = "'"
	if algorithm == 'Regression':
		charForResp = ""
	
	#---------------------------
	
	json_file = file.split(".")[0]+".json"
	
	tmp_root = root * 1
	parents_raw = copy.copy(parents)
	
	#---------------------------
	
	if numericColumn == True:
		compareTo = current_class #current class might be <=x or >x in this case
	else:
		compareTo = " == '"+str(current_class)+"'"
	
	#print(subdataset)
	
	terminateBuilding = False
	
	#-----------------------------------------------
	#can decision be made?
	
	if enableGBM == True and root >= max_depth: #max depth
		final_decision = subdataset['Decision'].mean()
		terminateBuilding = True
	elif enableAdaboost == True:
		#final_decision = subdataset['Decision'].value_counts().idxmax()
		final_decision = functions.sign(subdataset['Decision'].mean()) #get average
		terminateBuilding = True
		enableParallelism = False
	elif len(subdataset['Decision'].value_counts().tolist()) == 1:
		final_decision = subdataset['Decision'].value_counts().keys().tolist()[0] #all items are equal in this case
		terminateBuilding = True
	elif subdataset.shape[1] == 1: #if decision cannot be made even though all columns dropped
		final_decision = subdataset['Decision'].value_counts().idxmax() #get the most frequent one
		terminateBuilding = True
	elif algorithm == 'Regression' and subdataset.shape[0] < 5: #pruning condition
	#elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition
		final_decision = subdataset['Decision'].mean() #get average
		terminateBuilding = True
	
	#-----------------------------------------------
	
	if enableParallelism == True:
		check_condition = "if" #TODO: elif checks might be above than if statements in parallel
	else:	
		if branch_index == 0:
			check_condition = "if"
		else:
			check_condition = "elif"
	
	check_rule = check_condition+" obj["+str(winner_index)+"]"+compareTo+":"
	
	leaf_id = str(uuid.uuid1())
	custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
		
	if enableParallelism != True:
		
		#check_rule += " # feature: "+winner_name+", instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4))
		
		functions.storeRule(file,(functions.formatRule(root),"",check_rule))
	else:
		
		sample_rule = {}
		sample_rule["current_level"] = root
		sample_rule["leaf_id"] = leaf_id
		sample_rule["parents"] = parents
		sample_rule["rule"] = check_rule
		sample_rule["feature_idx"] = winner_index
		sample_rule["feature_name"] = winner_name
		sample_rule["instances"] = num_of_instances
		sample_rule["metric"] = metric
		sample_rule["return_statement"] = 0
		
		#json to string
		sample_rule = json.dumps(sample_rule)
	
		functions.createFile(custom_rule_file, "")
		functions.storeRule(custom_rule_file, sample_rule)
	
	#-----------------------------------------------
	
	if terminateBuilding == True: #check decision is made
		
		parents = copy.copy(leaf_id)
		leaf_id = str(uuid.uuid1())
		
		decision_rule = "return "+charForResp+str(final_decision)+charForResp
		
		if enableParallelism != True:
			#serial
			functions.storeRule(file,(functions.formatRule(root+1),decision_rule))
		else:
			#parallel			
			sample_rule = {}
			sample_rule["current_level"] = root+1
			sample_rule["leaf_id"] = leaf_id
			sample_rule["parents"] = parents
			sample_rule["rule"] = decision_rule
			sample_rule["feature_idx"] = winner_index
			sample_rule["feature_name"] = winner_name
			sample_rule["instances"] = num_of_instances
			sample_rule["metric"] = 0
			sample_rule["return_statement"] = 1
			
			#json to string
			sample_rule = ", "+json.dumps(sample_rule)
			
			functions.storeRule(custom_rule_file, sample_rule)
	
	else: #decision is not made, continue to create branch and leafs
		root = root + 1 #the following rule will be included by this rule. increase root
		parents = copy.copy(leaf_id)
		
		buildDecisionTree(subdataset, root, file, config, dataset_features
			, root-1, leaf_id, parents)
					
		root = tmp_root * 1
		parents = copy.copy(parents_raw)

Example #7

Show file

def reconstructRules(source):

    #print("Reconstructing ",source)

    file_name = source.split(".json")[0]
    file_name = file_name + ".py"

    functions.createFile(file_name,
                         "#This rule was reconstructed from " + source + "\n")

    with open(source, 'r') as f:
        rules = json.load(f)

    #print(rules)

    def padleft(rule, level):
        for i in range(0, level):
            rule = "\t" + rule
        return rule

    #print("def findDecision(obj):")

    max_level = 0

    rule_set = []
    #json file might not store rules respectively
    for instance in rules:
        if len(instance) > 0:
            rule = []
            rule.append(instance["current_level"])
            rule.append(instance["leaf_id"])
            rule.append(instance["parents"])
            rule.append(instance["rule"])
            rule_set.append(rule)
            #print(padleft(instance["rule"], instance["current_level"]))

    df = np.array(rule_set)

    def extractRules(df, parent='root', level=1):

        level_raw = level * 1
        parent_raw = copy.copy(parent)

        else_rule = ""

        leaf_idx = 0
        for i in range(0, df.shape[0]):
            leaf_id = df[i][1]
            parent_id = df[i][2]
            rule = df[i][3]

            if parent_id == parent:

                if_statement = False
                if rule[0:2] == "if":
                    if_statement = True

                else_statement = False
                if rule[0:5] == "else:":
                    else_statement = True
                    else_rule = rule

                #------------------------

                if else_statement != True:

                    if if_statement == True and leaf_idx > 0:
                        rule = "el" + rule

                    #print(padleft(rule, level), "(", leaf_idx,")")

                    functions.storeRule(file_name, padleft(rule, level))

                    level = level + 1
                    parent = copy.copy(leaf_id)
                    extractRules(df, parent, level)
                    level = level_raw * 1
                    parent = copy.copy(parent_raw)  #restore

                    leaf_idx = leaf_idx + 1

        #add else statement

        if else_rule != "":
            #print(padleft(else_rule, level))
            functions.storeRule(file_name, padleft(else_rule, level))

    #------------------------------------

    #print("def findDecision(obj):")
    functions.storeRule(file_name, "def findDecision(obj):")
    extractRules(df)

Example #8

Show file

def buildDecisionTree(df,
                      root,
                      file,
                      config,
                      dataset_features,
                      parent_level=0,
                      leaf_id=0,
                      parents='root'):

    models = []

    enableParallelism = config['enableParallelism']
    algorithm = config['algorithm']

    json_file = file.split(".")[0] + ".json"

    if root == 1:
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            raw_df = df.copy()

    #--------------------------------------

    df_copy = df.copy()

    winner_name = findDecision(df, config)

    #find winner index, this cannot be returned by find decision because columns dropped in previous steps
    j = 0
    for i in dataset_features:
        if i == winner_name:
            winner_index = j
        j = j + 1

    numericColumn = False
    if dataset_features[winner_name] != 'object':
        numericColumn = True

    #restoration
    columns = df.shape[1]
    for i in range(0, columns - 1):
        column_name = df.columns[i]
        column_type = df[column_name].dtypes
        if column_type != 'object' and column_name != winner_name:
            df[column_name] = df_copy[column_name]

    classes = df[winner_name].value_counts().keys().tolist()

    #-----------------------------------------------------

    #TO-DO: you should specify the number of cores in config
    num_cores = int(multiprocessing.cpu_count() /
                    2)  #allocate half of your total cores

    input_params = []

    #serial approach
    for i in range(0, len(classes)):
        current_class = classes[i]
        subdataset = df[df[winner_name] == current_class]
        subdataset = subdataset.drop(columns=[winner_name])
        branch_index = i * 1

        #create branches serially
        if enableParallelism != True:
            createBranch(config, current_class, subdataset, numericColumn,
                         branch_index, winner_index, root, parents, file,
                         dataset_features)
        else:
            input_params.append((config, current_class, subdataset,
                                 numericColumn, branch_index, winner_index,
                                 root, parents, file, dataset_features))

    #---------------------------
    #add else condition in the decision tree

    if df.Decision.dtypes == 'object':  #classification
        pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
        pivot = pivot.rename(columns={
            "Decision": "Instances",
            "index": "Decision"
        })
        pivot = pivot.sort_values(by=["Instances"],
                                  ascending=False).reset_index()

        else_decision = "return '%s'" % (pivot.iloc[0].Decision)

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:  #parallelism
            leaf_id = str(uuid.uuid1())
            custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt"

            check_rule = "else: " + else_decision

            sample_rule = "   {\n"
            sample_rule += "      \"current_level\": " + str(root) + ",\n"
            sample_rule += "      \"leaf_id\": \"" + str(leaf_id) + "\",\n"
            sample_rule += "      \"parents\": \"" + parents + "\",\n"
            sample_rule += "      \"rule\": \"" + check_rule + "\"\n"
            sample_rule += "   }"

            functions.createFile(custom_rule_file, "")
            functions.storeRule(custom_rule_file, sample_rule)

    else:  #regression
        else_decision = "return %s" % (subdataset.Decision.mean())

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:
            leaf_id = str(uuid.uuid1())
            custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt"

            check_rule = "else: " + else_decision

            sample_rule = "   {\n"
            sample_rule += "      \"current_level\": " + str(root) + ",\n"
            sample_rule += "      \"leaf_id\": \"" + str(leaf_id) + "\",\n"
            sample_rule += "      \"parents\": \"" + parents + "\",\n"
            sample_rule += "      \"rule\": \"" + check_rule + "\"\n"
            sample_rule += "   }"

            functions.createFile(custom_rule_file, "")
            functions.storeRule(custom_rule_file, sample_rule)

    #---------------------------

    #create branches in parallel
    if enableParallelism == True:
        """
		#this usage causes trouble for recursive functions
		with Pool(number_of_cpus) as pool:
			pool.starmap(createBranch, input_params)
		"""

        pool = MyPool(num_cores)
        results = pool.starmap(createBranch, input_params)
        pool.close()
        pool.join()

    #---------------------------------------------

    #calculate accuracy metrics
    if root == 1:

        if enableParallelism == True:

            #custom rules are stored in .txt files. merge them all in a json file

            functions.createFile(json_file, "[\n")

            custom_rules = []

            file_index = 0
            for file in os.listdir(os.getcwd() + "/outputs/rules"):
                if file.endswith(".txt"):
                    custom_rules.append(os.getcwd() + "/outputs/rules/" + file)
                    #print(file) #this file stores a custom rule
                    f = open(os.getcwd() + "/outputs/rules/" + file, "r")
                    custom_rule = f.read()

                    if file_index > 0:
                        custom_rule = ", " + custom_rule

                    functions.storeRule(json_file, custom_rule)
                    f.close()
                    file_index = file_index + 1

            functions.storeRule(json_file, "]")

            #-----------------------------------

            #custom rules are already merged in a json file. clear messy custom rules
            #TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule.

            for file in custom_rules:
                os.remove(file)

            #-----------------------------------

            reconstructRules(json_file)

            #-----------------------------------

        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            #this is reguler decision tree. find accuracy here.

            moduleName = "outputs/rules/rules"
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0
            models.append(myrules)

            num_of_features = df.shape[1] - 1
            instances = df.shape[0]
            classified = 0
            mae = 0
            mse = 0

            #instead of for loops, pandas functions perform well
            raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1)
            if algorithm != 'Regression':
                idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index

                #raw_df['Classified'] = 0
                #raw_df.loc[idx, 'Classified'] = 1
                #print(raw_df)

                accuracy = 100 * len(idx) / instances
                print("Accuracy: ", accuracy, "% on ", instances, " instances")
            else:
                raw_df['Absolute_Error'] = abs(raw_df['Prediction'] -
                                               raw_df['Decision'])
                raw_df['Absolute_Error_Squared'] = raw_df[
                    'Absolute_Error'] * raw_df['Absolute_Error']

                #print(raw_df)

                mae = raw_df['Absolute_Error'].sum() / instances
                print("MAE: ", mae)

                mse = raw_df['Absolute_Error_Squared'].sum() / instances
                rmse = math.sqrt(mse)
                print("RMSE: ", rmse)

                mean = raw_df['Decision'].mean()
                print("Mean: ", mean)

                if mean > 0:
                    print("MAE / Mean: ", 100 * mae / mean, "%")
                    print("RMSE / Mean: ", 100 * rmse / mean, "%")

    return models

Example #9

Show file

File: Chefboost.py Project: diana-dr/Artificial-Intelligence

def fit(df, config):
	
	target_label = df.columns[len(df.columns)-1]
	if target_label != 'Decision':
		print("Expected: Decision, Existing: ",target_label)
		raise ValueError('Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame')
	
	#------------------------
	#handle NaN values
	
	nan_values = []
	
	for column in df.columns:
		if df[column].dtypes != 'object':
			min_value = df[column].min()
			idx = df[df[column].isna()].index
			
			nan_value = []
			nan_value.append(column)
			
			if idx.shape[0] > 0:
				df.loc[idx, column] = min_value - 1
				nan_value.append(min_value - 1)
				min_value - 1
				#print("NaN values are replaced to ", min_value - 1, " in column ", column)
			else:
				nan_value.append(None)
			
			nan_values.append(nan_value)
	
	#------------------------
	
	#initialize params and folders
	config = functions.initializeParams(config)
	functions.initializeFolders()
	
	#------------------------
	
	algorithm = config['algorithm']
	
	valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression']
	
	if algorithm not in valid_algorithms:
		raise ValueError('Invalid algorithm passed. You passed ', algorithm," but valid algorithms are ",valid_algorithms)
	
	#------------------------

	enableRandomForest = config['enableRandomForest']
	num_of_trees = config['num_of_trees']
	enableMultitasking = config['enableMultitasking'] #no longer used. check to remove this variable.

	enableGBM = config['enableGBM']
	epochs = config['epochs']
	learning_rate = config['learning_rate']

	enableAdaboost = config['enableAdaboost']
	enableParallelism = config['enableParallelism']
	
	#this will handle basic decision stumps. parallelism is not required.
	if enableRandomForest == True:
		config['enableParallelism'] = False
		enableParallelism = False
	
	#------------------------
	raw_df = df.copy()
	num_of_rows = df.shape[0]; num_of_columns = df.shape[1]
	
	if algorithm == 'Regression':
		if df['Decision'].dtypes == 'object':
			raise ValueError('Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.')

	if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm
		algorithm = 'Regression'
		config['algorithm'] = 'Regression'
		global_stdev = df['Decision'].std(ddof=0)

	if enableGBM == True:
		print("Gradient Boosting Machines...")
		algorithm = 'Regression'
		config['algorithm'] = 'Regression'
	
	if enableAdaboost == True:
		#enableParallelism = False
		for j in range(0, num_of_columns):
			column_name = df.columns[j]
			if df[column_name].dtypes  == 'object':
				raise ValueError('Adaboost must be run on numeric data set for both features and target')
		
	#-------------------------
	
	print(algorithm," tree is going to be built...")
	
	dataset_features = dict() #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales.

	header = "def findDecision(obj): #"
	
	num_of_columns = df.shape[1]-1
	for i in range(0, num_of_columns):
		column_name = df.columns[i]
		dataset_features[column_name] = df[column_name].dtypes
		header = header + "obj[" + str(i) +"]: "+column_name
		if i != num_of_columns - 1:
			header = header + ", "
	
	header = header + "\n"
		
	#------------------------
	
	begin = time.time()
	
	trees = []; alphas = []

	if enableAdaboost == True:
		trees, alphas = adaboost.apply(df, config, header, dataset_features)

	elif enableGBM == True:
		
		if df['Decision'].dtypes == 'object': #transform classification problem to regression
			trees, alphas = gbm.classifier(df, config, header, dataset_features)
			classification = True
			
		else: #regression
			trees = gbm.regressor(df, config, header, dataset_features)
			classification = False
				
	elif enableRandomForest == True:
		trees = randomforest.apply(df, config, header, dataset_features)
	else: #regular decision tree building

		root = 1; file = "outputs/rules/rules.py"
		functions.createFile(file, header)
		
		if enableParallelism == True:
			json_file = "outputs/rules/rules.json"
			functions.createFile(json_file, "[\n")
			
		trees = Training.buildDecisionTree(df,root,file, config, dataset_features
			, 0, 0, 'root')
		
	print("finished in ",time.time() - begin," seconds")
	
	obj = {
		"trees": trees,
		"alphas": alphas,
		"config": config,
		"nan_values": nan_values
	}
	
	return obj

Example #10

Show file

File: gbm.py Project: ssgantayat/chefboost

def regressor(df, config, header, dataset_features, validation_df = None, process_id = None):
	models = []
	
	#we will update decisions in every epoch, this will be used to restore
	base_actuals = df.Decision.values
	
	algorithm = config['algorithm']
	
	enableRandomForest = config['enableRandomForest']
	num_of_trees = config['num_of_trees']
	enableMultitasking = config['enableMultitasking']

	enableGBM = config['enableGBM']
	epochs = config['epochs']
	learning_rate = config['learning_rate']

	enableAdaboost = config['enableAdaboost']
	
	#------------------------------
	
	boosted_from = 0; boosted_to = 0
	
	#------------------------------
	
	base_df = df.copy()
	
	#gbm will manipulate actuals. store its raw version.
	target_values = base_df['Decision'].values
	num_of_instances = target_values.shape[0]
	
	root = 1
	file = "outputs/rules/rules0.py"; json_file = "outputs/rules/rules0.json"
	functions.createFile(file, header)
	functions.createFile(json_file, "[\n")
	
	Training.buildDecisionTree(df,root,file, config, dataset_features
		, parent_level = 0, leaf_id = 0, parents = 'root') #generate rules0
	
	#functions.storeRule(json_file," {}]")
	
	df = base_df.copy()
	
	base_df['Boosted_Prediction'] = 0
	
	#------------------------------
	
	best_epoch_idx = 0; best_epoch_loss = 1000000
	
	pbar = tqdm(range(1, epochs+1), desc='Boosting')
	
	#for index in range(1,epochs+1):
	#for index in tqdm(range(1,epochs+1), desc='Boosting'):
	for index in pbar:
		#print("epoch ",index," - ",end='')
		loss = 0
		
		#run data(i-1) and rules(i-1), save data1
		
		#dynamic import
		moduleName = "outputs/rules/rules%s" % (index-1)
		fp, pathname, description = imp.find_module(moduleName)
		myrules = imp.load_module(moduleName, fp, pathname, description) #rules0
		
		models.append(myrules)
		
		new_data_set = "outputs/data/data%s.csv" % (index)
		f = open(new_data_set, "w")
		
		#put header in the following file
		columns = df.shape[1]
		
		mae = 0
		
		#----------------------------------------
		
		df['Epoch'] = index
		df['Prediction'] = df.apply(findPrediction, axis=1)
		
		base_df['Boosted_Prediction'] += df['Prediction']
		
		loss = (base_df['Boosted_Prediction'] - base_df['Decision']).pow(2).sum()
		current_loss = loss / num_of_instances #mse
		
		if index == 1: 
			boosted_from = current_loss * 1
		elif index == epochs:
			boosted_to = current_loss * 1
		
		if current_loss < best_epoch_loss:
			best_epoch_loss = current_loss * 1
			best_epoch_idx = index * 1
		
		df['Decision'] = int(learning_rate)*(df['Decision'] - df['Prediction'])
		df = df.drop(columns = ['Epoch', 'Prediction'])
		
		#---------------------------------
		
		df.to_csv(new_data_set, index=False)
		#data(i) created
		
		#---------------------------------
		
		file = "outputs/rules/rules"+str(index)+".py"
		json_file = "outputs/rules/rules"+str(index)+".json"
		
		functions.createFile(file, header)
		functions.createFile(json_file, "[\n")
		
		current_df = df.copy()
		Training.buildDecisionTree(df,root,file, config, dataset_features
			, parent_level = 0, leaf_id = 0, parents = 'root', main_process_id = process_id)
		
		#functions.storeRule(json_file," {}]")
		
		df = current_df.copy() #numeric features require this restoration to apply findDecision function
		
		#rules(i) created
		
		loss = loss / num_of_instances
		#print("epoch ",index," - loss: ",loss)
		#print("loss: ",loss)
		pbar.set_description("Epoch %d. Loss: %d. Process: " % (index, loss))
		
		gc.collect()
		
	#---------------------------------
	
	print("The best epoch is ", best_epoch_idx," with ", best_epoch_loss," loss value")
	models = models[0:best_epoch_idx]
	config["epochs"] = best_epoch_idx
	
	print("MSE of ",num_of_instances," instances are boosted from ",boosted_from," to ",best_epoch_loss," in ",epochs," epochs")
	
	return models

Example #11

Show file

def regressor(df, config, header, dataset_features):
    models = []

    algorithm = config['algorithm']

    enableRandomForest = config['enableRandomForest']
    num_of_trees = config['num_of_trees']
    enableMultitasking = config['enableMultitasking']

    enableGBM = config['enableGBM']
    epochs = config['epochs']
    learning_rate = config['learning_rate']

    enableAdaboost = config['enableAdaboost']

    #------------------------------

    boosted_from = 0
    boosted_to = 0

    #------------------------------

    base_df = df.copy()

    #gbm will manipulate actuals. store its raw version.
    target_values = base_df['Decision'].values
    num_of_instances = target_values.shape[0]

    root = 1
    file = "outputs/rules/rules0.py"
    functions.createFile(file, header)

    Training.buildDecisionTree(df, root, file, config,
                               dataset_features)  #generate rules0

    df = base_df.copy()

    base_df['Boosted_Prediction'] = 0

    #------------------------------

    pbar = tqdm(range(1, epochs + 1), desc='Boosting')

    #for index in range(1,epochs+1):
    #for index in tqdm(range(1,epochs+1), desc='Boosting'):
    for index in pbar:
        #print("epoch ",index," - ",end='')
        loss = 0

        #run data(i-1) and rules(i-1), save data1

        #dynamic import
        moduleName = "outputs/rules/rules%s" % (index - 1)
        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname,
                                  description)  #rules0

        models.append(myrules)

        new_data_set = "outputs/data/data%s.csv" % (index)
        f = open(new_data_set, "w")

        #put header in the following file
        columns = df.shape[1]

        mae = 0

        #----------------------------------------

        df['Epoch'] = index
        df['Prediction'] = df.apply(findPrediction, axis=1)

        base_df['Boosted_Prediction'] += df['Prediction']

        loss = (base_df['Boosted_Prediction'] -
                base_df['Decision']).pow(2).sum()

        if index == 1:
            boosted_from = loss / num_of_instances
        elif index == epochs:
            boosted_to = loss / num_of_instances

        df['Decision'] = int(learning_rate) * (df['Decision'] -
                                               df['Prediction'])
        df = df.drop(columns=['Epoch', 'Prediction'])

        #---------------------------------

        df.to_csv(new_data_set, index=False)
        #data(i) created

        #---------------------------------

        file = "outputs/rules/rules" + str(index) + ".py"

        functions.createFile(file, header)

        current_df = df.copy()
        Training.buildDecisionTree(df, root, file, config, dataset_features)
        df = current_df.copy(
        )  #numeric features require this restoration to apply findDecision function

        #rules(i) created

        loss = loss / num_of_instances
        #print("epoch ",index," - loss: ",loss)
        #print("loss: ",loss)
        pbar.set_description("Epoch %d. Loss: %d. Process: " % (index, loss))

        #---------------------------------

    print(num_of_instances, " instances are boosted from ", boosted_from,
          " to ", boosted_to, " in ", epochs, " epochs")

    return models

Example #12

Show file

def classifier(df, config, header, dataset_features):

    models = []

    print("gradient boosting for classification")

    epochs = config['epochs']

    temp_df = df.copy()
    original_dataset = df.copy()
    worksheet = df.copy()

    classes = df['Decision'].unique()

    boosted_predictions = np.zeros([df.shape[0], len(classes)])

    pbar = tqdm(range(0, epochs), desc='Boosting')

    #store actual set, we will use this to calculate loss
    actual_set = pd.DataFrame(np.zeros([df.shape[0], len(classes)]),
                              columns=classes)
    for i in range(0, len(classes)):
        current_class = classes[i]
        actual_set[current_class] = np.where(df['Decision'] == current_class,
                                             1, 0)
    actual_set = actual_set.values  #transform it to numpy array

    #for epoch in range(0, epochs):
    for epoch in pbar:
        for i in range(0, len(classes)):
            current_class = classes[i]

            if epoch == 0:
                temp_df['Decision'] = np.where(df['Decision'] == current_class,
                                               1, 0)
                worksheet['Y_' + str(i)] = temp_df['Decision']
            else:
                temp_df['Decision'] = worksheet['Y-P_' + str(i)]

            predictions = []

            #change data type for decision column
            temp_df[['Decision']].astype('int64')

            root = 1
            file = "outputs/rules/rules-for-" + current_class + "-round-" + str(
                epoch) + ".py"

            functions.createFile(file, header)

            Training.buildDecisionTree(temp_df, root, file, config,
                                       dataset_features)
            #decision rules created
            #----------------------------

            #dynamic import
            moduleName = "outputs/rules/rules-for-" + current_class + "-round-" + str(
                epoch)
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0

            models.append(myrules)

            num_of_columns = df.shape[1]

            for row, instance in df.iterrows():
                features = []
                for j in range(0, num_of_columns - 1):  #iterate on features
                    features.append(instance[j])

                actual = temp_df.loc[row]['Decision']
                prediction = myrules.findDecision(features)

                predictions.append(prediction)

            #----------------------------
            if epoch == 0:
                worksheet['F_' + str(i)] = 0
            else:
                worksheet['F_' + str(i)] = pd.Series(predictions).values

            boosted_predictions[:, i] = boosted_predictions[:, i] + worksheet[
                'F_' + str(i)].values.astype(np.float32)

            #print(boosted_predictions[0:5,:])

            worksheet['P_' + str(i)] = 0

            #----------------------------
            temp_df = df.copy()  #restoration

        for row, instance in worksheet.iterrows():
            f_scores = []
            for i in range(0, len(classes)):
                f_scores.append(instance['F_' + str(i)])

            probabilities = functions.softmax(f_scores)

            for j in range(0, len(probabilities)):
                instance['P_' + str(j)] = probabilities[j]

            worksheet.loc[row] = instance

        for i in range(0, len(classes)):
            worksheet['Y-P_' +
                      str(i)] = worksheet['Y_' + str(i)] - worksheet['P_' +
                                                                     str(i)]

        prediction_set = np.zeros([df.shape[0], len(classes)])
        for i in range(0, boosted_predictions.shape[0]):
            predicted_index = np.argmax(boosted_predictions[i])
            prediction_set[i][predicted_index] = 1

        #----------------------------
        #find loss for this epoch: prediction_set vs actual_set
        classified = 0
        for i in range(0, actual_set.shape[0]):
            actual = np.argmax(actual_set[i])
            prediction = np.argmax(prediction_set[i])
            #print("actual: ",actual," - prediction: ",prediction)

            if actual == prediction:
                classified = classified + 1

        accuracy = str(100 * classified / actual_set.shape[0]) + "%"

        #----------------------------

        #print(worksheet.head())
        #print("round ",epoch+1)
        pbar.set_description("Epoch %d. Accuracy: %s. Process: " %
                             (epoch + 1, accuracy))

    return models, classes

Example #13

Show file

def apply(df, config, header, dataset_features, validation_df = None, process_id = None):

	models = []

	num_of_trees = config['num_of_trees']

	parallelism_on = config["enableParallelism"]

	#TODO: is this logical for 48x2 cores?
	#config["enableParallelism"] = False #run each tree in parallel but each branch in serial

	#TODO: reconstruct for parallel run is problematic. you should reconstruct based on tree id.

	input_params = []

	pbar = tqdm(range(0, num_of_trees), desc='Bagging')
	for i in pbar:
		pbar.set_description("Sub decision tree %d is processing" % (i+1))
		subset = df.sample(frac=1/num_of_trees)

		root = 1

		moduleName = "outputs/rules/rule_"+str(i)
		file = moduleName+".py"

		functions.createFile(file, header)

		if parallelism_on: #parallel run
			input_params.append((subset, root, file, config, dataset_features, 0, 0, 'root', i, None, process_id))

		else: #serial run
			Training.buildDecisionTree(subset,root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', tree_id = i, main_process_id = process_id)

	#-------------------------------

	if parallelism_on:
		num_cores = config["num_cores"]

		#---------------------------------

		if num_of_trees <= num_cores:
			POOL_SIZE = num_of_trees
		else:
			POOL_SIZE = num_cores

		with closing(multiprocessing.Pool(POOL_SIZE)) as pool:
			funclist = []
			for input_param in input_params:
				f = pool.apply_async(buildDecisionTree, [*input_param])
				funclist.append(f)

			#all functions registered here
			#results = []
			for f in tqdm(funclist):
				branch_results = f.get(timeout = 100000)
				#results.append(branch_results)

			pool.close()
			pool.terminate()

	#-------------------------------
	#collect models for both serial and parallel here
	for i in range(0, num_of_trees):
		moduleName = "outputs/rules/rule_"+str(i)
		fp, pathname, description = imp.find_module(moduleName)
		myrules = imp.load_module(moduleName, fp, pathname, description)
		models.append(myrules)

	#-------------------------------

	return models

Example #14

Show file

File: Training.py Project: serengil/chefboost

def buildDecisionTree(df,
                      root,
                      file,
                      config,
                      dataset_features,
                      parent_level=0,
                      leaf_id=0,
                      parents='root',
                      tree_id=0,
                      validation_df=None,
                      main_process_id=None):

    models = []

    decision_rules = []

    feature_names = df.columns[0:-1]

    enableParallelism = config['enableParallelism']
    algorithm = config['algorithm']

    json_file = file.split(".")[0] + ".json"

    random_forest_enabled = config['enableRandomForest']
    enableGBM = config['enableGBM']
    enableAdaboost = config['enableAdaboost']

    if root == 1:
        if random_forest_enabled != True and enableGBM != True and enableAdaboost != True:
            raw_df = df.copy()

    #--------------------------------------

    df_copy = df.copy()

    winner_name, num_of_instances, metric, metric_name = findDecision(
        df, config)

    #find winner index, this cannot be returned by find decision because columns dropped in previous steps
    j = 0
    for i in dataset_features:
        if i == winner_name:
            winner_index = j
        j = j + 1

    numericColumn = False
    if dataset_features[winner_name] != 'object':
        numericColumn = True

    #restoration
    columns = df.shape[1]
    for i in range(0, columns - 1):
        #column_name = df.columns[i]; column_type = df[column_name].dtypes #numeric field already transformed to object. you cannot check it with df itself, you should check df_copy
        column_name = df_copy.columns[i]
        column_type = df_copy[column_name].dtypes
        if column_type != 'object' and column_name != winner_name:
            df[column_name] = df_copy[column_name]

    classes = df[winner_name].value_counts().keys().tolist()
    #print("classes: ",classes," in ", winner_name)
    #-----------------------------------------------------

    num_cores = config["num_cores"]

    input_params = []

    #serial approach
    for i in range(0, len(classes)):
        current_class = classes[i]
        subdataset = df[df[winner_name] == current_class]
        subdataset = subdataset.drop(columns=[winner_name])
        branch_index = i * 1

        #create branches serially
        if enableParallelism != True:

            if i == 0:

                descriptor = {
                    "feature": winner_name,
                    "instances": num_of_instances,
                    #"metric_name": metric_name,
                    "metric_value": round(metric, 4),
                    "depth": parent_level + 1
                }
                descriptor = "# " + json.dumps(descriptor)

                functions.storeRule(
                    file, (functions.formatRule(root), "", descriptor))

            results = createBranch(config,
                                   current_class,
                                   subdataset,
                                   numericColumn,
                                   branch_index,
                                   winner_name,
                                   winner_index,
                                   root,
                                   parents,
                                   file,
                                   dataset_features,
                                   num_of_instances,
                                   metric,
                                   tree_id=tree_id,
                                   main_process_id=main_process_id)

            decision_rules = decision_rules + results

        else:
            input_params.append(
                (config, current_class, subdataset, numericColumn,
                 branch_index, winner_name, winner_index, root, parents, file,
                 dataset_features, num_of_instances, metric, tree_id,
                 main_process_id))

    #---------------------------
    #add else condition in the decision tree

    if df.Decision.dtypes == 'object':  #classification
        pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
        pivot = pivot.rename(columns={
            "Decision": "Instances",
            "index": "Decision"
        })
        pivot = pivot.sort_values(by=["Instances"],
                                  ascending=False).reset_index()

        else_decision = "return '%s'" % (pivot.iloc[0].Decision)

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:  #parallelism
            leaf_id = str(uuid.uuid1())

            check_rule = "else: " + else_decision

            sample_rule = {}
            sample_rule["current_level"] = root
            sample_rule["leaf_id"] = leaf_id
            sample_rule["parents"] = parents
            sample_rule["rule"] = check_rule
            sample_rule["feature_idx"] = -1
            sample_rule["feature_name"] = ""
            sample_rule["instances"] = df.shape[0]
            sample_rule["metric"] = 0
            sample_rule["return_statement"] = 0
            sample_rule["tree_id"] = tree_id

            #json to string
            sample_rule = json.dumps(sample_rule)
            decision_rules.append(sample_rule)

    else:  #regression
        else_decision = "return %s" % (subdataset.Decision.mean())

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:
            leaf_id = str(uuid.uuid1())

            check_rule = "else: " + else_decision

            sample_rule = {}
            sample_rule["current_level"] = root
            sample_rule["leaf_id"] = leaf_id
            sample_rule["parents"] = parents
            sample_rule["rule"] = check_rule
            sample_rule["tree_id"] = tree_id
            sample_rule["feature_name"] = ""
            sample_rule["instances"] = 0
            sample_rule["metric"] = 0
            sample_rule["return_statement"] = 1

            #json to string
            sample_rule = json.dumps(sample_rule)
            decision_rules.append(sample_rule)

    #---------------------------

    try:
        main_process = psutil.Process(main_process_id)
        children = main_process.children(recursive=True)
        active_processes = len(children) + 1  #plus parent
        #active_processes = len(children)
    except:
        active_processes = 100  #set a large initial value

    results = []
    #create branches in parallel
    if enableParallelism == True:

        required_threads = active_processes + len(classes)

        #if parent_level == 0 and random_forest_enabled != True:
        if main_process_id != None and num_cores >= required_threads:  #len(classes) branches will be run in parallel

            #POOL_SIZE = num_cores
            POOL_SIZE = len(classes)

            #with closing(multiprocessing.Pool(POOL_SIZE)) as pool:
            with closing(MyPool(POOL_SIZE)) as pool:
                funclist = []

                for input_param in input_params:
                    f = pool.apply_async(createBranchWrapper,
                                         [createBranch, input_param])
                    funclist.append(f)

                #all functions registered here

                for f in funclist:
                    branch_results = f.get(timeout=100000)

                    for branch_result in branch_results:
                        results.append(branch_result)

                pool.close()
                pool.terminate()

            #--------------------------------

        else:  #serial
            for input_param in input_params:
                sub_results = createBranchWrapper(createBranch, input_param)
                for sub_result in sub_results:
                    results.append(sub_result)

        #--------------------------------

        decision_rules = decision_rules + results

        #--------------------------------

        if root != 1:  #return children results until the root node
            return decision_rules

    #---------------------------------------------

    if root == 1:

        if enableParallelism == True:

            #custom rules are stored in decision_rules. merge them all in a json file first

            json_rules = "[\n"  #initialize

            file_index = 0
            for custom_rule in decision_rules:

                json_rules += custom_rule

                if file_index < len(decision_rules) - 1:
                    json_rules += ", "

                json_rules += "\n"

                file_index = file_index + 1

            #-----------------------------------

            json_rules += "]"
            functions.createFile(json_file, json_rules)

            #-----------------------------------
            #reconstruct rules from json to py

            reconstructRules(json_file, feature_names)

            #-----------------------------------

        #is regular decision tree
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            #this is reguler decision tree. find accuracy here.

            moduleName = "outputs/rules/rules"
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0
            models.append(myrules)

    return models

Example #15

Show file

def initializeAlphaFile():
    file = "outputs/rules/alphas.py"
    header = "def findAlpha(epoch):\n"
    functions.createFile(file, header)

Example #16

Show file

File: Chefboost.py Project: Neha-jaist/chefboost

def fit(df, config={}, validation_df=None):
    """
	Parameters:
		df (pandas data frame): Training data frame. The target column must be named as 'Decision' and it has to be in the last column
		
		config (dictionary):
			
			config = {
				'algorithm' (string): ID3, 'C4.5, CART, CHAID or Regression
				'enableParallelism' (boolean): False
				
				'enableGBM' (boolean): True,
				'epochs' (int): 7,
				'learning_rate' (int): 1,
				
				'enableRandomForest' (boolean): True,
				'num_of_trees' (int): 5,
				
				'enableAdaboost' (boolean): True,
				'num_of_weak_classifier' (int): 4
			}
			
		validation_df (pandas data frame): if nothing is passed to validation data frame, then the function validates built trees for training data frame
		
	Returns:
		chefboost model
		
	"""

    process_id = os.getpid()

    base_df = df.copy()

    target_label = df.columns[len(df.columns) - 1]
    if target_label != 'Decision':
        print("Expected: Decision, Existing: ", target_label)
        raise ValueError(
            'Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame'
        )

    #------------------------
    #handle NaN values

    nan_values = []

    for column in df.columns:
        if df[column].dtypes != 'object':
            min_value = df[column].min()
            idx = df[df[column].isna()].index

            nan_value = []
            nan_value.append(column)

            if idx.shape[0] > 0:
                df.loc[idx, column] = min_value - 1
                nan_value.append(min_value - 1)
                min_value - 1
                #print("NaN values are replaced to ", min_value - 1, " in column ", column)
            else:
                nan_value.append(None)

            nan_values.append(nan_value)

    #------------------------

    #initialize params and folders
    config = functions.initializeParams(config)
    functions.initializeFolders()

    #------------------------

    algorithm = config['algorithm']

    valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression']

    if algorithm not in valid_algorithms:
        raise ValueError('Invalid algorithm passed. You passed ', algorithm,
                         " but valid algorithms are ", valid_algorithms)

    #------------------------

    enableRandomForest = config['enableRandomForest']
    num_of_trees = config['num_of_trees']
    enableMultitasking = config[
        'enableMultitasking']  #no longer used. check to remove this variable.

    enableGBM = config['enableGBM']
    epochs = config['epochs']
    learning_rate = config['learning_rate']

    enableAdaboost = config['enableAdaboost']
    enableParallelism = config['enableParallelism']

    #------------------------

    if enableParallelism == True:
        print("[INFO]: ", config["num_cores"],
              "CPU cores will be allocated in parallel running")

    #------------------------
    raw_df = df.copy()
    num_of_rows = df.shape[0]
    num_of_columns = df.shape[1]

    if algorithm == 'Regression':
        if df['Decision'].dtypes == 'object':
            raise ValueError(
                'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.'
            )

    if df['Decision'].dtypes != 'object':  #this must be regression tree even if it is not mentioned in algorithm

        if algorithm != 'Regression':
            print(
                "WARNING: You set the algorithm to ", algorithm,
                " but the Decision column of your data set has non-object type."
            )
            print(
                "That's why, the algorithm is set to Regression to handle the data set."
            )

        algorithm = 'Regression'
        config['algorithm'] = 'Regression'
        global_stdev = df['Decision'].std(ddof=0)

    if enableGBM == True:
        print("Gradient Boosting Machines...")
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'

    if enableAdaboost == True:
        #enableParallelism = False
        for j in range(0, num_of_columns):
            column_name = df.columns[j]
            if df[column_name].dtypes == 'object':
                raise ValueError(
                    'Adaboost must be run on numeric data set for both features and target'
                )

    #-------------------------

    print(algorithm, " tree is going to be built...")

    dataset_features = dict(
    )  #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales.

    header = "def findDecision(obj): #"

    num_of_columns = df.shape[1] - 1
    for i in range(0, num_of_columns):
        column_name = df.columns[i]
        dataset_features[column_name] = df[column_name].dtypes
        header = header + "obj[" + str(i) + "]: " + column_name
        if i != num_of_columns - 1:
            header = header + ", "

    header = header + "\n"

    #------------------------

    begin = time.time()

    trees = []
    alphas = []

    if enableAdaboost == True:
        trees, alphas = adaboost.apply(df,
                                       config,
                                       header,
                                       dataset_features,
                                       validation_df=validation_df)

    elif enableGBM == True:

        if df['Decision'].dtypes == 'object':  #transform classification problem to regression
            trees, alphas = gbm.classifier(df,
                                           config,
                                           header,
                                           dataset_features,
                                           validation_df=validation_df)
            classification = True

        else:  #regression
            trees = gbm.regressor(df,
                                  config,
                                  header,
                                  dataset_features,
                                  validation_df=validation_df)
            classification = False

    elif enableRandomForest == True:
        trees = randomforest.apply(df,
                                   config,
                                   header,
                                   dataset_features,
                                   validation_df=validation_df,
                                   process_id=process_id)
    else:  #regular decision tree building

        root = 1
        file = "outputs/rules/rules.py"
        functions.createFile(file, header)

        if enableParallelism == True:
            json_file = "outputs/rules/rules.json"
            functions.createFile(json_file, "[\n")

        trees = Training.buildDecisionTree(df,
                                           root=root,
                                           file=file,
                                           config=config,
                                           dataset_features=dataset_features,
                                           parent_level=0,
                                           leaf_id=0,
                                           parents='root',
                                           validation_df=validation_df,
                                           main_process_id=process_id)

    print("-------------------------")
    print("finished in ", time.time() - begin, " seconds")

    obj = {
        "trees": trees,
        "alphas": alphas,
        "config": config,
        "nan_values": nan_values
    }

    #-----------------------------------------

    #train set accuracy
    df = base_df.copy()
    evaluate(obj, df, task='train')

    #validation set accuracy
    if isinstance(validation_df, pd.DataFrame):
        evaluate(obj, validation_df, task='validation')

    #-----------------------------------------

    return obj

Example #17

Show file

def apply(df, config, header, dataset_features):

    models = []
    alphas = []

    initializeAlphaFile()

    num_of_weak_classifier = config['num_of_weak_classifier']

    #------------------------

    rows = df.shape[0]
    columns = df.shape[1]
    final_predictions = pd.DataFrame(np.zeros([rows, 1]),
                                     columns=['prediction'])

    worksheet = df.copy()
    worksheet['Weight'] = 1 / rows  #uniform distribution initially

    final_predictions = pd.DataFrame(np.zeros((df.shape[0], 2)),
                                     columns=['Prediction', 'Actual'])
    final_predictions['Actual'] = df['Decision']

    #for i in range(0, num_of_weak_classifier):
    pbar = tqdm(range(0, num_of_weak_classifier), desc='Adaboosting')
    for i in pbar:
        worksheet['Decision'] = worksheet['Weight'] * worksheet['Decision']

        root = 1
        file = "outputs/rules/rules_" + str(i) + ".py"

        functions.createFile(file, header)

        #print(worksheet)
        Training.buildDecisionTree(worksheet.drop(columns=['Weight']),
                                   root,
                                   file,
                                   config,
                                   dataset_features,
                                   parent_level=0,
                                   leaf_id=0,
                                   parents='root')

        #---------------------------------------

        moduleName = "outputs/rules/rules_" + str(i)
        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname, description)
        models.append(myrules)

        #---------------------------------------

        df['Epoch'] = i
        worksheet['Prediction'] = df.apply(findPrediction, axis=1)
        df = df.drop(columns=['Epoch'])

        #---------------------------------------
        worksheet['Actual'] = df['Decision']
        worksheet['Loss'] = abs(worksheet['Actual'] -
                                worksheet['Prediction']) / 2
        worksheet[
            'Weight_Times_Loss'] = worksheet['Loss'] * worksheet['Weight']

        epsilon = worksheet['Weight_Times_Loss'].sum()
        alpha = math.log(
            (1 - epsilon) /
            epsilon) / 2  #use alpha to update weights in the next round
        alphas.append(alpha)

        #-----------------------------

        #store alpha
        addEpochAlpha(i, alpha)

        #-----------------------------

        worksheet['Alpha'] = alpha
        worksheet['New_Weights'] = worksheet['Weight'] * (
            -alpha * worksheet['Actual'] * worksheet['Prediction']).apply(
                math.exp)

        #normalize
        worksheet['New_Weights'] = worksheet['New_Weights'] / worksheet[
            'New_Weights'].sum()
        worksheet['Weight'] = worksheet['New_Weights']
        worksheet['Decision'] = df['Decision']

        final_predictions['Prediction'] = final_predictions[
            'Prediction'] + worksheet['Alpha'] * worksheet['Prediction']
        #print(final_predictions)
        worksheet = worksheet.drop(columns=[
            'New_Weights', 'Prediction', 'Actual', 'Loss', 'Weight_Times_Loss',
            'Alpha'
        ])

        mae = (np.abs(final_predictions['Prediction'].apply(functions.sign) -
                      final_predictions['Actual']) /
               2).sum() / final_predictions.shape[0]
        #print(mae)
        pbar.set_description("Epoch %d. Loss: %d. Process: " % (i + 1, mae))

    #------------------------------
    final_predictions['Prediction'] = final_predictions['Prediction'].apply(
        functions.sign)
    final_predictions['Absolute_Error'] = np.abs(
        final_predictions['Actual'] - final_predictions['Prediction']) / 2
    #print(final_predictions)
    mae = final_predictions['Absolute_Error'].sum(
    ) / final_predictions.shape[0]
    print("Loss (MAE) found ", mae, " with ", num_of_weak_classifier,
          ' weak classifiers')

    return models, alphas

Example #18

Show file

def apply(df, config, header, dataset_features):

    models = []

    num_of_trees = config['num_of_trees']

    pbar = tqdm(range(0, num_of_trees), desc='Bagging')

    for i in pbar:
        #for i in range(0, num_of_trees):
        pbar.set_description("Sub decision tree %d is processing" % (i + 1))
        subset = df.sample(frac=1 / num_of_trees)

        root = 1

        moduleName = "outputs/rules/rule_" + str(i)
        file = moduleName + ".py"
        json_file = moduleName + ".json"

        functions.createFile(file, header)
        functions.createFile(json_file, "[\n")

        Training.buildDecisionTree(subset,
                                   root,
                                   file,
                                   config,
                                   dataset_features,
                                   parent_level=0,
                                   leaf_id=0,
                                   parents='root')

        functions.storeRule(json_file, "{}]")

        #--------------------------------

        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname, description)
        models.append(myrules)

    #-------------------------------
    #check regression or classification
    if df['Decision'].dtypes == 'object': problem_type = 'classification'
    else: problem_type = 'regression'

    actual_values = df['Decision'].values
    num_of_features = df.shape[1] - 1  #discard Decision
    number_of_instances = df.shape[0]

    global_predictions = []

    #if classification get the max number of prediction
    if problem_type == 'classification':
        for i in range(0, num_of_trees):

            moduleName = "outputs/rules/rule_" + str(i)
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname, description)

            predictions = []

            for index, instance in df.iterrows():
                params = []
                for j in range(0, num_of_features):
                    params.append(instance[j])

                #index row, i th column
                prediction = myrules.findDecision(params)
                predictions.append(prediction)
                #print(i,"th tree prediction: ",prediction)

            #print(predictions)
            global_predictions.append(predictions)

        #-------------------------------
        classified = 0
        for index in range(0, len(actual_values)):

            actual = actual_values[index]
            predictions = []
            for i in range(0, num_of_trees):
                prediction = global_predictions[i][index]
                if prediction != None:  #why None exists in some cases?
                    predictions.append(prediction)

            predictions = np.array(predictions)
            unique_values = np.unique(predictions)

            if unique_values.shape[0] == 1:
                prediction = unique_values[0]
            else:
                counts = []
                for unique in unique_values:
                    count = 0
                    for j in predictions:
                        if unique == j:
                            count = count + 1
                    counts.append(count)

                #print("unique: ",unique_values)
                #print("counts: ",counts)

                prediction = None

                if len(counts) > 0:
                    max_index = np.argmax(np.array(counts))
                    prediction = unique_values[max_index]

            #print(index,". actual: ",actual," - prediction: ", prediction)
            if actual == prediction:
                classified = classified + 1

        print("Accuracy: ", 100 * classified / number_of_instances, "% on ",
              number_of_instances, " instances")

    return models