def fit(df, config):
	
	target_label = df.columns[len(df.columns)-1]
	if target_label != 'Decision':
		print("Expected: Decision, Existing: ",target_label)
		raise ValueError('Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame')
	
	#------------------------
	#handle NaN values
	
	nan_values = []
	
	for column in df.columns:
		if df[column].dtypes != 'object':
			min_value = df[column].min()
			idx = df[df[column].isna()].index
			
			nan_value = []
			nan_value.append(column)
			
			if idx.shape[0] > 0:
				df.loc[idx, column] = min_value - 1
				nan_value.append(min_value - 1)
				min_value - 1
				#print("NaN values are replaced to ", min_value - 1, " in column ", column)
			else:
				nan_value.append(None)
			
			nan_values.append(nan_value)
	
	#------------------------
	
	#initialize params and folders
	config = functions.initializeParams(config)
	functions.initializeFolders()
	
	#------------------------
	
	algorithm = config['algorithm']
	
	valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression']
	
	if algorithm not in valid_algorithms:
		raise ValueError('Invalid algorithm passed. You passed ', algorithm," but valid algorithms are ",valid_algorithms)
	
	#------------------------

	enableRandomForest = config['enableRandomForest']
	num_of_trees = config['num_of_trees']
	enableMultitasking = config['enableMultitasking'] #no longer used. check to remove this variable.

	enableGBM = config['enableGBM']
	epochs = config['epochs']
	learning_rate = config['learning_rate']

	enableAdaboost = config['enableAdaboost']
	enableParallelism = config['enableParallelism']
	
	#this will handle basic decision stumps. parallelism is not required.
	if enableRandomForest == True:
		config['enableParallelism'] = False
		enableParallelism = False
	
	#------------------------
	raw_df = df.copy()
	num_of_rows = df.shape[0]; num_of_columns = df.shape[1]
	
	if algorithm == 'Regression':
		if df['Decision'].dtypes == 'object':
			raise ValueError('Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.')

	if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm
		algorithm = 'Regression'
		config['algorithm'] = 'Regression'
		global_stdev = df['Decision'].std(ddof=0)

	if enableGBM == True:
		print("Gradient Boosting Machines...")
		algorithm = 'Regression'
		config['algorithm'] = 'Regression'
	
	if enableAdaboost == True:
		#enableParallelism = False
		for j in range(0, num_of_columns):
			column_name = df.columns[j]
			if df[column_name].dtypes  == 'object':
				raise ValueError('Adaboost must be run on numeric data set for both features and target')
		
	#-------------------------
	
	print(algorithm," tree is going to be built...")
	
	dataset_features = dict() #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales.

	header = "def findDecision(obj): #"
	
	num_of_columns = df.shape[1]-1
	for i in range(0, num_of_columns):
		column_name = df.columns[i]
		dataset_features[column_name] = df[column_name].dtypes
		header = header + "obj[" + str(i) +"]: "+column_name
		if i != num_of_columns - 1:
			header = header + ", "
	
	header = header + "\n"
		
	#------------------------
	
	begin = time.time()
	
	trees = []; alphas = []

	if enableAdaboost == True:
		trees, alphas = adaboost.apply(df, config, header, dataset_features)

	elif enableGBM == True:
		
		if df['Decision'].dtypes == 'object': #transform classification problem to regression
			trees, alphas = gbm.classifier(df, config, header, dataset_features)
			classification = True
			
		else: #regression
			trees = gbm.regressor(df, config, header, dataset_features)
			classification = False
				
	elif enableRandomForest == True:
		trees = randomforest.apply(df, config, header, dataset_features)
	else: #regular decision tree building

		root = 1; file = "outputs/rules/rules.py"
		functions.createFile(file, header)
		
		if enableParallelism == True:
			json_file = "outputs/rules/rules.json"
			functions.createFile(json_file, "[\n")
			
		trees = Training.buildDecisionTree(df,root,file, config, dataset_features
			, 0, 0, 'root')
		
	print("finished in ",time.time() - begin," seconds")
	
	obj = {
		"trees": trees,
		"alphas": alphas,
		"config": config,
		"nan_values": nan_values
	}
	
	return obj
Esempio n. 2
0
def fit(df, config={}, validation_df=None):
    """
	Parameters:
		df (pandas data frame): Training data frame. The target column must be named as 'Decision' and it has to be in the last column
		
		config (dictionary):
			
			config = {
				'algorithm' (string): ID3, 'C4.5, CART, CHAID or Regression
				'enableParallelism' (boolean): False
				
				'enableGBM' (boolean): True,
				'epochs' (int): 7,
				'learning_rate' (int): 1,
				
				'enableRandomForest' (boolean): True,
				'num_of_trees' (int): 5,
				
				'enableAdaboost' (boolean): True,
				'num_of_weak_classifier' (int): 4
			}
			
		validation_df (pandas data frame): if nothing is passed to validation data frame, then the function validates built trees for training data frame
		
	Returns:
		chefboost model
		
	"""

    process_id = os.getpid()

    base_df = df.copy()

    target_label = df.columns[len(df.columns) - 1]
    if target_label != 'Decision':
        print("Expected: Decision, Existing: ", target_label)
        raise ValueError(
            'Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame'
        )

    #------------------------
    #handle NaN values

    nan_values = []

    for column in df.columns:
        if df[column].dtypes != 'object':
            min_value = df[column].min()
            idx = df[df[column].isna()].index

            nan_value = []
            nan_value.append(column)

            if idx.shape[0] > 0:
                df.loc[idx, column] = min_value - 1
                nan_value.append(min_value - 1)
                min_value - 1
                #print("NaN values are replaced to ", min_value - 1, " in column ", column)
            else:
                nan_value.append(None)

            nan_values.append(nan_value)

    #------------------------

    #initialize params and folders
    config = functions.initializeParams(config)
    functions.initializeFolders()

    #------------------------

    algorithm = config['algorithm']

    valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression']

    if algorithm not in valid_algorithms:
        raise ValueError('Invalid algorithm passed. You passed ', algorithm,
                         " but valid algorithms are ", valid_algorithms)

    #------------------------

    enableRandomForest = config['enableRandomForest']
    num_of_trees = config['num_of_trees']
    enableMultitasking = config[
        'enableMultitasking']  #no longer used. check to remove this variable.

    enableGBM = config['enableGBM']
    epochs = config['epochs']
    learning_rate = config['learning_rate']

    enableAdaboost = config['enableAdaboost']
    enableParallelism = config['enableParallelism']

    #------------------------

    if enableParallelism == True:
        print("[INFO]: ", config["num_cores"],
              "CPU cores will be allocated in parallel running")

    #------------------------
    raw_df = df.copy()
    num_of_rows = df.shape[0]
    num_of_columns = df.shape[1]

    if algorithm == 'Regression':
        if df['Decision'].dtypes == 'object':
            raise ValueError(
                'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.'
            )

    if df['Decision'].dtypes != 'object':  #this must be regression tree even if it is not mentioned in algorithm

        if algorithm != 'Regression':
            print(
                "WARNING: You set the algorithm to ", algorithm,
                " but the Decision column of your data set has non-object type."
            )
            print(
                "That's why, the algorithm is set to Regression to handle the data set."
            )

        algorithm = 'Regression'
        config['algorithm'] = 'Regression'
        global_stdev = df['Decision'].std(ddof=0)

    if enableGBM == True:
        print("Gradient Boosting Machines...")
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'

    if enableAdaboost == True:
        #enableParallelism = False
        for j in range(0, num_of_columns):
            column_name = df.columns[j]
            if df[column_name].dtypes == 'object':
                raise ValueError(
                    'Adaboost must be run on numeric data set for both features and target'
                )

    #-------------------------

    print(algorithm, " tree is going to be built...")

    dataset_features = dict(
    )  #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales.

    header = "def findDecision(obj): #"

    num_of_columns = df.shape[1] - 1
    for i in range(0, num_of_columns):
        column_name = df.columns[i]
        dataset_features[column_name] = df[column_name].dtypes
        header = header + "obj[" + str(i) + "]: " + column_name
        if i != num_of_columns - 1:
            header = header + ", "

    header = header + "\n"

    #------------------------

    begin = time.time()

    trees = []
    alphas = []

    if enableAdaboost == True:
        trees, alphas = adaboost.apply(df,
                                       config,
                                       header,
                                       dataset_features,
                                       validation_df=validation_df)

    elif enableGBM == True:

        if df['Decision'].dtypes == 'object':  #transform classification problem to regression
            trees, alphas = gbm.classifier(df,
                                           config,
                                           header,
                                           dataset_features,
                                           validation_df=validation_df)
            classification = True

        else:  #regression
            trees = gbm.regressor(df,
                                  config,
                                  header,
                                  dataset_features,
                                  validation_df=validation_df)
            classification = False

    elif enableRandomForest == True:
        trees = randomforest.apply(df,
                                   config,
                                   header,
                                   dataset_features,
                                   validation_df=validation_df,
                                   process_id=process_id)
    else:  #regular decision tree building

        root = 1
        file = "outputs/rules/rules.py"
        functions.createFile(file, header)

        if enableParallelism == True:
            json_file = "outputs/rules/rules.json"
            functions.createFile(json_file, "[\n")

        trees = Training.buildDecisionTree(df,
                                           root=root,
                                           file=file,
                                           config=config,
                                           dataset_features=dataset_features,
                                           parent_level=0,
                                           leaf_id=0,
                                           parents='root',
                                           validation_df=validation_df,
                                           main_process_id=process_id)

    print("-------------------------")
    print("finished in ", time.time() - begin, " seconds")

    obj = {
        "trees": trees,
        "alphas": alphas,
        "config": config,
        "nan_values": nan_values
    }

    #-----------------------------------------

    #train set accuracy
    df = base_df.copy()
    evaluate(obj, df, task='train')

    #validation set accuracy
    if isinstance(validation_df, pd.DataFrame):
        evaluate(obj, validation_df, task='validation')

    #-----------------------------------------

    return obj