def fit(df, config): target_label = df.columns[len(df.columns)-1] if target_label != 'Decision': print("Expected: Decision, Existing: ",target_label) raise ValueError('Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame') #------------------------ #handle NaN values nan_values = [] for column in df.columns: if df[column].dtypes != 'object': min_value = df[column].min() idx = df[df[column].isna()].index nan_value = [] nan_value.append(column) if idx.shape[0] > 0: df.loc[idx, column] = min_value - 1 nan_value.append(min_value - 1) min_value - 1 #print("NaN values are replaced to ", min_value - 1, " in column ", column) else: nan_value.append(None) nan_values.append(nan_value) #------------------------ #initialize params and folders config = functions.initializeParams(config) functions.initializeFolders() #------------------------ algorithm = config['algorithm'] valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression'] if algorithm not in valid_algorithms: raise ValueError('Invalid algorithm passed. You passed ', algorithm," but valid algorithms are ",valid_algorithms) #------------------------ enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config['enableMultitasking'] #no longer used. check to remove this variable. enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] enableParallelism = config['enableParallelism'] #this will handle basic decision stumps. parallelism is not required. if enableRandomForest == True: config['enableParallelism'] = False enableParallelism = False #------------------------ raw_df = df.copy() num_of_rows = df.shape[0]; num_of_columns = df.shape[1] if algorithm == 'Regression': if df['Decision'].dtypes == 'object': raise ValueError('Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.') if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm algorithm = 'Regression' config['algorithm'] = 'Regression' global_stdev = df['Decision'].std(ddof=0) if enableGBM == True: print("Gradient Boosting Machines...") algorithm = 'Regression' config['algorithm'] = 'Regression' if enableAdaboost == True: #enableParallelism = False for j in range(0, num_of_columns): column_name = df.columns[j] if df[column_name].dtypes == 'object': raise ValueError('Adaboost must be run on numeric data set for both features and target') #------------------------- print(algorithm," tree is going to be built...") dataset_features = dict() #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales. header = "def findDecision(obj): #" num_of_columns = df.shape[1]-1 for i in range(0, num_of_columns): column_name = df.columns[i] dataset_features[column_name] = df[column_name].dtypes header = header + "obj[" + str(i) +"]: "+column_name if i != num_of_columns - 1: header = header + ", " header = header + "\n" #------------------------ begin = time.time() trees = []; alphas = [] if enableAdaboost == True: trees, alphas = adaboost.apply(df, config, header, dataset_features) elif enableGBM == True: if df['Decision'].dtypes == 'object': #transform classification problem to regression trees, alphas = gbm.classifier(df, config, header, dataset_features) classification = True else: #regression trees = gbm.regressor(df, config, header, dataset_features) classification = False elif enableRandomForest == True: trees = randomforest.apply(df, config, header, dataset_features) else: #regular decision tree building root = 1; file = "outputs/rules/rules.py" functions.createFile(file, header) if enableParallelism == True: json_file = "outputs/rules/rules.json" functions.createFile(json_file, "[\n") trees = Training.buildDecisionTree(df,root,file, config, dataset_features , 0, 0, 'root') print("finished in ",time.time() - begin," seconds") obj = { "trees": trees, "alphas": alphas, "config": config, "nan_values": nan_values } return obj
def fit(df, config={}, validation_df=None): """ Parameters: df (pandas data frame): Training data frame. The target column must be named as 'Decision' and it has to be in the last column config (dictionary): config = { 'algorithm' (string): ID3, 'C4.5, CART, CHAID or Regression 'enableParallelism' (boolean): False 'enableGBM' (boolean): True, 'epochs' (int): 7, 'learning_rate' (int): 1, 'enableRandomForest' (boolean): True, 'num_of_trees' (int): 5, 'enableAdaboost' (boolean): True, 'num_of_weak_classifier' (int): 4 } validation_df (pandas data frame): if nothing is passed to validation data frame, then the function validates built trees for training data frame Returns: chefboost model """ process_id = os.getpid() base_df = df.copy() target_label = df.columns[len(df.columns) - 1] if target_label != 'Decision': print("Expected: Decision, Existing: ", target_label) raise ValueError( 'Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame' ) #------------------------ #handle NaN values nan_values = [] for column in df.columns: if df[column].dtypes != 'object': min_value = df[column].min() idx = df[df[column].isna()].index nan_value = [] nan_value.append(column) if idx.shape[0] > 0: df.loc[idx, column] = min_value - 1 nan_value.append(min_value - 1) min_value - 1 #print("NaN values are replaced to ", min_value - 1, " in column ", column) else: nan_value.append(None) nan_values.append(nan_value) #------------------------ #initialize params and folders config = functions.initializeParams(config) functions.initializeFolders() #------------------------ algorithm = config['algorithm'] valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression'] if algorithm not in valid_algorithms: raise ValueError('Invalid algorithm passed. You passed ', algorithm, " but valid algorithms are ", valid_algorithms) #------------------------ enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config[ 'enableMultitasking'] #no longer used. check to remove this variable. enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] enableParallelism = config['enableParallelism'] #------------------------ if enableParallelism == True: print("[INFO]: ", config["num_cores"], "CPU cores will be allocated in parallel running") #------------------------ raw_df = df.copy() num_of_rows = df.shape[0] num_of_columns = df.shape[1] if algorithm == 'Regression': if df['Decision'].dtypes == 'object': raise ValueError( 'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.' ) if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm if algorithm != 'Regression': print( "WARNING: You set the algorithm to ", algorithm, " but the Decision column of your data set has non-object type." ) print( "That's why, the algorithm is set to Regression to handle the data set." ) algorithm = 'Regression' config['algorithm'] = 'Regression' global_stdev = df['Decision'].std(ddof=0) if enableGBM == True: print("Gradient Boosting Machines...") algorithm = 'Regression' config['algorithm'] = 'Regression' if enableAdaboost == True: #enableParallelism = False for j in range(0, num_of_columns): column_name = df.columns[j] if df[column_name].dtypes == 'object': raise ValueError( 'Adaboost must be run on numeric data set for both features and target' ) #------------------------- print(algorithm, " tree is going to be built...") dataset_features = dict( ) #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales. header = "def findDecision(obj): #" num_of_columns = df.shape[1] - 1 for i in range(0, num_of_columns): column_name = df.columns[i] dataset_features[column_name] = df[column_name].dtypes header = header + "obj[" + str(i) + "]: " + column_name if i != num_of_columns - 1: header = header + ", " header = header + "\n" #------------------------ begin = time.time() trees = [] alphas = [] if enableAdaboost == True: trees, alphas = adaboost.apply(df, config, header, dataset_features, validation_df=validation_df) elif enableGBM == True: if df['Decision'].dtypes == 'object': #transform classification problem to regression trees, alphas = gbm.classifier(df, config, header, dataset_features, validation_df=validation_df) classification = True else: #regression trees = gbm.regressor(df, config, header, dataset_features, validation_df=validation_df) classification = False elif enableRandomForest == True: trees = randomforest.apply(df, config, header, dataset_features, validation_df=validation_df, process_id=process_id) else: #regular decision tree building root = 1 file = "outputs/rules/rules.py" functions.createFile(file, header) if enableParallelism == True: json_file = "outputs/rules/rules.json" functions.createFile(json_file, "[\n") trees = Training.buildDecisionTree(df, root=root, file=file, config=config, dataset_features=dataset_features, parent_level=0, leaf_id=0, parents='root', validation_df=validation_df, main_process_id=process_id) print("-------------------------") print("finished in ", time.time() - begin, " seconds") obj = { "trees": trees, "alphas": alphas, "config": config, "nan_values": nan_values } #----------------------------------------- #train set accuracy df = base_df.copy() evaluate(obj, df, task='train') #validation set accuracy if isinstance(validation_df, pd.DataFrame): evaluate(obj, validation_df, task='validation') #----------------------------------------- return obj