def apply(df, config, header, dataset_features, validation_df = None): models = [] num_of_trees = config['num_of_trees'] parallelism_on = config["enableParallelism"] #TODO: is this logical for 48x2 cores? #config["enableParallelism"] = False #run each tree in parallel but each branch in serial #TODO: reconstruct for parallel run is problematic. you should reconstruct based on tree id. input_params = [] pbar = tqdm(range(0, num_of_trees), desc='Bagging') for i in pbar: pbar.set_description("Sub decision tree %d is processing" % (i+1)) subset = df.sample(frac=1/num_of_trees) root = 1 moduleName = "outputs/rules/rule_"+str(i) file = moduleName+".py" functions.createFile(file, header) if parallelism_on: #parallel run input_params.append((subset, root, file, config, dataset_features, 0, 0, 'root', i)) else: #serial run Training.buildDecisionTree(subset,root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', tree_id = i) #------------------------------- if parallelism_on: num_cores = config["num_cores"] pool = Training.MyPool(num_cores) results = pool.starmap(buildDecisionTree, input_params) pool.close() pool.join() #------------------------------- #collect models for both serial and parallel here for i in range(0, num_of_trees): moduleName = "outputs/rules/rule_"+str(i) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #------------------------------- return models
def apply(df, config, header, dataset_features, validation_df=None): models = [] num_of_trees = config['num_of_trees'] pbar = tqdm(range(0, num_of_trees), desc='Bagging') for i in pbar: #for i in range(0, num_of_trees): pbar.set_description("Sub decision tree %d is processing" % (i + 1)) subset = df.sample(frac=1 / num_of_trees) root = 1 moduleName = "outputs/rules/rule_" + str(i) file = moduleName + ".py" json_file = moduleName + ".json" functions.createFile(file, header) functions.createFile(json_file, "[\n") Training.buildDecisionTree(subset, root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root') functions.storeRule(json_file, "{}]") #-------------------------------- fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #------------------------------- return models
def buildDecisionTree(df, root, file, config, dataset_features, parent_level, leaf_id, parents, tree_id, validation_df=None, process_id=None): Training.buildDecisionTree(df, root, file, config, dataset_features, parent_level=parent_level, leaf_id=leaf_id, parents=parents, tree_id=tree_id, main_process_id=process_id)
def processContinuousFeatures(algorithm, df, column_name, entropy, config): #if True: if df[column_name].nunique() <= 20: unique_values = sorted(df[column_name].unique()) else: unique_values = [] df_mean = df[column_name].mean() df_std = df[column_name].std(ddof=0) df_min = df[column_name].min() df_max = df[column_name].max() unique_values.append(df[column_name].min()) unique_values.append(df[column_name].max()) unique_values.append(df[column_name].mean()) scales = list(range(-3, +4, 1)) for scale in scales: if df_mean + scale * df_std > df_min and df_mean + scale * df_std < df_max: unique_values.append(df_mean + scale * df_std) unique_values.sort() #print(column_name,"->",unique_values) subset_gainratios = [] subset_gains = [] subset_ginis = [] subset_red_stdevs = [] subset_chi_squares = [] if len(unique_values) == 1: winner_threshold = unique_values[0] df[column_name] = np.where(df[column_name] <= winner_threshold, "<=" + str(winner_threshold), ">" + str(winner_threshold)) return df for i in range(0, len(unique_values) - 1): threshold = unique_values[i] subset1 = df[df[column_name] <= threshold] subset2 = df[df[column_name] > threshold] subset1_rows = subset1.shape[0] subset2_rows = subset2.shape[0] total_instances = df.shape[0] #subset1_rows+subset2_rows subset1_probability = subset1_rows / total_instances subset2_probability = subset2_rows / total_instances if algorithm == 'ID3' or algorithm == 'C4.5': threshold_gain = entropy - subset1_probability * Training.calculateEntropy( subset1, config) - subset2_probability * Training.calculateEntropy( subset2, config) subset_gains.append(threshold_gain) if algorithm == 'C4.5': #C4.5 also need gain in the block above. That's why, instead of else if we used direct if condition here threshold_splitinfo = -subset1_probability * math.log( subset1_probability, 2) - subset2_probability * math.log( subset2_probability, 2) gainratio = threshold_gain / threshold_splitinfo subset_gainratios.append(gainratio) elif algorithm == 'CART': decision_for_subset1 = subset1['Decision'].value_counts().tolist() decision_for_subset2 = subset2['Decision'].value_counts().tolist() gini_subset1 = 1 gini_subset2 = 1 for j in range(0, len(decision_for_subset1)): gini_subset1 = gini_subset1 - math.pow( (decision_for_subset1[j] / subset1_rows), 2) for j in range(0, len(decision_for_subset2)): gini_subset2 = gini_subset2 - math.pow( (decision_for_subset2[j] / subset2_rows), 2) gini = (subset1_rows / total_instances) * gini_subset1 + ( subset2_rows / total_instances) * gini_subset2 subset_ginis.append(gini) elif algorithm == "CHAID": #subset1 = high, subset2 = normal unique_decisions = df['Decision'].unique() #Yes, No num_of_decisions = len(unique_decisions) #2 subset1_expected = subset1.shape[0] / num_of_decisions subset2_expected = subset2.shape[0] / num_of_decisions chi_square = 0 for d in unique_decisions: #Yes, No #decision = Yes subset1_d = subset1[subset1["Decision"] == d] #high, yes subset2_d = subset2[subset2["Decision"] == d] #normal, yes subset1_d_chi_square = math.sqrt( ((subset1_d.shape[0] - subset1_expected) * (subset1_d.shape[0] - subset1_expected)) / subset1_expected) subset2_d_chi_square = math.sqrt( ((subset2_d.shape[0] - subset2_expected) * (subset2_d.shape[0] - subset2_expected)) / subset2_expected) chi_square = chi_square + subset1_d_chi_square + subset2_d_chi_square subset_chi_squares.append(chi_square) #---------------------------------- elif algorithm == 'Regression': superset_stdev = df['Decision'].std(ddof=0) subset1_stdev = subset1['Decision'].std(ddof=0) subset2_stdev = subset2['Decision'].std(ddof=0) threshold_weighted_stdev = ( subset1_rows / total_instances) * subset1_stdev + ( subset2_rows / total_instances) * subset2_stdev threshold_reducted_stdev = superset_stdev - threshold_weighted_stdev subset_red_stdevs.append(threshold_reducted_stdev) #---------------------------------- if algorithm == "C4.5": winner_one = subset_gainratios.index(max(subset_gainratios)) elif algorithm == "ID3": #actually, ID3 does not support for continuous features but we can still do it winner_one = subset_gains.index(max(subset_gains)) elif algorithm == "CART": winner_one = subset_ginis.index(min(subset_ginis)) elif algorithm == "CHAID": winner_one = subset_chi_squares.index(max(subset_chi_squares)) elif algorithm == "Regression": winner_one = subset_red_stdevs.index(max(subset_red_stdevs)) winner_threshold = unique_values[winner_one] #print(column_name,": ", winner_threshold," in ", unique_values) #print("theshold is ",winner_threshold," for ",column_name) df[column_name] = np.where(df[column_name] <= winner_threshold, "<=" + str(winner_threshold), ">" + str(winner_threshold)) return df
def apply(df, config, header, dataset_features): models = [] alphas = [] initializeAlphaFile() num_of_weak_classifier = config['num_of_weak_classifier'] #------------------------ rows = df.shape[0] columns = df.shape[1] final_predictions = pd.DataFrame(np.zeros([rows, 1]), columns=['prediction']) worksheet = df.copy() worksheet['Weight'] = 1 / rows #uniform distribution initially final_predictions = pd.DataFrame(np.zeros((df.shape[0], 2)), columns=['Prediction', 'Actual']) final_predictions['Actual'] = df['Decision'] #for i in range(0, num_of_weak_classifier): pbar = tqdm(range(0, num_of_weak_classifier), desc='Adaboosting') for i in pbar: worksheet['Decision'] = worksheet['Weight'] * worksheet['Decision'] root = 1 file = "outputs/rules/rules_" + str(i) + ".py" functions.createFile(file, header) #print(worksheet) Training.buildDecisionTree(worksheet.drop(columns=['Weight']), root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root') #--------------------------------------- moduleName = "outputs/rules/rules_" + str(i) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #--------------------------------------- df['Epoch'] = i worksheet['Prediction'] = df.apply(findPrediction, axis=1) df = df.drop(columns=['Epoch']) #--------------------------------------- worksheet['Actual'] = df['Decision'] worksheet['Loss'] = abs(worksheet['Actual'] - worksheet['Prediction']) / 2 worksheet[ 'Weight_Times_Loss'] = worksheet['Loss'] * worksheet['Weight'] epsilon = worksheet['Weight_Times_Loss'].sum() alpha = math.log( (1 - epsilon) / epsilon) / 2 #use alpha to update weights in the next round alphas.append(alpha) #----------------------------- #store alpha addEpochAlpha(i, alpha) #----------------------------- worksheet['Alpha'] = alpha worksheet['New_Weights'] = worksheet['Weight'] * ( -alpha * worksheet['Actual'] * worksheet['Prediction']).apply( math.exp) #normalize worksheet['New_Weights'] = worksheet['New_Weights'] / worksheet[ 'New_Weights'].sum() worksheet['Weight'] = worksheet['New_Weights'] worksheet['Decision'] = df['Decision'] final_predictions['Prediction'] = final_predictions[ 'Prediction'] + worksheet['Alpha'] * worksheet['Prediction'] #print(final_predictions) worksheet = worksheet.drop(columns=[ 'New_Weights', 'Prediction', 'Actual', 'Loss', 'Weight_Times_Loss', 'Alpha' ]) mae = (np.abs(final_predictions['Prediction'].apply(functions.sign) - final_predictions['Actual']) / 2).sum() / final_predictions.shape[0] #print(mae) pbar.set_description("Epoch %d. Loss: %d. Process: " % (i + 1, mae)) #------------------------------ final_predictions['Prediction'] = final_predictions['Prediction'].apply( functions.sign) final_predictions['Absolute_Error'] = np.abs( final_predictions['Actual'] - final_predictions['Prediction']) / 2 #print(final_predictions) mae = final_predictions['Absolute_Error'].sum( ) / final_predictions.shape[0] print("Loss (MAE) found ", mae, " with ", num_of_weak_classifier, ' weak classifiers') return models, alphas
def processContinuousFeatures(algorithm, df, column_name, entropy, config): unique_values = sorted(df[column_name].unique()) #print(column_name,"->",unique_values) subset_gainratios = [] subset_gains = [] subset_ginis = [] subset_red_stdevs = [] if len(unique_values) == 1: winner_threshold = unique_values[0] df[column_name] = np.where(df[column_name] <= winner_threshold, "<=" + str(winner_threshold), ">" + str(winner_threshold)) return df for i in range(0, len(unique_values) - 1): threshold = unique_values[i] subset1 = df[df[column_name] <= threshold] subset2 = df[df[column_name] > threshold] subset1_rows = subset1.shape[0] subset2_rows = subset2.shape[0] total_instances = df.shape[0] #subset1_rows+subset2_rows subset1_probability = subset1_rows / total_instances subset2_probability = subset2_rows / total_instances if algorithm == 'ID3' or algorithm == 'C4.5': threshold_gain = entropy - subset1_probability * Training.calculateEntropy( subset1, config) - subset2_probability * Training.calculateEntropy( subset2, config) subset_gains.append(threshold_gain) if algorithm == 'C4.5': #C4.5 also need gain in the block above. That's why, instead of else if we used direct if condition here threshold_splitinfo = -subset1_probability * math.log( subset1_probability, 2) - subset2_probability * math.log( subset2_probability, 2) gainratio = threshold_gain / threshold_splitinfo subset_gainratios.append(gainratio) elif algorithm == 'CART': decision_for_subset1 = subset1['Decision'].value_counts().tolist() decision_for_subset2 = subset2['Decision'].value_counts().tolist() gini_subset1 = 1 gini_subset2 = 1 for j in range(0, len(decision_for_subset1)): gini_subset1 = gini_subset1 - math.pow( (decision_for_subset1[j] / subset1_rows), 2) for j in range(0, len(decision_for_subset2)): gini_subset2 = gini_subset2 - math.pow( (decision_for_subset2[j] / subset2_rows), 2) gini = (subset1_rows / total_instances) * gini_subset1 + ( subset2_rows / total_instances) * gini_subset2 subset_ginis.append(gini) #---------------------------------- elif algorithm == 'Regression': superset_stdev = df['Decision'].std(ddof=0) subset1_stdev = subset1['Decision'].std(ddof=0) subset2_stdev = subset2['Decision'].std(ddof=0) threshold_weighted_stdev = ( subset1_rows / total_instances) * subset1_stdev + ( subset2_rows / total_instances) * subset2_stdev threshold_reducted_stdev = superset_stdev - threshold_weighted_stdev subset_red_stdevs.append(threshold_reducted_stdev) #---------------------------------- if algorithm == "C4.5": winner_one = subset_gainratios.index(max(subset_gainratios)) elif algorithm == "ID3": #actually, ID3 does not support for continuous features but we can still do it winner_one = subset_gains.index(max(subset_gains)) elif algorithm == "CART": winner_one = subset_ginis.index(min(subset_ginis)) elif algorithm == "Regression": winner_one = subset_red_stdevs.index(max(subset_red_stdevs)) winner_threshold = unique_values[winner_one] #print("theshold is ",winner_threshold," for ",column_name) df[column_name] = np.where(df[column_name] <= winner_threshold, "<=" + str(winner_threshold), ">" + str(winner_threshold)) return df
def fit(df, config): target_label = df.columns[len(df.columns)-1] if target_label != 'Decision': print("Expected: Decision, Existing: ",target_label) raise ValueError('Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame') #------------------------ #handle NaN values nan_values = [] for column in df.columns: if df[column].dtypes != 'object': min_value = df[column].min() idx = df[df[column].isna()].index nan_value = [] nan_value.append(column) if idx.shape[0] > 0: df.loc[idx, column] = min_value - 1 nan_value.append(min_value - 1) min_value - 1 #print("NaN values are replaced to ", min_value - 1, " in column ", column) else: nan_value.append(None) nan_values.append(nan_value) #------------------------ #initialize params and folders config = functions.initializeParams(config) functions.initializeFolders() #------------------------ algorithm = config['algorithm'] valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression'] if algorithm not in valid_algorithms: raise ValueError('Invalid algorithm passed. You passed ', algorithm," but valid algorithms are ",valid_algorithms) #------------------------ enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config['enableMultitasking'] #no longer used. check to remove this variable. enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] enableParallelism = config['enableParallelism'] #this will handle basic decision stumps. parallelism is not required. if enableRandomForest == True: config['enableParallelism'] = False enableParallelism = False #------------------------ raw_df = df.copy() num_of_rows = df.shape[0]; num_of_columns = df.shape[1] if algorithm == 'Regression': if df['Decision'].dtypes == 'object': raise ValueError('Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.') if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm algorithm = 'Regression' config['algorithm'] = 'Regression' global_stdev = df['Decision'].std(ddof=0) if enableGBM == True: print("Gradient Boosting Machines...") algorithm = 'Regression' config['algorithm'] = 'Regression' if enableAdaboost == True: #enableParallelism = False for j in range(0, num_of_columns): column_name = df.columns[j] if df[column_name].dtypes == 'object': raise ValueError('Adaboost must be run on numeric data set for both features and target') #------------------------- print(algorithm," tree is going to be built...") dataset_features = dict() #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales. header = "def findDecision(obj): #" num_of_columns = df.shape[1]-1 for i in range(0, num_of_columns): column_name = df.columns[i] dataset_features[column_name] = df[column_name].dtypes header = header + "obj[" + str(i) +"]: "+column_name if i != num_of_columns - 1: header = header + ", " header = header + "\n" #------------------------ begin = time.time() trees = []; alphas = [] if enableAdaboost == True: trees, alphas = adaboost.apply(df, config, header, dataset_features) elif enableGBM == True: if df['Decision'].dtypes == 'object': #transform classification problem to regression trees, alphas = gbm.classifier(df, config, header, dataset_features) classification = True else: #regression trees = gbm.regressor(df, config, header, dataset_features) classification = False elif enableRandomForest == True: trees = randomforest.apply(df, config, header, dataset_features) else: #regular decision tree building root = 1; file = "outputs/rules/rules.py" functions.createFile(file, header) if enableParallelism == True: json_file = "outputs/rules/rules.json" functions.createFile(json_file, "[\n") trees = Training.buildDecisionTree(df,root,file, config, dataset_features , 0, 0, 'root') print("finished in ",time.time() - begin," seconds") obj = { "trees": trees, "alphas": alphas, "config": config, "nan_values": nan_values } return obj
def fit(df, config={}, validation_df=None): """ Parameters: df (pandas data frame): Training data frame. The target column must be named as 'Decision' and it has to be in the last column config (dictionary): config = { 'algorithm' (string): ID3, 'C4.5, CART, CHAID or Regression 'enableParallelism' (boolean): False 'enableGBM' (boolean): True, 'epochs' (int): 7, 'learning_rate' (int): 1, 'enableRandomForest' (boolean): True, 'num_of_trees' (int): 5, 'enableAdaboost' (boolean): True, 'num_of_weak_classifier' (int): 4 } validation_df (pandas data frame): if nothing is passed to validation data frame, then the function validates built trees for training data frame Returns: chefboost model """ process_id = os.getpid() base_df = df.copy() target_label = df.columns[len(df.columns) - 1] if target_label != 'Decision': print("Expected: Decision, Existing: ", target_label) raise ValueError( 'Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame' ) #------------------------ #handle NaN values nan_values = [] for column in df.columns: if df[column].dtypes != 'object': min_value = df[column].min() idx = df[df[column].isna()].index nan_value = [] nan_value.append(column) if idx.shape[0] > 0: df.loc[idx, column] = min_value - 1 nan_value.append(min_value - 1) min_value - 1 #print("NaN values are replaced to ", min_value - 1, " in column ", column) else: nan_value.append(None) nan_values.append(nan_value) #------------------------ #initialize params and folders config = functions.initializeParams(config) functions.initializeFolders() #------------------------ algorithm = config['algorithm'] valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression'] if algorithm not in valid_algorithms: raise ValueError('Invalid algorithm passed. You passed ', algorithm, " but valid algorithms are ", valid_algorithms) #------------------------ enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config[ 'enableMultitasking'] #no longer used. check to remove this variable. enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] enableParallelism = config['enableParallelism'] #------------------------ if enableParallelism == True: print("[INFO]: ", config["num_cores"], "CPU cores will be allocated in parallel running") #------------------------ raw_df = df.copy() num_of_rows = df.shape[0] num_of_columns = df.shape[1] if algorithm == 'Regression': if df['Decision'].dtypes == 'object': raise ValueError( 'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.' ) if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm if algorithm != 'Regression': print( "WARNING: You set the algorithm to ", algorithm, " but the Decision column of your data set has non-object type." ) print( "That's why, the algorithm is set to Regression to handle the data set." ) algorithm = 'Regression' config['algorithm'] = 'Regression' global_stdev = df['Decision'].std(ddof=0) if enableGBM == True: print("Gradient Boosting Machines...") algorithm = 'Regression' config['algorithm'] = 'Regression' if enableAdaboost == True: #enableParallelism = False for j in range(0, num_of_columns): column_name = df.columns[j] if df[column_name].dtypes == 'object': raise ValueError( 'Adaboost must be run on numeric data set for both features and target' ) #------------------------- print(algorithm, " tree is going to be built...") dataset_features = dict( ) #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales. header = "def findDecision(obj): #" num_of_columns = df.shape[1] - 1 for i in range(0, num_of_columns): column_name = df.columns[i] dataset_features[column_name] = df[column_name].dtypes header = header + "obj[" + str(i) + "]: " + column_name if i != num_of_columns - 1: header = header + ", " header = header + "\n" #------------------------ begin = time.time() trees = [] alphas = [] if enableAdaboost == True: trees, alphas = adaboost.apply(df, config, header, dataset_features, validation_df=validation_df) elif enableGBM == True: if df['Decision'].dtypes == 'object': #transform classification problem to regression trees, alphas = gbm.classifier(df, config, header, dataset_features, validation_df=validation_df) classification = True else: #regression trees = gbm.regressor(df, config, header, dataset_features, validation_df=validation_df) classification = False elif enableRandomForest == True: trees = randomforest.apply(df, config, header, dataset_features, validation_df=validation_df, process_id=process_id) else: #regular decision tree building root = 1 file = "outputs/rules/rules.py" functions.createFile(file, header) if enableParallelism == True: json_file = "outputs/rules/rules.json" functions.createFile(json_file, "[\n") trees = Training.buildDecisionTree(df, root=root, file=file, config=config, dataset_features=dataset_features, parent_level=0, leaf_id=0, parents='root', validation_df=validation_df, main_process_id=process_id) print("-------------------------") print("finished in ", time.time() - begin, " seconds") obj = { "trees": trees, "alphas": alphas, "config": config, "nan_values": nan_values } #----------------------------------------- #train set accuracy df = base_df.copy() evaluate(obj, df, task='train') #validation set accuracy if isinstance(validation_df, pd.DataFrame): evaluate(obj, validation_df, task='validation') #----------------------------------------- return obj
def regressor(df, config, header, dataset_features): models = [] algorithm = config['algorithm'] enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config['enableMultitasking'] enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] #------------------------------ boosted_from = 0 boosted_to = 0 #------------------------------ base_df = df.copy() #gbm will manipulate actuals. store its raw version. target_values = base_df['Decision'].values num_of_instances = target_values.shape[0] root = 1 file = "outputs/rules/rules0.py" functions.createFile(file, header) Training.buildDecisionTree(df, root, file, config, dataset_features) #generate rules0 df = base_df.copy() base_df['Boosted_Prediction'] = 0 #------------------------------ pbar = tqdm(range(1, epochs + 1), desc='Boosting') #for index in range(1,epochs+1): #for index in tqdm(range(1,epochs+1), desc='Boosting'): for index in pbar: #print("epoch ",index," - ",end='') loss = 0 #run data(i-1) and rules(i-1), save data1 #dynamic import moduleName = "outputs/rules/rules%s" % (index - 1) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) new_data_set = "outputs/data/data%s.csv" % (index) f = open(new_data_set, "w") #put header in the following file columns = df.shape[1] mae = 0 #---------------------------------------- df['Epoch'] = index df['Prediction'] = df.apply(findPrediction, axis=1) base_df['Boosted_Prediction'] += df['Prediction'] loss = (base_df['Boosted_Prediction'] - base_df['Decision']).pow(2).sum() if index == 1: boosted_from = loss / num_of_instances elif index == epochs: boosted_to = loss / num_of_instances df['Decision'] = int(learning_rate) * (df['Decision'] - df['Prediction']) df = df.drop(columns=['Epoch', 'Prediction']) #--------------------------------- df.to_csv(new_data_set, index=False) #data(i) created #--------------------------------- file = "outputs/rules/rules" + str(index) + ".py" functions.createFile(file, header) current_df = df.copy() Training.buildDecisionTree(df, root, file, config, dataset_features) df = current_df.copy( ) #numeric features require this restoration to apply findDecision function #rules(i) created loss = loss / num_of_instances #print("epoch ",index," - loss: ",loss) #print("loss: ",loss) pbar.set_description("Epoch %d. Loss: %d. Process: " % (index, loss)) #--------------------------------- print(num_of_instances, " instances are boosted from ", boosted_from, " to ", boosted_to, " in ", epochs, " epochs") return models
def classifier(df, config, header, dataset_features): models = [] print("gradient boosting for classification") epochs = config['epochs'] temp_df = df.copy() original_dataset = df.copy() worksheet = df.copy() classes = df['Decision'].unique() boosted_predictions = np.zeros([df.shape[0], len(classes)]) pbar = tqdm(range(0, epochs), desc='Boosting') #store actual set, we will use this to calculate loss actual_set = pd.DataFrame(np.zeros([df.shape[0], len(classes)]), columns=classes) for i in range(0, len(classes)): current_class = classes[i] actual_set[current_class] = np.where(df['Decision'] == current_class, 1, 0) actual_set = actual_set.values #transform it to numpy array #for epoch in range(0, epochs): for epoch in pbar: for i in range(0, len(classes)): current_class = classes[i] if epoch == 0: temp_df['Decision'] = np.where(df['Decision'] == current_class, 1, 0) worksheet['Y_' + str(i)] = temp_df['Decision'] else: temp_df['Decision'] = worksheet['Y-P_' + str(i)] predictions = [] #change data type for decision column temp_df[['Decision']].astype('int64') root = 1 file = "outputs/rules/rules-for-" + current_class + "-round-" + str( epoch) + ".py" functions.createFile(file, header) Training.buildDecisionTree(temp_df, root, file, config, dataset_features) #decision rules created #---------------------------- #dynamic import moduleName = "outputs/rules/rules-for-" + current_class + "-round-" + str( epoch) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) num_of_columns = df.shape[1] for row, instance in df.iterrows(): features = [] for j in range(0, num_of_columns - 1): #iterate on features features.append(instance[j]) actual = temp_df.loc[row]['Decision'] prediction = myrules.findDecision(features) predictions.append(prediction) #---------------------------- if epoch == 0: worksheet['F_' + str(i)] = 0 else: worksheet['F_' + str(i)] = pd.Series(predictions).values boosted_predictions[:, i] = boosted_predictions[:, i] + worksheet[ 'F_' + str(i)].values.astype(np.float32) #print(boosted_predictions[0:5,:]) worksheet['P_' + str(i)] = 0 #---------------------------- temp_df = df.copy() #restoration for row, instance in worksheet.iterrows(): f_scores = [] for i in range(0, len(classes)): f_scores.append(instance['F_' + str(i)]) probabilities = functions.softmax(f_scores) for j in range(0, len(probabilities)): instance['P_' + str(j)] = probabilities[j] worksheet.loc[row] = instance for i in range(0, len(classes)): worksheet['Y-P_' + str(i)] = worksheet['Y_' + str(i)] - worksheet['P_' + str(i)] prediction_set = np.zeros([df.shape[0], len(classes)]) for i in range(0, boosted_predictions.shape[0]): predicted_index = np.argmax(boosted_predictions[i]) prediction_set[i][predicted_index] = 1 #---------------------------- #find loss for this epoch: prediction_set vs actual_set classified = 0 for i in range(0, actual_set.shape[0]): actual = np.argmax(actual_set[i]) prediction = np.argmax(prediction_set[i]) #print("actual: ",actual," - prediction: ",prediction) if actual == prediction: classified = classified + 1 accuracy = str(100 * classified / actual_set.shape[0]) + "%" #---------------------------- #print(worksheet.head()) #print("round ",epoch+1) pbar.set_description("Epoch %d. Accuracy: %s. Process: " % (epoch + 1, accuracy)) return models, classes
def apply(df, config, header, dataset_features, validation_df = None, process_id = None): models = [] num_of_trees = config['num_of_trees'] parallelism_on = config["enableParallelism"] #TODO: is this logical for 48x2 cores? #config["enableParallelism"] = False #run each tree in parallel but each branch in serial #TODO: reconstruct for parallel run is problematic. you should reconstruct based on tree id. input_params = [] pbar = tqdm(range(0, num_of_trees), desc='Bagging') for i in pbar: pbar.set_description("Sub decision tree %d is processing" % (i+1)) subset = df.sample(frac=1/num_of_trees) root = 1 moduleName = "outputs/rules/rule_"+str(i) file = moduleName+".py" functions.createFile(file, header) if parallelism_on: #parallel run input_params.append((subset, root, file, config, dataset_features, 0, 0, 'root', i, None, process_id)) else: #serial run Training.buildDecisionTree(subset,root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', tree_id = i, main_process_id = process_id) #------------------------------- if parallelism_on: num_cores = config["num_cores"] #--------------------------------- if num_of_trees <= num_cores: POOL_SIZE = num_of_trees else: POOL_SIZE = num_cores with closing(multiprocessing.Pool(POOL_SIZE)) as pool: funclist = [] for input_param in input_params: f = pool.apply_async(buildDecisionTree, [*input_param]) funclist.append(f) #all functions registered here #results = [] for f in tqdm(funclist): branch_results = f.get(timeout = 100000) #results.append(branch_results) pool.close() pool.terminate() #------------------------------- #collect models for both serial and parallel here for i in range(0, num_of_trees): moduleName = "outputs/rules/rule_"+str(i) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #------------------------------- return models
def regressor(df, config, header, dataset_features, validation_df = None, process_id = None): models = [] #we will update decisions in every epoch, this will be used to restore base_actuals = df.Decision.values algorithm = config['algorithm'] enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config['enableMultitasking'] enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] #------------------------------ boosted_from = 0; boosted_to = 0 #------------------------------ base_df = df.copy() #gbm will manipulate actuals. store its raw version. target_values = base_df['Decision'].values num_of_instances = target_values.shape[0] root = 1 file = "outputs/rules/rules0.py"; json_file = "outputs/rules/rules0.json" functions.createFile(file, header) functions.createFile(json_file, "[\n") Training.buildDecisionTree(df,root,file, config, dataset_features , parent_level = 0, leaf_id = 0, parents = 'root') #generate rules0 #functions.storeRule(json_file," {}]") df = base_df.copy() base_df['Boosted_Prediction'] = 0 #------------------------------ best_epoch_idx = 0; best_epoch_loss = 1000000 pbar = tqdm(range(1, epochs+1), desc='Boosting') #for index in range(1,epochs+1): #for index in tqdm(range(1,epochs+1), desc='Boosting'): for index in pbar: #print("epoch ",index," - ",end='') loss = 0 #run data(i-1) and rules(i-1), save data1 #dynamic import moduleName = "outputs/rules/rules%s" % (index-1) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) new_data_set = "outputs/data/data%s.csv" % (index) f = open(new_data_set, "w") #put header in the following file columns = df.shape[1] mae = 0 #---------------------------------------- df['Epoch'] = index df['Prediction'] = df.apply(findPrediction, axis=1) base_df['Boosted_Prediction'] += df['Prediction'] loss = (base_df['Boosted_Prediction'] - base_df['Decision']).pow(2).sum() current_loss = loss / num_of_instances #mse if index == 1: boosted_from = current_loss * 1 elif index == epochs: boosted_to = current_loss * 1 if current_loss < best_epoch_loss: best_epoch_loss = current_loss * 1 best_epoch_idx = index * 1 df['Decision'] = int(learning_rate)*(df['Decision'] - df['Prediction']) df = df.drop(columns = ['Epoch', 'Prediction']) #--------------------------------- df.to_csv(new_data_set, index=False) #data(i) created #--------------------------------- file = "outputs/rules/rules"+str(index)+".py" json_file = "outputs/rules/rules"+str(index)+".json" functions.createFile(file, header) functions.createFile(json_file, "[\n") current_df = df.copy() Training.buildDecisionTree(df,root,file, config, dataset_features , parent_level = 0, leaf_id = 0, parents = 'root', main_process_id = process_id) #functions.storeRule(json_file," {}]") df = current_df.copy() #numeric features require this restoration to apply findDecision function #rules(i) created loss = loss / num_of_instances #print("epoch ",index," - loss: ",loss) #print("loss: ",loss) pbar.set_description("Epoch %d. Loss: %d. Process: " % (index, loss)) gc.collect() #--------------------------------- print("The best epoch is ", best_epoch_idx," with ", best_epoch_loss," loss value") models = models[0:best_epoch_idx] config["epochs"] = best_epoch_idx print("MSE of ",num_of_instances," instances are boosted from ",boosted_from," to ",best_epoch_loss," in ",epochs," epochs") return models
def apply(df, config, header, dataset_features): models = [] num_of_trees = config['num_of_trees'] pbar = tqdm(range(0, num_of_trees), desc='Bagging') for i in pbar: #for i in range(0, num_of_trees): pbar.set_description("Sub decision tree %d is processing" % (i + 1)) subset = df.sample(frac=1 / num_of_trees) root = 1 moduleName = "outputs/rules/rule_" + str(i) file = moduleName + ".py" json_file = moduleName + ".json" functions.createFile(file, header) functions.createFile(json_file, "[\n") Training.buildDecisionTree(subset, root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root') functions.storeRule(json_file, "{}]") #-------------------------------- fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #------------------------------- #check regression or classification if df['Decision'].dtypes == 'object': problem_type = 'classification' else: problem_type = 'regression' actual_values = df['Decision'].values num_of_features = df.shape[1] - 1 #discard Decision number_of_instances = df.shape[0] global_predictions = [] #if classification get the max number of prediction if problem_type == 'classification': for i in range(0, num_of_trees): moduleName = "outputs/rules/rule_" + str(i) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) predictions = [] for index, instance in df.iterrows(): params = [] for j in range(0, num_of_features): params.append(instance[j]) #index row, i th column prediction = myrules.findDecision(params) predictions.append(prediction) #print(i,"th tree prediction: ",prediction) #print(predictions) global_predictions.append(predictions) #------------------------------- classified = 0 for index in range(0, len(actual_values)): actual = actual_values[index] predictions = [] for i in range(0, num_of_trees): prediction = global_predictions[i][index] if prediction != None: #why None exists in some cases? predictions.append(prediction) predictions = np.array(predictions) unique_values = np.unique(predictions) if unique_values.shape[0] == 1: prediction = unique_values[0] else: counts = [] for unique in unique_values: count = 0 for j in predictions: if unique == j: count = count + 1 counts.append(count) #print("unique: ",unique_values) #print("counts: ",counts) prediction = None if len(counts) > 0: max_index = np.argmax(np.array(counts)) prediction = unique_values[max_index] #print(index,". actual: ",actual," - prediction: ", prediction) if actual == prediction: classified = classified + 1 print("Accuracy: ", 100 * classified / number_of_instances, "% on ", number_of_instances, " instances") return models