def rf(self): clf = rfc.RandomForest("train.csv") tree = clf.train() print tree if self.p_flag == "p": print clf.predict predictions = tree.predict(clf.predict) self.binning(clf.predict_original, predictions)
def check_feature_rate(): import math import randomForest as rf missing_input = 'none' #'mean' transform = False scale = True use_text = False dummy = False use_feature_selection = False data_path = 'DorCirurgiaCategNA.csv' class_questionnaire = 'Q92510' class_name = 'Q92510_snDorPos' data, original_attributes, categories = read.readData( data_path=data_path, class_name=class_name, class_questionnaire=class_questionnaire, missing_input=missing_input, dummy=dummy, transform_numeric=transform, use_text=use_text, skip_class_questionnaire=True) #skip_class_questionnaire=False) X = data[:, 0:-1] y = np.array(data[:, -1]) ntrees = 5001 replace = False mtry = math.sqrt max_depth = None missing_branch = True seed = np.random.randint(0, 10000) clf1 = rf.RandomForest(ntrees=ntrees, oob_error=True, random_state=seed, mtry=mtry, missing_branch=missing_branch, prob_answer=False, max_depth=max_depth, replace=replace, balance=True) clf1.fit(X, y) attributes_used = {} for tree in clf1.forest: for attribute in tree.feature_indices: if (attribute not in attributes_used.keys()): attributes_used[attribute] = 1 else: attributes_used[attribute] += 1 if (len((attributes_used.keys())) != X.shape[1]): print(len(attributes_used.keys())) print(X.shape[1]) print('not equal!!! %r' % (1 - len(attributes_used.keys()) / X.shape[1])) print({original_attributes[a]: b for a, b in attributes_used.items()}) print(1 - clf1.oob_error_)
def select_params(X, y): times = 10 final_scores = [] parameters = [] ntrees = 10 #[700,600,500,400,300,200,100]#,50,25] mtry = [math.sqrt, None, math.sqrt, math.log2] max_depth = [2, 3, 4, None] missing_branch = True #[True,False]#,False]#,False] replace = False #[True,False] for md in max_depth: for mb in missing_branch: for mt in mtry: for r in replace: parameters.append({ 'max_depth': md, 'missing_branch': mb, 'mtry': mt, 'replace': r }) cont = 0 print(len(parameters)) for params in parameters: cont += 1 print('Choice %r of parameters' % cont) seed = np.random.randint(0, 100000) #for seed in np.random.choice(range(1000),times): clf = rf.RandomForest(random_state=seed, ntrees=ntrees, oob_error=True, mtry=params['mtry'], missing_branch=params['missing_branch'], max_depth=params['max_depth'], replace=params['replace'], balance=True) clf.fit(X, y) final_scores.append(clf.oob_error_) min_score = min(final_scores) std = np.std(final_scores) print('Best set of parameters:') indexes = np.where(np.array(final_scores) == min_score)[0] for index in indexes: print(parameters[index]) print('Best 1.s.e. set of parameters:') index = (np.abs(np.array(final_scores) - (min_score + std))).argmin() print('%r: %r' % (parameters[index], final_scores[index])) print('10 best parameters:') for a, b in zip( np.array(parameters)[np.array(final_scores).argsort()[:10]], np.array(sorted(final_scores)[:10])): print('%r : %r ' % (a, b))
def plot_randomforest_accuracy_threshold(X, y, original_attributes, variable_importances, ntrees, replace, mtry, max_depth, missing_branch): thrs = [] accuracy = [] nfeatures = len(variable_importances) features = [a[0] for a in variable_importances] thresholds = [a[1] for a in variable_importances] seed = np.random.randint(0, 10000) clf = rf.RandomForest(ntrees=ntrees, mtry=mtry, missing_branch=missing_branch, prob_answer=False, max_depth=max_depth, replace=replace, random_state=seed) clf.fit(X, y) thrs.append(thresholds[-1]) #nf.append(nfeatures) accuracy.append(1 - clf.oob_error_) for i in range(1, nfeatures): print('eliminating feature %r...' % original_attributes[features[-i]]) seed = np.random.randint(0, 10000) clf = rf.RandomForest(ntrees=ntrees, mtry=mtry, missing_branch=missing_branch, prob_answer=False, max_depth=max_depth, replace=replace, random_state=seed) clf.fit(X[:, features[:-i]], y) #print(features[:-i]) #nf.append(nfeatures-i) thrs.append(thresholds[-i - 1]) accuracy.append(1 - clf.oob_error_) plt.plot(thrs, accuracy, 'bo', color='blue') plt.xlabel('threshold') plt.ylabel('accuracy') plt.show()
def plot_randomforest_seed(X, y, attributes): missing_branch = [] missing_c45 = [] seeds = [] for seed in range(0, 1000, 10): print('seed:') print(seed) clf = rf.RandomForest(ntrees=300, mtry=math.sqrt, missing_branch=True, prob_answer=False, max_depth=4, replace=False, random_state=seed) clf.fit(X, y) missing_branch.append(1 - clf.oob_error_) print(1 - clf.oob_error_) clf2 = rf.RandomForest(ntrees=300, mtry=math.sqrt, missing_branch=False, prob_answer=False, max_depth=4, replace=False, random_state=seed) clf2.fit(X, y) missing_c45.append(1 - clf2.oob_error_) print(1 - clf2.oob_error_) seeds.append(seed) plt.plot(missing_c45, missing_branch, 'x', color='blue') plt.show()
def plot_randomforest_accuracy_nfeatures(X, y, original_attributes, features, ntrees, replace, mtry, max_depth, missing_branch): nf = [] accuracy = [] nfeatures = len(features) seed = np.random.randint(0, 10000) clf = rf.RandomForest(ntrees=ntrees, mtry=mtry, missing_branch=missing_branch, prob_answer=False, max_depth=max_depth, replace=replace, random_state=seed) clf.fit(X, y) nf.append(0) accuracy.append(1 - clf.oob_error_) for i in range(1, nfeatures): print('eliminating feature %r...' % original_attributes[features[-i]]) seed = np.random.randint(0, 10000) clf = rf.RandomForest(ntrees=ntrees, mtry=mtry, missing_branch=missing_branch, prob_answer=False, max_depth=max_depth, replace=replace, random_state=seed) clf.fit(X[:, features[:-i]], y) nf.append(i) accuracy.append(1 - clf.oob_error_) plt.plot(nf, accuracy, 'bo', color='blue') plt.xlabel('number of features not being considered') plt.ylabel('accuracy') plt.show()
def plot_randomforest_accuracy(X, y, attributes, ntrees, replace, mtry, max_depth, missing_branch, ntimes, title=None): missing_branch_dict = {} missing_c45 = [] seeds = [] for i in range(ntimes): #for seed in range(0,1000,50): seed = np.random.randint(100000) #print('seed: %r' % seed) clf = rf.RandomForest(ntrees=ntrees, mtry=mtry, missing_branch=missing_branch, prob_answer=False, max_depth=max_depth, replace=replace, random_state=seed) clf.fit(X, y) if round(1 - clf.oob_error_, 2) not in missing_branch_dict.keys(): missing_branch_dict[round(1 - clf.oob_error_, 2)] = 1 else: missing_branch_dict[round(1 - clf.oob_error_, 2)] += 1 k = sorted(missing_branch_dict.items(), key=lambda x: x[0]) plt.bar(range(len([i[0] for i in k])), [i[1] for i in k]) pos = np.arange(len(k)) width = 1.0 # gives histogram aspect to the bar diagram ax = plt.axes() ax.set_xticks(pos + (width / 2)) ax.set_xticklabels([round(i[0], 2) for i in k]) ax.set_yticks(range(0, 50, 5)) plt.xlabel('acurácia com missing branch = ' + str(missing_branch)) if (title is not None): plt.title(title) plt.ylabel('frequência') plt.show()
test_X = data['test_data'] train_y = (data['training_labels'].T)[:,0] train_X = data['training_data'] x_y = list(zip(train_X, train_y)) random.shuffle(x_y) train_X = np.array([e[0] for e in x_y]) train_y = np.ravel([e[1] for e in x_y]) validation_X = train_X[:2000, :] validation_y = train_y[:2000] train_X = train_X[2000:, :] train_y = train_y[2000:] print(train_X.shape) # random forest rf = randomForest.RandomForest(10,10,train_X.shape[0],train_X.shape[1]) rf.train(train_X,train_y) res = rf.predict(validation_X) score = 0 for i in range(len(res)): if res[i] == validation_y[i]: score += 1 score /= len(res) print(score) # decision tree tree = decisionTree.DecisionTree(10,train_X.shape[1]) tree.train(train_X,train_y) res = tree.predict(validation_X)
def feature_selection_threshold(X, y, ntrees, replace, mtry, max_depth, missing_branch, balance, cutoff, ntimes=25, title=None, missing_rate=False, vitype='err', vimissing=True, backwards=False, save_models=False, random_subspace=False): # get average feature importances for each feature vis = average_varimp(X, y, ntrees, replace, mtry, max_depth, missing_branch, balance=balance, missing_rate=missing_rate, ntimes=ntimes, select=False, mean=False, vitype=vitype, vimissing=vimissing, printvi=False, random_subspace=random_subspace) # if backwards is True, then the feature selection will start the process with all features, # and eliminating the least important ones in each step if (backwards is True): reverse = False comp_threshold = lambda x, y: x <= y get_slice = lambda x, index: x[index:] stop_index = -1 chosen_model = -1 # if it's False, then it starts with only the most important feature, and then adding # the most important ones in each step else: reverse = True comp_threshold = lambda x, y: x > y get_slice = lambda x: x[0:index] stop_index = 0 chosen_model = 0 ordered_features = [ a[0] for a in sorted(vis, key=lambda x: np.mean(x[1]), reverse=reverse) ] thresholds = [np.mean(vis[a][1]) for a in ordered_features] threshold_values = sorted([round(a, 10) for a in set(thresholds)], reverse=reverse) stop_indexes = [] scores = [] i = 0 nn = 0 # for each threshold value (feature importance value), create a forest # only using: (a) features whose importance value is <= than the threshold # if backwards is True (starting from the least important), or # (b) features whose importance value is > than the threshold if backwards is False # (starting from the most important one) for threshold in threshold_values: s_index = stop_index + 1 while s_index < len(thresholds): if (comp_threshold(threshold, thresholds[s_index])): break else: s_index += 1 stop_index = s_index features = get_slice(ordered_features, stop_index) seed = np.random.randint(0, 10000) clf = rf.RandomForest(ntrees=ntrees, oob_error=True, random_state=seed, mtry=mtry, missing_branch=missing_branch, prob_answer=False, max_depth=max_depth, replace=replace, balance=balance, cutoff=cutoff, random_subspace=random_subspace) clf.fit(X[X.columns[features]], y) clf.threshold = threshold if ('participant code' not in clf.X.columns): clf.X['participant code'] = X['participant code'] if ('Q44071_snCplexoAt' not in clf.X.columns): clf.X['Q44071_snCplexoAt'] = X['Q44071_snCplexoAt'] scores.append(1 - clf.oob_error_) if (save_models is True): with open('prognostic_model_' + title + str(nn) + '.pickle', 'wb') as handle: pickle.dump(clf, handle) nn += 1 stop_indexes.append(stop_index) stdm = sem(scores) indexes = np.where( np.array(scores) == scores[((np.abs( np.array([a for a in scores if a != max(scores)]) - (max(scores) - stdm))).argmin())])[0] # the forest with the best score (closest to the max score subtracted from the standard error of scores) and # the biggest threshold value (by index -1) is chosen as the suggested model to be used index = indexes[chosen_model] clf = rf.RandomForest(ntrees=ntrees, oob_error=True, random_state=seed, mtry=mtry, missing_branch=missing_branch, prob_answer=False, max_depth=max_depth, replace=replace, balance=balance, cutoff=cutoff, random_subspace=random_subspace) #clf.attributes = attributes[get_slice(ordered_features,stop_indexes[index])] clf.fit(X[X.columns[get_slice(ordered_features, stop_indexes[index])]], y) #importance_values = [[round(np.mean(aa),10) for aa in a[1]] for a in vis if round(np.mean(a[1]),10) >= threshold_values[index]] #features = attributes[[a[0] for a in vis if round(np.mean(a[1]),10) >= threshold_values[index]]] #plot.boxplot(importance_values,features,title) if (save_models is True): plot.plot_feature_importance_vs_accuracy(threshold_values, scores, xlabel='threshold', title=title, special=index) return clf
def average_varimp(X, y, ntrees, replace, mtry, max_depth, missing_branch, balance, vitype='err', vimissing=True, ntimes=25, select=True, printvi=False, plotvi=False, cutpoint=0.0, mean=False, title=None, missing_rate=False, random_subspace=False): vi = {a: [] for a in range(X.shape[1])} for i in range(X.shape[0]): if (i < ntimes): seed = np.random.randint(0, 10000) clf = rf.RandomForest(ntrees=ntrees, oob_error=True, random_state=seed, mtry=mtry, missing_branch=missing_branch, prob_answer=False, max_depth=max_depth, replace=replace, balance=balance, random_subspace=random_subspace) clf.fit(X, y) varimps = clf.variable_importance(vitype=vimissing, vimissing=True) for var in varimps.keys(): if (missing_rate): vi[var].append(varimps[var] * utils.notNanProportion(X[X.columns[var]])) else: vi[var].append(varimps[var]) else: break vimean = {a: [] for a in range(X.shape[1])} for var in vi.keys(): vimean[var] = np.mean(vi[var]) if (printvi): vis = sorted(vimean.items(), key=lambda x: x[1], reverse=True) for v, i in vis: print('feature: %r importance: %r' % (X.columns[v], i)) if (plotvi): print(cutpoint) importance_values = [] features = [] vis = sorted(vi.items(), key=lambda x: x[0]) for v, i in vis: if (vimean[v] >= cutpoint): importance_values.append(i) features.append(X.columns[v]) import plot plot.boxplot(importance_values, features, title) if (select): vis = sorted(vimean.items(), key=lambda x: x[1], reverse=True) return sorted([var[0] for var in vis if vimean[var[0]] >= cutpoint]) if (mean): return sorted(vimean.items(), key=lambda x: x[1], reverse=True) #return [var[0] for var in vis] return sorted(vi.items(), key=lambda x: x[0])
print('Testing Decision Tree score method...') assert (m.score([['RAIN', 63, 50, 'T'], ['SUNNY', 66, 90, 'F'], ['SUNNY', 50, 50, 'T'], ['OVERCAST', 70, 50, 'F']], ['PLAY', 'PLAY', 'PLAY', 'PLAY']) == accuracy_score( m.predict(([['RAIN', 63, 50, 'T'], ['SUNNY', 66, 90, 'F'], ['SUNNY', 50, 50, 'T'], ['OVERCAST', 70, 50, 'F']])), ['PLAY', 'PLAY', 'PLAY', 'PLAY'])) print('Testing Random Forest...') clf = rf.RandomForest(ntrees=8, mtry=math.sqrt, oob_error=True, random_state=9, missing_branch=False, prob_answer=False, max_depth=3, replace=False) clf.fit(X, y) fcs = clf.feature_contribution() #clf.forest[-1].to_dot(original_attributes,out='out.dot') clf.forest[-1].to_pdf(original_attributes, out='out2.pdf') print("Testing Random Forest with missing data...") # data = read.readData(data_path = '../Dados/Test_with_nan2.csv', class_name='Class', # dummy=dummy,transform_numeric=transform,use_text = use_text,missing_input='none') X = data[data.columns[:-1]].values y = data['Class'].values