def decision_tree(X_train, X_test, y_train, y_test, *, max_depth=None, random_state=None): dTree = DecisionTreeRegressor(max_depth=max_depth, random_state=random_state) model = str(dTree) + '\n\nwithout Pruning' fit_start = time.time() dTree.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() y_prediction = dTree.predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time) evaluation.print_errors(y_test, y_prediction, model, fit_time, pred_time)
def random_forest(X_train, X_test, y_train, y_test, *, n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False): regr = RandomForestRegressor( n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start) model = str(regr) fit_start = time.time() regr.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() y_prediction = regr.predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, y_prediction, model, fit_time, pred_time) evaluation.print_errors(y_test, y_prediction, model, fit_time, pred_time)
def dtree_with_pruning(X_train, X_test, y_train, y_test, *, max_depth=None, random_state=None): # Erstellen und Trainieren des ursprünglichen Baumes dtree = DecisionTreeRegressor(max_depth=max_depth, random_state=random_state) model = str(dtree) + '\n\nwith Pruning (Legacy)' fit_start = time.time() dtree.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() # Erstellen einer Liste zum Speichern der ge-prunten Bäume tree_array = [dtree] num_nodes = dtree.tree_.capacity # Pruning der Bäume und Anhängen an die Liste k = 1 while num_nodes > 1: tree_array.append(copy.deepcopy(tree_array[k - 1])) min_node_idx, min_gk = models.dtree.prune.determine_alpha( tree_array[k].tree_) models.dtree.prune.prune(tree_array[k].tree_, min_node_idx) num_nodes = sum(1 * (tree_array[k].tree_.n_node_samples != 0)) k += 1 # Finden des besten Baumes, basierend auf den Test-Daten predictlist = [] for i in range(0, len(tree_array)): pred = tree_array[i].predict(X_test) # predictlist.append(tree_array[i].score(X_test, y_test)) predictlist.append(mean_squared_error(y_test, pred)) tree_scores = np.array(predictlist) index = tree_scores.argmin() pred = tree_array[index].predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, pred, model, fit_time, pred_time) evaluation.print_errors(y_test, pred, model, fit_time, pred_time)
def svm_regression(X_train, X_test, y_train, y_test, *, kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1): svmr = SVR(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, epsilon=epsilon, shrinking=shrinking, cache_size=cache_size, verbose=verbose, max_iter=max_iter) svmr_model = svmr svmr_fit_start = time.time() svmr.fit(X_train, y_train) svmr_fit_end = time.time() svmr_fit_time = svmr_fit_end - svmr_fit_start svmr_pred_start = time.time() pred = svmr.predict(X_test) svmr_pred_end = time.time() svmr_pred_time = svmr_pred_end - svmr_pred_start evaluation.save_errors(y_test, pred, svmr_model, svmr_fit_time, svmr_pred_time) evaluation.print_errors(y_test, pred, svmr_model, svmr_fit_time, svmr_pred_time)
def dtree_with_pruning_faster(X_train, X_test, y_train, y_test, *, max_depth=None, random_state=None): # Initiate model dtree = DecisionTreeRegressor(max_depth=max_depth, random_state=random_state) model = str(dtree) + '\n\nwith Pruning (Faster) ' # Fit model fit_start = time.time() dtree.fit(X_train, y_train) fit_end = time.time() fit_time = fit_end - fit_start pred_start = time.time() # Pruning trees tree_pruner = models.dtree.prune_faster.TreePruner(dtree) tree_pruner.run() # Calculating errors test_errors = [] train_errors = [] for tree in tree_pruner.trees: y_pred_test = tree.predict(X_test) test_errors.append(mean_squared_error(y_test, y_pred_test)) y_pred_train = tree.predict(X_train) train_errors.append(mean_squared_error(y_train, y_pred_train)) # Find the best tree based on test data test_errors_np = np.array(test_errors) index = test_errors_np.argmin() pred = tree_pruner.trees[index].predict(X_test) pred_end = time.time() pred_time = pred_end - pred_start evaluation.save_errors(y_test, pred, model, fit_time, pred_time) evaluation.print_errors(y_test, pred, model, fit_time, pred_time)
plt.tight_layout() plt.show() train_X, test_X, train_y, test_y = features.heart.split_train_test( cleaned_data, "AHD") train_X, test_X, train_y, test_y = features.heart.scale_to_train( [train_X, test_X, train_y, test_y], [0, 2, 3, 6, 8, 9, 10, 18], "minmax") print(train_X) # log reg with simple feature set print("Evaluating simple feature set") #log_reg = lm.SGDClassifier(n_jobs=10, loss="log", max_iter = 50) log_reg = lm.LogisticRegression() log_reg.fit(train_X, train_y) pred = log_reg.predict(test_X) pred_proba = log_reg.predict_proba(test_X) evaluation.print_errors(test_y, pred) print("") """ # log reg with advanced feature set print("Evaluating modified feature set") log_reg2 = lm.SGDClassifier(n_jobs=1, loss="log", max_iter=50) classifier.fit(log_reg2, input_data2, targets) pred, pred_proba = classifier.predict(log_reg2, input_data2) evaluation.print_errors(targets, pred) """
if __name__ == '__main__': data_train = pd.read_csv("data/zip.train", header = None, sep =" ") cleaned_train_data = data_train.dropna(axis=1, thresh=2) input_data = cleaned_train_data.iloc[:, 1:].values targets = cleaned_train_data[0].values input_data2 = features.zip_codes.multires(input_data) # log reg with simple feature set print("Evaluating simple feature set") log_reg = lm.SGDClassifier(n_jobs=1, loss="log", max_iter = 50) classifier.fit(log_reg, input_data, targets) pred, pred_proba = classifier.predict(log_reg, input_data) evaluation.print_errors(targets, pred) print("") # log reg with advanced feature set print("Evaluating modified feature set") log_reg2 = lm.SGDClassifier(n_jobs=1, loss="log", max_iter=50) classifier.fit(log_reg2, input_data2, targets) pred, pred_proba = classifier.predict(log_reg2, input_data2) evaluation.print_errors(targets, pred)
clean_data = features.encode_binary(clean_data) clean_data = features.encode_category(clean_data, 'ChestPain') clean_data = features.encode_category(clean_data, 'Thal') data_train, data_test = features.split(clean_data, 0.2) X_train, y_train, X_test, y_test = features.set_target( data_train, data_test, 'AHD') logReg = lm.LogisticRegression() print(clean_data.head()) classifier.fit(logReg, X_train, y_train) pred, pred_proba = classifier.predict(logReg, X_test) evaluation.print_errors(y_test, pred) #print(len(data_train), len(data_test)) #print(clean_data) # cleaned_train_data = data_train.dropna(axis=1, thresh=2) # # input_data = cleaned_train_data.iloc[:, 1:].values # targets = cleaned_train_data[0].values # # input_data2 = features.zip_codes.multires(input_data) # # # log reg with simple feature set # print("Evaluating simple feature set") # log_reg = lm.SGDClassifier(n_jobs=1, loss="log", max_iter = 50) #