def train_and_predict(X_train, Y_train, X_test): """ :type X_train: numpy.ndarray :type X_test: numpy.ndarray :type Y_train: numpy.ndarray :rtype: numpy.ndarray """ X_combined = np.vstack((X_train, X_test)) sc = StandardScaler() sc.fit(X_combined) X_scaled = sc.transform(X_train) X_test_s = sc.transform(X_test) n_trees = 3 print('number of trees to be built in forest ', n_trees) trees = build_forest(X_scaled, Y_train, n_trees, min_sample_split=12, max_depth=10, min_gain_threshold=0.001, feature_subset=False) rf_pred = predict(X_test_s, trees) return rf_pred
def calculate_accuracy(model, validate_data): n_total = 0 n_correct = 0 predicted_categories = [predict(model, row[:-1]) for row in validate_data] correct_categories = [row[-1] for row in validate_data] for predicted_category, correct_category in zip(predicted_categories, correct_categories): n_total += 1 if predicted_category == correct_category: n_correct += 1 return n_correct / n_total
def get_predict(trees_result, trees_feature, data_train): m_tree = len(trees_result) m = np.shape(data_train)[0] result = [] for i in range(m_tree): clf = trees_result[i] feature = trees_feature[i] data = split_data(data_train, feature) result_i = [] for i in range(m): result_i.append((rdf.predict(data[i][0:-1], clf).keys())[0]) result.append(result_i) final_predict = np.sum(result, axis=0) return final_predict
def final_predict(trees, row): # return np.mean([predict(t, row) for t in trees], axis=0) predictions = [] for tree in trees: prediction = predict(tree, row) if prediction == None: prediction = 0 predictions.append(prediction) # np function for counting occurences of each label vals, counts = np.unique(predictions, return_counts=True) # find the index of the most frequent one max_index = np.argmax(counts) # return the most frequent value return vals[max_index]
x_train_data = x_train_dataset.iloc[:, 1:] y_train_data = y_train_dataset.iloc[:, 1].values # split train/validation data X_train, X_val, y_train, y_val = train_test_split(x_train_data, y_train_data, test_size=0.33, random_state=123) # scaler min_max_scaler = MinMaxScaler() # Default behavior is to scale to [0,1] scaler = min_max_scaler x_train_data = scaler.fit_transform(x_train_data) y_test_pred, y_valid_pred = random_forest.predict(X_train, y_train, X_val, y_val, x_test, bmac_scorer, bmac_score) print('{}: * k-nearest neighbour classifier ...'.format( datetime.now().strftime("%H:%M:%S"))) # Create and fit a nearest-neighbor classifier from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier() knn.fit(X_train, y_train) KNeighborsClassifier(algorithm='auto', n_neighbors=5, weights='uniform') # try different n_neighbors print('{}: Correct predications: {}/{}'.format( datetime.now().strftime("%H:%M:%S"), np.sum(knn.predict(X_val) == y_val), len(y_val)))