def train_and_predict(X_train, Y_train, X_test):
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: numpy.ndarray
    """

    X_combined = np.vstack((X_train, X_test))

    sc = StandardScaler()
    sc.fit(X_combined)

    X_scaled = sc.transform(X_train)
    X_test_s = sc.transform(X_test)

    n_trees = 3
    print('number of trees to be built in forest ', n_trees)
    trees = build_forest(X_scaled,
                         Y_train,
                         n_trees,
                         min_sample_split=12,
                         max_depth=10,
                         min_gain_threshold=0.001,
                         feature_subset=False)

    rf_pred = predict(X_test_s, trees)

    return rf_pred
Ejemplo n.º 2
0
def calculate_accuracy(model, validate_data):
    n_total = 0
    n_correct = 0
    predicted_categories = [predict(model, row[:-1]) for row in validate_data]
    correct_categories = [row[-1] for row in validate_data]
    for predicted_category, correct_category in zip(predicted_categories, correct_categories):
        n_total += 1
        if predicted_category == correct_category:
            n_correct += 1
    return n_correct / n_total
def get_predict(trees_result, trees_feature, data_train):
    m_tree = len(trees_result)
    m = np.shape(data_train)[0]
    result = []
    for i in range(m_tree):
        clf = trees_result[i]
        feature = trees_feature[i]
        data = split_data(data_train, feature)
        result_i = []
        for i in range(m):
            result_i.append((rdf.predict(data[i][0:-1], clf).keys())[0])
        result.append(result_i)
    final_predict = np.sum(result, axis=0)
    return final_predict
def final_predict(trees, row):
    # return np.mean([predict(t, row) for t in trees], axis=0)
    predictions = []
    for tree in trees:
        prediction = predict(tree, row)
        if prediction == None:
            prediction = 0
        predictions.append(prediction)
    # np function for counting occurences of each label
    vals, counts = np.unique(predictions, return_counts=True)
    # find the index of the most frequent one
    max_index = np.argmax(counts)
    # return the most frequent value
    return vals[max_index]
Ejemplo n.º 5
0
x_train_data = x_train_dataset.iloc[:, 1:]
y_train_data = y_train_dataset.iloc[:, 1].values

# split train/validation data
X_train, X_val, y_train, y_val = train_test_split(x_train_data,
                                                  y_train_data,
                                                  test_size=0.33,
                                                  random_state=123)

# scaler
min_max_scaler = MinMaxScaler()  # Default behavior is to scale to [0,1]
scaler = min_max_scaler
x_train_data = scaler.fit_transform(x_train_data)

y_test_pred, y_valid_pred = random_forest.predict(X_train, y_train, X_val,
                                                  y_val, x_test, bmac_scorer,
                                                  bmac_score)

print('{}: * k-nearest neighbour classifier ...'.format(
    datetime.now().strftime("%H:%M:%S")))
# Create and fit a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', n_neighbors=5,
                     weights='uniform')  # try different n_neighbors
print('{}: Correct predications: {}/{}'.format(
    datetime.now().strftime("%H:%M:%S"), np.sum(knn.predict(X_val) == y_val),
    len(y_val)))