boston_data = load_boston() train_data = np.array(boston_data.data) train_labels = np.array(boston_data.target) num_features = boston_data.data.shape[1] unique_labels = np.unique(train_labels) num_classes = len(unique_labels) print("The boston dataset has " + str(num_features) + " features") print(boston_data.feature_names) # Put everything into a Pandas DataFrame data = pd.DataFrame(data=np.c_[train_data], columns=boston_data.feature_names) # print(tabulate(data, headers='keys', tablefmt='psql')) # Compute the covariance matrix cov_mat_boston = np.cov(train_data.T) print("Covariance matrix") print(cov_mat_boston) # Normalize the data and then recompute the covariance matrix normalized_train_data = helpers.normalize_data(train_data) normalized_cov_mat_boston = np.cov(normalized_train_data.T) print("Normalized data covariance matrix") print(normalized_cov_mat_boston) # create scatterplot matrix fig = sns.pairplot(data=data, hue='CRIM') plt.show()
# Do a majority vote among the k neighbors and set prediction as the class receing the most votes label = self._majority_vote(k_nearest_neighbors, classes) y_pred.append(label) return np.array(y_pred) # Get the training data # Import the Iris flower dataset iris = datasets.load_iris() train_data = np.array(iris.data) train_labels = np.array(iris.target) num_features = train_data.data.shape[1] # Normalize the training data train_data = ml_helpers.normalize_data(train_data) # Apply PCA to the data to reduce its dimensionality pca = decomposition.PCA(n_components=2) pca.fit(train_data) train_data = pca.transform(train_data) X_train, X_test, y_train, y_test = ml_helpers.train_test_split(train_data, train_labels, test_size=0.5) # ********************************************* # Apply the KNN Classifier MANUALLY # ********************************************* clf = KNN(k=5) predicted_labels = clf.predict(X_test, X_train, y_train)
def main(): # ************************************************************** # Apply the Decision Tree for Classification Manually # ************************************************************** # Get the training data # Import the Iris flower dataset iris = datasets.load_iris() train_data = np.array(iris.data) train_labels = np.array(iris.target) num_features = train_data.data.shape[1] # Randomly shuffle the data train_data, train_labels = ml_helpers.shuffle_data(train_data, train_labels) # Apply PCA to the data to reduce its dimensionality pca = decomposition.PCA(n_components=4) pca.fit(train_data) train_data = pca.transform(train_data) X_train, X_test, y_train, y_test = ml_helpers.train_test_split(train_data, train_labels, test_size=0.4) clf = ClassificationTree() clf.fit(X_train, y_train) predicted_labels = clf.predict(X_test) # Compute the testing accuracy Accuracy = 0 for index in range(len(predicted_labels)): current_label = y_test[index] predicted_label = predicted_labels[index] if current_label == predicted_label: Accuracy += 1 Accuracy /= len(train_labels) # Print stuff print("Manual Decision Tree Classification Accuracy = ", Accuracy) # ************************************************************** # Apply the Decision Tree for Classification using Sklearn # ************************************************************** clf = DecisionTreeClassifier(criterion="gini", splitter="best") clf.fit(X=X_train, y=y_train) predicted_labels = clf.predict(X_test) # Compute the testing accuracy Accuracy = 0 for index in range(len(predicted_labels)): current_label = y_test[index] predicted_label = predicted_labels[index] if current_label == predicted_label: Accuracy += 1 Accuracy /= len(train_labels) # Print stuff print("Sklearn Decision Tree Classification Accuracy = ", Accuracy) # ************************************************************** # Apply the Decision Tree for Regression Manually # ************************************************************** # Load the Boston housing data set to regression training # NOTE that this loads as a dictionairy boston_dataset = load_boston() train_data = np.array(boston_dataset.data) train_labels = np.array(boston_dataset.target) num_features = boston_dataset.data.shape[1] # Randomly shuffle the data train_data, train_labels = ml_helpers.shuffle_data(train_data, train_labels) # Normalize the data to have zero-mean and unit variance train_data = ml_helpers.normalize_data(train_data) X_train, X_test, y_train, y_test = ml_helpers.train_test_split(train_data, train_labels, test_size=0.4) clf = RegressionTree() clf.fit(X_train, y_train) predicted_values = clf.predict(X_test) mse = ml_helpers.mean_squared_error(y_test, predicted_values) print ("Manual Decision Tree Regression Mean Squared Error:", mse) # Now plot the manual Linear Regression g = plt.figure(1) plt.plot(y_test, predicted_values,'ro') plt.plot([0,50],[0,50], 'g-') plt.xlabel('real') plt.ylabel('predicted') g.show() # ************************************************************** # Apply the Decision Tree for Regression using Sklearn # ************************************************************** clf = DecisionTreeRegressor(criterion="mse", splitter="best") clf.fit(X_train, y_train) predicted_values = clf.predict(X_test) mse = ml_helpers.mean_squared_error(y_test, predicted_values) print ("Sklearn Decision Tree Regression Mean Squared Error:", mse) # Now plot the manual Linear Regression g = plt.figure(2) plt.plot(y_test, predicted_values,'ro') plt.plot([0,50],[0,50], 'g-') plt.xlabel('real') plt.ylabel('predicted') g.show() # Keep the plots alive until we get a user input print("Press any key to exit") input()