def dtc_predict_actual(data): # split the data into training and testing training_set, testing_set = splitdata_train_test(data, 0.7) # generate the feature and targets for the training and test sets features_training, targets_training = generate_features_targets(training_set) features_testing, targets_testing = generate_features_targets(testing_set) # instantiate a decision tree classifier dtc = DecisionTreeClassifier() # train the classifier dtc.fit(features_training, targets_training) # get predictions predictions = dtc.predict(features_testing) # return the predictions and targets return predictions, targets_testing
def rf_predict_actual(data, n_estimators): # generate the features and targets features, targets = generate_features_targets(data) # instantiate a random forest classifier rfc = RandomForestClassifier(n_estimators=n_estimators) # get predictions using 10-fold cross validation with cross_val_predict predicted = cross_val_predict(rfc, features, targets, cv=10) # return the predictions and their actual classes return predicted, targets if __name__ == "__main__": data = np.load('galaxy_catalogue.npy') features, targets = generate_features_targets(data) # Print the shape of each array to check the arrays are the correct dimensions. print("Features shape:", features.shape) print("Targets shape:", targets.shape) # fraction of data which should be in the training set fraction_training = 0.7 # split the data using your function training, testing = splitdata_train_test(data, fraction_training) # print the key values print('Number data galaxies:', len(data)) print('Train fraction:', fraction_training) print('Number of galaxies in training set:', len(training)) print('Number of galaxies in testing set:', len(testing)) predicted_class, actual_class = dtc_predict_actual(data) # Print some of the initial results print("Some initial results...\n predicted, actual") for i in range(10): print("{}. {}, {}".format(i, predicted_class[i], actual_class[i])) # get the predicted and actual classes number_estimators = 50 # Number of trees predicted, actual = rf_predict_actual(data, number_estimators) # calculate the model score using your function accuracy = calculate_accuracy(predicted, actual) print("Accuracy score:", accuracy)
def rf_predict_actual(data, n_estimators): features, targets = generate_features_targets(data) rfc = RandomForestClassifier(n_estimators=n_estimators) predict = cross_val_predict(rfc, features, targets, cv=10) return predict, data['class']
def dtc_predict_actual(data): # split the data into training and testing sets using a training fraction of 0.7 training, testing = splitdata_train_test(data, 0.7) # generate the feature and targets for the training and test sets # i.e. train_features, train_targets, test_features, test_targets train_features, train_targets = generate_features_targets(training) test_features, test_targets = generate_features_targets(testing) # instantiate decision tree classifier dtc = DecisionTreeClassifier() # train the classifier with the train_features and train_targets dtc.fit(train_features, train_targets) # get predictions for the test_features predictions = dtc.predict(test_features) # return the predictions and the test_targets return(predictions, test_targets)
def rf_predict_actual(data, n_estimators): # generate the features and targets features, targets = generate_features_targets(data) # instantiate a random forest classifier using n estimators rfc = RandomForestClassifier(n_estimators=n_estimators) # get predictions using 10-fold cross validation with cross_val_predict predicted = cross_val_predict(rfc, features, targets, cv=10) # return the predictions and their actual classes return predicted, targets
from sklearn.model_selection import cross_val_predict from sklearn.tree import DecisionTreeClassifier from support_functions import plot_confusion_matrix, generate_features_targets # Implement the following function def calculate_accuracy(predicted, actual): correct = predicted[predicted == actual] return len(correct)/len(predicted) if __name__ == "__main__": data = np.load('galaxy_catalogue.npy') # split the data features, targets = generate_features_targets(data) # train the model to get predicted and actual classes dtc = DecisionTreeClassifier() predicted = cross_val_predict(dtc, features, targets, cv=10) # calculate the model score using your function model_score = calculate_accuracy(predicted, targets) print("Our accuracy score:", model_score) # calculate the models confusion matrix using sklearns confusion_matrix function class_labels = list(set(targets)) model_cm = confusion_matrix(y_true=targets, y_pred=predicted, labels=class_labels) # Plot the confusion matrix using the provided functions.