def main(date): """ Trains a random forest and extracts the feature importances. :param date: Date the training and testing data was collected (YYYY_MMDD) :return: (None) """ # Load the training data into memory trainX, trainY = FileIO.loadTrainingData(date) trainX = np.nan_to_num(trainX) # Train the random forest on the training data numCores = multiprocessing.cpu_count() forest = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=numCores) forest.fit(trainX, trainY) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(trainX.shape[1]), importances[indices], color="r", align="center") plt.xticks(range(trainX.shape[1]), indices) plt.xlim([-1, trainX.shape[1]]) plt.show()
def main(date): """ Runs linear regression (classification) between the herbicide resistance classes based on all wavelengths. The weights associated with each wavelength are then plotted, allowing the user to see the contribution to classification by each wavelength. :param date: (string) Data collection date YYYY_MMDD :return: (None) """ # Load the training data from disk X, y = FileIO.loadTrainingData(date) X = np.nan_to_num(X) # Train the classifier on the loaded data clf = SGDClassifier() clf.fit(X, y) # Plot the feature weights to visualize feature contributions featureWeights = np.fabs(clf.coef_) for i in xrange(3): plt.plot(WAVELENGTHS, featureWeights[i]) plt.title("Linear Classifier Weights for " + RESISTANCE_STRINGS[INDEX_TO_LABEL[i]] + " vs Others") plt.xlabel("Wavelength (nm)") plt.ylabel("Absolute Weight") plt.show()
def main(date): """ Trains a random forest and extracts the feature importances. :param date: Date the training and testing data was collected (YYYY_MMDD) :return: (None) """ # Load the training data into memory trainX, trainY = FileIO.loadTrainingData(date) trainX = np.nan_to_num(trainX) # Train the random forest on the training data numCores = multiprocessing.cpu_count() forest = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=numCores) forest.fit(trainX, trainY) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(trainX.shape[1]), importances[indices], color="r", align="center") plt.xticks(range(trainX.shape[1]), indices) plt.xlim([-1, trainX.shape[1]]) plt.show()
def main(date): """ Runs linear regression (classification) between the herbicide resistance classes based on all wavelengths. The weights associated with each wavelength are then plotted, allowing the user to see the contribution to classification by each wavelength. :param date: (string) Data collection date YYYY_MMDD :return: (None) """ # Load the training data from disk X, y = FileIO.loadTrainingData(date) X = np.nan_to_num(X) # Train the classifier on the loaded data clf = SGDClassifier() clf.fit(X, y) # Plot the feature weights to visualize feature contributions featureWeights = np.fabs(clf.coef_) for i in xrange(3): plt.plot(WAVELENGTHS, featureWeights[i]) plt.title("Linear Classifier Weights for " + RESISTANCE_STRINGS[INDEX_TO_LABEL[i]] + " vs Others") plt.xlabel("Wavelength (nm)") plt.ylabel("Absolute Weight") plt.show()
def main(date, modelType): """ Runs the training script. Trains the specified model type, saves the model to a prefined location (specified in the Constants file), and runs basic accuracy tests on the trained model. :param date: Date the training and testing data was collected (YYYY_MMDD) :param modelType: (string) type of machine learning model to train :return: (None) """ # Make sure that the model is a valid choice if (not (modelType in MODELS.keys())) and (modelType != ALL): print "Invalid model type:", modelType return # Allow for training more than one model at a time if modelType == ALL: modelsToTrain = MODELS.keys() else: modelsToTrain = [modelType] # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) testX, testY = FileIO.loadTestingData(date) trainX = np.nan_to_num(trainX) testX = np.nan_to_num(testX) for modelType in modelsToTrain: # Train the desired ML model name, clfType = MODELS[modelType] hyperparameters = HYPERPARAMETERS[modelType] print "Training the", name clf = clfType(**hyperparameters) clf.fit(trainX, trainY) # Perform some very basic accuracy testing trainResult = clf.predict(trainX) testResult = clf.predict(testX) trainingAccuracy = accuracy_score(trainY, trainResult) testingAccuracy = accuracy_score(testY, testResult) confusionMatrix = confusion_matrix(testY, testResult) print "Training Accuracy:", trainingAccuracy print "Testing Accuracy:", testingAccuracy print "Confusion Matrix:" print confusionMatrix print " " # Save the model to disk FileIO.saveModel(clf, modelType, date)
def main(date, modelType): """ Runs the training script. Trains the specified model type, saves the model to a prefined location (specified in the Constants file), and runs basic accuracy tests on the trained model. :param date: Date the training and testing data was collected (YYYY_MMDD) :param modelType: (string) type of machine learning model to train :return: (None) """ # Make sure that the model is a valid choice if (not (modelType in MODELS.keys())) and (modelType != ALL): print "Invalid model type:", modelType return # Allow for training more than one model at a time if modelType == ALL: modelsToTrain = MODELS.keys() else: modelsToTrain = [modelType] # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) testX, testY = FileIO.loadTestingData(date) trainX = np.nan_to_num(trainX) testX = np.nan_to_num(testX) for modelType in modelsToTrain: # Train the desired ML model name, clfType = MODELS[modelType] hyperparameters = HYPERPARAMETERS[modelType] print "Training the", name clf = clfType(**hyperparameters) clf.fit(trainX, trainY) # Perform some very basic accuracy testing trainResult = clf.predict(trainX) testResult = clf.predict(testX) trainingAccuracy = accuracy_score(trainY, trainResult) testingAccuracy = accuracy_score(testY, testResult) confusionMatrix = confusion_matrix(testY, testResult) print "Training Accuracy:", trainingAccuracy print "Testing Accuracy:", testingAccuracy print "Confusion Matrix:" print confusionMatrix print " " # Save the model to disk FileIO.saveModel(clf, modelType, date)
def main(date, takeSubset=False): """ Reduces the dimensionality of the training data to 3 dimensions, plots the transformed data in 3d space. The idea is to bring out separability between the resistance classes which may be hidden in the dimensionality of the data. :param date: (string) Data collection date YYYY_MMDD :param takeSubset: (boolean) Transform and plot a random subset of the trainng data? :return: (None) """ mkl.set_num_threads(8) # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) if takeSubset: indices = np.random.choice(range(0, len(trainY)), size=NUM_SAMPLES, replace=False) X = trainX[indices,:] y = trainY[indices] else: X = trainX y = trainY X = np.nan_to_num(X) # Break the data into resistance classes susIndex = Constants.LABEL_TO_INDEX[Constants.SUSCEPTIBLE] drIndex = Constants.LABEL_TO_INDEX[Constants.DR_RESISTANT] grIndex = Constants.LABEL_TO_INDEX[Constants.GR_RESISTANT] susX = X[y==susIndex, :] drX = X[y==drIndex, :] grX = X[y==grIndex, :] # Transform the data using PCA pca = IncrementalPCA(n_components=6) pointsSUS = pca.fit_transform(susX) pointsGR= pca.fit_transform(grX) pointsDR = pca.fit_transform(drX) # Plot the transformed data in 3D space traceSUS = go.Scatter3d( x=pointsSUS[:, 0], y=pointsSUS[:, 1], z=pointsSUS[:, 2], mode='markers', marker=dict( size=5, line=dict( color='rgba(255, 0, 0, 0)', width=0.1 ), opacity=0 ) ) traceDR = go.Scatter3d( x=pointsDR[:, 0], y=pointsDR[:, 1], z=pointsDR[:, 2], mode='markers', marker=dict( size=5, line=dict( color='rgba(0, 255, 0, 0)', width=0.1 ), opacity=0 ) ) traceGR = go.Scatter3d( x=pointsGR[:, 0], y=pointsGR[:, 1], z=pointsGR[:, 2], mode='markers', marker=dict( size=5, line=dict( color='rgba(0, 0, 255, 0)', width=0.1 ), opacity=0 ) ) data = [traceSUS, traceDR, traceGR] fig = go.Figure(data=data) py.iplot(fig, filename='3D PCA Wavelength Plot') # Plot the principle components eigenSpectra = pca.components_ plt.subplot(3,1,1) plt.plot(Constants.WAVELENGTHS, eigenSpectra[0, :]) plt.title("Principle Components 1 - 3") plt.subplot(3,1,2) plt.plot(Constants.WAVELENGTHS, eigenSpectra[1, :]) plt.subplot(3,1,3) plt.plot(Constants.WAVELENGTHS, eigenSpectra[2, :]) plt.xlabel("Wavelength (nm)") plt.show() plt.clf() plt.subplot(3,1,1) plt.plot(Constants.WAVELENGTHS, eigenSpectra[3, :]) plt.title("Principle Components 4 - 6") plt.subplot(3,1,2) plt.plot(Constants.WAVELENGTHS, eigenSpectra[4, :]) plt.subplot(3,1,3) plt.plot(Constants.WAVELENGTHS, eigenSpectra[5, :]) plt.xlabel("Wavelength (nm)") plt.show()
def main(date, modelType, iterations): """ Determines the optimal hyperparameters for a given machine learning model for a set of training data. :param date: Date the training and testing data was collected (YYYY_MMDD) :param modelType: (string) type of machine learning model to train :param iterations: (int) number of iterations for hyperparameter searching :return: (None) """ # Make sure that the model is a valid choice if (not (modelType in MODELS.keys())) and (modelType != ALL): print "Invalid model type:", modelType return # Allow for training more than one model at a time if modelType == ALL: modelsToTrain = MODELS.keys() else: modelsToTrain = [modelType] # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) testX, testY = FileIO.loadTestingData(date) trainX = np.nan_to_num(trainX) testX = np.nan_to_num(testX) for modelType in modelsToTrain: # Train the desired ML model name, clfType = MODELS[modelType] print "Training the", name baseClassifier = clfType() clf = RandomizedSearchCV(baseClassifier, param_distributions=PARAMETERS[modelType], n_iter=iterations, n_jobs=4) clf.fit(trainX, trainY) # Perform some very basic accuracy testing trainResult = clf.predict(trainX) testResult = clf.predict(testX) trainingAccuracy = accuracy_score(trainY, trainResult) testingAccuracy = accuracy_score(testY, testResult) confusionMatrix = confusion_matrix(testY, testResult) print "Training Accuracy:", trainingAccuracy print "Testing Accuracy:", testingAccuracy print "Confusion Matrix:" print confusionMatrix print " " print "Hyperparameters:" for param in PARAMETERS[modelType].keys(): print param + ':', clf.best_estimator_.get_params()[param] print " " # Save the model to disk FileIO.saveModel(clf.best_estimator_, modelType, date)
def main(date, takeSubset=False): """ Reduces the dimensionality of the training data to 3 dimensions, plots the transformed data in 3d space. The idea is to bring out separability between the resistance classes which may be hidden in the dimensionality of the data. :param date: (string) Data collection date YYYY_MMDD :param takeSubset: (boolean) Transform and plot a random subset of the trainng data? :return: (None) """ mkl.set_num_threads(8) # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) if takeSubset: indices = np.random.choice(range(0, len(trainY)), size=NUM_SAMPLES, replace=False) X = trainX[indices, :] y = trainY[indices] else: X = trainX y = trainY X = np.nan_to_num(X) # Break the data into resistance classes susIndex = Constants.LABEL_TO_INDEX[Constants.SUSCEPTIBLE] drIndex = Constants.LABEL_TO_INDEX[Constants.DR_RESISTANT] grIndex = Constants.LABEL_TO_INDEX[Constants.GR_RESISTANT] susX = X[y == susIndex, :] drX = X[y == drIndex, :] grX = X[y == grIndex, :] # Transform the data using PCA pca = IncrementalPCA(n_components=6) pointsSUS = pca.fit_transform(susX) pointsGR = pca.fit_transform(grX) pointsDR = pca.fit_transform(drX) # Plot the transformed data in 3D space traceSUS = go.Scatter3d(x=pointsSUS[:, 0], y=pointsSUS[:, 1], z=pointsSUS[:, 2], mode='markers', marker=dict(size=5, line=dict(color='rgba(255, 0, 0, 0)', width=0.1), opacity=0)) traceDR = go.Scatter3d(x=pointsDR[:, 0], y=pointsDR[:, 1], z=pointsDR[:, 2], mode='markers', marker=dict(size=5, line=dict(color='rgba(0, 255, 0, 0)', width=0.1), opacity=0)) traceGR = go.Scatter3d(x=pointsGR[:, 0], y=pointsGR[:, 1], z=pointsGR[:, 2], mode='markers', marker=dict(size=5, line=dict(color='rgba(0, 0, 255, 0)', width=0.1), opacity=0)) data = [traceSUS, traceDR, traceGR] fig = go.Figure(data=data) py.iplot(fig, filename='3D PCA Wavelength Plot') # Plot the principle components eigenSpectra = pca.components_ plt.subplot(3, 1, 1) plt.plot(Constants.WAVELENGTHS, eigenSpectra[0, :]) plt.title("Principle Components 1 - 3") plt.subplot(3, 1, 2) plt.plot(Constants.WAVELENGTHS, eigenSpectra[1, :]) plt.subplot(3, 1, 3) plt.plot(Constants.WAVELENGTHS, eigenSpectra[2, :]) plt.xlabel("Wavelength (nm)") plt.show() plt.clf() plt.subplot(3, 1, 1) plt.plot(Constants.WAVELENGTHS, eigenSpectra[3, :]) plt.title("Principle Components 4 - 6") plt.subplot(3, 1, 2) plt.plot(Constants.WAVELENGTHS, eigenSpectra[4, :]) plt.subplot(3, 1, 3) plt.plot(Constants.WAVELENGTHS, eigenSpectra[5, :]) plt.xlabel("Wavelength (nm)") plt.show()
def main(date, modelType, iterations): """ Determines the optimal hyperparameters for a given machine learning model for a set of training data. :param date: Date the training and testing data was collected (YYYY_MMDD) :param modelType: (string) type of machine learning model to train :param iterations: (int) number of iterations for hyperparameter searching :return: (None) """ # Make sure that the model is a valid choice if (not (modelType in MODELS.keys())) and (modelType != ALL): print "Invalid model type:", modelType return # Allow for training more than one model at a time if modelType == ALL: modelsToTrain = MODELS.keys() else: modelsToTrain = [modelType] # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) testX, testY = FileIO.loadTestingData(date) trainX = np.nan_to_num(trainX) testX = np.nan_to_num(testX) for modelType in modelsToTrain: # Train the desired ML model name, clfType = MODELS[modelType] print "Training the", name baseClassifier = clfType() clf = RandomizedSearchCV(baseClassifier, param_distributions=PARAMETERS[modelType], n_iter=iterations, n_jobs=4) clf.fit(trainX, trainY) # Perform some very basic accuracy testing trainResult = clf.predict(trainX) testResult = clf.predict(testX) trainingAccuracy = accuracy_score(trainY, trainResult) testingAccuracy = accuracy_score(testY, testResult) confusionMatrix = confusion_matrix(testY, testResult) print "Training Accuracy:", trainingAccuracy print "Testing Accuracy:", testingAccuracy print "Confusion Matrix:" print confusionMatrix print " " print "Hyperparameters:" for param in PARAMETERS[modelType].keys(): print param + ':', clf.best_estimator_.get_params()[param] print " " # Save the model to disk FileIO.saveModel(clf.best_estimator_, modelType, date)