Beispiel #1
0
def main(date, modelType):
    """
    Runs the training script. Trains the specified model type, saves the 
    model to a prefined location (specified in the Constants file), and 
    runs basic accuracy tests on the trained model.

    :param date: Date the training and testing data was collected (YYYY_MMDD)
    :param modelType: (string) type of machine learning model to train

    :return: (None)
    """

    # Make sure that the model is a valid choice
    if (not (modelType in MODELS.keys())) and (modelType != ALL):
        print "Invalid model type:", modelType
        return

    # Allow for training more than one model at a time
    if modelType == ALL:
        modelsToTrain = MODELS.keys()
    else:
        modelsToTrain = [modelType]

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    testX, testY = FileIO.loadTestingData(date)

    trainX = np.nan_to_num(trainX)
    testX = np.nan_to_num(testX)

    for modelType in modelsToTrain:

        # Train the desired ML model
        name, clfType = MODELS[modelType]
        hyperparameters = HYPERPARAMETERS[modelType]
        print "Training the", name

        clf = clfType(**hyperparameters)
        clf.fit(trainX, trainY)

        # Perform some very basic accuracy testing
        trainResult = clf.predict(trainX)
        testResult = clf.predict(testX)

        trainingAccuracy = accuracy_score(trainY, trainResult)
        testingAccuracy = accuracy_score(testY, testResult)
        confusionMatrix = confusion_matrix(testY, testResult)

        print "Training Accuracy:", trainingAccuracy
        print "Testing Accuracy:", testingAccuracy
        print "Confusion Matrix:"
        print confusionMatrix
        print " "

        # Save the model to disk
        FileIO.saveModel(clf, modelType, date)
Beispiel #2
0
def main(date, modelType):
    """
    Runs the training script. Trains the specified model type, saves the 
    model to a prefined location (specified in the Constants file), and 
    runs basic accuracy tests on the trained model.

    :param date: Date the training and testing data was collected (YYYY_MMDD)
    :param modelType: (string) type of machine learning model to train

    :return: (None)
    """
    
    # Make sure that the model is a valid choice
    if (not (modelType in MODELS.keys())) and (modelType != ALL):
        print "Invalid model type:", modelType
        return

    # Allow for training more than one model at a time
    if modelType == ALL:
        modelsToTrain = MODELS.keys()
    else:
        modelsToTrain = [modelType]

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    testX, testY = FileIO.loadTestingData(date)

    trainX = np.nan_to_num(trainX)
    testX = np.nan_to_num(testX)

    for modelType in modelsToTrain:

        # Train the desired ML model
        name, clfType = MODELS[modelType]
        hyperparameters = HYPERPARAMETERS[modelType]
        print "Training the", name

        clf = clfType(**hyperparameters)
        clf.fit(trainX, trainY)

        # Perform some very basic accuracy testing
        trainResult = clf.predict(trainX)
        testResult = clf.predict(testX)

        trainingAccuracy = accuracy_score(trainY, trainResult)
        testingAccuracy = accuracy_score(testY, testResult)
        confusionMatrix = confusion_matrix(testY, testResult)

        print "Training Accuracy:", trainingAccuracy
        print "Testing Accuracy:", testingAccuracy
        print "Confusion Matrix:"
        print confusionMatrix
        print " "

        # Save the model to disk
        FileIO.saveModel(clf, modelType, date)
def main(date):
    """
    Trains a random forest and extracts the feature importances.

    :param date: Date the training and testing data was collected (YYYY_MMDD)

    :return: (None)
    """

    # Load the training data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    trainX = np.nan_to_num(trainX)

    # Train the random forest on the training data
    numCores = multiprocessing.cpu_count()
    forest = RandomForestClassifier(n_estimators=500,
                                    random_state=0,
                                    n_jobs=numCores)
    forest.fit(trainX, trainY)

    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(trainX.shape[1]),
            importances[indices],
            color="r",
            align="center")
    plt.xticks(range(trainX.shape[1]), indices)
    plt.xlim([-1, trainX.shape[1]])
    plt.show()
def main(date):
    """
    Trains a random forest and extracts the feature importances.

    :param date: Date the training and testing data was collected (YYYY_MMDD)

    :return: (None)
    """

    # Load the training data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    trainX = np.nan_to_num(trainX)

    # Train the random forest on the training data
    numCores = multiprocessing.cpu_count()
    forest = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=numCores)
    forest.fit(trainX, trainY)

    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(trainX.shape[1]), importances[indices],
           color="r", align="center")
    plt.xticks(range(trainX.shape[1]), indices)
    plt.xlim([-1, trainX.shape[1]])
    plt.show()
def main(date):
    """
    Runs linear regression (classification) between the herbicide 
    resistance classes based on all wavelengths. The weights
    associated with each wavelength are then plotted, allowing
    the user to see the contribution to classification by each
    wavelength.

    :param date: (string) Data collection date YYYY_MMDD

    :return: (None)
    """
    
    # Load the training data from disk   
    X, y = FileIO.loadTrainingData(date)
    X = np.nan_to_num(X)

    # Train the classifier on the loaded data
    clf = SGDClassifier()
    clf.fit(X, y)

    # Plot the feature weights to visualize feature contributions
    featureWeights = np.fabs(clf.coef_)

    for i in xrange(3):
        plt.plot(WAVELENGTHS, featureWeights[i])
        plt.title("Linear Classifier Weights for " + RESISTANCE_STRINGS[INDEX_TO_LABEL[i]] + " vs Others")
        plt.xlabel("Wavelength (nm)")
        plt.ylabel("Absolute Weight")
        plt.show()
def main(date):
    """
    Runs linear regression (classification) between the herbicide 
    resistance classes based on all wavelengths. The weights
    associated with each wavelength are then plotted, allowing
    the user to see the contribution to classification by each
    wavelength.

    :param date: (string) Data collection date YYYY_MMDD

    :return: (None)
    """

    # Load the training data from disk
    X, y = FileIO.loadTrainingData(date)
    X = np.nan_to_num(X)

    # Train the classifier on the loaded data
    clf = SGDClassifier()
    clf.fit(X, y)

    # Plot the feature weights to visualize feature contributions
    featureWeights = np.fabs(clf.coef_)

    for i in xrange(3):
        plt.plot(WAVELENGTHS, featureWeights[i])
        plt.title("Linear Classifier Weights for " +
                  RESISTANCE_STRINGS[INDEX_TO_LABEL[i]] + " vs Others")
        plt.xlabel("Wavelength (nm)")
        plt.ylabel("Absolute Weight")
        plt.show()
Beispiel #7
0
def main(date, delete, keywords=[], byLeaf=True, saveProportion=0.5):
    """
    Generates ML training and testing data from extracted CSV files

    :param date: (string) Data collection date YYYY_MMDD
    :param delete: (boolean) Determines whether or not to delete the existing
                             training/testing data files
    :param keywords: (list of strings) Data filename keywords
    :param byLeaf: (boolean) Should we separate the train/test data
                             by leaf, or should we randomly separate
                             the data according to a set proportion?
    :param saveProportion: (float) Amount of each CSV file to save as training
                                   and testing data.

    :return: (None)
    """

    # Get the data files we will be looking at
    dataPath = DATA_DIRECTORIES[date]
    dataFilenames = FileIO.getDatafileNames(dataPath, keywords)

    # If desired, remove the old training data and start fresh
    if delete:

        mlDataPath = DATA_DIRECTORIES[date+"_ML"]
        trainingDataPath = os.path.join(mlDataPath, TRAINING_DATA_PATH)
        testingDataPath = os.path.join(mlDataPath, TESTING_DATA_PATH)
        sampleCountsPath = os.path.join(mlDataPath, SAMPLE_COUNTS_PATH)

        if os.path.exists(trainingDataPath):
            os.remove(trainingDataPath)

        if os.path.exists(testingDataPath):
            os.remove(testingDataPath)

        if os.path.exists(sampleCountsPath):
            os.remove(sampleCountsPath)

    # Consolidate the CSV files into training and testing data
    (train_X, train_y, test_X, test_y) = DataManipulation.separateTrainTest(dataPath, 
                                                                            dataFilenames, 
                                                                            byLeaf=byLeaf, 
                                                                            saveProportion=saveProportion)

    # Save the training and testing data in the proper spot
    FileIO.saveTrainingData(date, train_X, train_y)
    FileIO.saveTestingData(date, test_X, test_y)
def main(date, takeSubset=False):
    """
    Reduces the dimensionality of the training data to 3 dimensions, 
    plots the transformed data in 3d space. The idea is to bring
    out separability between the resistance classes which may be 
    hidden in the dimensionality of the data.

    :param date: (string) Data collection date YYYY_MMDD
    :param takeSubset: (boolean) Transform and plot a random subset of
                                 the trainng data?

    :return: (None)
    """

    mkl.set_num_threads(8)

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)

    if takeSubset:
        indices = np.random.choice(range(0, len(trainY)), size=NUM_SAMPLES, replace=False)
        X = trainX[indices,:]
        y = trainY[indices]
    else:
        X = trainX
        y = trainY

    X = np.nan_to_num(X)

    # Break the data into resistance classes
    susIndex = Constants.LABEL_TO_INDEX[Constants.SUSCEPTIBLE]
    drIndex = Constants.LABEL_TO_INDEX[Constants.DR_RESISTANT]
    grIndex = Constants.LABEL_TO_INDEX[Constants.GR_RESISTANT]

    susX = X[y==susIndex, :]
    drX = X[y==drIndex, :]
    grX = X[y==grIndex, :]

    # Transform the data using PCA
    pca = IncrementalPCA(n_components=6)

    pointsSUS = pca.fit_transform(susX)
    pointsGR= pca.fit_transform(grX)
    pointsDR = pca.fit_transform(drX)

    # Plot the transformed data in 3D space
    traceSUS = go.Scatter3d(
        x=pointsSUS[:, 0],
        y=pointsSUS[:, 1],
        z=pointsSUS[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            line=dict(
                color='rgba(255, 0, 0, 0)',
                width=0.1
            ),
            opacity=0
        )
    )

    traceDR = go.Scatter3d(
        x=pointsDR[:, 0],
        y=pointsDR[:, 1],
        z=pointsDR[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            line=dict(
                color='rgba(0, 255, 0, 0)',
                width=0.1
            ),
            opacity=0
        )
    )

    traceGR = go.Scatter3d(
        x=pointsGR[:, 0],
        y=pointsGR[:, 1],
        z=pointsGR[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            line=dict(
                color='rgba(0, 0, 255, 0)',
                width=0.1
            ),
            opacity=0
        )
    )

    data = [traceSUS, traceDR, traceGR]
    fig = go.Figure(data=data)
    py.iplot(fig, filename='3D PCA Wavelength Plot')

    # Plot the principle components
    eigenSpectra = pca.components_

    plt.subplot(3,1,1)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[0, :])
    plt.title("Principle Components 1 - 3")
    plt.subplot(3,1,2)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[1, :])
    plt.subplot(3,1,3)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[2, :])
    plt.xlabel("Wavelength (nm)")
    plt.show()

    plt.clf()
    plt.subplot(3,1,1)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[3, :])
    plt.title("Principle Components 4 - 6")
    plt.subplot(3,1,2)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[4, :])
    plt.subplot(3,1,3)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[5, :])
    plt.xlabel("Wavelength (nm)")
    plt.show()
def main(date, modelType, iterations):
    """
    Determines the optimal hyperparameters for a given machine learning
    model for a set of training data.

    :param date: Date the training and testing data was collected (YYYY_MMDD)
    :param modelType: (string) type of machine learning model to train
    :param iterations: (int) number of iterations for hyperparameter searching

    :return: (None)
    """
    
    # Make sure that the model is a valid choice
    if (not (modelType in MODELS.keys())) and (modelType != ALL):
        print "Invalid model type:", modelType
        return

    # Allow for training more than one model at a time
    if modelType == ALL:
        modelsToTrain = MODELS.keys()
    else:
        modelsToTrain = [modelType]

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    testX, testY = FileIO.loadTestingData(date)

    trainX = np.nan_to_num(trainX)
    testX = np.nan_to_num(testX)

    for modelType in modelsToTrain:

        # Train the desired ML model
        name, clfType = MODELS[modelType]
        print "Training the", name

        baseClassifier = clfType()
        clf = RandomizedSearchCV(baseClassifier, param_distributions=PARAMETERS[modelType],
                                                 n_iter=iterations,
                                                 n_jobs=4)
        clf.fit(trainX, trainY)

        # Perform some very basic accuracy testing
        trainResult = clf.predict(trainX)
        testResult = clf.predict(testX)

        trainingAccuracy = accuracy_score(trainY, trainResult)
        testingAccuracy = accuracy_score(testY, testResult)
        confusionMatrix = confusion_matrix(testY, testResult)

        print "Training Accuracy:", trainingAccuracy
        print "Testing Accuracy:", testingAccuracy
        print "Confusion Matrix:"
        print confusionMatrix
        print " "
        print "Hyperparameters:"
        for param in PARAMETERS[modelType].keys():
            print param + ':', clf.best_estimator_.get_params()[param]
        print " "

        # Save the model to disk
        FileIO.saveModel(clf.best_estimator_, modelType, date)
def main(date, wavelengths, keywords=[], allSpectra=False):
    """
    Plot three wavelengths against each other from a specified set of data.

    :param date: (string) Data collection date YYYY_MMDD
    :param wavelengths: (3-tuple) Wavelengths to plot against another
    :param keywords: (list of strings) Strings which should be included in the 
                                       filenames of files being plotted
    :allSpectra: (boolean) Determines where there is one point for every spectra
                           collected, or one point for every leaf file

    :return: (None)
    """

    # Convert the wavelengths to indices for accessing the data
    wavelengthIndices = map(wavelengthToIndex, wavelengths)
    wavelengthIndex1 = wavelengthIndices[0]
    wavelengthIndex2 = wavelengthIndices[1]
    wavelengthIndex3 = wavelengthIndices[2]


    # Get the data files we will be looking at
    dataPath = DATA_DIRECTORIES[date]
    filesToPlot = FileIO.getDatafileNames(dataPath, keywords)

    pointsDR = []
    pointsGR = []
    pointsSUS = []

    for name in filesToPlot:

        tokens = name[0:-4].split('_')
        map(lambda x: x.lower(), tokens)

        plant = tokens[0]
        resistance = tokens[1]

        filePath = os.path.join(dataPath, name)
        data = FileIO.loadCSV(filePath)

        try:
            rows, columns = data.shape
            if columns < 2:
                continue
        except:
            continue

        if allSpectra:

            xValues = data[:, wavelengthIndex1]
            yValues = data[:, wavelengthIndex2]
            zValues = data[:, wavelengthIndex3]

            points = np.zeros((rows, 3))
            points[:, 0] = xValues
            points[:, 1] = yValues
            points[:, 2] = zValues
                
            if resistance == SUSCEPTIBLE:
                if pointsSUS == []:
                    pointsSUS = points
                else:
                    pointsSUS = np.append(pointsSUS, points, axis=0)

            elif resistance == DR_RESISTANT:
                if pointsDR == []:
                    pointsDR = points
                else:
                    pointsDR = np.append(pointsDR, points, axis=0)

            elif resistance == GR_RESISTANT:
                if pointsGR == []:
                    pointsGR = points
                else:
                    pointsGR = np.append(pointsGR, points, axis=0)
            else:
                raise Exception("Unknown resistance type: " + resistance)

        else:

            mean = np.mean(data, axis=0)
            meanValue1 = mean[wavelengthIndex1]
            meanValue2 = mean[wavelengthIndex2]
            meanValue3 = mean[wavelengthIndex3]

            if resistance == SUSCEPTIBLE:
                pointsSUS.append([meanValue1, meanValue2, meanValue3])
            elif resistance == DR_RESISTANT:
                pointsDR.append([meanValue1, meanValue2, meanValue3])
            elif resistance == GR_RESISTANT:
                pointsGR.append([meanValue1, meanValue2, meanValue3])
            else:
                raise Exception("Unknown resistance type: " + resistance)

    # Plot the wavelengths
    pointsDR = np.array(pointsDR)
    pointsGR = np.array(pointsGR)
    pointsSUS = np.array(pointsSUS)

    traceSUS = plotPoints(pointsSUS, RESISTANCE_STRINGS[SUSCEPTIBLE], 'rgba(255, 0, 0, 0)')
    traceDR = plotPoints(pointsDR, RESISTANCE_STRINGS[DR_RESISTANT], 'rgba(0, 255, 0, 0)')
    traceGR = plotPoints(pointsGR, RESISTANCE_STRINGS[GR_RESISTANT], 'rgba(0, 0, 255, 0)')

    layout = go.Layout(
        title='3D Wavelength Plot',
        scene=go.Scene(
            xaxis=go.XAxis(title='Reflectance @ ' + str(wavelengths[0]) + ' nm'),
            yaxis=go.YAxis(title='Reflectance @ ' + str(wavelengths[1]) + ' nm'),
            zaxis=go.ZAxis(title='Reflectance @ ' + str(wavelengths[2]) + ' nm')
        )
    )

    data = [traceSUS, traceDR, traceGR]
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, filename='3D Wavelength Plot')
def main(date, wavelengths, plotLeaves, binning, keywords=[]):
    """
    Plot the histogram of a specified list of wavelengths.

    :param date: (string) Data collection date YYYY_MMDD
    :param wavelengths: (list) Wavelengths to plot histograms
    :param plotLeaves: (boolean) Plot only a single point per 
                                 leaf vs. all spectra in a leaf
    :param binning: (float) Wavelength binning width (in nm)
    :param keywords: (list of strings) Strings which should be included in the
                                       filenames of files being plotted

    :return: (None)
    """

    numHistograms = len(wavelengths)

    # Get the data files we will be looking at
    dataPath = DATA_DIRECTORIES[date]
    filesToPlot = FileIO.getDatafileNames(dataPath, keywords)

    pointsDR = np.zeros((1, numHistograms))
    pointsGR = np.zeros((1, numHistograms))
    pointsSUS = np.zeros((1, numHistograms))

    for name in filesToPlot:

        tokens = name[0:-4].split('_')
        map(lambda x: x.lower(), tokens)

        plant = tokens[0]
        resistance = tokens[1]
        imageType = tokens[2]
        index = int(tokens[4])

        filePath = os.path.join(dataPath, name)
        data = FileIO.loadCSV(filePath)

        # Extract the relevant data from the spectra in the data file
        try:
            if not binning:
                wavelengthIndices = map(wavelengthToIndex, wavelengths)
                histogramData = data[:, wavelengthIndices]
            else:
                
                indexRegions = map(lambda x: wavelengthRegionToIndices(x, binning), wavelengths)
                rows, columns = data.shape
                histogramData = np.zeros((rows, numHistograms))

                for i in xrange(numHistograms):

                    histogramData[:, i] = map(lambda j: np.mean(data[j,indexRegions[i]]), xrange(rows))


        except Exception, e:
            print "Error with file:", name
            continue

        if plotLeaves:

            meanLeaf = map(lambda i: np.mean(histogramData[:,i]), xrange(numHistograms))

            if resistance == SUSCEPTIBLE:
                pointsSUS = np.append(pointsSUS, [meanLeaf], axis=0)
            elif resistance == DR_RESISTANT:
                pointsDR = np.append(pointsDR, [meanLeaf], axis=0)
            elif resistance == GR_RESISTANT:
                pointsGR = np.append(pointsGR, [meanLeaf], axis=0)
            else:
                raise Exception("Unknown resistance type: " + resistance)

        else:

            if resistance == SUSCEPTIBLE:
                pointsSUS = np.append(pointsSUS, histogramData, axis=0)
            elif resistance == DR_RESISTANT:
                pointsDR = np.append(pointsDR, histogramData, axis=0)
            elif resistance == GR_RESISTANT:
                pointsGR = np.append(pointsGR, histogramData, axis=0)
            else:
                raise Exception("Unknown resistance type: " + resistance)
def main(date, takeSubset=False):
    """
    Reduces the dimensionality of the training data to 3 dimensions, 
    plots the transformed data in 3d space. The idea is to bring
    out separability between the resistance classes which may be 
    hidden in the dimensionality of the data.

    :param date: (string) Data collection date YYYY_MMDD
    :param takeSubset: (boolean) Transform and plot a random subset of
                                 the trainng data?

    :return: (None)
    """

    mkl.set_num_threads(8)

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)

    if takeSubset:
        indices = np.random.choice(range(0, len(trainY)),
                                   size=NUM_SAMPLES,
                                   replace=False)
        X = trainX[indices, :]
        y = trainY[indices]
    else:
        X = trainX
        y = trainY

    X = np.nan_to_num(X)

    # Break the data into resistance classes
    susIndex = Constants.LABEL_TO_INDEX[Constants.SUSCEPTIBLE]
    drIndex = Constants.LABEL_TO_INDEX[Constants.DR_RESISTANT]
    grIndex = Constants.LABEL_TO_INDEX[Constants.GR_RESISTANT]

    susX = X[y == susIndex, :]
    drX = X[y == drIndex, :]
    grX = X[y == grIndex, :]

    # Transform the data using PCA
    pca = IncrementalPCA(n_components=6)

    pointsSUS = pca.fit_transform(susX)
    pointsGR = pca.fit_transform(grX)
    pointsDR = pca.fit_transform(drX)

    # Plot the transformed data in 3D space
    traceSUS = go.Scatter3d(x=pointsSUS[:, 0],
                            y=pointsSUS[:, 1],
                            z=pointsSUS[:, 2],
                            mode='markers',
                            marker=dict(size=5,
                                        line=dict(color='rgba(255, 0, 0, 0)',
                                                  width=0.1),
                                        opacity=0))

    traceDR = go.Scatter3d(x=pointsDR[:, 0],
                           y=pointsDR[:, 1],
                           z=pointsDR[:, 2],
                           mode='markers',
                           marker=dict(size=5,
                                       line=dict(color='rgba(0, 255, 0, 0)',
                                                 width=0.1),
                                       opacity=0))

    traceGR = go.Scatter3d(x=pointsGR[:, 0],
                           y=pointsGR[:, 1],
                           z=pointsGR[:, 2],
                           mode='markers',
                           marker=dict(size=5,
                                       line=dict(color='rgba(0, 0, 255, 0)',
                                                 width=0.1),
                                       opacity=0))

    data = [traceSUS, traceDR, traceGR]
    fig = go.Figure(data=data)
    py.iplot(fig, filename='3D PCA Wavelength Plot')

    # Plot the principle components
    eigenSpectra = pca.components_

    plt.subplot(3, 1, 1)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[0, :])
    plt.title("Principle Components 1 - 3")
    plt.subplot(3, 1, 2)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[1, :])
    plt.subplot(3, 1, 3)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[2, :])
    plt.xlabel("Wavelength (nm)")
    plt.show()

    plt.clf()
    plt.subplot(3, 1, 1)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[3, :])
    plt.title("Principle Components 4 - 6")
    plt.subplot(3, 1, 2)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[4, :])
    plt.subplot(3, 1, 3)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[5, :])
    plt.xlabel("Wavelength (nm)")
    plt.show()
def main(date, modelType, iterations):
    """
    Determines the optimal hyperparameters for a given machine learning
    model for a set of training data.

    :param date: Date the training and testing data was collected (YYYY_MMDD)
    :param modelType: (string) type of machine learning model to train
    :param iterations: (int) number of iterations for hyperparameter searching

    :return: (None)
    """

    # Make sure that the model is a valid choice
    if (not (modelType in MODELS.keys())) and (modelType != ALL):
        print "Invalid model type:", modelType
        return

    # Allow for training more than one model at a time
    if modelType == ALL:
        modelsToTrain = MODELS.keys()
    else:
        modelsToTrain = [modelType]

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    testX, testY = FileIO.loadTestingData(date)

    trainX = np.nan_to_num(trainX)
    testX = np.nan_to_num(testX)

    for modelType in modelsToTrain:

        # Train the desired ML model
        name, clfType = MODELS[modelType]
        print "Training the", name

        baseClassifier = clfType()
        clf = RandomizedSearchCV(baseClassifier,
                                 param_distributions=PARAMETERS[modelType],
                                 n_iter=iterations,
                                 n_jobs=4)
        clf.fit(trainX, trainY)

        # Perform some very basic accuracy testing
        trainResult = clf.predict(trainX)
        testResult = clf.predict(testX)

        trainingAccuracy = accuracy_score(trainY, trainResult)
        testingAccuracy = accuracy_score(testY, testResult)
        confusionMatrix = confusion_matrix(testY, testResult)

        print "Training Accuracy:", trainingAccuracy
        print "Testing Accuracy:", testingAccuracy
        print "Confusion Matrix:"
        print confusionMatrix
        print " "
        print "Hyperparameters:"
        for param in PARAMETERS[modelType].keys():
            print param + ':', clf.best_estimator_.get_params()[param]
        print " "

        # Save the model to disk
        FileIO.saveModel(clf.best_estimator_, modelType, date)