def main(date):
    """
    Trains a random forest and extracts the feature importances.

    :param date: Date the training and testing data was collected (YYYY_MMDD)

    :return: (None)
    """

    # Load the training data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    trainX = np.nan_to_num(trainX)

    # Train the random forest on the training data
    numCores = multiprocessing.cpu_count()
    forest = RandomForestClassifier(n_estimators=500,
                                    random_state=0,
                                    n_jobs=numCores)
    forest.fit(trainX, trainY)

    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(trainX.shape[1]),
            importances[indices],
            color="r",
            align="center")
    plt.xticks(range(trainX.shape[1]), indices)
    plt.xlim([-1, trainX.shape[1]])
    plt.show()
def main(date):
    """
    Runs linear regression (classification) between the herbicide 
    resistance classes based on all wavelengths. The weights
    associated with each wavelength are then plotted, allowing
    the user to see the contribution to classification by each
    wavelength.

    :param date: (string) Data collection date YYYY_MMDD

    :return: (None)
    """
    
    # Load the training data from disk   
    X, y = FileIO.loadTrainingData(date)
    X = np.nan_to_num(X)

    # Train the classifier on the loaded data
    clf = SGDClassifier()
    clf.fit(X, y)

    # Plot the feature weights to visualize feature contributions
    featureWeights = np.fabs(clf.coef_)

    for i in xrange(3):
        plt.plot(WAVELENGTHS, featureWeights[i])
        plt.title("Linear Classifier Weights for " + RESISTANCE_STRINGS[INDEX_TO_LABEL[i]] + " vs Others")
        plt.xlabel("Wavelength (nm)")
        plt.ylabel("Absolute Weight")
        plt.show()
def main(date):
    """
    Trains a random forest and extracts the feature importances.

    :param date: Date the training and testing data was collected (YYYY_MMDD)

    :return: (None)
    """

    # Load the training data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    trainX = np.nan_to_num(trainX)

    # Train the random forest on the training data
    numCores = multiprocessing.cpu_count()
    forest = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=numCores)
    forest.fit(trainX, trainY)

    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(trainX.shape[1]), importances[indices],
           color="r", align="center")
    plt.xticks(range(trainX.shape[1]), indices)
    plt.xlim([-1, trainX.shape[1]])
    plt.show()
def main(date):
    """
    Runs linear regression (classification) between the herbicide 
    resistance classes based on all wavelengths. The weights
    associated with each wavelength are then plotted, allowing
    the user to see the contribution to classification by each
    wavelength.

    :param date: (string) Data collection date YYYY_MMDD

    :return: (None)
    """

    # Load the training data from disk
    X, y = FileIO.loadTrainingData(date)
    X = np.nan_to_num(X)

    # Train the classifier on the loaded data
    clf = SGDClassifier()
    clf.fit(X, y)

    # Plot the feature weights to visualize feature contributions
    featureWeights = np.fabs(clf.coef_)

    for i in xrange(3):
        plt.plot(WAVELENGTHS, featureWeights[i])
        plt.title("Linear Classifier Weights for " +
                  RESISTANCE_STRINGS[INDEX_TO_LABEL[i]] + " vs Others")
        plt.xlabel("Wavelength (nm)")
        plt.ylabel("Absolute Weight")
        plt.show()
Beispiel #5
0
def main(date, modelType):
    """
    Runs the training script. Trains the specified model type, saves the 
    model to a prefined location (specified in the Constants file), and 
    runs basic accuracy tests on the trained model.

    :param date: Date the training and testing data was collected (YYYY_MMDD)
    :param modelType: (string) type of machine learning model to train

    :return: (None)
    """

    # Make sure that the model is a valid choice
    if (not (modelType in MODELS.keys())) and (modelType != ALL):
        print "Invalid model type:", modelType
        return

    # Allow for training more than one model at a time
    if modelType == ALL:
        modelsToTrain = MODELS.keys()
    else:
        modelsToTrain = [modelType]

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    testX, testY = FileIO.loadTestingData(date)

    trainX = np.nan_to_num(trainX)
    testX = np.nan_to_num(testX)

    for modelType in modelsToTrain:

        # Train the desired ML model
        name, clfType = MODELS[modelType]
        hyperparameters = HYPERPARAMETERS[modelType]
        print "Training the", name

        clf = clfType(**hyperparameters)
        clf.fit(trainX, trainY)

        # Perform some very basic accuracy testing
        trainResult = clf.predict(trainX)
        testResult = clf.predict(testX)

        trainingAccuracy = accuracy_score(trainY, trainResult)
        testingAccuracy = accuracy_score(testY, testResult)
        confusionMatrix = confusion_matrix(testY, testResult)

        print "Training Accuracy:", trainingAccuracy
        print "Testing Accuracy:", testingAccuracy
        print "Confusion Matrix:"
        print confusionMatrix
        print " "

        # Save the model to disk
        FileIO.saveModel(clf, modelType, date)
Beispiel #6
0
def main(date, modelType):
    """
    Runs the training script. Trains the specified model type, saves the 
    model to a prefined location (specified in the Constants file), and 
    runs basic accuracy tests on the trained model.

    :param date: Date the training and testing data was collected (YYYY_MMDD)
    :param modelType: (string) type of machine learning model to train

    :return: (None)
    """
    
    # Make sure that the model is a valid choice
    if (not (modelType in MODELS.keys())) and (modelType != ALL):
        print "Invalid model type:", modelType
        return

    # Allow for training more than one model at a time
    if modelType == ALL:
        modelsToTrain = MODELS.keys()
    else:
        modelsToTrain = [modelType]

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    testX, testY = FileIO.loadTestingData(date)

    trainX = np.nan_to_num(trainX)
    testX = np.nan_to_num(testX)

    for modelType in modelsToTrain:

        # Train the desired ML model
        name, clfType = MODELS[modelType]
        hyperparameters = HYPERPARAMETERS[modelType]
        print "Training the", name

        clf = clfType(**hyperparameters)
        clf.fit(trainX, trainY)

        # Perform some very basic accuracy testing
        trainResult = clf.predict(trainX)
        testResult = clf.predict(testX)

        trainingAccuracy = accuracy_score(trainY, trainResult)
        testingAccuracy = accuracy_score(testY, testResult)
        confusionMatrix = confusion_matrix(testY, testResult)

        print "Training Accuracy:", trainingAccuracy
        print "Testing Accuracy:", testingAccuracy
        print "Confusion Matrix:"
        print confusionMatrix
        print " "

        # Save the model to disk
        FileIO.saveModel(clf, modelType, date)
def main(date, takeSubset=False):
    """
    Reduces the dimensionality of the training data to 3 dimensions, 
    plots the transformed data in 3d space. The idea is to bring
    out separability between the resistance classes which may be 
    hidden in the dimensionality of the data.

    :param date: (string) Data collection date YYYY_MMDD
    :param takeSubset: (boolean) Transform and plot a random subset of
                                 the trainng data?

    :return: (None)
    """

    mkl.set_num_threads(8)

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)

    if takeSubset:
        indices = np.random.choice(range(0, len(trainY)), size=NUM_SAMPLES, replace=False)
        X = trainX[indices,:]
        y = trainY[indices]
    else:
        X = trainX
        y = trainY

    X = np.nan_to_num(X)

    # Break the data into resistance classes
    susIndex = Constants.LABEL_TO_INDEX[Constants.SUSCEPTIBLE]
    drIndex = Constants.LABEL_TO_INDEX[Constants.DR_RESISTANT]
    grIndex = Constants.LABEL_TO_INDEX[Constants.GR_RESISTANT]

    susX = X[y==susIndex, :]
    drX = X[y==drIndex, :]
    grX = X[y==grIndex, :]

    # Transform the data using PCA
    pca = IncrementalPCA(n_components=6)

    pointsSUS = pca.fit_transform(susX)
    pointsGR= pca.fit_transform(grX)
    pointsDR = pca.fit_transform(drX)

    # Plot the transformed data in 3D space
    traceSUS = go.Scatter3d(
        x=pointsSUS[:, 0],
        y=pointsSUS[:, 1],
        z=pointsSUS[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            line=dict(
                color='rgba(255, 0, 0, 0)',
                width=0.1
            ),
            opacity=0
        )
    )

    traceDR = go.Scatter3d(
        x=pointsDR[:, 0],
        y=pointsDR[:, 1],
        z=pointsDR[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            line=dict(
                color='rgba(0, 255, 0, 0)',
                width=0.1
            ),
            opacity=0
        )
    )

    traceGR = go.Scatter3d(
        x=pointsGR[:, 0],
        y=pointsGR[:, 1],
        z=pointsGR[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            line=dict(
                color='rgba(0, 0, 255, 0)',
                width=0.1
            ),
            opacity=0
        )
    )

    data = [traceSUS, traceDR, traceGR]
    fig = go.Figure(data=data)
    py.iplot(fig, filename='3D PCA Wavelength Plot')

    # Plot the principle components
    eigenSpectra = pca.components_

    plt.subplot(3,1,1)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[0, :])
    plt.title("Principle Components 1 - 3")
    plt.subplot(3,1,2)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[1, :])
    plt.subplot(3,1,3)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[2, :])
    plt.xlabel("Wavelength (nm)")
    plt.show()

    plt.clf()
    plt.subplot(3,1,1)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[3, :])
    plt.title("Principle Components 4 - 6")
    plt.subplot(3,1,2)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[4, :])
    plt.subplot(3,1,3)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[5, :])
    plt.xlabel("Wavelength (nm)")
    plt.show()
def main(date, modelType, iterations):
    """
    Determines the optimal hyperparameters for a given machine learning
    model for a set of training data.

    :param date: Date the training and testing data was collected (YYYY_MMDD)
    :param modelType: (string) type of machine learning model to train
    :param iterations: (int) number of iterations for hyperparameter searching

    :return: (None)
    """
    
    # Make sure that the model is a valid choice
    if (not (modelType in MODELS.keys())) and (modelType != ALL):
        print "Invalid model type:", modelType
        return

    # Allow for training more than one model at a time
    if modelType == ALL:
        modelsToTrain = MODELS.keys()
    else:
        modelsToTrain = [modelType]

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    testX, testY = FileIO.loadTestingData(date)

    trainX = np.nan_to_num(trainX)
    testX = np.nan_to_num(testX)

    for modelType in modelsToTrain:

        # Train the desired ML model
        name, clfType = MODELS[modelType]
        print "Training the", name

        baseClassifier = clfType()
        clf = RandomizedSearchCV(baseClassifier, param_distributions=PARAMETERS[modelType],
                                                 n_iter=iterations,
                                                 n_jobs=4)
        clf.fit(trainX, trainY)

        # Perform some very basic accuracy testing
        trainResult = clf.predict(trainX)
        testResult = clf.predict(testX)

        trainingAccuracy = accuracy_score(trainY, trainResult)
        testingAccuracy = accuracy_score(testY, testResult)
        confusionMatrix = confusion_matrix(testY, testResult)

        print "Training Accuracy:", trainingAccuracy
        print "Testing Accuracy:", testingAccuracy
        print "Confusion Matrix:"
        print confusionMatrix
        print " "
        print "Hyperparameters:"
        for param in PARAMETERS[modelType].keys():
            print param + ':', clf.best_estimator_.get_params()[param]
        print " "

        # Save the model to disk
        FileIO.saveModel(clf.best_estimator_, modelType, date)
def main(date, takeSubset=False):
    """
    Reduces the dimensionality of the training data to 3 dimensions, 
    plots the transformed data in 3d space. The idea is to bring
    out separability between the resistance classes which may be 
    hidden in the dimensionality of the data.

    :param date: (string) Data collection date YYYY_MMDD
    :param takeSubset: (boolean) Transform and plot a random subset of
                                 the trainng data?

    :return: (None)
    """

    mkl.set_num_threads(8)

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)

    if takeSubset:
        indices = np.random.choice(range(0, len(trainY)),
                                   size=NUM_SAMPLES,
                                   replace=False)
        X = trainX[indices, :]
        y = trainY[indices]
    else:
        X = trainX
        y = trainY

    X = np.nan_to_num(X)

    # Break the data into resistance classes
    susIndex = Constants.LABEL_TO_INDEX[Constants.SUSCEPTIBLE]
    drIndex = Constants.LABEL_TO_INDEX[Constants.DR_RESISTANT]
    grIndex = Constants.LABEL_TO_INDEX[Constants.GR_RESISTANT]

    susX = X[y == susIndex, :]
    drX = X[y == drIndex, :]
    grX = X[y == grIndex, :]

    # Transform the data using PCA
    pca = IncrementalPCA(n_components=6)

    pointsSUS = pca.fit_transform(susX)
    pointsGR = pca.fit_transform(grX)
    pointsDR = pca.fit_transform(drX)

    # Plot the transformed data in 3D space
    traceSUS = go.Scatter3d(x=pointsSUS[:, 0],
                            y=pointsSUS[:, 1],
                            z=pointsSUS[:, 2],
                            mode='markers',
                            marker=dict(size=5,
                                        line=dict(color='rgba(255, 0, 0, 0)',
                                                  width=0.1),
                                        opacity=0))

    traceDR = go.Scatter3d(x=pointsDR[:, 0],
                           y=pointsDR[:, 1],
                           z=pointsDR[:, 2],
                           mode='markers',
                           marker=dict(size=5,
                                       line=dict(color='rgba(0, 255, 0, 0)',
                                                 width=0.1),
                                       opacity=0))

    traceGR = go.Scatter3d(x=pointsGR[:, 0],
                           y=pointsGR[:, 1],
                           z=pointsGR[:, 2],
                           mode='markers',
                           marker=dict(size=5,
                                       line=dict(color='rgba(0, 0, 255, 0)',
                                                 width=0.1),
                                       opacity=0))

    data = [traceSUS, traceDR, traceGR]
    fig = go.Figure(data=data)
    py.iplot(fig, filename='3D PCA Wavelength Plot')

    # Plot the principle components
    eigenSpectra = pca.components_

    plt.subplot(3, 1, 1)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[0, :])
    plt.title("Principle Components 1 - 3")
    plt.subplot(3, 1, 2)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[1, :])
    plt.subplot(3, 1, 3)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[2, :])
    plt.xlabel("Wavelength (nm)")
    plt.show()

    plt.clf()
    plt.subplot(3, 1, 1)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[3, :])
    plt.title("Principle Components 4 - 6")
    plt.subplot(3, 1, 2)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[4, :])
    plt.subplot(3, 1, 3)
    plt.plot(Constants.WAVELENGTHS, eigenSpectra[5, :])
    plt.xlabel("Wavelength (nm)")
    plt.show()
def main(date, modelType, iterations):
    """
    Determines the optimal hyperparameters for a given machine learning
    model for a set of training data.

    :param date: Date the training and testing data was collected (YYYY_MMDD)
    :param modelType: (string) type of machine learning model to train
    :param iterations: (int) number of iterations for hyperparameter searching

    :return: (None)
    """

    # Make sure that the model is a valid choice
    if (not (modelType in MODELS.keys())) and (modelType != ALL):
        print "Invalid model type:", modelType
        return

    # Allow for training more than one model at a time
    if modelType == ALL:
        modelsToTrain = MODELS.keys()
    else:
        modelsToTrain = [modelType]

    # Load the training and testing data into memory
    trainX, trainY = FileIO.loadTrainingData(date)
    testX, testY = FileIO.loadTestingData(date)

    trainX = np.nan_to_num(trainX)
    testX = np.nan_to_num(testX)

    for modelType in modelsToTrain:

        # Train the desired ML model
        name, clfType = MODELS[modelType]
        print "Training the", name

        baseClassifier = clfType()
        clf = RandomizedSearchCV(baseClassifier,
                                 param_distributions=PARAMETERS[modelType],
                                 n_iter=iterations,
                                 n_jobs=4)
        clf.fit(trainX, trainY)

        # Perform some very basic accuracy testing
        trainResult = clf.predict(trainX)
        testResult = clf.predict(testX)

        trainingAccuracy = accuracy_score(trainY, trainResult)
        testingAccuracy = accuracy_score(testY, testResult)
        confusionMatrix = confusion_matrix(testY, testResult)

        print "Training Accuracy:", trainingAccuracy
        print "Testing Accuracy:", testingAccuracy
        print "Confusion Matrix:"
        print confusionMatrix
        print " "
        print "Hyperparameters:"
        for param in PARAMETERS[modelType].keys():
            print param + ':', clf.best_estimator_.get_params()[param]
        print " "

        # Save the model to disk
        FileIO.saveModel(clf.best_estimator_, modelType, date)