예제 #1
0
def TabulateModelPerformanceForROC(model, xValidate, yValidate):
    pointsToEvaluate = 100
    thresholds = [
        x / float(pointsToEvaluate) for x in range(pointsToEvaluate + 1)
    ]
    FPRs = []
    FNRs = []

    try:
        for threshold in thresholds:
            FPRs.append(
                EvaluateBinaryClassification.FalsePositiveRate(
                    yValidate,
                    model.predict(xValidate,
                                  classificationThreshold=threshold)))
            FNRs.append(
                EvaluateBinaryClassification.FalseNegativeRate(
                    yValidate,
                    model.predict(xValidate,
                                  classificationThreshold=threshold)))
    except NotImplementedError:
        raise UserWarning(
            "The 'model' parameter must have a 'predict' method that supports using a 'classificationThreshold' parameter with range [ 0 - 1.0 ] to create classifications."
        )

    return (FPRs, FNRs, thresholds)
def ExecuteEvaluationRun(runSpecification, xTrainRaw, yTrain, numberOfFolds=2):
    print("runSpecification: ", runSpecification)
    startTime = time.time()

    # HERE upgrade this to use crossvalidation
    featurizer = SMSSpamFeaturize.SMSSpamFeaturize()
    featurizer.CreateVocabulary(
        xTrainRaw,
        yTrain,
        numFrequentWords=runSpecification['numFrequentWords'],
        numMutualInformationWords=runSpecification['numMutualInformationWords']
    )

    xTrain = featurizer.Featurize(xTrainRaw)
    xValidate = featurizer.Featurize(xValidateRaw)

    if numberOfFolds > 1:
        crossValidationAccuracy = []
        for i in range(numberOfFolds):
            xTrainI, yTrainI, xEvaluateI, yEvaluateI = CrossValidation.CrossValidation(
                xTrain, yTrain, numberOfFolds, i)

            model = LogisticRegression.LogisticRegression()
            model.fit(xTrainI,
                      yTrainI,
                      convergence=runSpecification['convergence'],
                      stepSize=runSpecification['stepSize'],
                      verbose=False)

            crossValidationAccuracy.append(
                EvaluateBinaryClassification.Accuracy(
                    yEvaluateI, model.predict(xEvaluateI)))

        mean = np.mean(crossValidationAccuracy)
        runSpecification['crossValidationMean'] = mean
        lower, _ = ErrorBounds.GetAccuracyBounds(
            np.mean(crossValidationAccuracy), len(yEvaluateI), .5)
        runSpecification['crossValidationErrorBound'] = mean - lower

    if numberOfFolds == 1:
        model = LogisticRegression.LogisticRegression()
        model.fit(xTrain,
                  yTrain,
                  convergence=runSpecification['convergence'],
                  stepSize=runSpecification['stepSize'],
                  verbose=False)
        validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
            yValidate, model.predict(xValidate))

        runSpecification['accuracy'] = validationSetAccuracy
        lower, _ = ErrorBounds.GetAccuracyBounds(validationSetAccuracy,
                                                 len(yValidate), .5)
        runSpecification['accuracyErrorBound'] = validationSetAccuracy - lower

    endTime = time.time()
    if numberOfFolds > 1:
        runSpecification['runtime'] = endTime - startTime

    return runSpecification
def ExecuteEvaluationRun(runSpecification, xTrainRaw, yTrain, numberOfFolds=5):
    startTime = time.time()

    # HERE upgrade this to use crossvalidation

    featurizer = SMSSpamFeaturize.SMSSpamFeaturize()
    featurizer.CreateVocabulary(
        xTrainRaw,
        yTrain,
        numFrequentWords=runSpecification['numFrequentWords'],
        numMutualInformationWords=runSpecification['numMutualInformationWords']
    )

    xTrain = featurizer.Featurize(xTrainRaw)
    xValidate = featurizer.Featurize(xValidateRaw)

    model = LogisticRegression.LogisticRegression()
    model.fit(xTrain,
              yTrain,
              convergence=runSpecification['convergence'],
              stepSize=runSpecification['stepSize'],
              verbose=True)

    validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
        yValidate, model.predict(xValidate))

    runSpecification['accuracy'] = validationSetAccuracy

    # HERE upgrade this to calculate and save some error bounds...

    endTime = time.time()
    runSpecification['runtime'] = endTime - startTime

    return runSpecification
예제 #4
0
def ExecuteEvaluationRun(runSpecification, xTrain, yTrain, numberOfFolds=2):
    print("runSpecification: ", runSpecification)
    startTime = time.time()

    if numberOfFolds > 1:
        crossValidationAccuracy = []
        for i in range(numberOfFolds):
            xTrainI, yTrainI, xEvaluateI, yEvaluateI = CrossValidation.CrossValidation(
                xTrain, yTrain, numberOfFolds, i)

            model = LogisticRegression.LogisticRegression()
            model.fit(xTrainI,
                      yTrainI,
                      convergence=runSpecification['convergence'],
                      stepSize=runSpecification['stepSize'],
                      verbose=False)

            crossValidationAccuracy.append(
                EvaluateBinaryClassification.Accuracy(
                    yEvaluateI, model.predict(xEvaluateI)))

        mean = np.mean(crossValidationAccuracy)
        runSpecification['crossValidationMean'] = mean
        lower, _ = ErrorBounds.GetAccuracyBounds(
            np.mean(crossValidationAccuracy), len(yEvaluateI), .5)
        runSpecification['crossValidationErrorBound'] = mean - lower

    if numberOfFolds == 1:
        model = LogisticRegression.LogisticRegression()
        model.fit(xTrain,
                  yTrain,
                  convergence=runSpecification['convergence'],
                  stepSize=runSpecification['stepSize'],
                  verbose=False)
        validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
            yValidate, model.predict(xValidate))

        runSpecification['accuracy'] = validationSetAccuracy
        lower, _ = ErrorBounds.GetAccuracyBounds(validationSetAccuracy,
                                                 len(yValidate), .5)
        runSpecification['accuracyErrorBound'] = validationSetAccuracy - lower

    endTime = time.time()
    if numberOfFolds > 1:
        runSpecification['runtime'] = endTime - startTime

    return runSpecification
예제 #5
0
    def ExecuteEvaluationRun(runSpecification,
                             xTrain,
                             yTrain,
                             numberOfFolds=2):
        print("runSpecification: ", runSpecification)
        startTime = time.time()

        if numberOfFolds > 1:
            crossValidationAccuracy = []
            for i in range(numberOfFolds):
                xTrainI, yTrainI, xEvaluateI, yEvaluateI = CrossValidation.CrossValidation(
                    xTrain, yTrain, numberOfFolds, i)

                model = DecisionTree.DecisionTree()
                model.fit(xTrainI,
                          yTrainI,
                          maxDepth=runSpecification["maxDepth"])

                crossValidationAccuracy.append(
                    EvaluateBinaryClassification.Accuracy(
                        yEvaluateI, model.predict(xEvaluateI)))

            mean = np.mean(crossValidationAccuracy)
            runSpecification['crossValidationMean'] = mean
            lower, _ = ErrorBounds.GetAccuracyBounds(
                np.mean(crossValidationAccuracy), len(yEvaluateI), .95)
            runSpecification['crossValidationErrorBound'] = mean - lower

        if numberOfFolds == 1:
            model = DecisionTree.DecisionTree()
            model.fit(xTrain, yTrain, maxDepth=runSpecification["maxDepth"])
            validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
                yValidate, model.predict(xValidate))

            runSpecification['accuracy'] = validationSetAccuracy
            lower, _ = ErrorBounds.GetAccuracyBounds(validationSetAccuracy,
                                                     len(yValidate), .95)
            runSpecification[
                'accuracyErrorBound'] = validationSetAccuracy - lower
            runSpecification['crossValidationMean'] = validationSetAccuracy
            runSpecification[
                'crossValidationErrorBound'] = validationSetAccuracy - lower

        endTime = time.time()
        runSpecification['runtime'] = endTime - startTime

        return runSpecification
예제 #6
0
def trainModel(m, op, maxE, patience=10, saveChartName=""):
    startTime = time.time()

    lossFunction = torch.nn.BCELoss(reduction='mean')

    trainLosses = []
    validationLosses = []

    converged = False
    epoch = 1
    lastValidationLoss = None

    currPatience = 0
    while not converged and epoch < maxE:
        # Reset the gradients in the network to zero
        op.zero_grad()
        #for batchXTensor, batchYTensor in trainDataSetGenerator:
        #    x = batchXTensor.to(device)
        #    y = batchYTensor.to(device)
        #
        # Do the forward pass
        #    yPredicted = m(x)

        # Compute the total loss summed across training samples in the epoch
        #  note this is different from our implementation, which took one step
        #  of gradient descent per sample.
        #    trainLoss = lossFunction(yPredicted, y)

        # Backprop the errors from the loss on this iteration
        #    trainLoss.backward()
        yTrainPredicted = m(xTrain)
        trainLoss = lossFunction(yTrainPredicted, yTrain)
        trainLoss.backward()
        # Do a weight update step
        op.step()

        # now check the validation loss
        m.train(mode=False)
        #validationLossTotal = 0
        #for batchXTensor, batchYTensor in validationDataSetGenerator:
        #    x = batchXTensor.to(device)
        #    y = batchYTensor.to(device)
        #    yPredicted = m(x)
        #    validationLoss = lossFunction(yPredicted, y)
        #    validationLossTotal += validationLoss.item()
        #validationLoss = validationLossTotal / len(validationDataSet)
        #validationLosses.append(validationLoss)
        yValidationPredicted = m(xValidate)
        validationLoss = lossFunction(yValidationPredicted, yValidate)

        #trainingLossTotal = 0
        #for batchXTensor, batchYTensor in trainDataSetGenerator:
        #    x = batchXTensor.to(device)
        #    y = batchYTensor.to(device)
        #    yPredicted = m(x)
        #    trainLoss = lossFunction(yPredicted, y)
        #    trainingLossTotal += trainLoss.item()
        #trainLosses.append(trainingLossTotal / len(trainingDataSet))

        #print("epoch %d: training loss {}, validation loss {}".format(epoch, trainLosses[-1], validationLoss))
        #if lastValidationLoss is not None and validationLoss > lastValidationLoss and saveChartName == "":
        #    converged = True
        #else:
        #    lastValidationLoss = validationLoss
        yTrainingPredicted = m(xTrain)
        trainLoss = lossFunction(yTrainingPredicted, yTrain)
        trainLosses.append(trainLoss.item() / len(yTrain))
        validationLosses.append(validationLoss.item() / len(yValidate))
        print("epoch {}: training loss {}, validation loss {}".format(
            epoch, trainLosses[-1], validationLosses[-1]))
        if lastValidationLoss is not None and validationLoss > lastValidationLoss:
            if currPatience < patience:
                currPatience += 1
            else:
                converged = True
        else:
            lastValidationLoss = validationLoss
            currPatience = 0
        epoch = epoch + 1
        m.train(mode=True)

    endTime = time.time()
    print("Runtime: %s" % (endTime - startTime))

    ##
    # Visualize Training run
    ##
    if saveChartName != "":
        xValues = [i + 1 for i in range(len(trainLosses))]
        Charting.PlotSeries([trainLosses, validationLosses],
                            ["Train Loss", "Validate Loss"],
                            xValues,
                            useMarkers=False,
                            chartTitle="Blink LeNet Model Loss/Epoch",
                            xAxisTitle="Epoch",
                            yAxisTitle="Loss",
                            yBotLimit=0.0,
                            outputDirectory=kOutputDirectory,
                            fileName="4-" + saveChartName)

    ##
    # Get the model accuracy on validation set
    ##
    model.train(mode=False)
    #yValidatePredicted = []
    #for batchXTensor, batchYTensor in validationDataSetGenerator:
    #        x = batchXTensor.to(device)
    #        y = batchYTensor.to(device)
    #        yPredicted = m(x)
    #        yValidatePredicted += yPredicted.tolist()
    yValidatePredicted = m(xValidate)
    return EvaluateBinaryClassification.Accuracy(
        yValidate, [1 if pred > 0.5 else 0 for pred in yValidatePredicted])
                             convergence=convergence,
                             momentum=momentum)
        if (i + 1) % 100 == 0:
            for filterNumber in range(hiddenStructure[0]):
                ## update the first parameter based on your representation
                #VisualizeWeights([model.weight0[0][filterNumber]] + list(model.layers[0][filterNumber][:]), "%s/filters/epoch%d_neuron%d.jpg" % (kOutputDirectory, i+1, filterNumber), sampleStride=sampleStride)
                VisualizeWeights([model.weight0[0][filterNumber]] +
                                 list(model.layers[0][filterNumber][:]),
                                 "%s/filters2/epoch%d_neuron%d.jpg" %
                                 (kOutputDirectory, i + 1, filterNumber),
                                 sampleStride=sampleStride)
    tLoss = model.loss(xTrain, yTrain)
    vLoss = model.loss(xValidate, yValidate)
    trainingLosses.append(tLoss)
    validationLosses.append(vLoss)

import MachineLearningCourse.MLUtilities.Visualizations.Charting as Charting
#Charting.PlotSeries([trainingLosses, validationLosses], ["training loss", "validation loss"], list(range(maxEpochs)), chartTitle="Single Layer Loss", xAxisTitle="epochs", yAxisTitle="loss", outputDirectory=kOutputDirectory+"/visualize\\", fileName="2-SingleLayerModelLoss")
Charting.PlotSeries([trainingLosses, validationLosses],
                    ["training loss", "validation loss"],
                    list(range(maxEpochs)),
                    chartTitle="Two Layer Loss",
                    xAxisTitle="epochs",
                    yAxisTitle="loss",
                    outputDirectory=kOutputDirectory + "/visualize\\",
                    fileName="2-TwoLayerModelLoss")

# Evaluate things...
accuracy = EvaluateBinaryClassification.Accuracy(yValidate,
                                                 model.predict(xValidate))
print("Model Accuracy is:", accuracy)
import MachineLearningCourse.MLUtilities.Visualizations.Charting as Charting

xValues = [i + 1 for i in range(len(trainLosses))]

Charting.PlotSeries([trainLosses, validationLosses],
                    ["Train Loss", "Validate Loss"],
                    xValues,
                    useMarkers=False,
                    chartTitle="Pytorch First Modeling Run",
                    xAxisTitle="Epoch",
                    yAxisTitle="Loss",
                    yBotLimit=0.0,
                    outputDirectory=kOutputDirectory,
                    fileName="PyTorch-Initial-TrainValidate")

##
# Evaluate the Model
##

import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification
import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds

model.train(mode=False)
yTestPredicted = model(xTest)

testAccuracy = EvaluateBinaryClassification.Accuracy(
    yTest, [1 if pred > 0.5 else 0 for pred in yTestPredicted])
print("Accuracy simple:", testAccuracy,
      ErrorBounds.GetAccuracyBounds(testAccuracy, len(yTestPredicted), 0.95))
예제 #9
0
    xTrainNumeric = featurizerNumeric.Featurize(xTrainRaw)
    xValidateNumeric = featurizerNumeric.Featurize(xValidateRaw)
    xTestNumeric = featurizerNumeric.Featurize(xTestRaw)

    ############################
    import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification
    import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds
    import MachineLearningCourse.MLUtilities.Data.CrossValidation as CrossValidation
    import MachineLearningCourse.MLUtilities.Visualizations.Charting as Charting
    import time
    import numpy as np

    model = DecisionTree.DecisionTree()
    model.fit(xTrainNumeric, yTrain)
    validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
        yValidate, model.predict(xValidateNumeric))
    print("numericvalidationSetAccuracy: ", validationSetAccuracy)
    #model.visualize()

    model = DecisionTree.DecisionTree()
    model.fit(xTrain, yTrain)
    validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
        yValidate, model.predict(xValidate))
    print("validationSetAccuracy: ", validationSetAccuracy)
    #model.visualize()


# A helper function for calculating FN rate and FP rate across a range of thresholds
def TabulateModelPerformanceForROC(model, xValidate, yValidate):
    pointsToEvaluate = 100
    thresholds = [
예제 #10
0
        if epoch > 10 and lastLoss != None and abs(lastLoss - loss) < convergence:
            if patience >  4:
                converged = True
                pass
            else: 
                patience += 1
        else:
            lastLoss = loss
            patience = 0
            
        epoch = epoch + 1

    model.train(mode=True)

    # Check accuracies
    torch_accuracy = EvaluateBinaryClassification.Accuracy(yTrain,model.predict(xTrain))
    print("Training accuracy: " + str(torch_accuracy))
    torch_accuracy = EvaluateBinaryClassification.Accuracy(yValidate,model.predict(xValidate))
    print("Validation accuracy: " + str(torch_accuracy))

    # Calculate ROC curve
    (modelFPRs, modelFNRs, thresholds) = TabulateModelPerformanceForROC(model, xValidate, yValidate)

    FNRs_series.append(modelFNRs)
    FPRs_series.append(modelFPRs)
    Label_series.append("PyTorch")

    #### Include 3x3 Grid Features
    print("Moving on to 3x3 features...")
 
    # Featureize
예제 #11
0
    def ExecuteFitting(runSpecification, xTrain, yTrain, xValidate, yValidate):
        startTime = time.time()

        # Create features and train based on type of model
        # Create the model
        model = BlinkNeuralNetwork.BlinkNeuralNetwork(hiddenNodes = 6, hiddenNodesTwo = 4)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print("Device is:", device)

        model.to(device)

        # Move the data onto whichever device was selected
        xTrain = xTrain.to(device)
        yTrain = yTrain.to(device)
        xValidate = xValidate.to(device)
        yValidate = yValidate.to(device)
        
        converged = False
        epoch = 1
        lastLoss = None
        convergence = runSpecification['convergence']
        optimizer = torch.optim.SGD(model.parameters(), lr=runSpecification['learning_rate'])
        lossFunction = torch.nn.MSELoss(reduction='mean')
        patience = 0

        while not converged and epoch < 5000:
            # Do the forward pass
            yTrainPredicted = model(xTrain)
            trainLoss = lossFunction(yTrainPredicted, yTrain)

            # Reset the gradients in the network to zero
            optimizer.zero_grad()

            # Backprop the errors from the loss on this iteration
            trainLoss.backward()

            # Do a weight update step
            optimizer.step()

            loss = trainLoss.item()
            # print(loss)
            if epoch > 10 and lastLoss != None and abs(lastLoss - loss) < convergence:
                if patience >=  0:
                    converged = True
                    pass
                else: 
                    patience += 1
            else:
                lastLoss = loss
                patience = 0
                
            epoch = epoch + 1

        model.train(mode=True)

        endTime = time.time()

        runSpecification['runtime'] = endTime - startTime
        runSpecification['epoch'] = epoch
        
        yValidatePredicted = model(xValidate)
        validAccuracy = EvaluateBinaryClassification.Accuracy(yValidate, [ 1 if pred > 0.5 else 0 for pred in yValidatePredicted ])
        runSpecification['accuracy'] = validAccuracy

        num_samples = len(xValidate)
        (low_bound, high_bound) = ErrorBounds.GetAccuracyBounds(validAccuracy, num_samples, 0.5)
        errorBound = (high_bound - low_bound) / 2
        runSpecification['50PercentBound'] = errorBound
        
        return runSpecification
    print(" %d - %s" % (yTrain[i], xTrainRaw[i]))

# Now we'll do our first 'machine learning' using a very simple 'algorithm'.
import MachineLearningCourse.MLUtilities.Learners.MostCommonClassModel as MostCommonClassModel

model = MostCommonClassModel.MostCommonClassModel()

# go read the ModelMostCommon code to see what model.fit does
model.fit(xTrainRaw, yTrain)

print("\n- Inspect the model -")
model.visualize()

# model.predict takes the 'features' (in this case the raw strings) and returns a parallel array of preditions (in this case the most common label in the training set).
yTrainPredicted = model.predict(xTrainRaw)

# look at a few of the predictions, along with the correct labels and the raw x data.
print(
    "\n- Inspect a few predictions [ <predicted> (<true label>) - <raw string> ] -"
)
for i in range(5):
    print("%d (%d) - %s" % (yTrainPredicted[i], yTrain[i], xTrainRaw[i]))

import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification
trainSetAccuracy = EvaluateBinaryClassification.Accuracy(
    yTrain, yTrainPredicted)

print("\n---")
print(
    "Predicting the most common class gives: %.2f accuracy on the training set."
    % (trainSetAccuracy))
예제 #13
0
import MachineLearningCourse.MLUtilities.Learners.MostCommonClassModel as MostCommonClassModel

model = MostCommonClassModel.MostCommonClassModel()

# go read the ModelMostCommon code to see what model.fit does
model.fit(xTrainRaw, yTrain)

print("\n- Inspect the model -")
model.visualize()

# model.predict takes the 'features' (in this case the raw strings) and returns a parallel array of preditions (in this case the most common label in the training set).
yTrainPredicted = model.predict(xTrainRaw)

# look at a few of the predictions, along with the correct labels and the raw x data.
print(
    "\n- Inspect a few predictions [ <predicted> (<true label>) - <raw string> ] -"
)
for i in range(5):
    print("%d (%d) - %s" % (yTrainPredicted[i], yTrain[i], xTrainRaw[i]))

import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification
trainSetAccuracy = EvaluateBinaryClassification.Accuracy(
    yTrain, yTrainPredicted)

print("\n---")
print(
    "Predicting the most common class gives: %.2f accuracy on the training set."
    % (trainSetAccuracy))

yValidatePredicted = model.predict(xValidateRaw)
EvaluateBinaryClassification.ExecuteAll(yValidate, yValidatePredicted)
예제 #14
0
    xTrain    = featurizer.Featurize(xTrainRaw)
    xValidate = featurizer.Featurize(xValidateRaw)
    xTest     = featurizer.Featurize(xTestRaw)

    bestModelBT = None
    kValues = [1, 10, 50, 100, 150]
    maxDepth = 1
    validationAccuracies = []
    validationAccuracyErrorBounds = []
    trainingAccuracies = []
    trainingAccuracyErrorBounds = []
    for kv in kValues:
        model = BoostedTree.BoostedTree()
        model.fit(xTrain, yTrain, maxDepth=maxDepth, k=kv)
        validationAccuracy = EvaluateBinaryClassification.Accuracy(yValidate, model.predict(xValidate))
        lower, upper = ErrorBounds.GetAccuracyBounds(validationAccuracy, len(yValidate), .5)
        trainingAccuracy = EvaluateBinaryClassification.Accuracy(yTrain, model.predict(xTrain))
        lowerTrain, upperTrain = ErrorBounds.GetAccuracyBounds(trainingAccuracy, len(yTrain), .5)

        validationAccuracies.append(validationAccuracy)
        validationAccuracyErrorBounds.append(validationAccuracy-lower)
        trainingAccuracies.append(trainingAccuracy)
        trainingAccuracyErrorBounds.append(trainingAccuracy-lowerTrain)

        print("k: ", kv, " accuracy: ", lower, "-", upper)
        if bestModelBT is None:
            bestModelBT = (model, lower, upper, kv)
        elif lower > bestModelBT[2]:
            bestModelBT = (model, lower, upper, kv)
예제 #15
0
import MachineLearningCourse.MLUtilities.Visualizations.Visualize2D as Visualize2D

## this code outputs the true concept.
visualize = Visualize2D.Visualize2D(kOutputDirectory, "4-Generated Concept")
visualize.Plot2DDataAndBinaryConcept(xTest,yTest,concept)
visualize.Save()

bestModel = None
kValues = [1, 10, 25, 50, 100]
maxDepth = 1
accuracies = []
errorBarsAccuracy = []
for kv in kValues:
    model = BoostedTree.BoostedTree()
    model.fit(xTrain, yTrain, maxDepth=maxDepth, k=kv)
    accuracy = EvaluateBinaryClassification.Accuracy(yTest, model.predict(xTest))
    lower, upper = ErrorBounds.GetAccuracyBounds(accuracy, len(yTest), .5)
    print(kv, ": ", accuracy)
    accuracies.append(accuracy)
    errorBarsAccuracy.append(accuracy-lower)
    if bestModel is None:
        bestModel = (model, upper)
    elif lower > bestModel[1]:
        bestModel = (model, upper)

Charting.PlotSeriesWithErrorBars([accuracies], [errorBarsAccuracy], ["k-round tuning accuracy"], kValues, chartTitle="Line/Circle Concept Accuracy", xAxisTitle="Boosting Rounds", yAxisTitle="Test Accuracy", yBotLimit=0.5, outputDirectory=kOutputDirectory, fileName="4-BoostingTreeRoundTuning")

## you can use this to visualize what your model is learning.
accuracy = EvaluateBinaryClassification.Accuracy(yTest, bestModel[0].predict(xTest))
lower, upper = ErrorBounds.GetAccuracyBounds(accuracy, len(yTest), .95)
print("accuracy: ", lower, "-", upper)
예제 #16
0
xTrain = featurizer.Featurize(xTrainRaw)
xValidate = featurizer.Featurize(xValidateRaw)
xTest = featurizer.Featurize(xTestRaw)

for i in range(10):
    print("%d - " % (yTrain[i]), xTrain[i])

############################
import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification
import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds
import MachineLearningCourse.MLUtilities.Learners.MostCommonClassModel as MostCommonClassModel

model = MostCommonClassModel.MostCommonClassModel()
model.fit(xTrain, yTrain)
yValidatePredicted = model.predict(xValidate)
validateAccuracy = EvaluateBinaryClassification.Accuracy(
    yValidate, yValidatePredicted)
errorBounds = ErrorBounds.GetAccuracyBounds(validateAccuracy, len(yValidate),
                                            0.95)

print()
print(
    "### 'Most Common Class' model validate set accuracy: %.4f (95%% %.4f - %.4f)"
    % (validateAccuracy, errorBounds[0], errorBounds[1]))

import MachineLearningCourse.MLUtilities.Data.CrossValidation as CrossValidation
import MachineLearningCourse.MLUtilities.Learners.LogisticRegression as LogisticRegression
import MachineLearningCourse.MLUtilities.Visualizations.Charting as Charting
import time
import numpy as np

    xValidate = featurizer.Featurize(xValidateRaw)
    xTest = featurizer.Featurize(xTestRaw)

    frequentModel = LogisticRegression.LogisticRegression()
    frequentModel.fit(xTrain,
                      yTrain,
                      convergence=convergence,
                      stepSize=stepSize,
                      verbose=True)

    ######
    ### Use equation 5.1 from Mitchell to bound the validation set error and the true error
    import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds

    print("Logistic regression with 25 features by mutual information:")
    validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
        yValidate, frequentModel.predict(xValidate))
    print("Validation set accuracy: %.4f." % (validationSetAccuracy))
    for confidence in [.5, .8, .9, .95, .99]:
        (lowerBound,
         upperBound) = ErrorBounds.GetAccuracyBounds(validationSetAccuracy,
                                                     len(xValidate),
                                                     confidence)
        print(" %.2f%% accuracy bound: %.4f - %.4f" %
              (confidence, lowerBound, upperBound))

    ### Compare to most common class model here...
    mostCommonModel = MostCommonClassModel.MostCommonClassModel()
    mostCommonModel.fit(xTrain, yTrain)

    print("MostCommon regression model:")
    validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
                            supplementalVocabularyWords=['call', 'to', 'your'])

# Apply the featurerizer to the raw data sets to produce feature vectors. In this case, each message will be converted to an array
#  with one bit per feature that is 1 if the message has the feature, and 0 if the message does not have the feature.
xTrain = featurizer.Featurize(xTrainRaw)
xValidate = featurizer.Featurize(xValidateRaw)
xTest = featurizer.Featurize(xTestRaw)

print("\n - Inspect the features -")
for i in range(len(xTrain[0])):
    print(featurizer.GetFeatureInfo(i))

print("\n - Inspect feature values for a few training samples -")
for i in range(5):
    print(yTrain[i], "-", xTrain[i], xTrainRaw[i])

# Now let's up our modeling game (as compared to predicting the most common class)
#  we'll use a heuristic (hand-tuned) linear model.
import MachineLearningCourse.MLUtilities.Learners.LinearHeuristicModel as LinearHeuristicModel
model = LinearHeuristicModel.LinearHeuristicModel()

model.fit(xTrain, yTrain, -1.0, [.75, .75, .75, .25, .25])

print("\n - Inspect the weights on the heuristically-tuned model -")
model.visualize()

yValidatePredicted = model.predict(xValidate)

import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification
EvaluateBinaryClassification.ExecuteAll(yValidate, yValidatePredicted)
    # Learn the logistic regression model

    print("Learning the logistic regression model:")
    import MachineLearningCourse.MLUtilities.Learners.LogisticRegression as LogisticRegression
    logisticRegressionModel = LogisticRegression.LogisticRegression()

    logisticRegressionModel.fit(xTrain,
                                yTrain,
                                stepSize=1.0,
                                convergence=0.005)

    #############################
    # Evaluate the model

    import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification

    print("\nLogistic regression model:")
    logisticRegressionModel.visualize()
    EvaluateBinaryClassification.ExecuteAll(
        yValidate,
        logisticRegressionModel.predict(xValidate,
                                        classificationThreshold=0.5))

    #################
    # You may find the following module helpful for making charts. You'll have to install matplotlib (see the lecture notes).
    #
    # import MachineLearningCourse.MLUtilities.Visualizations.Charting as Charting
    #
    # # trainLosses, validationLosses, and lossXLabels are parallel arrays with the losses you want to plot at the specified x coordinates
    #
    # Charting.PlotSeries([trainLosses, validationLosses], ['Train', 'Validate'], lossXLabels, chartTitle="Logistic Regression", xAxisTitle="Gradient Descent Steps", yAxisTitle="Avg. Loss", outputDirectory=kOutputDirectory, fileName="3-Logistic Regression Train vs Validate loss")
예제 #20
0
    xValidate = featurizer.Featurize(xValidateRaw)
    xTest = featurizer.Featurize(xTestRaw)

    frequentModel = LogisticRegression.LogisticRegression()
    frequentModel.fit(xTrain,
                      yTrain,
                      convergence=convergence,
                      stepSize=stepSize,
                      verbose=True)

    ######
    ### Use equation 5.1 from Mitchell to bound the validation set error and the true error
    import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds

    print("Logistic regression with 25 features by mutual information:")
    validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
        yValidate, frequentModel.predict(xValidate))
    print("Validation set accuracy: %.4f." % (validationSetAccuracy))
    for confidence in [.5, .8, .9, .95, .99]:
        (lowerBound,
         upperBound) = ErrorBounds.GetAccuracyBounds(validationSetAccuracy,
                                                     len(xValidate),
                                                     confidence)
        print(" %.2f%% accuracy bound: %.4f - %.4f" %
              (confidence, lowerBound, upperBound))

    ### Compare to most common class model here...

# Set this to true when you've completed the previous steps and are ready to move on...
doCrossValidation = False
if doCrossValidation:
    import MachineLearningCourse.MLUtilities.Data.CrossValidation as CrossValidation