def TabulateModelPerformanceForROC(model, xValidate, yValidate): pointsToEvaluate = 100 thresholds = [ x / float(pointsToEvaluate) for x in range(pointsToEvaluate + 1) ] FPRs = [] FNRs = [] try: for threshold in thresholds: FPRs.append( EvaluateBinaryClassification.FalsePositiveRate( yValidate, model.predict(xValidate, classificationThreshold=threshold))) FNRs.append( EvaluateBinaryClassification.FalseNegativeRate( yValidate, model.predict(xValidate, classificationThreshold=threshold))) except NotImplementedError: raise UserWarning( "The 'model' parameter must have a 'predict' method that supports using a 'classificationThreshold' parameter with range [ 0 - 1.0 ] to create classifications." ) return (FPRs, FNRs, thresholds)
def ExecuteEvaluationRun(runSpecification, xTrainRaw, yTrain, numberOfFolds=2): print("runSpecification: ", runSpecification) startTime = time.time() # HERE upgrade this to use crossvalidation featurizer = SMSSpamFeaturize.SMSSpamFeaturize() featurizer.CreateVocabulary( xTrainRaw, yTrain, numFrequentWords=runSpecification['numFrequentWords'], numMutualInformationWords=runSpecification['numMutualInformationWords'] ) xTrain = featurizer.Featurize(xTrainRaw) xValidate = featurizer.Featurize(xValidateRaw) if numberOfFolds > 1: crossValidationAccuracy = [] for i in range(numberOfFolds): xTrainI, yTrainI, xEvaluateI, yEvaluateI = CrossValidation.CrossValidation( xTrain, yTrain, numberOfFolds, i) model = LogisticRegression.LogisticRegression() model.fit(xTrainI, yTrainI, convergence=runSpecification['convergence'], stepSize=runSpecification['stepSize'], verbose=False) crossValidationAccuracy.append( EvaluateBinaryClassification.Accuracy( yEvaluateI, model.predict(xEvaluateI))) mean = np.mean(crossValidationAccuracy) runSpecification['crossValidationMean'] = mean lower, _ = ErrorBounds.GetAccuracyBounds( np.mean(crossValidationAccuracy), len(yEvaluateI), .5) runSpecification['crossValidationErrorBound'] = mean - lower if numberOfFolds == 1: model = LogisticRegression.LogisticRegression() model.fit(xTrain, yTrain, convergence=runSpecification['convergence'], stepSize=runSpecification['stepSize'], verbose=False) validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, model.predict(xValidate)) runSpecification['accuracy'] = validationSetAccuracy lower, _ = ErrorBounds.GetAccuracyBounds(validationSetAccuracy, len(yValidate), .5) runSpecification['accuracyErrorBound'] = validationSetAccuracy - lower endTime = time.time() if numberOfFolds > 1: runSpecification['runtime'] = endTime - startTime return runSpecification
def ExecuteEvaluationRun(runSpecification, xTrainRaw, yTrain, numberOfFolds=5): startTime = time.time() # HERE upgrade this to use crossvalidation featurizer = SMSSpamFeaturize.SMSSpamFeaturize() featurizer.CreateVocabulary( xTrainRaw, yTrain, numFrequentWords=runSpecification['numFrequentWords'], numMutualInformationWords=runSpecification['numMutualInformationWords'] ) xTrain = featurizer.Featurize(xTrainRaw) xValidate = featurizer.Featurize(xValidateRaw) model = LogisticRegression.LogisticRegression() model.fit(xTrain, yTrain, convergence=runSpecification['convergence'], stepSize=runSpecification['stepSize'], verbose=True) validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, model.predict(xValidate)) runSpecification['accuracy'] = validationSetAccuracy # HERE upgrade this to calculate and save some error bounds... endTime = time.time() runSpecification['runtime'] = endTime - startTime return runSpecification
def ExecuteEvaluationRun(runSpecification, xTrain, yTrain, numberOfFolds=2): print("runSpecification: ", runSpecification) startTime = time.time() if numberOfFolds > 1: crossValidationAccuracy = [] for i in range(numberOfFolds): xTrainI, yTrainI, xEvaluateI, yEvaluateI = CrossValidation.CrossValidation( xTrain, yTrain, numberOfFolds, i) model = LogisticRegression.LogisticRegression() model.fit(xTrainI, yTrainI, convergence=runSpecification['convergence'], stepSize=runSpecification['stepSize'], verbose=False) crossValidationAccuracy.append( EvaluateBinaryClassification.Accuracy( yEvaluateI, model.predict(xEvaluateI))) mean = np.mean(crossValidationAccuracy) runSpecification['crossValidationMean'] = mean lower, _ = ErrorBounds.GetAccuracyBounds( np.mean(crossValidationAccuracy), len(yEvaluateI), .5) runSpecification['crossValidationErrorBound'] = mean - lower if numberOfFolds == 1: model = LogisticRegression.LogisticRegression() model.fit(xTrain, yTrain, convergence=runSpecification['convergence'], stepSize=runSpecification['stepSize'], verbose=False) validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, model.predict(xValidate)) runSpecification['accuracy'] = validationSetAccuracy lower, _ = ErrorBounds.GetAccuracyBounds(validationSetAccuracy, len(yValidate), .5) runSpecification['accuracyErrorBound'] = validationSetAccuracy - lower endTime = time.time() if numberOfFolds > 1: runSpecification['runtime'] = endTime - startTime return runSpecification
def ExecuteEvaluationRun(runSpecification, xTrain, yTrain, numberOfFolds=2): print("runSpecification: ", runSpecification) startTime = time.time() if numberOfFolds > 1: crossValidationAccuracy = [] for i in range(numberOfFolds): xTrainI, yTrainI, xEvaluateI, yEvaluateI = CrossValidation.CrossValidation( xTrain, yTrain, numberOfFolds, i) model = DecisionTree.DecisionTree() model.fit(xTrainI, yTrainI, maxDepth=runSpecification["maxDepth"]) crossValidationAccuracy.append( EvaluateBinaryClassification.Accuracy( yEvaluateI, model.predict(xEvaluateI))) mean = np.mean(crossValidationAccuracy) runSpecification['crossValidationMean'] = mean lower, _ = ErrorBounds.GetAccuracyBounds( np.mean(crossValidationAccuracy), len(yEvaluateI), .95) runSpecification['crossValidationErrorBound'] = mean - lower if numberOfFolds == 1: model = DecisionTree.DecisionTree() model.fit(xTrain, yTrain, maxDepth=runSpecification["maxDepth"]) validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, model.predict(xValidate)) runSpecification['accuracy'] = validationSetAccuracy lower, _ = ErrorBounds.GetAccuracyBounds(validationSetAccuracy, len(yValidate), .95) runSpecification[ 'accuracyErrorBound'] = validationSetAccuracy - lower runSpecification['crossValidationMean'] = validationSetAccuracy runSpecification[ 'crossValidationErrorBound'] = validationSetAccuracy - lower endTime = time.time() runSpecification['runtime'] = endTime - startTime return runSpecification
def trainModel(m, op, maxE, patience=10, saveChartName=""): startTime = time.time() lossFunction = torch.nn.BCELoss(reduction='mean') trainLosses = [] validationLosses = [] converged = False epoch = 1 lastValidationLoss = None currPatience = 0 while not converged and epoch < maxE: # Reset the gradients in the network to zero op.zero_grad() #for batchXTensor, batchYTensor in trainDataSetGenerator: # x = batchXTensor.to(device) # y = batchYTensor.to(device) # # Do the forward pass # yPredicted = m(x) # Compute the total loss summed across training samples in the epoch # note this is different from our implementation, which took one step # of gradient descent per sample. # trainLoss = lossFunction(yPredicted, y) # Backprop the errors from the loss on this iteration # trainLoss.backward() yTrainPredicted = m(xTrain) trainLoss = lossFunction(yTrainPredicted, yTrain) trainLoss.backward() # Do a weight update step op.step() # now check the validation loss m.train(mode=False) #validationLossTotal = 0 #for batchXTensor, batchYTensor in validationDataSetGenerator: # x = batchXTensor.to(device) # y = batchYTensor.to(device) # yPredicted = m(x) # validationLoss = lossFunction(yPredicted, y) # validationLossTotal += validationLoss.item() #validationLoss = validationLossTotal / len(validationDataSet) #validationLosses.append(validationLoss) yValidationPredicted = m(xValidate) validationLoss = lossFunction(yValidationPredicted, yValidate) #trainingLossTotal = 0 #for batchXTensor, batchYTensor in trainDataSetGenerator: # x = batchXTensor.to(device) # y = batchYTensor.to(device) # yPredicted = m(x) # trainLoss = lossFunction(yPredicted, y) # trainingLossTotal += trainLoss.item() #trainLosses.append(trainingLossTotal / len(trainingDataSet)) #print("epoch %d: training loss {}, validation loss {}".format(epoch, trainLosses[-1], validationLoss)) #if lastValidationLoss is not None and validationLoss > lastValidationLoss and saveChartName == "": # converged = True #else: # lastValidationLoss = validationLoss yTrainingPredicted = m(xTrain) trainLoss = lossFunction(yTrainingPredicted, yTrain) trainLosses.append(trainLoss.item() / len(yTrain)) validationLosses.append(validationLoss.item() / len(yValidate)) print("epoch {}: training loss {}, validation loss {}".format( epoch, trainLosses[-1], validationLosses[-1])) if lastValidationLoss is not None and validationLoss > lastValidationLoss: if currPatience < patience: currPatience += 1 else: converged = True else: lastValidationLoss = validationLoss currPatience = 0 epoch = epoch + 1 m.train(mode=True) endTime = time.time() print("Runtime: %s" % (endTime - startTime)) ## # Visualize Training run ## if saveChartName != "": xValues = [i + 1 for i in range(len(trainLosses))] Charting.PlotSeries([trainLosses, validationLosses], ["Train Loss", "Validate Loss"], xValues, useMarkers=False, chartTitle="Blink LeNet Model Loss/Epoch", xAxisTitle="Epoch", yAxisTitle="Loss", yBotLimit=0.0, outputDirectory=kOutputDirectory, fileName="4-" + saveChartName) ## # Get the model accuracy on validation set ## model.train(mode=False) #yValidatePredicted = [] #for batchXTensor, batchYTensor in validationDataSetGenerator: # x = batchXTensor.to(device) # y = batchYTensor.to(device) # yPredicted = m(x) # yValidatePredicted += yPredicted.tolist() yValidatePredicted = m(xValidate) return EvaluateBinaryClassification.Accuracy( yValidate, [1 if pred > 0.5 else 0 for pred in yValidatePredicted])
convergence=convergence, momentum=momentum) if (i + 1) % 100 == 0: for filterNumber in range(hiddenStructure[0]): ## update the first parameter based on your representation #VisualizeWeights([model.weight0[0][filterNumber]] + list(model.layers[0][filterNumber][:]), "%s/filters/epoch%d_neuron%d.jpg" % (kOutputDirectory, i+1, filterNumber), sampleStride=sampleStride) VisualizeWeights([model.weight0[0][filterNumber]] + list(model.layers[0][filterNumber][:]), "%s/filters2/epoch%d_neuron%d.jpg" % (kOutputDirectory, i + 1, filterNumber), sampleStride=sampleStride) tLoss = model.loss(xTrain, yTrain) vLoss = model.loss(xValidate, yValidate) trainingLosses.append(tLoss) validationLosses.append(vLoss) import MachineLearningCourse.MLUtilities.Visualizations.Charting as Charting #Charting.PlotSeries([trainingLosses, validationLosses], ["training loss", "validation loss"], list(range(maxEpochs)), chartTitle="Single Layer Loss", xAxisTitle="epochs", yAxisTitle="loss", outputDirectory=kOutputDirectory+"/visualize\\", fileName="2-SingleLayerModelLoss") Charting.PlotSeries([trainingLosses, validationLosses], ["training loss", "validation loss"], list(range(maxEpochs)), chartTitle="Two Layer Loss", xAxisTitle="epochs", yAxisTitle="loss", outputDirectory=kOutputDirectory + "/visualize\\", fileName="2-TwoLayerModelLoss") # Evaluate things... accuracy = EvaluateBinaryClassification.Accuracy(yValidate, model.predict(xValidate)) print("Model Accuracy is:", accuracy)
import MachineLearningCourse.MLUtilities.Visualizations.Charting as Charting xValues = [i + 1 for i in range(len(trainLosses))] Charting.PlotSeries([trainLosses, validationLosses], ["Train Loss", "Validate Loss"], xValues, useMarkers=False, chartTitle="Pytorch First Modeling Run", xAxisTitle="Epoch", yAxisTitle="Loss", yBotLimit=0.0, outputDirectory=kOutputDirectory, fileName="PyTorch-Initial-TrainValidate") ## # Evaluate the Model ## import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds model.train(mode=False) yTestPredicted = model(xTest) testAccuracy = EvaluateBinaryClassification.Accuracy( yTest, [1 if pred > 0.5 else 0 for pred in yTestPredicted]) print("Accuracy simple:", testAccuracy, ErrorBounds.GetAccuracyBounds(testAccuracy, len(yTestPredicted), 0.95))
xTrainNumeric = featurizerNumeric.Featurize(xTrainRaw) xValidateNumeric = featurizerNumeric.Featurize(xValidateRaw) xTestNumeric = featurizerNumeric.Featurize(xTestRaw) ############################ import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds import MachineLearningCourse.MLUtilities.Data.CrossValidation as CrossValidation import MachineLearningCourse.MLUtilities.Visualizations.Charting as Charting import time import numpy as np model = DecisionTree.DecisionTree() model.fit(xTrainNumeric, yTrain) validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, model.predict(xValidateNumeric)) print("numericvalidationSetAccuracy: ", validationSetAccuracy) #model.visualize() model = DecisionTree.DecisionTree() model.fit(xTrain, yTrain) validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, model.predict(xValidate)) print("validationSetAccuracy: ", validationSetAccuracy) #model.visualize() # A helper function for calculating FN rate and FP rate across a range of thresholds def TabulateModelPerformanceForROC(model, xValidate, yValidate): pointsToEvaluate = 100 thresholds = [
if epoch > 10 and lastLoss != None and abs(lastLoss - loss) < convergence: if patience > 4: converged = True pass else: patience += 1 else: lastLoss = loss patience = 0 epoch = epoch + 1 model.train(mode=True) # Check accuracies torch_accuracy = EvaluateBinaryClassification.Accuracy(yTrain,model.predict(xTrain)) print("Training accuracy: " + str(torch_accuracy)) torch_accuracy = EvaluateBinaryClassification.Accuracy(yValidate,model.predict(xValidate)) print("Validation accuracy: " + str(torch_accuracy)) # Calculate ROC curve (modelFPRs, modelFNRs, thresholds) = TabulateModelPerformanceForROC(model, xValidate, yValidate) FNRs_series.append(modelFNRs) FPRs_series.append(modelFPRs) Label_series.append("PyTorch") #### Include 3x3 Grid Features print("Moving on to 3x3 features...") # Featureize
def ExecuteFitting(runSpecification, xTrain, yTrain, xValidate, yValidate): startTime = time.time() # Create features and train based on type of model # Create the model model = BlinkNeuralNetwork.BlinkNeuralNetwork(hiddenNodes = 6, hiddenNodesTwo = 4) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Device is:", device) model.to(device) # Move the data onto whichever device was selected xTrain = xTrain.to(device) yTrain = yTrain.to(device) xValidate = xValidate.to(device) yValidate = yValidate.to(device) converged = False epoch = 1 lastLoss = None convergence = runSpecification['convergence'] optimizer = torch.optim.SGD(model.parameters(), lr=runSpecification['learning_rate']) lossFunction = torch.nn.MSELoss(reduction='mean') patience = 0 while not converged and epoch < 5000: # Do the forward pass yTrainPredicted = model(xTrain) trainLoss = lossFunction(yTrainPredicted, yTrain) # Reset the gradients in the network to zero optimizer.zero_grad() # Backprop the errors from the loss on this iteration trainLoss.backward() # Do a weight update step optimizer.step() loss = trainLoss.item() # print(loss) if epoch > 10 and lastLoss != None and abs(lastLoss - loss) < convergence: if patience >= 0: converged = True pass else: patience += 1 else: lastLoss = loss patience = 0 epoch = epoch + 1 model.train(mode=True) endTime = time.time() runSpecification['runtime'] = endTime - startTime runSpecification['epoch'] = epoch yValidatePredicted = model(xValidate) validAccuracy = EvaluateBinaryClassification.Accuracy(yValidate, [ 1 if pred > 0.5 else 0 for pred in yValidatePredicted ]) runSpecification['accuracy'] = validAccuracy num_samples = len(xValidate) (low_bound, high_bound) = ErrorBounds.GetAccuracyBounds(validAccuracy, num_samples, 0.5) errorBound = (high_bound - low_bound) / 2 runSpecification['50PercentBound'] = errorBound return runSpecification
print(" %d - %s" % (yTrain[i], xTrainRaw[i])) # Now we'll do our first 'machine learning' using a very simple 'algorithm'. import MachineLearningCourse.MLUtilities.Learners.MostCommonClassModel as MostCommonClassModel model = MostCommonClassModel.MostCommonClassModel() # go read the ModelMostCommon code to see what model.fit does model.fit(xTrainRaw, yTrain) print("\n- Inspect the model -") model.visualize() # model.predict takes the 'features' (in this case the raw strings) and returns a parallel array of preditions (in this case the most common label in the training set). yTrainPredicted = model.predict(xTrainRaw) # look at a few of the predictions, along with the correct labels and the raw x data. print( "\n- Inspect a few predictions [ <predicted> (<true label>) - <raw string> ] -" ) for i in range(5): print("%d (%d) - %s" % (yTrainPredicted[i], yTrain[i], xTrainRaw[i])) import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification trainSetAccuracy = EvaluateBinaryClassification.Accuracy( yTrain, yTrainPredicted) print("\n---") print( "Predicting the most common class gives: %.2f accuracy on the training set." % (trainSetAccuracy))
import MachineLearningCourse.MLUtilities.Learners.MostCommonClassModel as MostCommonClassModel model = MostCommonClassModel.MostCommonClassModel() # go read the ModelMostCommon code to see what model.fit does model.fit(xTrainRaw, yTrain) print("\n- Inspect the model -") model.visualize() # model.predict takes the 'features' (in this case the raw strings) and returns a parallel array of preditions (in this case the most common label in the training set). yTrainPredicted = model.predict(xTrainRaw) # look at a few of the predictions, along with the correct labels and the raw x data. print( "\n- Inspect a few predictions [ <predicted> (<true label>) - <raw string> ] -" ) for i in range(5): print("%d (%d) - %s" % (yTrainPredicted[i], yTrain[i], xTrainRaw[i])) import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification trainSetAccuracy = EvaluateBinaryClassification.Accuracy( yTrain, yTrainPredicted) print("\n---") print( "Predicting the most common class gives: %.2f accuracy on the training set." % (trainSetAccuracy)) yValidatePredicted = model.predict(xValidateRaw) EvaluateBinaryClassification.ExecuteAll(yValidate, yValidatePredicted)
xTrain = featurizer.Featurize(xTrainRaw) xValidate = featurizer.Featurize(xValidateRaw) xTest = featurizer.Featurize(xTestRaw) bestModelBT = None kValues = [1, 10, 50, 100, 150] maxDepth = 1 validationAccuracies = [] validationAccuracyErrorBounds = [] trainingAccuracies = [] trainingAccuracyErrorBounds = [] for kv in kValues: model = BoostedTree.BoostedTree() model.fit(xTrain, yTrain, maxDepth=maxDepth, k=kv) validationAccuracy = EvaluateBinaryClassification.Accuracy(yValidate, model.predict(xValidate)) lower, upper = ErrorBounds.GetAccuracyBounds(validationAccuracy, len(yValidate), .5) trainingAccuracy = EvaluateBinaryClassification.Accuracy(yTrain, model.predict(xTrain)) lowerTrain, upperTrain = ErrorBounds.GetAccuracyBounds(trainingAccuracy, len(yTrain), .5) validationAccuracies.append(validationAccuracy) validationAccuracyErrorBounds.append(validationAccuracy-lower) trainingAccuracies.append(trainingAccuracy) trainingAccuracyErrorBounds.append(trainingAccuracy-lowerTrain) print("k: ", kv, " accuracy: ", lower, "-", upper) if bestModelBT is None: bestModelBT = (model, lower, upper, kv) elif lower > bestModelBT[2]: bestModelBT = (model, lower, upper, kv)
import MachineLearningCourse.MLUtilities.Visualizations.Visualize2D as Visualize2D ## this code outputs the true concept. visualize = Visualize2D.Visualize2D(kOutputDirectory, "4-Generated Concept") visualize.Plot2DDataAndBinaryConcept(xTest,yTest,concept) visualize.Save() bestModel = None kValues = [1, 10, 25, 50, 100] maxDepth = 1 accuracies = [] errorBarsAccuracy = [] for kv in kValues: model = BoostedTree.BoostedTree() model.fit(xTrain, yTrain, maxDepth=maxDepth, k=kv) accuracy = EvaluateBinaryClassification.Accuracy(yTest, model.predict(xTest)) lower, upper = ErrorBounds.GetAccuracyBounds(accuracy, len(yTest), .5) print(kv, ": ", accuracy) accuracies.append(accuracy) errorBarsAccuracy.append(accuracy-lower) if bestModel is None: bestModel = (model, upper) elif lower > bestModel[1]: bestModel = (model, upper) Charting.PlotSeriesWithErrorBars([accuracies], [errorBarsAccuracy], ["k-round tuning accuracy"], kValues, chartTitle="Line/Circle Concept Accuracy", xAxisTitle="Boosting Rounds", yAxisTitle="Test Accuracy", yBotLimit=0.5, outputDirectory=kOutputDirectory, fileName="4-BoostingTreeRoundTuning") ## you can use this to visualize what your model is learning. accuracy = EvaluateBinaryClassification.Accuracy(yTest, bestModel[0].predict(xTest)) lower, upper = ErrorBounds.GetAccuracyBounds(accuracy, len(yTest), .95) print("accuracy: ", lower, "-", upper)
xTrain = featurizer.Featurize(xTrainRaw) xValidate = featurizer.Featurize(xValidateRaw) xTest = featurizer.Featurize(xTestRaw) for i in range(10): print("%d - " % (yTrain[i]), xTrain[i]) ############################ import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds import MachineLearningCourse.MLUtilities.Learners.MostCommonClassModel as MostCommonClassModel model = MostCommonClassModel.MostCommonClassModel() model.fit(xTrain, yTrain) yValidatePredicted = model.predict(xValidate) validateAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, yValidatePredicted) errorBounds = ErrorBounds.GetAccuracyBounds(validateAccuracy, len(yValidate), 0.95) print() print( "### 'Most Common Class' model validate set accuracy: %.4f (95%% %.4f - %.4f)" % (validateAccuracy, errorBounds[0], errorBounds[1])) import MachineLearningCourse.MLUtilities.Data.CrossValidation as CrossValidation import MachineLearningCourse.MLUtilities.Learners.LogisticRegression as LogisticRegression import MachineLearningCourse.MLUtilities.Visualizations.Charting as Charting import time import numpy as np
xValidate = featurizer.Featurize(xValidateRaw) xTest = featurizer.Featurize(xTestRaw) frequentModel = LogisticRegression.LogisticRegression() frequentModel.fit(xTrain, yTrain, convergence=convergence, stepSize=stepSize, verbose=True) ###### ### Use equation 5.1 from Mitchell to bound the validation set error and the true error import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds print("Logistic regression with 25 features by mutual information:") validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, frequentModel.predict(xValidate)) print("Validation set accuracy: %.4f." % (validationSetAccuracy)) for confidence in [.5, .8, .9, .95, .99]: (lowerBound, upperBound) = ErrorBounds.GetAccuracyBounds(validationSetAccuracy, len(xValidate), confidence) print(" %.2f%% accuracy bound: %.4f - %.4f" % (confidence, lowerBound, upperBound)) ### Compare to most common class model here... mostCommonModel = MostCommonClassModel.MostCommonClassModel() mostCommonModel.fit(xTrain, yTrain) print("MostCommon regression model:") validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
supplementalVocabularyWords=['call', 'to', 'your']) # Apply the featurerizer to the raw data sets to produce feature vectors. In this case, each message will be converted to an array # with one bit per feature that is 1 if the message has the feature, and 0 if the message does not have the feature. xTrain = featurizer.Featurize(xTrainRaw) xValidate = featurizer.Featurize(xValidateRaw) xTest = featurizer.Featurize(xTestRaw) print("\n - Inspect the features -") for i in range(len(xTrain[0])): print(featurizer.GetFeatureInfo(i)) print("\n - Inspect feature values for a few training samples -") for i in range(5): print(yTrain[i], "-", xTrain[i], xTrainRaw[i]) # Now let's up our modeling game (as compared to predicting the most common class) # we'll use a heuristic (hand-tuned) linear model. import MachineLearningCourse.MLUtilities.Learners.LinearHeuristicModel as LinearHeuristicModel model = LinearHeuristicModel.LinearHeuristicModel() model.fit(xTrain, yTrain, -1.0, [.75, .75, .75, .25, .25]) print("\n - Inspect the weights on the heuristically-tuned model -") model.visualize() yValidatePredicted = model.predict(xValidate) import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification EvaluateBinaryClassification.ExecuteAll(yValidate, yValidatePredicted)
# Learn the logistic regression model print("Learning the logistic regression model:") import MachineLearningCourse.MLUtilities.Learners.LogisticRegression as LogisticRegression logisticRegressionModel = LogisticRegression.LogisticRegression() logisticRegressionModel.fit(xTrain, yTrain, stepSize=1.0, convergence=0.005) ############################# # Evaluate the model import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification print("\nLogistic regression model:") logisticRegressionModel.visualize() EvaluateBinaryClassification.ExecuteAll( yValidate, logisticRegressionModel.predict(xValidate, classificationThreshold=0.5)) ################# # You may find the following module helpful for making charts. You'll have to install matplotlib (see the lecture notes). # # import MachineLearningCourse.MLUtilities.Visualizations.Charting as Charting # # # trainLosses, validationLosses, and lossXLabels are parallel arrays with the losses you want to plot at the specified x coordinates # # Charting.PlotSeries([trainLosses, validationLosses], ['Train', 'Validate'], lossXLabels, chartTitle="Logistic Regression", xAxisTitle="Gradient Descent Steps", yAxisTitle="Avg. Loss", outputDirectory=kOutputDirectory, fileName="3-Logistic Regression Train vs Validate loss")
xValidate = featurizer.Featurize(xValidateRaw) xTest = featurizer.Featurize(xTestRaw) frequentModel = LogisticRegression.LogisticRegression() frequentModel.fit(xTrain, yTrain, convergence=convergence, stepSize=stepSize, verbose=True) ###### ### Use equation 5.1 from Mitchell to bound the validation set error and the true error import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds print("Logistic regression with 25 features by mutual information:") validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, frequentModel.predict(xValidate)) print("Validation set accuracy: %.4f." % (validationSetAccuracy)) for confidence in [.5, .8, .9, .95, .99]: (lowerBound, upperBound) = ErrorBounds.GetAccuracyBounds(validationSetAccuracy, len(xValidate), confidence) print(" %.2f%% accuracy bound: %.4f - %.4f" % (confidence, lowerBound, upperBound)) ### Compare to most common class model here... # Set this to true when you've completed the previous steps and are ready to move on... doCrossValidation = False if doCrossValidation: import MachineLearningCourse.MLUtilities.Data.CrossValidation as CrossValidation