def ExecuteEvaluationRun(runSpecification, xTrainRaw, yTrain, numberOfFolds=2): print("runSpecification: ", runSpecification) startTime = time.time() # HERE upgrade this to use crossvalidation featurizer = SMSSpamFeaturize.SMSSpamFeaturize() featurizer.CreateVocabulary( xTrainRaw, yTrain, numFrequentWords=runSpecification['numFrequentWords'], numMutualInformationWords=runSpecification['numMutualInformationWords'] ) xTrain = featurizer.Featurize(xTrainRaw) xValidate = featurizer.Featurize(xValidateRaw) if numberOfFolds > 1: crossValidationAccuracy = [] for i in range(numberOfFolds): xTrainI, yTrainI, xEvaluateI, yEvaluateI = CrossValidation.CrossValidation( xTrain, yTrain, numberOfFolds, i) model = LogisticRegression.LogisticRegression() model.fit(xTrainI, yTrainI, convergence=runSpecification['convergence'], stepSize=runSpecification['stepSize'], verbose=False) crossValidationAccuracy.append( EvaluateBinaryClassification.Accuracy( yEvaluateI, model.predict(xEvaluateI))) mean = np.mean(crossValidationAccuracy) runSpecification['crossValidationMean'] = mean lower, _ = ErrorBounds.GetAccuracyBounds( np.mean(crossValidationAccuracy), len(yEvaluateI), .5) runSpecification['crossValidationErrorBound'] = mean - lower if numberOfFolds == 1: model = LogisticRegression.LogisticRegression() model.fit(xTrain, yTrain, convergence=runSpecification['convergence'], stepSize=runSpecification['stepSize'], verbose=False) validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, model.predict(xValidate)) runSpecification['accuracy'] = validationSetAccuracy lower, _ = ErrorBounds.GetAccuracyBounds(validationSetAccuracy, len(yValidate), .5) runSpecification['accuracyErrorBound'] = validationSetAccuracy - lower endTime = time.time() if numberOfFolds > 1: runSpecification['runtime'] = endTime - startTime return runSpecification
def ExecuteEvaluationRun(runSpecification, xTrainRaw, yTrain, numberOfFolds=5): startTime = time.time() # HERE upgrade this to use crossvalidation featurizer = SMSSpamFeaturize.SMSSpamFeaturize() featurizer.CreateVocabulary( xTrainRaw, yTrain, numFrequentWords=runSpecification['numFrequentWords'], numMutualInformationWords=runSpecification['numMutualInformationWords'] ) xTrain = featurizer.Featurize(xTrainRaw) xValidate = featurizer.Featurize(xValidateRaw) model = LogisticRegression.LogisticRegression() model.fit(xTrain, yTrain, convergence=runSpecification['convergence'], stepSize=runSpecification['stepSize'], verbose=True) validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, model.predict(xValidate)) runSpecification['accuracy'] = validationSetAccuracy # HERE upgrade this to calculate and save some error bounds... endTime = time.time() runSpecification['runtime'] = endTime - startTime return runSpecification
def ExecuteEvaluationRun(runSpecification, xTrain, yTrain, numberOfFolds=2): print("runSpecification: ", runSpecification) startTime = time.time() if numberOfFolds > 1: crossValidationAccuracy = [] for i in range(numberOfFolds): xTrainI, yTrainI, xEvaluateI, yEvaluateI = CrossValidation.CrossValidation( xTrain, yTrain, numberOfFolds, i) model = LogisticRegression.LogisticRegression() model.fit(xTrainI, yTrainI, convergence=runSpecification['convergence'], stepSize=runSpecification['stepSize'], verbose=False) crossValidationAccuracy.append( EvaluateBinaryClassification.Accuracy( yEvaluateI, model.predict(xEvaluateI))) mean = np.mean(crossValidationAccuracy) runSpecification['crossValidationMean'] = mean lower, _ = ErrorBounds.GetAccuracyBounds( np.mean(crossValidationAccuracy), len(yEvaluateI), .5) runSpecification['crossValidationErrorBound'] = mean - lower if numberOfFolds == 1: model = LogisticRegression.LogisticRegression() model.fit(xTrain, yTrain, convergence=runSpecification['convergence'], stepSize=runSpecification['stepSize'], verbose=False) validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, model.predict(xValidate)) runSpecification['accuracy'] = validationSetAccuracy lower, _ = ErrorBounds.GetAccuracyBounds(validationSetAccuracy, len(yValidate), .5) runSpecification['accuracyErrorBound'] = validationSetAccuracy - lower endTime = time.time() if numberOfFolds > 1: runSpecification['runtime'] = endTime - startTime return runSpecification
init['stepSize'] = 1.0 init['convergence'] = 0.005 init['numFrequentWords'] = 0 init['numMutualInformationWords'] = 20 featurizer = SMSSpamFeaturize.SMSSpamFeaturize() featurizer.CreateVocabulary( xTrainRaw, yTrain, numFrequentWords=init['numFrequentWords'], numMutualInformationWords=init['numMutualInformationWords']) xTrain = featurizer.Featurize(xTrainRaw) xTest = featurizer.Featurize(xTestRaw) model = LogisticRegression.LogisticRegression() model.fit(xTrain, yTrain, convergence=init['convergence'], stepSize=init['stepSize'], verbose=False) (modelFPRs, modelFNRs, thresholds) = TabulateModelPerformanceForROC(model, xTest, yTest) seriesFPRs.append(modelFPRs) seriesFNRs.append(modelFNRs) seriesLabels.append('initial parameters') featurizer = SMSSpamFeaturize.SMSSpamFeaturize() featurizer.CreateVocabulary( xTrainRaw,
if doModelEvaluation: ###### ### Build a model and evaluate on validation data stepSize = 1.0 convergence = 0.001 featurizer = SMSSpamFeaturize.SMSSpamFeaturize() featurizer.CreateVocabulary(xTrainRaw, yTrain, numMutualInformationWords=25) xTrain = featurizer.Featurize(xTrainRaw) xValidate = featurizer.Featurize(xValidateRaw) xTest = featurizer.Featurize(xTestRaw) frequentModel = LogisticRegression.LogisticRegression() frequentModel.fit(xTrain, yTrain, convergence=convergence, stepSize=stepSize, verbose=True) ###### ### Use equation 5.1 from Mitchell to bound the validation set error and the true error import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds print("Logistic regression with 25 features by mutual information:") validationSetAccuracy = EvaluateBinaryClassification.Accuracy( yValidate, frequentModel.predict(xValidate)) print("Validation set accuracy: %.4f." % (validationSetAccuracy)) for confidence in [.5, .8, .9, .95, .99]:
kOutputDirectory = "C:\\temp\\visualize" runUnitTest = True if runUnitTest: # Little synthetic dataset to help with implementation. 2 features, 8 samples. xTrain = [[.1, .1], [.2, .2], [.2, .1], [.1, .2], [.95, .95], [.9, .8], [.8, .9], [.7, .6]] yTrain = [0, 0, 0, 0, 1, 1, 1, 1] # create a linear model with the right number of weights initialized import MachineLearningCourse.MLUtilities.Learners.LogisticRegression as LogisticRegression model = LogisticRegression.LogisticRegression(featureCount=len(xTrain[0])) # To use this visualizer you need to install the PIL imaging library. Instructions are in the lecture notes. import MachineLearningCourse.MLUtilities.Visualizations.Visualize2D as Visualize2D while not model.converged: # do 10 iterations of training model.incrementalFit(xTrain, yTrain, maxSteps=10, stepSize=1.0, convergence=0.005) # then look at the models weights model.visualize() # then look at how training set loss is converging print(" fit for %d iterations, train set loss is %.4f" % (model.totalGradientDescentSteps, model.loss(xTrain, yTrain)))
# Remember to create a new featurizer object/vocabulary for each part of the assignment featurizer = SMSSpamFeaturize.SMSSpamFeaturize( useHandCraftedFeatures=False) featurizer.CreateVocabulary(xTrainRaw, yTrain, numMutualInformationWords=10) print(featurizer.vocabulary) # Remember to reprocess the raw data whenever you change the featurizer xTrain = featurizer.Featurize(xTrainRaw) xValidate = featurizer.Featurize(xValidateRaw) xTest = featurizer.Featurize(xTestRaw) ## Good luck! print("Learning the logistic regression model:") import MachineLearningCourse.MLUtilities.Learners.LogisticRegression as LogisticRegression logisticRegressionModel = LogisticRegression.LogisticRegression() logisticRegressionModel.fit(xTrain, yTrain, stepSize=stepSize, convergence=convergence) import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification print("\nLogistic regression model:") logisticRegressionModel.visualize() #EvaluateBinaryClassification.ExecuteAll(yTrain, logisticRegressionModel.predict(xTrain, classificationThreshold=0.5)) EvaluateBinaryClassification.ExecuteAll( yValidate, logisticRegressionModel.predict(xValidate, classificationThreshold=0.5))