def ExecuteEvaluationRun(runSpecification, xTrainRaw, yTrain, numberOfFolds=2):
    print("runSpecification: ", runSpecification)
    startTime = time.time()

    # HERE upgrade this to use crossvalidation
    featurizer = SMSSpamFeaturize.SMSSpamFeaturize()
    featurizer.CreateVocabulary(
        xTrainRaw,
        yTrain,
        numFrequentWords=runSpecification['numFrequentWords'],
        numMutualInformationWords=runSpecification['numMutualInformationWords']
    )

    xTrain = featurizer.Featurize(xTrainRaw)
    xValidate = featurizer.Featurize(xValidateRaw)

    if numberOfFolds > 1:
        crossValidationAccuracy = []
        for i in range(numberOfFolds):
            xTrainI, yTrainI, xEvaluateI, yEvaluateI = CrossValidation.CrossValidation(
                xTrain, yTrain, numberOfFolds, i)

            model = LogisticRegression.LogisticRegression()
            model.fit(xTrainI,
                      yTrainI,
                      convergence=runSpecification['convergence'],
                      stepSize=runSpecification['stepSize'],
                      verbose=False)

            crossValidationAccuracy.append(
                EvaluateBinaryClassification.Accuracy(
                    yEvaluateI, model.predict(xEvaluateI)))

        mean = np.mean(crossValidationAccuracy)
        runSpecification['crossValidationMean'] = mean
        lower, _ = ErrorBounds.GetAccuracyBounds(
            np.mean(crossValidationAccuracy), len(yEvaluateI), .5)
        runSpecification['crossValidationErrorBound'] = mean - lower

    if numberOfFolds == 1:
        model = LogisticRegression.LogisticRegression()
        model.fit(xTrain,
                  yTrain,
                  convergence=runSpecification['convergence'],
                  stepSize=runSpecification['stepSize'],
                  verbose=False)
        validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
            yValidate, model.predict(xValidate))

        runSpecification['accuracy'] = validationSetAccuracy
        lower, _ = ErrorBounds.GetAccuracyBounds(validationSetAccuracy,
                                                 len(yValidate), .5)
        runSpecification['accuracyErrorBound'] = validationSetAccuracy - lower

    endTime = time.time()
    if numberOfFolds > 1:
        runSpecification['runtime'] = endTime - startTime

    return runSpecification
def ExecuteEvaluationRun(runSpecification, xTrainRaw, yTrain, numberOfFolds=5):
    startTime = time.time()

    # HERE upgrade this to use crossvalidation

    featurizer = SMSSpamFeaturize.SMSSpamFeaturize()
    featurizer.CreateVocabulary(
        xTrainRaw,
        yTrain,
        numFrequentWords=runSpecification['numFrequentWords'],
        numMutualInformationWords=runSpecification['numMutualInformationWords']
    )

    xTrain = featurizer.Featurize(xTrainRaw)
    xValidate = featurizer.Featurize(xValidateRaw)

    model = LogisticRegression.LogisticRegression()
    model.fit(xTrain,
              yTrain,
              convergence=runSpecification['convergence'],
              stepSize=runSpecification['stepSize'],
              verbose=True)

    validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
        yValidate, model.predict(xValidate))

    runSpecification['accuracy'] = validationSetAccuracy

    # HERE upgrade this to calculate and save some error bounds...

    endTime = time.time()
    runSpecification['runtime'] = endTime - startTime

    return runSpecification
Example #3
0
def ExecuteEvaluationRun(runSpecification, xTrain, yTrain, numberOfFolds=2):
    print("runSpecification: ", runSpecification)
    startTime = time.time()

    if numberOfFolds > 1:
        crossValidationAccuracy = []
        for i in range(numberOfFolds):
            xTrainI, yTrainI, xEvaluateI, yEvaluateI = CrossValidation.CrossValidation(
                xTrain, yTrain, numberOfFolds, i)

            model = LogisticRegression.LogisticRegression()
            model.fit(xTrainI,
                      yTrainI,
                      convergence=runSpecification['convergence'],
                      stepSize=runSpecification['stepSize'],
                      verbose=False)

            crossValidationAccuracy.append(
                EvaluateBinaryClassification.Accuracy(
                    yEvaluateI, model.predict(xEvaluateI)))

        mean = np.mean(crossValidationAccuracy)
        runSpecification['crossValidationMean'] = mean
        lower, _ = ErrorBounds.GetAccuracyBounds(
            np.mean(crossValidationAccuracy), len(yEvaluateI), .5)
        runSpecification['crossValidationErrorBound'] = mean - lower

    if numberOfFolds == 1:
        model = LogisticRegression.LogisticRegression()
        model.fit(xTrain,
                  yTrain,
                  convergence=runSpecification['convergence'],
                  stepSize=runSpecification['stepSize'],
                  verbose=False)
        validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
            yValidate, model.predict(xValidate))

        runSpecification['accuracy'] = validationSetAccuracy
        lower, _ = ErrorBounds.GetAccuracyBounds(validationSetAccuracy,
                                                 len(yValidate), .5)
        runSpecification['accuracyErrorBound'] = validationSetAccuracy - lower

    endTime = time.time()
    if numberOfFolds > 1:
        runSpecification['runtime'] = endTime - startTime

    return runSpecification
init['stepSize'] = 1.0
init['convergence'] = 0.005
init['numFrequentWords'] = 0
init['numMutualInformationWords'] = 20

featurizer = SMSSpamFeaturize.SMSSpamFeaturize()
featurizer.CreateVocabulary(
    xTrainRaw,
    yTrain,
    numFrequentWords=init['numFrequentWords'],
    numMutualInformationWords=init['numMutualInformationWords'])

xTrain = featurizer.Featurize(xTrainRaw)
xTest = featurizer.Featurize(xTestRaw)

model = LogisticRegression.LogisticRegression()
model.fit(xTrain,
          yTrain,
          convergence=init['convergence'],
          stepSize=init['stepSize'],
          verbose=False)

(modelFPRs, modelFNRs,
 thresholds) = TabulateModelPerformanceForROC(model, xTest, yTest)
seriesFPRs.append(modelFPRs)
seriesFNRs.append(modelFNRs)
seriesLabels.append('initial parameters')

featurizer = SMSSpamFeaturize.SMSSpamFeaturize()
featurizer.CreateVocabulary(
    xTrainRaw,
if doModelEvaluation:
    ######
    ### Build a model and evaluate on validation data
    stepSize = 1.0
    convergence = 0.001

    featurizer = SMSSpamFeaturize.SMSSpamFeaturize()
    featurizer.CreateVocabulary(xTrainRaw,
                                yTrain,
                                numMutualInformationWords=25)

    xTrain = featurizer.Featurize(xTrainRaw)
    xValidate = featurizer.Featurize(xValidateRaw)
    xTest = featurizer.Featurize(xTestRaw)

    frequentModel = LogisticRegression.LogisticRegression()
    frequentModel.fit(xTrain,
                      yTrain,
                      convergence=convergence,
                      stepSize=stepSize,
                      verbose=True)

    ######
    ### Use equation 5.1 from Mitchell to bound the validation set error and the true error
    import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds

    print("Logistic regression with 25 features by mutual information:")
    validationSetAccuracy = EvaluateBinaryClassification.Accuracy(
        yValidate, frequentModel.predict(xValidate))
    print("Validation set accuracy: %.4f." % (validationSetAccuracy))
    for confidence in [.5, .8, .9, .95, .99]:
kOutputDirectory = "C:\\temp\\visualize"

runUnitTest = True
if runUnitTest:
    # Little synthetic dataset to help with implementation. 2 features, 8 samples.
    xTrain = [[.1, .1], [.2, .2], [.2, .1], [.1, .2], [.95, .95], [.9, .8],
              [.8, .9], [.7, .6]]
    yTrain = [0, 0, 0, 0, 1, 1, 1, 1]

    # create a linear model with the right number of weights initialized
    import MachineLearningCourse.MLUtilities.Learners.LogisticRegression as LogisticRegression
    model = LogisticRegression.LogisticRegression(featureCount=len(xTrain[0]))

    # To use this visualizer you need to install the PIL imaging library. Instructions are in the lecture notes.
    import MachineLearningCourse.MLUtilities.Visualizations.Visualize2D as Visualize2D

    while not model.converged:
        # do 10 iterations of training
        model.incrementalFit(xTrain,
                             yTrain,
                             maxSteps=10,
                             stepSize=1.0,
                             convergence=0.005)

        # then look at the models weights
        model.visualize()

        # then look at how training set loss is converging
        print(" fit for %d iterations, train set loss is %.4f" %
              (model.totalGradientDescentSteps, model.loss(xTrain, yTrain)))
    # Remember to create a new featurizer object/vocabulary for each part of the assignment
    featurizer = SMSSpamFeaturize.SMSSpamFeaturize(
        useHandCraftedFeatures=False)
    featurizer.CreateVocabulary(xTrainRaw,
                                yTrain,
                                numMutualInformationWords=10)
    print(featurizer.vocabulary)
    # Remember to reprocess the raw data whenever you change the featurizer
    xTrain = featurizer.Featurize(xTrainRaw)
    xValidate = featurizer.Featurize(xValidateRaw)
    xTest = featurizer.Featurize(xTestRaw)

    ## Good luck!
    print("Learning the logistic regression model:")
    import MachineLearningCourse.MLUtilities.Learners.LogisticRegression as LogisticRegression
    logisticRegressionModel = LogisticRegression.LogisticRegression()

    logisticRegressionModel.fit(xTrain,
                                yTrain,
                                stepSize=stepSize,
                                convergence=convergence)

    import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification

    print("\nLogistic regression model:")
    logisticRegressionModel.visualize()
    #EvaluateBinaryClassification.ExecuteAll(yTrain, logisticRegressionModel.predict(xTrain, classificationThreshold=0.5))
    EvaluateBinaryClassification.ExecuteAll(
        yValidate,
        logisticRegressionModel.predict(xValidate,
                                        classificationThreshold=0.5))