kOutputDirectory = "MachineLearningCourse/Assignments/Module01/Graphs/visualize\\"

import MachineLearningCourse.MLProjectSupport.SMSSpam.SMSSpamDataset as SMSSpamDataset

kDataPath = "MachineLearningCourse/MLProjectSupport/SMSSpam/dataset/SMSSpamCollection"

(xRaw, yRaw) = SMSSpamDataset.LoadRawData(kDataPath)

import MachineLearningCourse.MLUtilities.Data.Sample as Sample
(xTrainRaw, yTrain, xValidateRaw, yValidate, xTestRaw,
 yTest) = Sample.TrainValidateTestSplit(xRaw,
                                        yRaw,
                                        percentValidate=.1,
                                        percentTest=.1)

import MachineLearningCourse.MLUtilities.Learners.LogisticRegression as LogisticRegression
import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification
import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds
import MachineLearningCourse.Assignments.Module01.SupportCode.SMSSpamFeaturize as SMSSpamFeaturize
import MachineLearningCourse.MLUtilities.Data.CrossValidation as CrossValidation

import time
import numpy as np


# A helper function for calculating FN rate and FP rate across a range of thresholds
def TabulateModelPerformanceForROC(model, xValidate, yValidate):
    pointsToEvaluate = 100
    thresholds = [
        x / float(pointsToEvaluate) for x in range(pointsToEvaluate + 1)
    ]
import MachineLearningCourse.MLProjectSupport.SMSSpam.SMSSpamDataset as SMSSpamDataset

# x represents training data, y represents the labels. These are parallel arrays.
#  'Raw' indicates that the data has not been processed into features.
#    in this case, the xRaw array contains the raw SMS text strings and yRaw contains 1 if the message is spam and 0 if it isn't.
(xRaw, yRaw) = SMSSpamDataset.LoadRawData()

# The 'Sample' utility contains helper functions for spliting & sampling data, which you will need to do a lot in Machine Learning.
import MachineLearningCourse.MLUtilities.Data.Sample as Sample

# The 'TrainValidateTestSplit' function separates the raw data into three sets to use for your modeling process. These are:
#  1) the training data, which you should use to build your model and make any feature engineering/selection decision
#  2) the validation data, which you should use to tune your modeling process (hyper-parameters, etc)
#  3) the testing data, which you should use sparingly to estimate the true quality of your final model
#
# In this case, use 80% of data for training, 10% for validation, and 10% for testing.
(xTrainRaw, yTrain, xValidateRaw, yValidate, xTestRaw,
 yTest) = Sample.TrainValidateTestSplit(xRaw,
                                        yRaw,
                                        percentValidate=.1,
                                        percentTest=.1)

# Now do some basic data exploration. Always a good idea to look at some very basic stats about your data sets before diving in with ML.
print("Statistics on the data sets:")
print(" Train set contains %04d samples,    percent spam: " % (len(yTrain)) +
      "{:.2%}".format(sum(yTrain) / len(yTrain)))
print(" Validate set contains %04d samples, percent spam: " %
      (len(yValidate)) + "{:.2%}".format(sum(yValidate) / len(yValidate)))
print(" Test set contains %04d samples,     percent spam: " % (len(yTest)) +
      "{:.2%}".format(sum(yTest) / len(yTest)))