kOutputDirectory = "MachineLearningCourse/Assignments/Module01/Graphs/visualize\\" import MachineLearningCourse.MLProjectSupport.SMSSpam.SMSSpamDataset as SMSSpamDataset kDataPath = "MachineLearningCourse/MLProjectSupport/SMSSpam/dataset/SMSSpamCollection" (xRaw, yRaw) = SMSSpamDataset.LoadRawData(kDataPath) import MachineLearningCourse.MLUtilities.Data.Sample as Sample (xTrainRaw, yTrain, xValidateRaw, yValidate, xTestRaw, yTest) = Sample.TrainValidateTestSplit(xRaw, yRaw, percentValidate=.1, percentTest=.1) import MachineLearningCourse.MLUtilities.Learners.LogisticRegression as LogisticRegression import MachineLearningCourse.MLUtilities.Evaluations.EvaluateBinaryClassification as EvaluateBinaryClassification import MachineLearningCourse.MLUtilities.Evaluations.ErrorBounds as ErrorBounds import MachineLearningCourse.Assignments.Module01.SupportCode.SMSSpamFeaturize as SMSSpamFeaturize import MachineLearningCourse.MLUtilities.Data.CrossValidation as CrossValidation import time import numpy as np # A helper function for calculating FN rate and FP rate across a range of thresholds def TabulateModelPerformanceForROC(model, xValidate, yValidate): pointsToEvaluate = 100 thresholds = [ x / float(pointsToEvaluate) for x in range(pointsToEvaluate + 1) ]
import MachineLearningCourse.MLProjectSupport.SMSSpam.SMSSpamDataset as SMSSpamDataset # x represents training data, y represents the labels. These are parallel arrays. # 'Raw' indicates that the data has not been processed into features. # in this case, the xRaw array contains the raw SMS text strings and yRaw contains 1 if the message is spam and 0 if it isn't. (xRaw, yRaw) = SMSSpamDataset.LoadRawData() # The 'Sample' utility contains helper functions for spliting & sampling data, which you will need to do a lot in Machine Learning. import MachineLearningCourse.MLUtilities.Data.Sample as Sample # The 'TrainValidateTestSplit' function separates the raw data into three sets to use for your modeling process. These are: # 1) the training data, which you should use to build your model and make any feature engineering/selection decision # 2) the validation data, which you should use to tune your modeling process (hyper-parameters, etc) # 3) the testing data, which you should use sparingly to estimate the true quality of your final model # # In this case, use 80% of data for training, 10% for validation, and 10% for testing. (xTrainRaw, yTrain, xValidateRaw, yValidate, xTestRaw, yTest) = Sample.TrainValidateTestSplit(xRaw, yRaw, percentValidate=.1, percentTest=.1) # Now do some basic data exploration. Always a good idea to look at some very basic stats about your data sets before diving in with ML. print("Statistics on the data sets:") print(" Train set contains %04d samples, percent spam: " % (len(yTrain)) + "{:.2%}".format(sum(yTrain) / len(yTrain))) print(" Validate set contains %04d samples, percent spam: " % (len(yValidate)) + "{:.2%}".format(sum(yValidate) / len(yValidate))) print(" Test set contains %04d samples, percent spam: " % (len(yTest)) + "{:.2%}".format(sum(yTest) / len(yTest)))