def validation_core(i, x, y, model, feature_count): (foldTrainX, foldTrainY, foldValidationX, foldValidationY) = self.__splitDataFold(x, y, i) mutualInformationTable = FeatureSelection.byMutualInformation(foldTrainX, foldTrainY) words = [word for word,_ in mutualInformationTable[:feature_count]] (xNewTrain, xNewValidation) = FeatureSelection.Featurize(foldTrainX, foldValidationX, words) model.fit(xNewTrain, foldTrainY) return self.__countCorrect(model.predict(xNewValidation), foldValidationY)
def validateByMutualInformation(self, x, y, model): totalCorrect = 0 for i in range(self.k): (foldTrainX, foldTrainY, foldValidationX, foldValidationY) = self.__splitDataFold(x, y, i) mutualInformationTable = FeatureSelection.byMutualInformation( foldTrainX, foldTrainY) words = [word for word, _ in mutualInformationTable[:10]] print('For fold %d/%d, choose words:' % (i + 1, self.k)) print(words) (xNewTrain, xNewValidation) = FeatureSelection.Featurize( foldTrainX, foldValidationX, words) model.fit(xNewTrain, foldTrainY) totalCorrect += self.__countCorrect(model.predict(xNewValidation), foldValidationY) accuracy = totalCorrect / len(x) return accuracy
############################ import FeatureSelection print('### Get the Frequency Table') frequencyTable = FeatureSelection.byFrequency(xTrainRaw) print('Top 10') for i in range(10): print(frequencyTable[i]) ############################# print('### Get the Mutual Information Table') mutualInformationTable = FeatureSelection.byMutualInformation( xTrainRaw, yTrain) print('Top 10') for i in range(10): print(mutualInformationTable[i]) ############################# print('### Run Gradient Descent with the Top 10 Words by Frequency') words = [word for word, _ in frequencyTable[:10]] print(words) (xNewTrain, xNewTest) = FeatureSelection.Featurize(xTrainRaw, xTestRaw, words) model.fit(xNewTrain, yTrain, iterations=50000, step=0.01) yTestPredicted = model.predict(xNewTest) testAccuracy = EvaluationsStub.Accuracy(yTest, yTestPredicted) print("Test Set Accuracy is %f" % (testAccuracy))
print("Train is %f percent spam." % (sum(yTrainRaw)/len(yTrainRaw))) print("Test is %f percent spam." % (sum(yTestRaw)/len(yTestRaw))) (xTrain, xTest) = Assignment1Support.Featurize(xTrainRaw, xTestRaw) yTrain = yTrainRaw yTest = yTestRaw import LogisticRegressionModel model = LogisticRegressionModel.LogisticRegressionModel() ############################# import FeatureSelection print('### Get the Mutual Information Table') mutualInformationTable = FeatureSelection.byMutualInformation(xRaw, yRaw) ############################# import EvaluationsStub print('### Get the Confusion Matrix') words = [word for word,_ in mutualInformationTable[:10]] print(words) (xNewTrain, xNewTest) = FeatureSelection.Featurize(xTrainRaw, xTestRaw, words) model.fit(xNewTrain, yTrain, iterations=50000, step=0.01) yTestPredicted = model.predict(xNewTest) EvaluationsStub.ExecuteAll(yTest, yTestPredicted)