def runWithIGR(featureSize, modelCount): X_raw, y = common.loadTrainingDataSet() reducer = InformationGainReducer() reducer.fit(X_raw, y) reducer.resize(featureSize) X = reducer.transform(X_raw).toarray() modelList = [] for modelNum in range(modelCount): rs = 42 + modelNum rus = RandomUnderSampler(random_state=rs) X_model, y_model = rus.fit_resample(X, y) nbClassifier = NaiveBayesClassifier() nbClassifier.fit(X_model, y_model) modelList.append(nbClassifier) X_test_raw = common.loadTestDataSet() X_test = reducer.transform(X_test_raw).toarray() combinedModelOutput = common.predictCombinedSimple(X_test, modelList) common.writeResultsFile(combinedModelOutput) print("Done predicting with multi-model and IGR.")
def runWithBalancingAndIGR(featureSize, alphaValue): X_model_full_imbalanced, y_model_imbalanced = common.loadTrainingDataSet() balancer = FeatureIndependentOversampler(random_state=42) X_model_full_raw, y_model_raw = balancer.fit_transform( X_model_full_imbalanced, y_model_imbalanced) X_model_full, y_model = shuffle(X_model_full_raw, y_model_raw, random_state=42) reducer = InformationGainReducer() reducer.fit(X_model_full, y_model) reducer.resize(featureSize) X_model = reducer.transform(X_model_full).todense() hiddenLayerSizes = (int(math.sqrt(featureSize)) + 1, ) mc = MLPClassifier(solver='lbfgs', alpha=alphaValue, hidden_layer_sizes=hiddenLayerSizes) mc.fit(X_model, y_model) X_test_full = common.loadTestDataSet() X_test = reducer.transform(X_test_full) output = mc.predict(X_test) common.writeResultsFile(output) print("Done estimating with neural network for feature size = " + str(featureSize) + " and alpha = " + str(alphaValue))
def testCustomUnderfitting(): ##################### # Part 1. Balance the dataset ##################### X, y = common.loadTrainingDataSet() #xBalanced, yBalanced = data_balancing.balanceDatasetWithRandomOversampling(xRawData, yRawData) #tuneFeatureCountWithChiSquared(xBalanced, yBalanced) #tuneFeatureCountWithTruncatedSVD(X, y) datasets = testMultiModel(X, y, 9) for modelNum in range(9): dataset = datasets[modelNum] X_sub = dataset[0] y_sub = dataset[1] reducer = feature_reduction.getChiSquared(X_sub, y_sub, 300) X_sub_new = feature_reduction.transform(reducer, X_sub) y_sub_new = np.array(y_sub, dtype=np.int64) modelScore = getAvgF1Score(X_sub_new, y_sub_new) print("Model score for model " + str(modelNum) + " = " + str(modelScore))
def tuneMultimodelKnnIgr(featureSizes, kValues): X_raw, y_raw = common.loadTrainingDataSet() scoreMap = dict() for featureSize in featureSizes: for kValue in kValues: scoreMap[(featureSize, kValue)] = [] kf = KFold(n_splits=5, random_state=42, shuffle=True) foldNumber = 0 for train_index, test_index in kf.split(X_raw): X_train, X_test = X_raw[train_index], X_raw[test_index] y_train, y_test = y_raw[train_index], y_raw[test_index] reducer = InformationGainReducer() reducer.fit(X_train, y_train) for featureSize in featureSizes: reducer.resize(featureSize) X_train_reduced = reducer.transform(X_train).toarray() X_test_reduced = reducer.transform(X_test).toarray() for kValue in kValues: modelList = [] for modelNum in range(11): rus_rs = 555 + (modelNum * featureSize) rus = RandomUnderSampler(random_state=rus_rs) X_model, y_model = rus.fit_resample( X_train_reduced, y_train) clf = KNeighborsClassifier(n_neighbors=kValue, metric='manhattan') clf.fit(X_model, y_model) modelList.append(clf) print(".", end="") output = common.predictCombinedSimple(X_test_reduced, modelList) combinedModelScore = f1_score(y_test, output) scoreMap[(featureSize, kValue)].append(combinedModelScore) print() print("Done with kValue = " + str(kValue) + " for fold #" + str(foldNumber) + " for feature size = " + str(featureSize) + ". F1 = " + str(combinedModelScore)) print("Done with fold #" + str(foldNumber) + " for feature size = " + str(featureSize)) foldNumber += 1 for featureSize in featureSizes: for kValue in kValues: meanF1Score = mean(scoreMap[(featureSize, kValue)]) print("F1 Score for KNN with IGR, K = " + str(kValue) + " and FR size = " + str(featureSize) + " is: " + str(meanF1Score))
def runWithUndersamplingMutualInfo(): X, y = common.loadTrainingDataSet() print("Counter(y) = " + str(Counter(y))) rus = RandomUnderSampler(random_state=42) X_res, y_res = rus.fit_resample(X, y) print("Counter(y_res) = " + str(Counter(y_res))) reducer = SelectKBest(mutual_info_classif, 300) X_new = reducer.fit_transform(X_res, y_res).toarray() print("Done with feature selection") #reducer = feature_reduction.getChiSquared(X_res, y_res, 1331) #featureReducer = SelectKBest(chi2, k=j) #featureReducer.fit(X, y) #X_new = feature_reduction.transform(reducer, X_res) nbClf = NaiveBayesClassifier() nbClf.fit(X_new, y_res) X_test = common.loadTestDataSet() X_test_new = reducer.transform(X_test).toarray() testPredictions = nbClf.predict(X_test_new) print("Test predictions shape = " + str(testPredictions.shape)) print("Test Estimates = " + str(testPredictions)) common.writeResultsFile(testPredictions) print("Done!")
def runWithOversampling(): ##################### # Part 1. Balance the dataset ##################### xRawData, yRawData = common.loadTrainingDataSet() xBalanced, yBalanced = data_balancing.balanceDatasetWithRandomOversampling( xRawData, yRawData) ##################### # Part 2. Feature Reduction ##################### featureReducer = SelectKBest(chi2, k=10000) featureReducer.fit(xBalanced, yBalanced) xReduced = featureReducer.transform(xBalanced).todense() nbClassifier = NaiveBayesClassifier() nbClassifier.fit(xReduced, yBalanced) rawTestData = common.loadTestDataSet() reducedTestData = featureReducer.transform(rawTestData).todense() resultsArray = nbClassifier.predict(reducedTestData) common.writeResultsFile(resultsArray)
def tuneNaiveBayesIgrFeatureSize(featureSizeList, modelCountList): X_raw, y = common.loadTrainingDataSet() reducer = InformationGainReducer() reducer.fit(X_raw, y) for featureSize in featureSizeList: reducer.resize(featureSize) X = reducer.transform(X_raw).toarray() #print("Counter(y) = " + str(Counter(y))) for modelCount in modelCountList: kf = KFold(n_splits=5, random_state=42, shuffle=True) splitIndex = 0 f1ScoreList = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] modelList = [] for modelNum in range(modelCount): rs = 42 + modelNum rus = RandomUnderSampler(random_state=rs) X_model, y_model = rus.fit_resample(X_train, y_train) nbClassifier = NaiveBayesClassifier() nbClassifier.fit(X_model, y_model) #X_test_2 = reducer.transform(X_test).toarray() #output = nbClassifier.predict(X_test_2) #modelScore = f1_score(y_test, output) #print("Split Index = " + str(splitIndex) + ", Model Num = " + str(modelNum) + ", F1 = " + str(modelScore)) modelList.append(nbClassifier) #print(".", end='') #print() combinedModelOutput = common.predictCombinedSimple( X_test, modelList) combinedModelScore = f1_score(y_test, combinedModelOutput) f1ScoreList.append(combinedModelScore) #print("Combined Model Score for split #" + str(splitIndex) + " = " + str(combinedModelScore)) splitIndex += 1 print("F1 Score for FR size = " + str(featureSize) + " and model count = " + str(modelCount) + " is: " + str(mean(f1ScoreList)))
def tuneMultimodelIGR(featureSizes): X_raw, y_raw = common.loadTrainingDataSet() scoreMap = dict() for featureSize in featureSizes: scoreMap[featureSize] = [] kf = KFold(n_splits=5, random_state=42, shuffle=True) foldNumber = 0 for train_index, test_index in kf.split(X_raw): X_train, X_test = X_raw[train_index], X_raw[test_index] y_train, y_test = y_raw[train_index], y_raw[test_index] reducer = InformationGainReducer() reducer.fit(X_train, y_train) for featureSize in featureSizes: reducer.resize(featureSize) X_train_reduced = reducer.transform(X_train).toarray() modelList = [] for modelNum in range(11): rus_rs = 555 + modelNum rus = RandomUnderSampler(random_state=rus_rs) X_model, y_model = rus.fit_resample(X_train_reduced, y_train) nbClassifier = NaiveBayesClassifier() nbClassifier.fit(X_model, y_model) modelList.append(nbClassifier) print(".", end="") X_test_reduced = reducer.transform(X_test).toarray() output = common.predictCombinedSimple(X_test_reduced, modelList) combinedModelScore = f1_score(y_test, output) scoreMap[featureSize].append(combinedModelScore) print() print("Done with fold #" + str(foldNumber) + " for feature size = " + str(featureSize) + ". F1 = " + str(combinedModelScore)) foldNumber += 1 for featureSize in featureSizes: meanF1Score = mean(scoreMap[featureSize]) print("F1 Score for NN with Chi2 and FR size = " + str(featureSize) + " is: " + str(meanF1Score))
def tuneMultimodelSvm(featureSizes): X_raw, y_raw = common.loadTrainingDataSet() scoreMap = dict() for featureSize in featureSizes: scoreMap[featureSize] = [] kf = KFold(n_splits=5, random_state=42, shuffle=True) foldNumber = 0 for train_index, test_index in kf.split(X_raw): X_train, X_test = X_raw[train_index], X_raw[test_index] y_train, y_test = y_raw[train_index], y_raw[test_index] for featureSize in featureSizes: reducer = TruncatedSVD(n_components=featureSize) X_train_reduced = reducer.fit_transform(X_train) modelList = [] for modelNum in range(11): rus_rs = 555 + (modelNum * featureSize) rus = RandomUnderSampler(random_state=rus_rs) X_model, y_model = rus.fit_resample(X_train_reduced, y_train) clf = SVC(gamma='scale') clf.fit(X_model, y_model) modelList.append(clf) print(".", end="") X_test_reduced = reducer.transform(X_test) output = common.predictCombinedSimple(X_test_reduced, modelList) combinedModelScore = f1_score(y_test, output) scoreMap[featureSize].append(combinedModelScore) print() print("Done with fold #" + str(foldNumber) + " for feature size = " + str(featureSize) + ". F1 = " + str(combinedModelScore)) foldNumber += 1 for featureSize in featureSizes: meanF1Score = mean(scoreMap[featureSize]) print("F1 Score for SVM with Truncated SVD and FR size = " + str(featureSize) + " is: " + str(meanF1Score))
def tuneBasicDecisionTree(): # Some setup X_raw, y_raw = common.loadTrainingDataSet() #xBalanced, yBalanced = data_balancing.balanceDatasetWithRandomOversampling(xRawData, yRawData) #myCounter = Counter(yBalanced) #print("Finished loading and sampling. Data dist = " + str(myCounter)) decisionTreeClassifier = DecisionTreeClassifier() cvFolds = 5 #constants.crossValidationFoldCount cvScores = cross_val_score(estimator=decisionTreeClassifier, X=X_raw, y=y_raw, scoring='f1', cv=cvFolds) print("Individual CV scores = " + str(cvScores)) avg = sum(cvScores) / cvFolds print("Cross validation score for decision tree = " + str(avg))
def tuneNaiveBayesFeatureReduction(): X, y = common.loadTrainingDataSet() rus = RandomUnderSampler(random_state=42) X_res, y_res = rus.fit_resample(X, y) print("Counter(y_res) = " + str(Counter(y_res))) for j in common.getFeatureCountArray(): reducer = feature_reduction.getChiSquared(X_res, y_res, j) #featureReducer = SelectKBest(chi2, k=j) #featureReducer.fit(X, y) X_new = feature_reduction.transform(reducer, X_res) f1 = getAvgF1Score(X_new, y_res) print("J = " + str(j) + ", F1 = " + str(f1)) '''
def tuneNaiveBayesMultiModel(featureSize, modelCount): X, y = common.loadTrainingDataSet() #print("Counter(y) = " + str(Counter(y))) kf = KFold(n_splits=5, random_state=42, shuffle=True) splitIndex = 0 f1ScoreList = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] modelTransformerList = [] for modelNum in range(modelCount): rs = 42 + modelNum rus = RandomUnderSampler(random_state=rs) X_model_full, y_model = rus.fit_resample(X_train, y_train) reducer = SelectKBest(chi2, k=featureSize) X_model = reducer.fit_transform(X_model_full, y_model).toarray() nbClassifier = NaiveBayesClassifier() nbClassifier.fit(X_model, y_model) #X_test_2 = reducer.transform(X_test).toarray() #output = nbClassifier.predict(X_test_2) #modelScore = f1_score(y_test, output) #print("Split Index = " + str(splitIndex) + ", Model Num = " + str(modelNum) + ", F1 = " + str(modelScore)) modelTransformerList.append((nbClassifier, reducer)) combinedModelOutput = common.predictCombined(X_test, modelTransformerList) combinedModelScore = f1_score(y_test, combinedModelOutput) f1ScoreList.append(combinedModelScore) #print("Combined Model Score = " + str(combinedModelScore)) splitIndex += 1 print("F1 Score for FR size = " + str(featureSize) + " is: " + str(mean(f1ScoreList)))
def tuneRandomForestDepth(depths): X_raw, y_raw = common.loadTrainingDataSet() scoreMap = dict() for depth in depths: scoreMap[depth] = [] kf = KFold(n_splits=5, random_state=42, shuffle=True) foldNumber = 0 for train_index, test_index in kf.split(X_raw): X_train, X_test = X_raw[train_index], X_raw[test_index] y_train, y_test = y_raw[train_index], y_raw[test_index] for depth in depths: reducer = SelectKBest(chi2, k=127) reducer.fit(X_train, y_train) X_train_reduced = reducer.transform(X_train).toarray() ss_rs = 42 + (depth * foldNumber) smoteSampler = SMOTE(random_state=ss_rs) X_model, y_model = smoteSampler.fit_resample( X_train_reduced, y_train) clf = RandomForestClassifier(max_depth=depth) clf.fit(X_model, y_model) X_test_reduced = reducer.transform(X_test).toarray() output = clf.predict(X_test_reduced) combinedModelScore = f1_score(y_test, output) scoreMap[depth].append(combinedModelScore) print() print("Done with RF prediction for fold #" + str(foldNumber) + " for depth = " + str(depth) + ". F1 = " + str(combinedModelScore)) foldNumber += 1 for depth in depths: meanF1Score = mean(scoreMap[depth]) print("F1 Score for RF with Chi2 and depth = " + str(depth) + " is: " + str(meanF1Score))
def tuneDecisionTreeSmote(featureSizes): X_raw, y_raw = common.loadTrainingDataSet() scoreMap = dict() for featureSize in featureSizes: scoreMap[featureSize] = [] kf = KFold(n_splits=5, random_state=42, shuffle=True) foldNumber = 0 for train_index, test_index in kf.split(X_raw): X_train, X_test = X_raw[train_index], X_raw[test_index] y_train, y_test = y_raw[train_index], y_raw[test_index] for featureSize in featureSizes: reducer = SelectKBest(chi2, k=featureSize) reducer.fit(X_train, y_train) X_train_reduced = reducer.transform(X_train).toarray() ss_rs = 42 + (featureSize * foldNumber) smoteSampler = SMOTE(random_state=ss_rs) X_model, y_model = smoteSampler.fit_resample( X_train_reduced, y_train) dtClassifier = DecisionTreeClassifier(max_depth=10) dtClassifier.fit(X_model, y_model) X_test_reduced = reducer.transform(X_test).toarray() output = dtClassifier.predict(X_test_reduced) combinedModelScore = f1_score(y_test, output) scoreMap[featureSize].append(combinedModelScore) print() print("Done with DT prediction for fold #" + str(foldNumber) + " for feature size = " + str(featureSize) + ". F1 = " + str(combinedModelScore)) foldNumber += 1 for featureSize in featureSizes: meanF1Score = mean(scoreMap[featureSize]) print("F1 Score for DT with Chi2 and FR size = " + str(featureSize) + " is: " + str(meanF1Score))
def runWithMultiModel(): modelTransformerList = [] X, y = common.loadTrainingDataSet() for modelNum in range(9): rs = 42 + modelNum rus = RandomUnderSampler(random_state=rs) X_model_full, y_model = rus.fit_resample(X, y) reducer = SelectKBest(chi2, k=105) X_model = reducer.fit_transform(X_model_full, y_model).toarray() nbClassifier = NaiveBayesClassifier() nbClassifier.fit(X_model, y_model) modelTransformerList.append((nbClassifier, reducer)) X_test = common.loadTestDataSet() combinedModelOutput = common.predictCombined(X_test, modelTransformerList) common.writeResultsFile(combinedModelOutput) print("Done predicting with multi-model.")
def tuneReducedDecisionTreeWithFeatureSizeAndDepth(featureSize, depth): X, y = common.loadTrainingDataSet() kf = KFold(n_splits=5, random_state=42, shuffle=True) f1ScoreList = [] foldNumber = 1 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] rus = RandomUnderSampler(random_state=foldNumber) X_model_full, y_model = rus.fit_resample(X_train, y_train) reducer = SelectKBest(chi2, k=featureSize) X_model1 = reducer.fit_transform(X_model_full, y_model) X_model = X_model1.tocsc() #reducer = TruncatedSVD(n_components=featureSize, n_iter=7, random_state=42) #X_model = reducer.fit_transform(X_train, y_train) dtClassifier = DecisionTreeClassifier( max_depth=depth, class_weight="balanced", #min_samples_split=0.01, min_samples_leaf=1, min_weight_fraction_leaf=0.01) dtClassifier.fit(X_model, y_model) X_model_test = reducer.transform(X_test).tocsr() y_pred = dtClassifier.predict(X_model_test) #report = classification_report(y_test, y_pred) currentF1 = f1_score(y_test, y_pred) f1ScoreList.append(currentF1) foldNumber += 1 #print("f1 Score list = " + str(f1ScoreList)) print("Mean for feature and depth of (" + str(featureSize) + ", " + str(depth) + ") = " + str(mean(f1ScoreList)))
def runBuiltInBernoulli(): trainingDataMatrix, labelMatrix = common.loadTrainingDataSet() predictiveFeatures = feature_reduction.computePredictiveness( trainingDataMatrix, labelMatrix) #print("Performed feature selection. New shape is: " + str(trainingMatrix1.shape)) bernoulliClf = BernoulliNB(alpha=constants.smoothingConstant, binarize=None, fit_prior=False) ''' for j in range(5, 1001, 5): importantFeatures = [element[0] for element in predictiveFeatures[0:j]] #print("Important features = " + str(importantFeatures)) importantFeaturesArray = np.array(importantFeatures) reducedDataSet = trainingDataMatrix[:, importantFeaturesArray] #print("Reduced data set shape = " + str(reducedDataSet.shape)) cvScores = cross_val_score(estimator=bernoulliClf, X=reducedDataSet, y=labelMatrix, scoring='f1', cv=constants.crossValidationFoldCount) avg = sum(cvScores) / constants.crossValidationFoldCount print("My reducer. Feature Count = " + str(j) + " Avg Score = " + str(avg)) ''' importantFeaturesArray = [ element[0] for element in predictiveFeatures[0:205] ] reducedTraining = trainingDataMatrix[:, importantFeaturesArray] bernoulliClf.fit(reducedTraining, labelMatrix) testDataMatrix = common.loadTestDataSet() reducedTesting = testDataMatrix[:, importantFeaturesArray] testPredictions = bernoulliClf.predict(reducedTesting) print("Test predictions shape = " + str(testPredictions.shape)) print("Test Estimates = " + str(testPredictions)) common.writeResultsFile(testPredictions)
def tuneReducedDecisionTree(): X, y = common.loadTrainingDataSet() kf = KFold(n_splits=5, random_state=42, shuffle=True) splitIndex = 0 f1ScoreList = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] totalF1 = 0.0 numModels = 9 for modelNum in range(numModels): rs = 42 + modelNum rus = RandomUnderSampler(random_state=rs) X_model_full, y_model = rus.fit_resample(X_train, y_train) truncatedSvd = TruncatedSVD(n_components=331, n_iter=7, random_state=42) X_model = truncatedSvd.fit_transform(X_model_full, y_model) dtClassifier = DecisionTreeClassifier(ccp_alpha=0.015) dtClassifier.fit(X_model, y_model) X_model_test = truncatedSvd.transform(X_test) y_pred = dtClassifier.predict(X_model_test) #report = classification_report(y_test, y_pred) currentF1 = f1_score(y_test, y_pred) print("Printing F1 for model #" + str(modelNum) + " = " + str(currentF1)) #print(str(report)) totalF1 += currentF1 avgF1 = totalF1 / numModels print("f1 = " + str(avgF1))
''' Created on Mar 7, 2020 @author: William ''' import cs584.project2.common as common import cs584.project2.data_balancing as data_balancing from collections import Counter from sklearn import tree if __name__ == '__main__': # Some setup xRawData, yRawData = common.loadTrainingDataSet() xBalanced, yBalanced = data_balancing.balanceDatasetWithRandomOversampling(xRawData, yRawData) myCounter = Counter(yBalanced) print("Finished loading and sampling. Data dist = " + str(myCounter)) decisionTreeClassifier = tree.DecisionTreeClassifier() decisionTreeClassifier.fit(xBalanced, yBalanced)
def runBernoulliWithChiSquared(): trainingDataMatrix, labelMatrix = common.loadTrainingDataSet() #predictiveFeatures = feature_reduction.computePredictiveness(trainingDataMatrix, labelMatrix) #print("Performed feature selection. New shape is: " + str(trainingMatrix1.shape)) bernoulliClf = BernoulliNB(alpha=constants.smoothingConstant, binarize=None, fit_prior=False) ''' maxAvg = 0 maxK = -1 for kVal in range(1025, 10000, 50): trainingMatrix1 = SelectKBest(chi2, k=kVal).fit_transform(trainingDataMatrix, labelMatrix) cvScores = cross_val_score(estimator=bernoulliClf, X=trainingMatrix1, y=labelMatrix, scoring='f1', cv=7) avg = sum(cvScores) / 7 if avg > maxAvg: maxAvg = avg maxK = kVal print("k = " + str(kVal) + ", avg = " + str(avg)) print("Best value is k = " + str(maxK) + ", " + str(maxAvg)) ''' featureReducer = SelectKBest(chi2, k=985) featureReducer.fit(trainingDataMatrix, labelMatrix) trainingMatrix1 = featureReducer.transform(trainingDataMatrix) cvScores = cross_val_score(estimator=bernoulliClf, X=trainingMatrix1, y=labelMatrix, scoring='f1', cv=7) avg = sum(cvScores) / 7 print("k = 985, avg = " + str(avg)) bernoulliClf.fit(trainingMatrix1, labelMatrix) ''' estimateSet = trainingDataMatrix estimatePredictions = bernoulliClf.predict(estimateSet) print("estimates = " + str(estimatePredictions)) results = np.zeros((2, 2), dtype=np.int) for i in range(len(trainDrugRecords)): actual = trainDrugRecords[i].label guess = int(estimatePredictions[i]) #print("guess = " + str(guess) + ", actual = " + str(actual)) results[guess, actual] += 1 print("results = " + str(results)) ''' testDataMatrix = common.loadTestDataSet() testMatrix1 = featureReducer.transform(testDataMatrix) testPredictions = bernoulliClf.predict(testMatrix1) print("Test predictions shape = " + str(testPredictions.shape)) print("Test Estimates = " + str(testPredictions)) common.writeResultsFile(testPredictions)