def runWithOversampling():
    #####################
    # Part 1. Balance the dataset
    #####################
    xRawData, yRawData = common.loadTrainingDataSet()

    xBalanced, yBalanced = data_balancing.balanceDatasetWithRandomOversampling(
        xRawData, yRawData)

    #####################
    # Part 2. Feature Reduction
    #####################
    featureReducer = SelectKBest(chi2, k=10000)
    featureReducer.fit(xBalanced, yBalanced)

    xReduced = featureReducer.transform(xBalanced).todense()

    nbClassifier = NaiveBayesClassifier()
    nbClassifier.fit(xReduced, yBalanced)

    rawTestData = common.loadTestDataSet()
    reducedTestData = featureReducer.transform(rawTestData).todense()

    resultsArray = nbClassifier.predict(reducedTestData)

    common.writeResultsFile(resultsArray)
def runWithIGR(featureSize, modelCount):
    X_raw, y = common.loadTrainingDataSet()

    reducer = InformationGainReducer()
    reducer.fit(X_raw, y)
    reducer.resize(featureSize)
    X = reducer.transform(X_raw).toarray()

    modelList = []

    for modelNum in range(modelCount):
        rs = 42 + modelNum
        rus = RandomUnderSampler(random_state=rs)
        X_model, y_model = rus.fit_resample(X, y)

        nbClassifier = NaiveBayesClassifier()
        nbClassifier.fit(X_model, y_model)

        modelList.append(nbClassifier)

    X_test_raw = common.loadTestDataSet()
    X_test = reducer.transform(X_test_raw).toarray()
    combinedModelOutput = common.predictCombinedSimple(X_test, modelList)

    common.writeResultsFile(combinedModelOutput)
    print("Done predicting with multi-model and IGR.")
def runWithUndersamplingMutualInfo():
    X, y = common.loadTrainingDataSet()

    print("Counter(y) = " + str(Counter(y)))

    rus = RandomUnderSampler(random_state=42)

    X_res, y_res = rus.fit_resample(X, y)

    print("Counter(y_res) = " + str(Counter(y_res)))

    reducer = SelectKBest(mutual_info_classif, 300)
    X_new = reducer.fit_transform(X_res, y_res).toarray()

    print("Done with feature selection")

    #reducer = feature_reduction.getChiSquared(X_res, y_res, 1331)
    #featureReducer = SelectKBest(chi2, k=j)
    #featureReducer.fit(X, y)

    #X_new = feature_reduction.transform(reducer, X_res)

    nbClf = NaiveBayesClassifier()
    nbClf.fit(X_new, y_res)

    X_test = common.loadTestDataSet()
    X_test_new = reducer.transform(X_test).toarray()
    testPredictions = nbClf.predict(X_test_new)

    print("Test predictions shape = " + str(testPredictions.shape))
    print("Test Estimates = " + str(testPredictions))
    common.writeResultsFile(testPredictions)
    print("Done!")
def runWithBalancingAndIGR(featureSize, alphaValue):
    X_model_full_imbalanced, y_model_imbalanced = common.loadTrainingDataSet()

    balancer = FeatureIndependentOversampler(random_state=42)
    X_model_full_raw, y_model_raw = balancer.fit_transform(
        X_model_full_imbalanced, y_model_imbalanced)

    X_model_full, y_model = shuffle(X_model_full_raw,
                                    y_model_raw,
                                    random_state=42)

    reducer = InformationGainReducer()
    reducer.fit(X_model_full, y_model)

    reducer.resize(featureSize)
    X_model = reducer.transform(X_model_full).todense()

    hiddenLayerSizes = (int(math.sqrt(featureSize)) + 1, )
    mc = MLPClassifier(solver='lbfgs',
                       alpha=alphaValue,
                       hidden_layer_sizes=hiddenLayerSizes)
    mc.fit(X_model, y_model)

    X_test_full = common.loadTestDataSet()
    X_test = reducer.transform(X_test_full)

    output = mc.predict(X_test)
    common.writeResultsFile(output)

    print("Done estimating with neural network for feature size = " +
          str(featureSize) + " and alpha = " + str(alphaValue))
def runWithMultiModel():
    modelTransformerList = []
    X, y = common.loadTrainingDataSet()

    for modelNum in range(9):
        rs = 42 + modelNum
        rus = RandomUnderSampler(random_state=rs)
        X_model_full, y_model = rus.fit_resample(X, y)

        reducer = SelectKBest(chi2, k=105)
        X_model = reducer.fit_transform(X_model_full, y_model).toarray()

        nbClassifier = NaiveBayesClassifier()
        nbClassifier.fit(X_model, y_model)

        modelTransformerList.append((nbClassifier, reducer))

    X_test = common.loadTestDataSet()
    combinedModelOutput = common.predictCombined(X_test, modelTransformerList)
    common.writeResultsFile(combinedModelOutput)
    print("Done predicting with multi-model.")
def runBuiltInBernoulli():
    trainingDataMatrix, labelMatrix = common.loadTrainingDataSet()

    predictiveFeatures = feature_reduction.computePredictiveness(
        trainingDataMatrix, labelMatrix)

    #print("Performed feature selection. New shape is: " + str(trainingMatrix1.shape))

    bernoulliClf = BernoulliNB(alpha=constants.smoothingConstant,
                               binarize=None,
                               fit_prior=False)
    '''
    for j in range(5, 1001, 5):
        importantFeatures = [element[0] for element in predictiveFeatures[0:j]]
        #print("Important features = " + str(importantFeatures))
        importantFeaturesArray = np.array(importantFeatures)
        reducedDataSet = trainingDataMatrix[:, importantFeaturesArray]
    
        #print("Reduced data set shape = " + str(reducedDataSet.shape))
        cvScores = cross_val_score(estimator=bernoulliClf, X=reducedDataSet, y=labelMatrix, scoring='f1', cv=constants.crossValidationFoldCount)
    
        avg = sum(cvScores) / constants.crossValidationFoldCount
        print("My reducer. Feature Count = " + str(j) + "   Avg Score = " + str(avg))
    '''

    importantFeaturesArray = [
        element[0] for element in predictiveFeatures[0:205]
    ]
    reducedTraining = trainingDataMatrix[:, importantFeaturesArray]

    bernoulliClf.fit(reducedTraining, labelMatrix)

    testDataMatrix = common.loadTestDataSet()
    reducedTesting = testDataMatrix[:, importantFeaturesArray]
    testPredictions = bernoulliClf.predict(reducedTesting)

    print("Test predictions shape = " + str(testPredictions.shape))
    print("Test Estimates = " + str(testPredictions))
    common.writeResultsFile(testPredictions)
def runBernoulliWithChiSquared():
    trainingDataMatrix, labelMatrix = common.loadTrainingDataSet()

    #predictiveFeatures = feature_reduction.computePredictiveness(trainingDataMatrix, labelMatrix)

    #print("Performed feature selection. New shape is: " + str(trainingMatrix1.shape))

    bernoulliClf = BernoulliNB(alpha=constants.smoothingConstant,
                               binarize=None,
                               fit_prior=False)
    '''
    maxAvg = 0
    maxK = -1
    
    for kVal in range(1025, 10000, 50):
        trainingMatrix1 = SelectKBest(chi2, k=kVal).fit_transform(trainingDataMatrix, labelMatrix)
        cvScores = cross_val_score(estimator=bernoulliClf, X=trainingMatrix1, y=labelMatrix, scoring='f1', cv=7)
        avg = sum(cvScores) / 7
        if avg > maxAvg:
            maxAvg = avg
            maxK = kVal
        
        print("k = " + str(kVal) + ", avg = " + str(avg))
    
    print("Best value is k = " + str(maxK) + ", " + str(maxAvg))
    '''
    featureReducer = SelectKBest(chi2, k=985)
    featureReducer.fit(trainingDataMatrix, labelMatrix)

    trainingMatrix1 = featureReducer.transform(trainingDataMatrix)

    cvScores = cross_val_score(estimator=bernoulliClf,
                               X=trainingMatrix1,
                               y=labelMatrix,
                               scoring='f1',
                               cv=7)
    avg = sum(cvScores) / 7
    print("k = 985, avg = " + str(avg))

    bernoulliClf.fit(trainingMatrix1, labelMatrix)
    '''
    estimateSet = trainingDataMatrix
    estimatePredictions = bernoulliClf.predict(estimateSet)
    print("estimates = " + str(estimatePredictions))
    
    results = np.zeros((2, 2), dtype=np.int)
    
    for i in range(len(trainDrugRecords)):
        actual = trainDrugRecords[i].label
        guess = int(estimatePredictions[i])
        #print("guess = " + str(guess) + ", actual = " + str(actual))
        results[guess, actual] += 1     
    
    print("results = " + str(results))
    '''

    testDataMatrix = common.loadTestDataSet()
    testMatrix1 = featureReducer.transform(testDataMatrix)
    testPredictions = bernoulliClf.predict(testMatrix1)

    print("Test predictions shape = " + str(testPredictions.shape))
    print("Test Estimates = " + str(testPredictions))
    common.writeResultsFile(testPredictions)