Exemple #1
0
 def knn(self, predictData=None, trainData=None):
     h = hp()
     k = knn()
     accuracy = []
     precision = []
     recall = []
     f_score = []
     mean, stdDev = h.normalizeData(trainData)
     nn = int(input("Enter the number of closest neighbors to consider: "))
     h.normalizeEvaluationSet(predictData, mean, stdDev)
     for i in range(len(trainData)):
         tmp = None
         predictData = trainData[i]
         tmp = [lt for j, lt in enumerate(trainData) if j != i]
         td = h.convertToList(tmp)
         k.classify(td, predictData, nn)
         truePositives, trueNegatives, falsePositives, falseNegatives = h.findParams(
             predictData)
         accuracy.append(
             h.findAccuracy(truePositives, trueNegatives, falsePositives,
                            falseNegatives))
         tmpPrecision = h.findPrecision(truePositives, trueNegatives,
                                        falsePositives, falseNegatives)
         tmpRecall = h.findRecall(truePositives, trueNegatives,
                                  falsePositives, falseNegatives)
         precision.append(tmpPrecision)
         recall.append(tmpRecall)
         f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
     return accuracy, precision, recall, f_score
Exemple #2
0
 def bayes_naive(self, predictData, trainData):
     h = hp()
     nb = bayes()
     accuracy = []
     precision = []
     recall = []
     f_score = []
     for i in range(len(trainData)):
         tmp = None
         predictData = trainData[i]
         tmp = [lt for j, lt in enumerate(trainData) if j != i]
         td = h.convertToList(tmp)
         classPriorProbabilities = nb.findClassPriorProbability(td)
         classes = nb.segregateClasses(td)
         occurences, means, stdDev = nb.findDescriptorPosteriorProbabilites(
             classes, td)
         nb.classify(predictData, classPriorProbabilities, occurences,
                     means, stdDev)
         truePositives, trueNegatives, falsePositives, falseNegatives = h.findParams(
             predictData)
         accuracy.append(
             h.findAccuracy(truePositives, trueNegatives, falsePositives,
                            falseNegatives))
         tmpPrecision = h.findPrecision(truePositives, trueNegatives,
                                        falsePositives, falseNegatives)
         tmpRecall = h.findRecall(truePositives, trueNegatives,
                                  falsePositives, falseNegatives)
         precision.append(tmpPrecision)
         recall.append(tmpRecall)
         f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
     return accuracy, precision, recall, f_score
Exemple #3
0
 def knn2(self, predictData, trainData, labels):
     h = hp()
     pd = list()
     finalAnswer = defaultdict(list)
     pca = PCA(n_components=30)
     for point in predictData:
         pd.append(point.point)
     knn = KNeighborsClassifier(n_neighbors=9, metric='euclidean')
     tmp = list()
     tmpLabels = list()
     for lt in trainData:
         for point in lt:
             tmp.append(point.point)
             tmpLabels.append(labels[point.id])
     # X_transformed = pca.fit_transform(tmp)
     # newdata_transformed = pca.transform(pd)
     std_scale = preprocessing.StandardScaler().fit(tmp)
     X_transformed = std_scale.transform(tmp)
     newdata_transformed  = std_scale.transform(pd)
     knn.fit(np.array(X_transformed), np.array(tmpLabels))
     y_pred = knn.predict(np.array(newdata_transformed))
     k = 0
     for i in range(418, 796):
         finalAnswer[i] = y_pred[k]
         k+=1
     return finalAnswer
Exemple #4
0
 def svm(self, predictData, trainData, labels):
     h = hp()
     matrix = defaultdict(list)
     finalAnswer = defaultdict(list)
     pca = PCA(n_components=25)
     for i in range(len(trainData)):
         tmp = list()
         tmpLabels = list()
         pd = list()
         for j, lt in enumerate(trainData):
             if j != i:
                 for point in lt:
                     tmp.append(point.point)
                     tmpLabels.append(labels[point.id])
         for point in predictData:
             pd.append(point.point)
         clf = SVC()
         X_transformed = pca.fit_transform(tmp)
         newdata_transformed = pca.transform(pd)
         # mean, stdDev = h.normalizeData(trainData)
         # h.normalizeEvaluationSet(predictData, mean, stdDev)
         clf.fit(np.array(X_transformed), np.array(tmpLabels))
         y_pred = clf.predict(np.array(newdata_transformed))
         k = 0
         for i in range(418, 796):
             finalAnswer[i].append(y_pred[k])
             k+=1
     for key in finalAnswer:
         matrix[key] = 0 if finalAnswer[key].count(0) > finalAnswer[key].count(1) else 1
     return matrix
Exemple #5
0
 def logisticRegression(self, predictData, trainData, labels):
     h = hp()
     # mean, stdDev = h.normalizeData(trainData)
     # h.normalizeEvaluationSet(predictData, mean, stdDev)
     finalAnswer = defaultdict(list)
     pd = list()
     matrix = defaultdict(list)
     for point in predictData:
         pd.append(point.point)
     for i in range(len(trainData)):
         tmp = list()
         tmpLabels = list()
         for j, lt in enumerate(trainData):
             if j != i:
                 for point in lt:
                     tmp.append(point.point)
                     tmpLabels.append(labels[point.id])
         pca = PCA(svd_solver='full')
         pca_matrix = pca.fit_transform(tmp)
         pca1_matrix = pca.transform(pd)
         lr = LogisticRegression(solver='sag', max_iter=1500)
         lr.fit(pca_matrix, np.array(tmpLabels))
         y_pred = lr.predict(np.array(pca1_matrix))
         k = 0
         for d in range(418, 796):
             finalAnswer[d].append(y_pred[k])
             k+=1
     for key in finalAnswer:
         matrix[key] = 0 if finalAnswer[key].count(0) > finalAnswer[key].count(1) else 1
     return matrix
Exemple #6
0
 def knn(self, predictData = None, trainData = None):
     h = hp()
     k = knn()
     # mean, stdDev = h.normalizeData(trainData)
     nn = int(input("Enter the number of closest neighbors to consider: "))
     # h.normalizeEvaluationSet(predictData, mean, stdDev)
     tmp = [lt for j, lt in enumerate(trainData)]
     td = h.convertToList(tmp)
     k.classify(td, predictData,nn)
 def findDescriptorPosteriorProbabilites(self, classes, td):
     occurences = defaultdict(int)
     mean, stdDeviation = defaultdict(dict), defaultdict(dict)
     for key in classes:
         tmp = classes[key]
         mean[key], stdDeviation[key] = hp().standardizeBayes(tmp)
         for pt in tmp:
             for index, i in enumerate(pt.categoricalData):
                 if (i, key) not in occurences:
                     count = self.countOccurence(i, index, tmp)
                     occurences[(i, key)] = count / len(tmp)
     return occurences, mean, stdDeviation
Exemple #8
0
 def bayes_naive(self, predictData, trainData):
     h = hp()
     nb = bayes()
     matrix = defaultdict(list)
     pd = [pt for pt in predictData]
     # for i in range(len(trainData)):
     tmp = [lt for j, lt in enumerate(trainData)]
     td = h.convertToList(tmp)
     classPriorProbabilities = nb.findClassPriorProbability(td)
     classes = nb.segregateClasses(td)
     occurences, means, stdDev = nb.findDescriptorPosteriorProbabilites(classes, td)
     nb.classify(predictData, classPriorProbabilities, occurences, means, stdDev)
     return predictData
Exemple #9
0
 def knnDemo(self, predictData=None, trainData=None):
     h = hp()
     k = knn()
     nn = int(input("Enter the number of closest neighbors to consider: "))
     k.classify(trainData, predictData, nn)
     truePositives, trueNegatives, falsePositives, falseNegatives = h.findParams(
         predictData)
     accuracy = h.findAccuracy(truePositives, trueNegatives, falsePositives,
                               falseNegatives)
     precision = h.findPrecision(truePositives, trueNegatives,
                                 falsePositives, falseNegatives)
     recall = h.findRecall(truePositives, trueNegatives, falsePositives,
                           falseNegatives)
     f_score = h.findFMeasure(precision, recall)
     return accuracy, precision, recall, f_score
Exemple #10
0
 def bayes_naive_demo(self, predictData, trainData):
     h = hp()
     nb = bayes()
     classPriorProbabilities = nb.findClassPriorProbability(trainData)
     classes = nb.segregateClasses(trainData)
     occurences, means, stdDev = nb.findDescriptorPosteriorProbabilites(
         classes, trainData)
     probabilities = nb.classify_demo(predictData, classPriorProbabilities,
                                      occurences, means, stdDev)
     maxProb = float('-inf')
     classKey = -1
     for key in probabilities:
         print("P(X|H{})*P(H{}) = {}".format(key, key, probabilities[key]))
         if probabilities[key] > maxProb:
             maxProb = probabilities[key]
             classKey = key
     print("This test data record belongs to: Class {}".format(classKey))
Exemple #11
0
def dt(data, labels, test_features=None):

    from helpers import helpers as hp
    from decision_tree import decisionTree
    h = hp()
    dt = decisionTree()
    data = pd.concat([data, labels], axis=1)
    data.dropna(inplace=True)
    print(data.head())
    trainAccuracy = []
    testAccuracy = []
    precision = []
    recall = []
    f_score = []
    models = []

    foldSize = int(data.shape[0] / 10)
    for i in range(10):
        print("Running iteration " + str(i+1) + " of k cross validation")
        testData = data.loc[foldSize*i:foldSize*(i+1)-1]
        # testData = pd.DataFrame(stats.zscore(testData.iloc[:,:-1], axis=1), columns=testData.columns)
        trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
        trainData = trainData[(np.abs(stats.zscore(trainData.iloc[:,:-1])) < 3).all(axis=1)]
        root = dt.decision(trainData, depth=10, minLeafRows=5)
        testTarget = testData.iloc[:,-1].values.tolist()
        # testPredicted = dt.predictData(testData.iloc[:, :-1], root)
        testPredicted = dt.predictData(pd.DataFrame(stats.zscore(testData.iloc[:,:-1], axis=1), columns=testData.columns.values.tolist()[:-1]), root)
        trainTarget = trainData.iloc[:,-1].values.tolist()
        trainPredicted = dt.predictData(trainData.iloc[:, :-1], root)
        models.append(root)
        truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(trainPredicted, trainTarget)
        trainAccuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
        truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(testPredicted, testTarget)
        testAccuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
        tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
        tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
        precision.append(tmpPrecision)
        recall.append(tmpRecall)
        f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
    h.calculateMetrics(testAccuracy, precision, recall, f_score)
    return trainAccuracy, testAccuracy, precision, recall, models, dt
Exemple #12
0
def rf(data, labels, test_features=None):

    from random_forest import randomForest
    from helpers import helpers as hp
    from decision_tree import decisionTree
    h = hp()
    rf = randomForest()
    dt = decisionTree()

    data = pd.concat([data, labels], axis=1)
    # print(data)

    accuracy = []
    precision = []
    recall = []
    f_score = []
    models = []
    fb_score = []

    foldSize = int(data.shape[0] / 5)
    for i in range(5):
        print("Running iteration " + str(i+1) + " of k cross validation")
        testData = data.loc[foldSize*i:foldSize*(i+1)-1]
        trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):])
        forest = rf.forest(trainData)
        target = testData.iloc[:,-1].values.tolist()
        predicted = rf.predictForest(testData.iloc[:, :-1], forest)
        models.append(forest)
        calMetrics(target, predicted)
        # truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target)
        # print(truePositives, trueNegatives, falsePositives, falseNegatives)
        # accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives))
        # tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives)
        # tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives)
        # precision.append(tmpPrecision)
        # recall.append(tmpRecall)
        # tm_fscore = h.findFMeasure(tmpPrecision, tmpRecall)
        # print(tm_fscore)
        # f_score.append(tm_fscore)
    
    h.calculateMetrics(accuracy, precision, recall, f_score)

    
    
    # print(accuracy, precision, recall, f_score)
    # h.calculateMetrics(accuracy, precision, recall, f_score)

    # ind = f_score.index(min(f_score))
    # print(f_score[ind])
    # pred = rf.predictForest(test_features, models[ind])
    # print(pred)
    predicted = pd.DataFrame()
    for root in models:
        pred = dt.predictData(test_features, root)
        predicted = pd.concat([predicted, pd.DataFrame(pred)], axis=1)

    print(predicted)

    p = pd.DataFrame()

    p = []
    for idx, row in predicted.iterrows():
        p.append(row.value_counts().index.tolist()[0])

    print(p)

    return p
Exemple #13
0
    def random_forest(self, kCrossValidation):
        print("\nRunning Random Forest Classifier ....................\n")
        from random_forest import randomForest
        h = hp()
        fileName = h.get_fileName()
        filePath = "../Data/" + fileName + ".txt"
        # filePath = "CSE-601/project3/Data/"+fileName+".txt"
        data, labels = h.readData(filePath)
        data = h.oneHotEncoding(data, labels)
        rf = randomForest()

        try:
            numTrees = int(input("\nEnter number of trees: "))
            numFeatures = int(input("Enter number of features to consider: "))
        except:
            print("\nExecution Failed - Wrong Input")
            exit()

        accuracy = []
        precision = []
        recall = []
        f_score = []
        models = []

        foldSize = int(data.shape[0] / kCrossValidation)
        for i in range(kCrossValidation):
            print("Running iteration " + str(i + 1) +
                  " of k cross validation .....")
            testData = data.loc[foldSize * i:foldSize * (i + 1) - 1]
            trainData = data.loc[:foldSize * i - 1].append(data.loc[foldSize *
                                                                    (i + 1):])
            forest = rf.forest(trainData,
                               numTrees=numTrees,
                               numFeatures=numFeatures)
            target = testData.iloc[:, -1].values.tolist()
            predicted = rf.predictForest(testData.iloc[:, :-1], forest)
            models.append(forest)
            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(
                predicted, target)
            accuracy.append(
                h.findAccuracy(truePositives, trueNegatives, falsePositives,
                               falseNegatives))
            tmpPrecision = h.findPrecision(truePositives, trueNegatives,
                                           falsePositives, falseNegatives)
            tmpRecall = h.findRecall(truePositives, trueNegatives,
                                     falsePositives, falseNegatives)
            precision.append(tmpPrecision)
            recall.append(tmpRecall)
            f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))

        print("\nMetrics on train data with k-cross validation")
        h.calculateMetrics(accuracy, precision, recall, f_score)

        fileName = input(
            "\nEnter test data file name without extension (if no test file, just press enter): "
        )
        if fileName != '':
            filePath = "../Data/" + fileName + ".txt"
            # filePath = "CSE-601/project3/Data/"+fileName+".txt"
            testData, testLabels = h.readData(filePath)
            testData = h.oneHotEncoding(testData, testLabels)
            predLabels = []
            for forest in models:
                predLabels.append(rf.predictForest(testData, forest))
            predLabels = pd.DataFrame(predLabels)
            pred = []
            for _, colData in predLabels.iteritems():
                pred.append(colData.value_counts().index[0])
            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(
                pred, testData.iloc[:, -1].values.tolist())
            accuracy = [
                h.findAccuracy(truePositives, trueNegatives, falsePositives,
                               falseNegatives)
            ]
            precision = h.findPrecision(truePositives, trueNegatives,
                                        falsePositives, falseNegatives)
            recall = h.findRecall(truePositives, trueNegatives, falsePositives,
                                  falseNegatives)
            f_score = [h.findFMeasure(precision, recall)]
            print("\nMetrics on test data with bagging")
            h.calculateMetrics(accuracy, [precision], [recall], f_score)
Exemple #14
0
    def decision_tree(self, kCrossValidation):
        print("\nRunning Decision Tree Classifier ....................\n")
        from decision_tree import decisionTree
        h = hp()
        fileName = h.get_fileName()
        # filePath = "../Data/"+fileName+".txt"
        filePath = "CSE-601/project3/Data/" + fileName + ".txt"
        data, labels = h.readData(filePath)
        data = h.oneHotEncoding(data, labels)
        dt = decisionTree()

        accuracy = []
        precision = []
        recall = []
        f_score = []
        models = []

        if kCrossValidation <= 1:
            root = dt.decision(data)
            print(root)
            target = data.iloc[:, -1]
            predicted = dt.predictData(data.iloc[:, :-1], root)
            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(
                predicted, target)
            accuracy.append(
                h.findAccuracy(truePositives, trueNegatives, falsePositives,
                               falseNegatives))
            tmpPrecision = h.findPrecision(truePositives, trueNegatives,
                                           falsePositives, falseNegatives)
            tmpRecall = h.findRecall(truePositives, trueNegatives,
                                     falsePositives, falseNegatives)
            precision.append(tmpPrecision)
            recall.append(tmpRecall)
            f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
        else:
            foldSize = int(data.shape[0] / kCrossValidation)
            for i in range(kCrossValidation):
                print("Running iteration " + str(i + 1) +
                      " of k cross validation .....")
                testData = data.loc[foldSize * i:foldSize * (i + 1) - 1]
                trainData = data.loc[:foldSize * i - 1].append(
                    data.loc[foldSize * (i + 1):])
                root = dt.decision(trainData)
                target = testData.iloc[:, -1].values.tolist()

                predicted = dt.predictData(testData.iloc[:, :-1], root)
                models.append(root)
                truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(
                    predicted, target)
                accuracy.append(
                    h.findAccuracy(truePositives, trueNegatives,
                                   falsePositives, falseNegatives))
                tmpPrecision = h.findPrecision(truePositives, trueNegatives,
                                               falsePositives, falseNegatives)
                tmpRecall = h.findRecall(truePositives, trueNegatives,
                                         falsePositives, falseNegatives)
                precision.append(tmpPrecision)
                recall.append(tmpRecall)
                f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))

        print("\nMetrics on train data with k-cross validation")
        h.calculateMetrics(accuracy, precision, recall, f_score)

        fileName = input(
            "\nEnter test data file name without extension (if no test file, just press enter): "
        )
        if fileName != '':
            filePath = "../Data/" + fileName + ".txt"
            # filePath = "CSE-601/project3/Data/"+fileName+".txt"
            testData, testLabels = h.readData(filePath)
            testData = h.oneHotEncoding(testData, testLabels)
            predLabels = []
            for _, row in testData.iloc[:, :-1].iterrows():
                predictedRow = [dt.predictRow(row, root) for root in models]
                predLabels.append(
                    max(set(predictedRow), key=predictedRow.count))
            truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(
                predLabels, testData.iloc[:, -1].values.tolist())
            accuracy = [
                h.findAccuracy(truePositives, trueNegatives, falsePositives,
                               falseNegatives)
            ]
            precision = h.findPrecision(truePositives, trueNegatives,
                                        falsePositives, falseNegatives)
            recall = h.findRecall(truePositives, trueNegatives, falsePositives,
                                  falseNegatives)
            f_score = [h.findFMeasure(precision, recall)]
            print("\nMetrics on test data with bagging")
            h.calculateMetrics(accuracy, [precision], [recall], f_score)
Exemple #15
0
            accuracy = [
                h.findAccuracy(truePositives, trueNegatives, falsePositives,
                               falseNegatives)
            ]
            precision = h.findPrecision(truePositives, trueNegatives,
                                        falsePositives, falseNegatives)
            recall = h.findRecall(truePositives, trueNegatives, falsePositives,
                                  falseNegatives)
            f_score = [h.findFMeasure(precision, recall)]
            print("\nMetrics on test data with bagging")
            h.calculateMetrics(accuracy, [precision], [recall], f_score)


if __name__ == "__main__":
    m = main()
    h = hp()
    algorithm = int(
        input(
            "Enter 0 to run K-Nearest Neighbors in demo mode\nEnter 1 for K-Nearest Neigbour Algorithm\nEnter 2 for Decision Tree Algorithm\nEnter 3 for Naive Bayes Algorithm\nEnter 4 to run Naive Bayes Algorithm in demo mode\nEnter 5 for Random Forest Algorithm\n"
        ))

    if algorithm == 0:
        print("Enter train File name")
        trainData = h.get_file_demo(h.get_fileName())
        print("Enter test File name")
        predictData = h.get_file_demo(h.get_fileName(), fileType='predictData')
        accuracy, precision, recall, f_score = m.knnDemo(
            predictData, trainData)
        h.calculateMetricsDemo(accuracy, precision, recall, f_score)

    elif algorithm == 1: