Example #1
0
File: UI.py Project: SabaAlex/AI
    def __process(nrOfIterations, learningRate, hiddenNeuronsNumber, aConst):
        dataset = ProblemData("resources/data.data")
        trainX, trainY, testX, testY = dataset.splitData()

        neuralNetwork = ANN(deepcopy(trainX), deepcopy(trainY), learningRate,
                            hiddenNeuronsNumber, aConst)

        iterations = []
        for i in range(nrOfIterations):
            neuralNetwork.feedForward()
            neuralNetwork.backProp()
            iterations.append(i)

        for i in range(len(testX)):
            predictedOut = neuralNetwork.getOutput(testX[i])
            print("Predicted output: {0}\nReal value: {1}".format(
                predictedOut, testY[i]))

        matplotlib.pyplot.plot(iterations,
                               neuralNetwork.getLoss(),
                               label='loss value vs iteration')
        matplotlib.pyplot.xlabel('Iterations')
        matplotlib.pyplot.ylabel('Loss function')
        matplotlib.pyplot.legend()
        matplotlib.pyplot.show()
def main():

    # loading the data as prblmData
    prblmData = ProblemData(defaultSignalValue=defaultSignalValue,
                            numNodes=numNodes)
    prblmData = prblmData.loadData(useStoredData=useStoredData,
                                   inputFileName=inputFileName,
                                   storeReadData=storeReadData,
                                   storeDataName=storeDataName,
                                   rowReadUntil=readDataUntilRow)

    # partitioning data and providing test and train sets and corresponding labels as dataPar
    dataPar = DataPartition()
    dataPar = dataPar.makeTrainTest(prblmData=prblmData,
                                    readSampleSize=sampleSize,
                                    testPartitionSize=testPartitionSize,
                                    randomState=0,
                                    doNormalize=False,
                                    useSubSample=useSubSample,
                                    storeSubSample=True,
                                    subSamplePcklName=subSamplePcklName)

    # providing normalized data of dataPar as dataParNormal
    dataParNormal = DataPartition()
    dataParNormal = dataParNormal.makeTrainTest(
        prblmData=prblmData,
        readSampleSize=sampleSize,
        testPartitionSize=testPartitionSize,
        randomState=0,
        doNormalize=True,
        useSubSample=useSubSample,
        storeSubSample=True,
        subSamplePcklName=subSamplePcklName)

    # Feature reduction of normalized data by PCA(with pcaNcomponents dimensions) as dataParPca
    pcaNcomponent = 10
    pcaObj = PCA(n_components=pcaNcomponent)
    fit = pcaObj.fit(dataParNormal.fVecTrain)
    dataParPca = DataPartition()
    dataParPca.fVecTrain = pcaObj.fit_transform(dataParNormal.fVecTrain)
    dataParPca.fVecTest = pcaObj.fit_transform(dataParNormal.fVecTest)
    dataParPca.labelTrain = dataParNormal.labelTrain
    dataParPca.labelTest = dataParNormal.labelTest
    dataParPca.isNormalized = True

    # plotting data by first and second principle components of applied PCA on data
    plt.plot(dataParPca.fVecTrain[:, 0], dataParPca.fVecTrain[:, 1], 'b.')
    plt.title('2D PCA')
    plt.show()

    # Three classifiers (random forest, KNN, SVM) with hyper parameters (will be tuned by cross validation) are defined below:
    # parameter cv shows number of partitions for cross validation of hyper parameter tuning
    # n_jobs = -1 run the data on multiple cores

    # Random Forest:
    # n_estimators: number of trees to make the forest
    #criteria: measuring quality of a split. “gini” for the Gini impurity and “entropy” for the information gain
    # max_features: number of features to consider when looking for the best split
    paramGrid_rf = {
        'n_estimators': [5, 10, 17, 30],
        'criterion': ['gini', 'entropy'],
        'max_features': ['auto', 0.01, 0.1, 0.9],
        'n_jobs': [-1]
    }
    #paramGrid_rf = {'n_estimators': [30] ,'criterion': ['gini'] ,'n_jobs': [-1]} # n_jobs => runs in parallel
    clf_rf = GridSearchCV(RandomForestClassifier(), paramGrid_rf,
                          cv=3)  # ,scoring='%s_macro' % score

    # KNN:
    # n_neighbors: number of neighbours to consider
    # weights: how to weight labels of neighbors. uniform or distance(consider reverse of the neighbors distance)
    # metric: how to measure the distance: 'minkowski', 'euclidean' or 'manhattan'
    paramGrid_knn = {
        'n_neighbors': [3, 5, 9, 15],
        'weights': ['uniform', 'distance'],
        'metric': ['minkowski', 'euclidean', 'manhattan']
    }
    #paramGrid_knn = {'n_neighbors': [5, 9], 'weights': ['distance'],'metric': ['manhattan']}
    clf_knn = GridSearchCV(KNeighborsClassifier(algorithm='kd_tree',
                                                n_jobs=-1),
                           paramGrid_knn,
                           cv=3)  #, verbose=10  # ,scoring='%s_macro' % score

    # SVM:
    # C : Penalty parameter for miss classification
    # kernel: used kernel 'linear', 'poly', 'rbf' // rbf is more time consuming but seems to be more concordat to problem
    # gamma: kernel coefficient. How far the influence of a single training data reaches [low: far / high: close]
    param_grid_svm = {'C': [0.1], 'kernel': ['rbf'], 'gamma': [0.01]}
    # param_grid_svm = {'C': [0.01, 0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf'], 'gamma': [0.001, 0.01, 0.1, 1]} # rbf is time consuming comparing to others
    clf_svm = GridSearchCV(
        svm.SVC(), param_grid_svm,
        cv=3)  # , verbose=10: write the result of each epoc of cv

    clfNames = ['random_forest', 'knn', 'svm']
    dataTypes = ['original ', 'normalized', 'nomalized_PCA']

    for idx, clf in enumerate([clf_rf, clf_knn, clf_svm]):
        for idx2, datap in enumerate([dataPar, dataParNormal, dataParPca]):
            runCl = RunClassifier()
            prediction, accuracy, conf_matrix, clf.best_params = runCl.doClassification(
                clf,
                datap.fVecTrain,
                datap.fVecTest,
                datap.labelTrain,
                datap.labelTest,
                showPlot=True,
                savePickleModel=savePickleModel,
                clfName=clfNames[idx],
                dataType=dataTypes[idx2])
            print('\n+++++++++++++++++++\n')