Esempio n. 1
0
def makePCA(fn, validExcept, rows, ftype, fcols, n_cols, isTrain, target,
            exceptCols, comp, exva, mean, exceptTargetForPCA, useLog,
            logConstant):
    print('')
    print('+=======================+')
    print('|  Function : makePCA   |')
    print('+=======================+')

    # get dataFrame
    (dataSetDF, targetCol) = _MDF.makeDataFrame(fn, validExcept, rows, ftype,
                                                fcols, isTrain, target,
                                                exceptCols, useLog,
                                                logConstant)
    DFtoFindPCA = dataSetDF  # dataFrame to find PCA

    # remove target column when exceptTargetForPCA is True
    if exceptTargetForPCA == True:
        newDataSetDF = dataSetDF.drop([target], axis='columns')

        # print newDataSetDF
        print('\n<<< [8] newDataSetDF.columns >>>')
        print(newDataSetDF.columns)
        print('\n<<< [9] newDataSetDF >>>')
        print(newDataSetDF)

        DFtoFindPCA = newDataSetDF

    # display correlation
    # https://seaborn.pydata.org/generated/seaborn.clustermap.html
    df = DFtoFindPCA.corr()  # get correlation
    seab.clustermap(df, annot=True, cmap='RdYlBu_r', vmin=-1, vmax=1)
    plt.show()

    # to standard normal distribution
    scaled = StandardScaler().fit_transform(DFtoFindPCA)

    # PCA
    # https://medium.com/@john_analyst/pca-%EC%B0%A8%EC%9B%90-%EC%B6%95%EC%86%8C-%EB%9E%80-3339aed5afa1
    initializePCA = False  # initializing PCA?

    if str(comp) == 'None' or str(exva) == 'None' or str(
            mean) == 'None':  # create PCA if does not exist
        pca = PCA(n_components=n_cols)
        pca.fit(scaled)

        # get components and explained variances of PCA
        comp = pca.components_
        exva = pca.explained_variance_
        mean = pca.mean_

        initializePCA = True

    # https://machinelearningmastery.com/calculate-principal-component-analysis-scratch-python/
    # print pca.components_ and pca.explained_variance_
    print('\n<<< [10] pca.components_ >>>\n' + str(comp))
    print('\n<<< [11] pca.explained_variance_ >>>\n' + str(exva))
    print('\n<<< [12] pca.mean_ >>>\n' + str(mean))

    # create PCA using comp and exva
    if initializePCA == False:
        pca = PCA(n_components=n_cols)
        pca.components_ = comp
        pca.explained_variance_ = exva
        pca.mean_ = mean

    # apply PCA to the data
    scaledPCA = pca.transform(scaled)

    print('\n<<< [13] scaledPCA.shape >>>\n' + str(scaledPCA.shape))
    print('\n<<< [14] scaledPCA.data.shape >>>\n' + str(scaledPCA.data.shape))

    print('\n<<< [15] scaledPCA >>>')
    print(scaledPCA)

    # for training data
    # (ID : ~original train data) -> (ID : ~except for validation data)
    if isTrain == True:
        print('\n<<< [15-1] dataSetDF[target] before >>>')
        print(dataSetDF[target])

        # dataFrame -> list -> dataFrame
        targetList = list(dataSetDF[target])
        targetListCopy = []
        for i in range(len(targetList)):
            targetListCopy.append(targetList[i])
        targetDF = pd.DataFrame(targetListCopy)

        print('\n<<< [15-2] dataSetDF[target] after : targetDF >>>')
        print(targetDF)

    # name each column for PCA transformed data
    pca_cols = []
    for i in range(n_cols):
        pca_cols.append('pca' + str(i))
    df_pca = pd.DataFrame(scaledPCA, columns=pca_cols)
    if isTrain == True: df_pca['target'] = targetDF

    print('\n<<< [16] df_pca >>>')
    print(df_pca)

    df_pcaCorr = df_pca.corr()
    seab.clustermap(df_pcaCorr, annot=True, cmap='RdYlBu_r', vmin=-1, vmax=1)
    plt.show()

    # immediately return the pca if testing
    if isTrain == False:
        print('')
        print('+=======================+')
        print('|    Exit : makePCA     |')
        print('+=======================+')

        return (df_pca, comp, exva, mean, targetCol)

    # print data as 2d or 3d space (run only on training data)
    _PD.printDataAsSpace(n_cols, df_pca, '(PCA) training data')

    print('')
    print('+=======================+')
    print('|    Exit : makePCA     |')
    print('+=======================+')

    return (df_pca, comp, exva, mean, targetCol)
Esempio n. 2
0
def executeAlgorithm(
    finalResult,
    trainName,
    testName,
    ftype,
    fcolsTrain,
    fcolsTest,
    targetColName,
    exceptCols,  # basic configuration
    globalValidationRate,
    method,
    usePCA,
    validationExceptCols,
    validationCol,  # important configuration
    PCAdimen,
    exceptTargetForPCA,
    useLog=False,
    logConstant=10000000,  # PCA and log
    kNN_k=None,
    kNN_useAverage=None,
    kNN_useCaseWeight=None,
    kNN_weight=None,  # for kNN (method 0)
    DT_maxDepth=None,
    DT_criterion=None,
    DT_splitter=None,
    DT_numericRange=None,  # for Decision Tree (method 1)
    exceptColsForMethod2=None,  # for method 2
    XG_info=None,  # xgBoost (for method 3 and 4)
    DL_normalizeTarget=False):  # for Deep Learning (method 5 and 6)

    #################################
    ###                           ###
    ### split data into train/val ###
    ###                           ###
    #################################

    trainValid_trainRows = None  # rows for training (for train-valid mode where globalValidationRate > 0)
    trainValid_validRows = None  # rows for validation (for train-valid mode where globalValidationRate > 0)

    # validation mode & method <= 4 (not deep learning method, returns finalResult)
    if globalValidationRate > 0 and method <= 4:
        testName = trainName  # because test (validation) using training data

        # get the number of rows of training data
        ft = open(trainName)
        ftrows = len(ft.readlines())
        ft.close()

        print('ftrows = ' + str(ftrows))

        # specify rows for training and test
        valid = []
        for i in range(ftrows):
            valid.append(0)

        while sum(valid) < ftrows * globalValidationRate:
            valid[random.randint(0, ftrows - 1)] = 1

        # update trainValid_trainRows and trainValid_validRows
        trainValid_trainRows = []
        trainValid_validRows = []

        for i in range(ftrows):
            if valid[i] == 1: trainValid_validRows.append(i)
            else: trainValid_trainRows.append(i)

    #################################
    ###                           ###
    ###      model execution      ###
    ###                           ###
    #################################

    # copy training column list
    fcolsTrain_ = []
    for i in range(len(fcolsTrain)):
        fcolsTrain_.append(fcolsTrain[i])

    # method 0, 1, 3 or 5 -> use PCA
    if method == 0 or method == 1 or method == 3 or method == 5:

        # set target column name
        targetColumn = 'target'
        if usePCA == False: targetColumn = targetColName

        # get PCA (components and explained variances) for training data
        # df_pca_train: dataFrame with columns including target column [pca0 pca1 ... pcaN target]
        if usePCA == True:

            # obtaining traing data
            print('')
            print('+-----------------------+')
            print('|     Training Data     |')
            print('+-----------------------+')
            (df_pca_train, comp, exva, mean, targetCol) = _PCA.makePCA(
                trainName, None, trainValid_trainRows, ftype, fcolsTrain,
                PCAdimen, True, targetColName, exceptCols, None, None, None,
                exceptTargetForPCA, useLog, logConstant)

            # remove target column from comp and mean
            if exceptTargetForPCA == False:
                comp = np.delete(comp, [targetCol], 1)
                mean = np.delete(mean, [targetCol], 0)

            # get PCA (components and explained variances) for test data
            # df_pca_test: dateFrame with columns except for target column [pca0 pca1 ... pcaN]

            # normal mode
            if globalValidationRate == 0 or method == 5:
                print('')
                print('+-----------------------+')
                print('|       Test Data       |')
                print('+-----------------------+')
                (df_pca_test, noUse0, noUse1, noUse2,
                 noUse3) = _PCA.makePCA(testName, None, None, ftype, fcolsTest,
                                        PCAdimen, False, None, exceptCols,
                                        comp, exva, mean, False, useLog,
                                        logConstant)

            # validation mode (use fcolsTrain_ because using training data file)
            else:
                print('')
                print('+-----------------------+')
                print('|    Validation Data    |')
                print('+-----------------------+')
                (df_pca_test, noUse0, noUse1, noUse2, noUse3) = _PCA.makePCA(
                    trainName, validationExceptCols, trainValid_validRows,
                    ftype, fcolsTrain_, PCAdimen, False, None, exceptCols,
                    comp, exva, mean, False, useLog, logConstant)

        # do not use PCA
        else:

            # obtaining traing data
            print('')
            print('+-----------------------+')
            print('|     Training Data     |')
            print('+-----------------------+')
            (df_pca_train, targetCol) = _DF.makeDataFrame(
                trainName, None, trainValid_trainRows, ftype, fcolsTrain, True,
                targetColumn, exceptCols, useLog, logConstant)

            # normal mode
            if globalValidationRate == 0 or method == 5:
                print('')
                print('+-----------------------+')
                print('|       Test Data       |')
                print('+-----------------------+')
                (df_pca_test,
                 noUse) = _DF.makeDataFrame(testName, None, None, ftype,
                                            fcolsTest, False, targetColumn,
                                            exceptCols, useLog, logConstant)

            # validation mode (use fcolsTrain_ because using training data file)
            else:
                print('')
                print('+-----------------------+')
                print('|    Validation Data    |')
                print('+-----------------------+')
                (df_pca_test,
                 noUse) = _DF.makeDataFrame(trainName, validationExceptCols,
                                            trainValid_validRows, ftype,
                                            fcolsTrain_, False, targetColumn,
                                            exceptCols, useLog, logConstant)

        # do not use decision tree
        if method == 0:

            # k-NN of test data
            finalResult = _KNN.kNN(df_pca_train, df_pca_test, kNN_weight,
                                   kNN_useCaseWeight, targetColumn, PCAdimen,
                                   kNN_k, kNN_useAverage)

        # use decision tree
        elif method == 1:

            # make decision tree
            DT = _DT.createDTfromDF(df_pca_train, targetColumn, True,
                                    DT_maxDepth, DT_criterion, DT_splitter,
                                    DT_numericRange)

            # predict test data using decision tree
            finalResult = _DT.predictDT(df_pca_test, DT, True, DT_maxDepth,
                                        DT_criterion, DT_splitter)

        # use xgBoost
        # https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/
        # if XG_xgBoostLevel=1 then using like https://swlock.blogspot.com/2019/02/xgboost-stratifiedkfold-kfold.html
        elif method == 3:
            totalAccuracy = 0  # total accuracy score
            totalROC = 0  # total area under ROC
            times = 20  # number of iterations (tests)

            # iteratively validation
            for i in range(times):
                (accuracy, ROC,
                 _) = _XB.usingXgBoost(df_pca_train, df_pca_test, targetColumn,
                                       'test ' + str(i), True, True,
                                       XG_xgBoostLevel, XG_info)
                totalAccuracy += accuracy
                totalROC += ROC

            print('\n<<< [22] total accuracy and ROC for xgBoost >>>')
            print('avg accuracy : ' + str(totalAccuracy / times))
            print('avg ROC      : ' + str(totalROC / times))

            # predict values
            finalResult = _XB.usingXgBoost(df_pca_train, df_pca_test,
                                           targetColumn, 'test ' + str(i),
                                           True, False, XG_xgBoostLevel,
                                           XG_info)

        # use Deep Learning for PCA-ed data
        elif method == 5:

            # save data as file
            DLmain.dataFromDF(df_pca_train, df_pca_test, targetColumn, [],
                              DL_normalizeTarget)

            # deep learning
            AImain.AIbase_deeplearning()

    # method 2 -> do not use Decision Tree, use text vectorization + Naive Bayes
    # source: https://www.kaggle.com/alvations/basic-nlp-with-nltk
    elif method == 2:

        if specificCol == None:
            print('specificCol must be specified')
            exit()

        nltk.download('punkt')
        nltk.download('averaged_perceptron_tagger')
        nltk.download('wordnet')
        nltk.download('stopwords')

        # create count vectorizer
        count_vect = CountVectorizer(analyzer=_TV.preprocess_text)

        # get train and test dataFrame
        print('')
        print('+-----------------------+')
        print('|     Training Data     |')
        print('+-----------------------+')
        (train_df, targetColOfTrainDataFrame) = _DF.makeDataFrame(
            trainName, None, trainValid_trainRows, ftype, fcolsTrain, True,
            targetColName, exceptColsForMethod2, useLog, logConstant)

        if globalValidationRate == 0:  # normal mode
            print('')
            print('+-----------------------+')
            print('|       Test Data       |')
            print('+-----------------------+')
            (test_df, noUse) = _DF.makeDataFrame(testName, None, None, ftype,
                                                 fcolsTest, False,
                                                 targetColName,
                                                 exceptColsForMethod2, useLog,
                                                 logConstant)

        else:  # validation mode (use fcolsTrain_ because using training data file)
            print('')
            print('+-----------------------+')
            print('|    Validation Data    |')
            print('+-----------------------+')
            (test_df,
             noUse) = _DF.makeDataFrame(trainName, validationExceptCols,
                                        trainValid_validRows, ftype,
                                        fcolsTrain_, False, targetColName,
                                        exceptColsForMethod2, useLog,
                                        logConstants)

        # print dataFrames
        print('\n<<< [24] train_df.columns >>>')
        print(train_df.columns)
        print('\n<<< [25] train_df >>>')
        print(train_df)
        print('\n<<< [26] test_df.columns >>>')
        print(test_df.columns)
        print('\n<<< [27] test_df >>>')
        print(test_df)

        print('\n<<< [28] train_df[' + targetColName + '] >>>')
        print(train_df[targetColName])

        # change train_df['requester_received_pizza']
        # if method == 2 and target value is not binary(0 or 1), it may cause error
        for i in range(len(train_df)):
            if train_df.at[i, targetColName] == 1:
                train_df.at[i, targetColName] = True
            else:
                train_df.at[i, targetColName] = False

        # fit transform each column for training data
        # In [51] and In [52] / In [55]:
        trainSet = count_vect.fit_transform(train_df[specificCol])
        trainTags = train_df[targetColName].astype('bool')
        testSet = count_vect.transform(test_df[specificCol])

        print('\n<<< [29] trainSet >>>')
        print(trainSet)
        print('\n<<< [30] trainTags >>>')
        print(trainTags)

        # In [53] / In [55]:
        clf = MultinomialNB()
        clf.fit(trainSet, trainTags)

        # In [56]:
        predictions = clf.predict(testSet)

        print('\n<<< [31] predictions >>>')
        print(predictions)

        # create finalResult based on predictions
        finalResult = []
        for i in range(len(predictions)):
            if predictions[i] == True: finalResult.append(1)
            else: finalResult.append(0)

    # method 4 or method 6 (xgboost only or deep learning)
    if method == 4 or method == 6:

        # get train and test dataFrame
        print('')
        print('+-----------------------+')
        print('|     Training Data     |')
        print('+-----------------------+')
        (df_pca_train, targetColOfTrainDataFrame) = _DF.makeDataFrame(
            trainName, None, trainValid_trainRows, ftype, fcolsTrain, True,
            targetColName, exceptCols, useLog, logConstant)

        if globalValidationRate == 0 or method == 6:  # normal mode
            print('')
            print('+-----------------------+')
            print('|       Test Data       |')
            print('+-----------------------+')
            (df_pca_test, noUse) = _DF.makeDataFrame(testName, None, None,
                                                     ftype, fcolsTest, False,
                                                     targetColName, exceptCols,
                                                     useLog, logConstant)

        else:  # validation mode (use fcolsTrain_ because using training data file)
            print('')
            print('+-----------------------+')
            print('|    Validation Data    |')
            print('+-----------------------+')
            (df_pca_test,
             noUse) = _DF.makeDataFrame(trainName, validationExceptCols,
                                        trainValid_validRows, ftype,
                                        fcolsTrain_, False, targetColName,
                                        exceptCols, useLog, logConstant)

    # xgboost only
    # if XG_xgBoostLevel = 1 then as https://www.kaggle.com/jatinraina/random-acts-of-pizza-xgboost
    #                          (ref: https://swlock.blogspot.com/2019/02/xgboost-stratifiedkfold-kfold.html)
    if method == 4:

        # print training and test data
        print('\n<<< [32] df_pca_train method==4 >>>')
        print(df_pca_train)

        print('\n<<< [33] df_pca_test method==4 >>>')
        print(df_pca_test)

        # run xgboost
        # when setting validation as True, finally, always return error at [33] (both XG_xgBoostLevel=0 and XG_xgBoostLevel=1)
        finalResult = _XB.usingXgBoost(df_pca_train, df_pca_test,
                                       targetColName, 'method4', True, False,
                                       XG_xgBoostLevel, XG_info)

        print('\n<<< [34] len of finalResult >>>')
        print(len(finalResult))

    # use Deep Learning
    elif method == 6:

        # print training and test data
        print('\n<<< [35] df_pca_train >>>')
        print(df_pca_train)

        print('\n<<< [36] df_pca_test >>>')
        print(df_pca_test)

        # save data as file
        DLmain.dataFromDF(df_pca_train, df_pca_test, targetColName, exceptCols,
                          DL_normalizeTarget)

        # deep learning
        AImain.AIbase_deeplearning()

    #################################
    ###                           ###
    ###        model test         ###
    ###                           ###
    #################################

    # exit if final result does not exist (method 5 or method 6)
    if method == 5 or method == 6:
        print('\nfinish')
        exit()

    # for method 5 and method 6, this causes error because finalResult is None
    print('\n<<< [37] final result (len=' + str(len(finalResult)) + ') >>>')
    print(np.array(finalResult))

    # write result if final result exists
    result = str(targetColName) + '\n'
    for i in range(len(finalResult)):
        result += str(finalResult[i]) + '\n'

    # write the result to file
    f = open('result.csv', 'w')
    f.write(result)
    f.close()

    # compare final result with validation data
    if globalValidationRate > 0:
        _CV.compare(finalResult, testName, validationCol, trainValid_validRows)
Esempio n. 3
0
        if specificCol == None:
            print('specificCol must be specified')
            exit()

        nltk.download('punkt')
        nltk.download('averaged_perceptron_tagger')
        nltk.download('wordnet')
        nltk.download('stopwords')

        # create count vectorizer
        count_vect = CountVectorizer(analyzer=_TV.preprocess_text)

        # get train and test dataFrame
        (train_df, targetColOfTrainDataFrame) = _DF.makeDataFrame(
            trainName, ftype, fcolsTrain, True, targetColName, tfCols,
            exceptColsForMethod2, useLog, logConstant, specificCol,
            frequentWords)
        (test_df, noUse) = _DF.makeDataFrame(testName, ftype, fcolsTest, False,
                                             targetColName, tfCols,
                                             exceptColsForMethod2, useLog,
                                             logConstant, specificCol,
                                             frequentWords)

        print('\n<<< [24] train_df.columns >>>')
        print(train_df.columns)
        print('\n<<< [25] train_df >>>')
        print(train_df)
        print('\n<<< [26] test_df.columns >>>')
        print(test_df.columns)
        print('\n<<< [27] test_df >>>')
        print(test_df)