Example #1
0
def ANNClassifier(X_train, y_train, X_test, y_test):

    bestLyrAlphaIter = []
    cost = 0.0
    cost_list = []
    fp = './output/ANN_cst_d3_test.txt'
    f = open(fp, 'w')
    f.write('d\tlyr\talpha\titer\trate\taccu\n')
    for d in range(3, 4):
        poly = PolynomialFeatures(d)
        X_train_poly = poly.fit_transform(X_train)
        scaled_X_train, scaler = dataProc.standardize_X(X_train_poly)
        X_test_poly = poly.fit_transform(X_test)
        scaled_X_test = scaler.transform(X_test_poly)

        hid_layer = [3]
        Alpha_list = [1, 3, 10]
        iter_list = [3000, 10000, 30000]
        learn_rate = [0.001, 0.003]

        for l in hid_layer:
            feature_num = scaled_X_train.shape[1]
            inLyr = [feature_num] * l
            inLyr = tuple(inLyr)
            for Alpha in range(1, 11, 2):
                for iter in range(3000, 30000, 3000):
                    for rate in learn_rate:
                        clf = MLPClassifier(hidden_layer_sizes=inLyr,
                                             alpha=Alpha, learning_rate_init=rate, max_iter=iter)

                        clf.fit(scaled_X_train, y_train)

                        prdct_y = clf.predict(scaled_X_test)

                        y_test = np.array(y_test)

                        sample_num = 0
                        accu = 0.0

                        for index, i in enumerate(y_test):
                            sample_num += 1
                            if y_test[index] == prdct_y[index]:
                                accu += 1

                        accu = accu/sample_num
                        cost_list.append([d, l, Alpha, iter, rate, accu])

                        print("(d= {0}, layer = {1}, Alpha = {2}, iter = {3}, rate = {4}) is {5}"
                              .format(d, l, Alpha, iter, rate, accu))
                        f.write(str(d) + '    ' + str(l) + '  ' + str(Alpha) + '   ' + str(iter) + '  ' + str(rate)
                                + '   ' + str(accu) + '\n')
                        if accu > cost:
                            bestLyrAlphaIter = [d, l, Alpha, iter, rate]
                            cost = accu
    print(bestLyrAlphaIter, cost)
    f.flush()
    f.close()
def SVMClassifier(X_train_path, y_train_path, X_test_path, y_test_path):
    X_train = pickle.load(open(X_train_path))
    X_test = pickle.load(open(X_test_path))
    y_train = pickle.load(open(y_train_path))
    y_test = pickle.load(open(y_test_path))

    bestDCgamma = []
    cost = 0.0
    cost_list = []
    fp = './output/SVM_cst3.txt'
    f = open(fp, 'w')
    f.write('d\tC\tgamma\taccu\n')
    for d in range(1, 4):
        poly = PolynomialFeatures(d)
        X_train_poly = poly.fit_transform(X_train)
        scaled_X_train, scaler = dataProc.standardize_X(X_train_poly)
        X_test_poly = poly.fit_transform(X_test)
        scaled_X_test = scaler.transform(X_test_poly)

        C_list = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000]
        gamma_list = [30, 100, 300]

        for c in C_list:
            for gamma in gamma_list:
                clf = svm.SVC(C=c, gamma=gamma)

                clf.fit(scaled_X_train, y_train)

                prdct_y = clf.predict(scaled_X_test)

                y_test = np.array(y_test)

                sample_num = 0
                accu = 0.0

                for index, i in enumerate(y_test):
                    sample_num += 1
                    if y_test[index] == prdct_y[index]:
                        accu += 1

                accu = accu / sample_num
                cost_list.append([d, c, gamma, accu])

                print("(d= {0}, C = {1}, gamma = {2}) is {3}").format(
                    d, c, gamma, accu)
                f.write(
                    str(d) + '\t' + str(c) + '\t' + str(gamma) + '\t' +
                    str(accu) + '\n')
                if accu > cost:
                    bestDCgamma = [d, c, gamma]
                    cost = accu

    print(bestDCgamma, cost)
    f.flush()
    f.close()
Example #3
0
def RFClassifier(X_train, y_train, X_test, y_test):
    num_feat = len(X_train[0])
    num_tree = [x for x in range(10, 1010, 10)]

    cost = 0.0
    bestDFeatTrees = []
    fp = './output/RF_test.txt'
    f = open(fp, 'w')
    f.write('d\tfeatures\ttrees\taccu\n')
    for d in range(3, 4):
        poly = PolynomialFeatures(d)
        X_train_poly = poly.fit_transform(X_train)
        scaled_X_train, scaler = dataProc.standardize_X(X_train_poly)
        X_test_poly = poly.fit_transform(X_test)
        scaled_X_test = scaler.transform(X_test_poly)

        for feat in range(1, num_feat + 1):
            for tree in num_tree:
                clf = RandomForestClassifier(n_estimators=tree,
                                             max_features=feat)
                clf = clf.fit(scaled_X_train, y_train)
                y_predict = clf.predict(scaled_X_test)

                y_test = np.array(y_test)

                sample_num = 0
                accu = 0.0

                for index, i in enumerate(y_test):
                    sample_num += 1
                    if y_test[index] == y_predict[index]:
                        accu += 1

                accu = accu / sample_num

                print("(d = {0}, features = {1}, trees = {2} is {3})".format(
                    d, feat, tree, accu))
                f.write(
                    str(d) + '\t' + str(feat) + '\t' + str(tree) + '\t' +
                    str(accu) + '\n')
                if accu > cost:
                    bestDFeatTrees = [d, feat, tree]
                    cost = accu

    print(bestDFeatTrees, cost)
    f.flush()
    f.close()
def DTClassifier(X_train, y_train, X_test, y_test):
    train_sample_size = len(X_train)

    cost_list = []
    cost = 0.0
    bestDLeaf = []
    fp = './output/DT_cst3.txt'
    f = open(fp, 'w')
    f.write('d\tleaf\taccu\n')
    for d in range(1, 2):
        poly = PolynomialFeatures(d)
        X_train_poly = poly.fit_transform(X_train)
        scaled_X_train, scaler = dataProc.standardize_X(X_train_poly)
        X_test_poly = poly.fit_transform(X_test)
        scaled_X_test = scaler.transform(X_test_poly)

        # the following performs the pre-pruning approach
        minSampleLeaf = [x for x in range(1, 70, 1)]

        for leaf in minSampleLeaf:
            clf = DecisionTreeClassifier(min_samples_leaf=leaf)
            clf = clf.fit(scaled_X_train, y_train)
            y_predict = clf.predict(scaled_X_test)

            y_test = np.array(y_test)

            sample_num = 0
            accu = 0.0

            for index, i in enumerate(y_test):
                sample_num += 1
                if y_test[index] == y_predict[index]:
                    accu += 1

            accu = accu/sample_num
            cost_list.append([d, leaf, accu])

            print("(d = {0}, leaf = {1}) is {2}".format(d, leaf, accu))
            f.write(str(d)+'\t'+str(leaf)+'\t'+str(accu)+'\n')
            if accu > cost:
                bestDLeaf = [d, leaf]
                cost = accu

    print(bestDLeaf, cost)
    f.flush()
    f.close()
def SVMClassifier2(X_train_path, y_train_path, X_test_path, y_test_path):
    X_train = pickle.load(open(X_train_path))
    X_test = pickle.load(open(X_test_path))
    y_train = pickle.load(open(y_train_path))
    y_test = pickle.load(open(y_test_path))

    bestCgamma = []
    cost = 0.0
    cost_list = []

    scaled_X_train, scaler = dataProc.standardize_X(X_train)
    scaled_X_test = scaler.transform(X_test)

    C_list = [30, 100, 300]

    for c in range(10, 301):
        for gamma in range(1, 100):
            clf = svm.SVC(C=c, gamma=gamma)

            clf.fit(scaled_X_train, y_train)

            prdct_y = clf.predict(scaled_X_test)

            y_test = np.array(y_test)

            sample_num = 0
            accu = 0.0

            for index, i in enumerate(y_test):
                sample_num += 1
                if y_test[index] == prdct_y[index]:
                    accu += 1

            accu = accu / sample_num
            cost_list.append([c, gamma, accu])

            print("( C = {0}, gamma = {1}) is {2}").format(c, gamma, accu)
            if accu > cost:
                bestCgamma = [c, gamma]
                cost = accu

    pickle.dump(cost_list, open('cst_SVM.p', 'wb'))
    print(bestCgamma, cost)
def runANNclassifier(polyD):
    starttime = datetime.now()
    # connect to database to load training and image
    DB_NAME = 'MLCdata'
    USER_NAME = 'Shih_admin'
    PASSWD = 'racoon790713'
    try:
        connection = psycopg2.connect(database=DB_NAME,
                                      user=USER_NAME,
                                      password=PASSWD)
        print("connection to '%s' succes!" % (DB_NAME))

    except Exception as e:
        print("connection to '%s' failed" % (DB_NAME))
        print(e)
        print(errorcodes.lookup(e.pgcode))

    # load training data from database
    query = 'Select * From accratraining Order by Random() limit 80000;'
    cursor = connection.cursor()
    cursor.execute(query)
    traindata = cursor.fetchall()
    traindata = np.array(list(traindata))
    traindata = traindata[:, 2:]

    trainX = traindata[:, 1:]
    poly = PolynomialFeatures(polyD)
    trainX = poly.fit_transform(trainX)
    # ANN needs one more step for data preprocessing i.e. feature scaling
    trainX, scaler = dataProc.standardize_X(trainX)
    xlen = len(trainX[0]) - 1

    # y (class) need to be broken into multiple columns. The number of new columns corresponds to the number of classes
    y = np.unique(traindata[:, 0])
    yout = []
    ytitle = []
    ytype = []
    for yele in y:
        ytitle.append('cls' + yele)
        ytype.append(int)
        cls = traindata[:, 0] == yele
        cls = cls.astype(int)
        cls = cls.tolist()
        yout.append(cls)
    yout = np.array(yout)
    yout = np.transpose(yout)
    traindata = np.concatenate((yout, trainX[:, 1:]), axis=1)
    train_pddf = pd.DataFrame(traindata)
    del trainX, traindata, yout

    # need titles for all independent variables
    colname = ytitle
    dtype = ytype
    xtitle = []
    xtype = []
    for i in range(xlen):
        xtitle.append('v' + str(i + 1))
        xtype.append(float)
    colname = colname + xtitle
    dtype = dtype + xtype

    train_pddf.columns = colname
    for i in range(len(train_pddf.columns)):
        train_pddf[colname[i]] = train_pddf[colname[i]].astype(dtype[i])

    # set up ANN classifier and train it in R
    r = pyper.R(RCMD=r'C:\Program Files\R\R-3.3.3\bin\R.exe',
                use_pandas='True')
    r("setwd('E:/2016_MachineLearningClassifier/mydata')")
    r.assign("df1", train_pddf)
    r('require(neuralnet)')
    starttime1 = datetime.now()
    # for ANN, create formula is necessary in neuralnet package
    formula = 'formu = as.formula("'
    for i, val in enumerate(ytitle):
        if i == 0:
            formula = formula + val
        else:
            formula = formula + '+' + val
    formula = formula + '~'
    for i, val in enumerate(xtitle):
        if i == 0:
            formula = formula + val
        else:
            formula = formula + '+' + val
    formula = formula + '")'
    print(formula)
    r(formula)
    print('starting training...')
    r("ann = nueralnet(formu, df1, hidden=c(8,8,8), learningrate= 0.001, stepmax= 21000, lifesign='full')"
      )
    print(r('ann'))
    print(r('summary(ann)'))
    endtime1 = datetime.now()
    dur = endtime1 - starttime1
    print("time usage: " + str(dur.days) + " days, " + str(dur.seconds) +
          " seconds, " + str(dur.microseconds) + " musecs.")

    # connect to image database below and query for the total number of pixels
    query = 'Select Count(*) From accra8bands;'
    cursor = connection.cursor()
    cursor.execute(query)
    result = cursor.fetchone()
    pixelsize = result[0]
    del result

    # create a csv file for store all classified pixels
    outfp = './output/accra_R_ANNpx.csv'
    import csv

    f = open(outfp, 'w')
    wr = csv.writer(f, delimiter=',')
    header = ['pointid', 'class']
    wr.writerow(header)
    # break all the pixels into multiple sets for classifier predicting
    loop = int(pixelsize / 10000)
    if (pixelsize % 10000) != 0:
        loop = loop + 1
    print('start to query...')

    rscript = "colnames(pred.result) = c("
    for k, val in enumerate(ytitle):
        if k == 0:
            rscript = rscript + "'" + val + "'"
        else:
            rscript = rscript + ", '" + val + "'"
    rscript = rscript + ")"
    print(rscript)
    for i in range(loop):
        query = 'Select * From accra8bands Where pointid>' + str(i * 10000) + \
                ' and pointid<= ' + str((i + 1) * 10000) + ' Order by pointid;'
        cursor.execute(query)
        pixel10k = cursor.fetchall()

        # get the spectra info from database and make it as data frame with pandas
        pixel10k = np.array(list(pixel10k))
        X = pixel10k[:, 3:]
        X = poly.fit_transform(X)
        X = scaler.transform(X)

        # should process for y??????
        X = X[:, 1:]
        pixel_pddf = pd.DataFrame(X)
        pixel_pddf.columns = xtitle
        for j in range(len(pixel_pddf.columns)):
            pixel_pddf[xtitle[j]] = pixel_pddf[xtitle[j]].astype(xtype[j])

        r.assign('df_pred', pixel_pddf)

        print('start to predict...')
        print(i)
        r('pred=compute(ann, df_pred)')
        r('pred.result = as.data.frame(pred$net.result)')
        r(rscript)
        r('pred.result$class = substr(colnames(pred.result)[max.col(pred.result,ties.method="first")],4,4)'
          )
        r('output = pred.result$class')

        # done work, send R result back to python
        # Warning!! Not getting anything back from R yet, need to fix
        predclass = r.get('output')
        predclass = predclass.astype(int)
        predclass = predclass.reshape(predclass.shape[0], -1)
        id = pixel10k[:, 1]
        id = id.astype(int)
        id = id.reshape(id.shape[0], -1)
        pixelout = np.concatenate((id, predclass), axis=1)
        wr.writerows(pixelout)
    f.flush()
    f.close()
    connection.commit()
    cursor.close()
    connection.close()
    endtime = datetime.now()
    dur = endtime - starttime
    print("time usage: " + str(dur.days) + " days, " + str(dur.seconds) +
          " seconds, " + str(dur.microseconds) + " musecs.")
def runSVMclassifier(polyD):
    starttime = datetime.now()
    # connect to database to load training and image
    DB_NAME = 'MLCdata'
    USER_NAME = 'Shih_admin'
    PASSWD = 'racoon790713'
    try:
        connection = psycopg2.connect(database=DB_NAME,
                                      user=USER_NAME,
                                      password=PASSWD)
        print("connection to '%s' succes!" % (DB_NAME))

    except Exception as e:
        print("connection to '%s' failed" % (DB_NAME))
        print(e)
        print(errorcodes.lookup(e.pgcode))

    # load training data from database
    query = 'Select * From accratraining Order by pointid;'
    cursor = connection.cursor()
    cursor.execute(query)
    traindata = cursor.fetchall()
    traindata = np.array(list(traindata))

    trainX = traindata[:, 3:]
    poly = PolynomialFeatures(polyD)
    trainX = poly.fit_transform(trainX)
    # SVM needs one more step for data preprocessing i.e. feature scaling
    trainX, scaler = dataProc.standardize_X(trainX)
    traindata = np.concatenate((traindata[:, :3], trainX[:, 1:]), axis=1)
    train_pddf = pd.DataFrame(traindata)
    del trainX, traindata
    colname = ['objectid', 'pointid', 'class']
    dtype = [int, int, str]
    for i in range(len(train_pddf.columns) - 3):
        colname.append('v' + str(i + 1))
        dtype.append(float)
    train_pddf.columns = colname
    for i in range(len(train_pddf.columns)):
        train_pddf[colname[i]] = train_pddf[colname[i]].astype(dtype[i])

    # set up SVM classifier and train it in R
    r = pyper.R(RCMD=r'C:\Program Files\R\R-3.3.3\bin\R.exe',
                use_pandas='True')
    r("setwd('E:/2016_MachineLearningClassifier/mydata')")
    r.assign("df1", train_pddf)
    r('require(e1071)')
    print('starting training...')
    starttime1 = datetime.now()
    r("svmfit = svm(class~.-objectid-pointid, data=df1, kernel='radial', gamma=58, cost=16)"
      )
    endtime1 = datetime.now()
    dur = endtime1 - starttime1
    print("time usage: " + str(dur.days) + " days, " + str(dur.seconds) +
          " seconds, " + str(dur.microseconds) + " musecs.")

    # connect to image database below and query for the total number of pixels
    query = 'Select Count(*) From accra8bands;'
    cursor = connection.cursor()
    cursor.execute(query)
    result = cursor.fetchone()
    pixelsize = result[0]
    del result

    # create a csv file for store all classified pixels
    outfp = './output/accra_R_SVMpx.csv'
    import csv

    f = open(outfp, 'w')
    wr = csv.writer(f, delimiter=',')
    header = ['pointid', 'class']
    wr.writerow(header)
    # break all the pixels into multiple sets for classifier predicting
    loop = int(pixelsize / 10000)
    if (pixelsize % 10000) != 0:
        loop = loop + 1
    print('start to query...')
    for i in range(loop):
        query = 'Select * From accra8bands Where pointid>' + str(i * 10000) + \
                ' and pointid<= ' + str((i + 1) * 10000) + ' Order by pointid;'
        cursor.execute(query)
        pixel10k = cursor.fetchall()

        # get the spectra info from database and make it as data frame with pandas
        pixel10k = np.array(list(pixel10k))
        X = pixel10k[:, 3:]
        X = poly.fit_transform(X)
        X = scaler.transform(X)
        pixel10k = np.concatenate((pixel10k[:, :3], X[:, 1:]), axis=1)
        pixel_pddf = pd.DataFrame(pixel10k)
        del X
        pixel_pddf.columns = colname
        for j in range(len(pixel_pddf.columns)):
            pixel_pddf[colname[j]] = pixel_pddf[colname[j]].astype(dtype[j])

        r.assign('df_pred', pixel_pddf)
        print('start to predict...')
        print(i)
        r('pred=predict(svmfit, df_pred)')
        # done work, send R result back to python

        predclass = r.get('pred')
        predclass = predclass.astype(int)
        predclass = predclass.reshape(predclass.shape[0], -1)
        id = pixel10k[:, 1]
        id = id.astype(int)
        id = id.reshape(id.shape[0], -1)
        pixelout = np.concatenate((id, predclass), axis=1)
        wr.writerows(pixelout)
    f.flush()
    f.close()
    connection.commit()
    cursor.close()
    connection.close()
    endtime = datetime.now()
    dur = endtime - starttime
    print("time usage: " + str(dur.days) + " days, " + str(dur.seconds) +
          " seconds, " + str(dur.microseconds) + " musecs.")