Exemple #1
0
def lda(k, sam_tr, ind_tr, sam_te, ind_te, c, classifier=0):
    """
    fisherface implementation
    -------------------------
    inputs:
    k --size of subspace;
    sam_tr --training set
    sam_te --testing set
    c --num of classes
    classifier --0 for kNN(default), 1 for SVM

    outputs:
    accu --classification accuracy
    co_mat --confusion mat
    W --projection mat
    """
    # data preparation
    flg = 0
    num_tr = sam_tr.shape[1]/c
    dm = sam_tr.shape[0]

    # training
    # compute variables
    # compute within class mean
    mu_c = np.mean(sam_tr.T.reshape(c, num_tr, dm), axis=1).T
    mu = np.mean(sam_tr, 1)  # compute overall mean
    # compute within class scatter matrix Sw
    mu_c_l = np.tile(mu_c.T, (num_tr, 1, 1)).transpose(1, 0, 2)\
        .reshape(num_tr*c, dm).T  # enlarged mean mat, note the usage of transpose
    B = sam_tr - mu_c_l
    Sw = B.dot(B.T)
    # compute between class scatter matrix Sb
    Ni = num_tr*np.ones((dm, c))  # sample numbers in each class
    C = mu_c - np.tile(mu.reshape(dm, 1), (1, c))
    Sb = C.dot(Ni.T*C.T)
    # solve generalized eigenvalue problem
    W = mf.eigs(np.linalg.inv(Sw).dot(Sb), k)  # the projection matrix

    # testing
    sam_tr_p = W.T.dot(sam_tr-np.tile(mu.reshape(dm, 1), (1, sam_tr.shape[1])))  # projected training samples
    sam_te_p = W.T.dot(sam_te-np.tile(mu.reshape(dm, 1), (1, sam_te.shape[1])))  # projected testing samples
    # 1NN
    if classifier == 0:
        nei = kNN(n_neighbors=1)
        nei.fit(sam_tr_p.T, ind_tr)
        p_label = nei.predict(sam_te_p.T)
        co_mat = met.confusion_matrix(ind_te, p_label)
    # SVM
    if classifier == 1:
        lb_tr, ins_tr, lb_te, ins_te = mf.np2libsvm(ind_tr, sam_tr_p, ind_te, sam_te_p)
        accu, co_mat = libsvm(lb_tr, ins_tr, lb_te, ins_te)
        flg = 1

    if flg == 0:
        accu = np.trace(co_mat).astype(float)/np.sum(co_mat).astype(float)
    else:
        accu = accu/100
    return accu, co_mat, W
Exemple #2
0
def pca(k, sam_tr, ind_tr, sam_te, ind_te, c, classifier=0):
    """
    eigenface implementation
    ----------------------------------------
    inputs:
    k --size of subspace
    sam_tr --training set
    sam_te --testing set
    c --num of classes
    classifier --0 for kNN(default), 1 for SVM

    outputs
    accu --classfication accuracy
    co_mat --confusion mat
    W --projection mat
    """
    # data preparation
    flg = 0
    num_tr = sam_tr.shape[1]/c
    dm = sam_tr.shape[0]

    # training
    mu = np.mean(sam_tr, axis=1)  # compute overall mean
    B = sam_tr - np.tile(mu.reshape(dm, 1), (1, num_tr*c))
    S = B.T.dot(B)
    # S = sam_tr.dot(sam_tr.T)
    W = mf.eigs1(S, B, k)  # the projection matrix
    W = np.real(W)

    # testing
    sam_tr_p = W.T.dot(sam_tr-np.tile(mu.reshape(dm, 1), (1, sam_tr.shape[1])))  # projected training samples
    sam_te_p = W.T.dot(sam_te-np.tile(mu.reshape(dm, 1), (1, sam_te.shape[1])))  # projected testing samples
    # 1NN
    if classifier == 0:
        nei = kNN(n_neighbors=1)
        nei.fit(sam_tr_p.T, ind_tr)
        p_label = nei.predict(sam_te_p.T)
        co_mat = met.confusion_matrix(ind_te, p_label)
    # SVM
    if classifier == 1:
        lb_tr, ins_tr, lb_te, ins_te = mf.np2libsvm(ind_tr, sam_tr_p, ind_te, sam_te_p)
        accu, co_mat = libsvm(lb_tr, ins_tr, lb_te, ins_te)
        flg = 1

    if flg == 0:
        accu = np.trace(co_mat).astype(float)/np.sum(co_mat).astype(float)
    else:
        accu = accu/100
    return accu, co_mat, W
def handwritingClassTest():
    #测试集的Labels
    hwLabels = []
    #返回trainingDigits目录下的文件名
    trainingFileList = listdir('trainingDigits')
    #返回文件夹下文件的个数
    m = len(trainingFileList)
    #初始化训练的Mat矩阵,测试集
    trainingMat = np.zeros((m, 1024))
    #从文件名中解析出训练集的类别
    for i in range(m):
        #获得文件的名字
        fileNameStr = trainingFileList[i]
        #获得分类的数字
        classNumber = int(fileNameStr.split('_')[0])
        #将获得的类别添加到hwLabels中
        hwLabels.append(classNumber)
        #将每一个文件的1x1024数据存储到trainingMat矩阵中
        trainingMat[i,:] = img2vector('trainingDigits/%s' % (fileNameStr))
    #构建kNN分类器
    neigh = kNN(n_neighbors = 3, algorithm = 'auto')
    #拟合模型, trainingMat为训练矩阵,hwLabels为对应的标签
    neigh.fit(trainingMat, hwLabels)
    #返回testDigits目录下的文件列表
    testFileList = listdir('testDigits')
    #错误检测计数
    errorCount = 0.0
    #测试数据的数量
    mTest = len(testFileList)
    #从文件中解析出测试集的类别并进行分类测试
    for i in range(mTest):
        #获得文件的名字
        fileNameStr = testFileList[i]
        #获得分类的数字
        classNumber = int(fileNameStr.split('_')[0])
        #获得测试集的1x1024向量,用于训练
        vectorUnderTest = img2vector('testDigits/%s' % (fileNameStr))
        #获得预测结果
        # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        classifierResult = neigh.predict(vectorUnderTest)
        print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber))
        if(classifierResult != classNumber):
            errorCount += 1.0
    print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100))
def handwritingClassTest():
    hwLabels = []#测试集的标签矩阵
    trainingFileList = listdir('machinelearning/Ch02/trainingDigits')#返回trainingDigits目录下的文件名
    m = len(trainingFileList)#返回文件夹下文件的个数
    trainingMat = np.zeros((m, 1024))#初始化训练的Mat矩阵,测试集向量大小为训练数据个数*1024,即多少张图像,就有多少行,一行存一个图像
    for i in range(m): #从文件名中解析出训练集的类别标签
        fileNameStr = trainingFileList[i] #获得文件的名字
        classNumber = int(fileNameStr.split('_')[0])##第一个字符串存储标签,故取分离后的第一个元素,即相当于获取了该图像类别标签
        hwLabels.append(classNumber)#将获得的类别标签添加到hwLabels中
        trainingMat[i,:] = img2vector('machinelearning/Ch02/trainingDigits/%s' % (fileNameStr))#将每一个文件的1x1024数据存储到trainingMat中
    neigh = kNN(n_neighbors = 3, algorithm = 'auto')#构建kNN分类器,第一个参数表示近邻数为3,算法为权重均匀的算法
    neigh.fit(trainingMat, hwLabels)#拟合模型, trainingMat为测试矩阵,hwLabels为对应的标签
    testFileList = listdir('machinelearning/Ch02/trainingDigits')#返回testDigits目录下的文件列表
    errorCount = 0.0#错误检测计数,初始值为0
    mTest = len(testFileList)#测试数据的数量
    for i in range(mTest):#从文件中解析出测试集的类别并进行分类测试
        fileNameStr = testFileList[i]#获得文件的名字
        classNumber = int(fileNameStr.split('_')[0])#获得分类的数字标签
        vectorUnderTest = img2vector('machinelearning/Ch02/trainingDigits/%s' % (fileNameStr)) #获得测试集的1x1024向量,用于训练
        classifierResult = neigh.predict(vectorUnderTest)#获得预测结果
        print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber))
        if(classifierResult != classNumber):#如果预测结果与实际结果不符,则错误数加一
            errorCount += 1.0
    print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100))#获取错误率
Exemple #5
0
def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('trainingDigits')
    m = len(trainingFileList)
    trainingMat = np.zeros((m, 1024))
    for i in range(m):
        filenameStr = trainingFileList[i]
        classNumber = int(filenameStr.split('_')[0])
        hwLabels.append(classNumber)
        trainingMat[i, :] = img2vector('trainingDigits/%s' % (filenameStr))
    neigh = kNN(n_neighbors=3, algorithm='auto')
    neigh.fit(trainingMat, hwLabels)
    testFileList = listdir('testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        filenameStr = testFileList[i]
        classNumber = int(filenameStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' % (filenameStr))
        classifierResult = neigh.predict(vectorUnderTest)
        print('分类返回结果为%d\t真实值结果为%d' % (classifierResult, classNumber))
        if (classifierResult != classNumber):
            errorCount += 1.0
    print('总错误了%d个数据\n错误率为%f%%' % (errorCount, errorCount / mTest * 100))
Exemple #6
0
def handwritingClassTest():
    #训练数据集的标签
    hwLabels = []
    #返回文件夹trainingDigits下的文件名
    trainingFileList = listdir("trainingDigits")
    #获得文件数
    m = len(trainingFileList)
    #创建默仍值为0的m*1024训练集矩阵
    trainingMat = np.zeros((m, 1024))
    #依次对文件夹下的每个文件进行解析
    #获得文件名,文件标签和文件中的数据,构建训练数据集矩阵
    for i in range(m):
        fileNameStr = trainingFileList[i]
        classNumber = int(fileNameStr.split('_')[0])
        hwLabels.append(classNumber)
        trainingMat[i, :] = imageToVector('trainingDigits/' + str(fileNameStr))
    #构建kNN分类器
    neigh = kNN(n_neighbors=3, algorithm='auto')
    #拟合训练
    neigh.fit(trainingMat, hwLabels)
    #错误数检测
    errorCount = 0.0
    #读取testDigits文件夹下的文件,并且解析
    #利用predict进行预测返回分类结果
    testFileList = listdir('testDigits')
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        classNumber = int(fileNameStr.split('_')[0])
        vectorUnderTest = imageToVector('testDigits/' + str(fileNameStr))
        classifierUnderTest = neigh.predict(vectorUnderTest)
        print("分类返回结果为:" + str(classifierUnderTest) + "真实结果为:" +
              str(classNumber))
        if classifierUnderTest != classNumber:
            errorCount += 1.0
    print("共算错了%d个数据\n错误率为%f%%" % (errorCount, errorCount / mTest * 100))
Exemple #7
0
def score_classifier(symbol,
                     market_df,
                     start_date,
                     end_date,
                     base=60,
                     span=4,
                     profit=.05):
    """ Adapted gen classifier function specifically to generate scores """

    # 1. get stock data
    df = get_stock_data(symbol=symbol,
                        start_date=start_date,
                        end_date=end_date)
    # 2. add market data
    df = merge_datasets(df, market_df)
    # 3. calculate stock indicators
    df = get_tech_indicators(df)
    # 4. create features
    df = create_features(df, base=base)
    # 5. create labels
    df = create_labels(df, span=span, profit=profit)
    # 6. split features and labels
    X, y = split_features_labels(df)
    # 7. scale
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    # 8. fit classifier
    # knn with pca, train faster. can't lose this amount of time
    pca = PCA(n_components=10, random_state=42)  #whiten=True
    X = pca.fit_transform(X)
    clf = kNN()
    clf.fit(X, y)
    # 9. calculate precision
    cv = StratifiedShuffleSplit(n_splits=10, test_size=.1, random_state=42)
    scores = cross_val_score(clf, X, y, cv=cv, scoring='precision')
    return scores.mean(), scores.std(), np.bincount(y)[1] / len(y)
Exemple #8
0
def Knn_train(data, label):
    # 构建kNN分类器
    neigh = kNN(n_neighbors=10, algorithm='auto', weights='distance', n_jobs=1)
    # 拟合模型, trainingMat为测试矩阵,hwLabels为对应的标签
    neigh.fit(data, label)
    # 验证机数据集
    valid_data, valid_label = Data_change(train=False)
    # 正确检测计数
    resultCount = 0.0
    # 验证数据集的数量
    mTest = len(valid_data)
    valid_data = np.array(valid_data)
    valid_label = np.array(valid_label)
    # classifierResult = neigh.predict(valid_data[1].reshape(1,-1))
    # print((classifierResult==valid_label[1]).all())
    # print(valid_label[1])
    # print(type(valid_label[1]))
    # valid_label=np.array(valid_label)
    for i in range(mTest):
        classifierResult = neigh.predict(valid_data[i].reshape(1, -1))
        # print("分类返回结果为%d\t真实结果为%d" % (classifierResult, valid_label))
        if ((classifierResult == valid_label[i]).all()):
            resultCount += 1.0
    print("总共对了%d个数据\n准确率率为%f%%" % (resultCount, resultCount / mTest * 100))
Exemple #9
0
def train():
    labels = []
    trainSet = listdir('./digits/trainSet')
    numTrain = len(trainSet)
    trainMatrix = np.zeros((numTrain, 1024))  #32*32 img size
    for i in range(numTrain):
        filename = trainSet[i]
        label = int(filename.split('_')[0])
        labels.append(label)
        trainMatrix[i, :] = img2vector('./digits/trainSet/%s' % (filename))
    neigh = kNN(n_neighbors=3, algorithm='auto')
    neigh.fit(trainMatrix, labels)
    testSet = listdir('./digits/testSet')
    errorCount = 0.0
    numTest = len(testSet)
    for i in range(numTest):
        filename = testSet[i]
        label = int(filename.split('_')[0])
        vectorImg = img2vector('./digits/testSet/%s' % (filename))
        predLabel = neigh.predict(vectorImg)
        print('label: %d  vs  predLabel: %d' % (label, predLabel))
        if (label != predLabel):
            errorCount += 1.0
    print('Error Rate : %f%%' % (errorCount / numTest * 100))
Exemple #10
0
def main(args):

    if args.people:
        with open(args.people[0]) as f:
            people = [line.rstrip() for line in f]
        people = np.array(people)

        auth_descriptors = np.loadtxt(args.people[1], dtype=np.float32)
        auth_id = np.arange(len(people)).repeat(10)
        print('Authorized Features:', auth_descriptors.shape)

        if args.use_sklearn_knn:
            from sklearn.neighbors import KNeighborsClassifier as kNN
            knn = kNN(args.k, weights='distance', n_jobs=4)
            knn.fit(auth_descriptors, auth_id)

    # initialize camera
    vs = VideoStream(usePiCamera=True,
                     framerate=args.framerate,
                     resolution=args.resolution)
    vs.start()

    # initialize face detector
    DET_NET_DEF = 'detector/res10_300x300_ssd.prototxt'
    DET_NET_WEIGHTS = 'detector/res10_300x300_ssd_iter_140000.caffemodel'

    start = time.time()
    detector = cv2.dnn.readNetFromCaffe(DET_NET_DEF, DET_NET_WEIGHTS)
    end = time.time()
    print('Detector loading time:', (end - start))

    # initialize the extraction network
    SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
    MODEL = os.path.join(
        SCRIPT_DIR, 'extractor/caffe_models/{0}_caffe/{0}'.format(args.model))
    EXT_NET_DEF = '{}.prototxt'.format(MODEL)
    EXT_NET_WEIGHTS = '{}.caffemodel'.format(MODEL)
    LAYER = 'pool5/7x7_s1'  # 'classifier'
    mean = np.array([91.4953, 103.8827, 131.0912]).reshape(3, 1, 1)

    args.use_caffe = 'senet' in args.model

    start = time.time()
    if args.use_caffe:
        os.environ['GLOG_minloglevel'] = '2'
        import caffe
        extractor = caffe.Net(EXT_NET_DEF, caffe.TEST, weights=EXT_NET_WEIGHTS)
        extractor.blobs['data'].reshape(1, 3, args.side, args.side)
    else:
        extractor = cv2.dnn.readNetFromCaffe(EXT_NET_DEF, EXT_NET_WEIGHTS)

    end = time.time()
    print('Extractor loading time:', (end - start))

    while True:
        start_whole = time.time()
        # capture image from camera
        start = time.time()
        img = vs.read()
        if img is None:
            continue  # skip initial empty frames due to camera init. delay
        end = time.time()
        capture_time = end - start
        print('\n\tCapture:', capture_time, 's')

        # detect faces
        start = time.time()
        img_det = cv2.resize(img, (300, 300))
        img_det = cv2.dnn.blobFromImage(img_det,
                                        1.0, (300, 300), (104.0, 177.0, 123.0),
                                        swapRB=False)
        detector.setInput(img_det)
        faces = detector.forward().squeeze()
        end = time.time()
        del img_det
        detection_time = end - start

        confidences = faces[:, 2]
        faces = faces[confidences > args.detection_confidence, 3:7]

        print('\tDetect :', detection_time, 's,', len(faces), 'faces')

        for face in faces:
            face *= np.tile(args.resolution, 2)
            (startX, startY, endX, endY) = face.astype("int")
            face = img[startY:endY, startX:endX]

            if face.size == 0:
                print('\tDiscarded empty bounding box')
                continue

            # preprocess face
            start = time.time()
            face = preprocess_image(face, mean, side=args.side)
            end = time.time()
            preproc_time = end - start
            print('\tPreProc:', preproc_time, 's')

            # get the description
            start = time.time()
            if args.use_caffe:
                extractor.blobs['data'].data[...] = face
                descriptor = extractor.forward(end=LAYER)[LAYER].squeeze()
            else:
                extractor.setInput(face)
                descriptor = extractor.forward(LAYER)
            end = time.time()
            extraction_time = end - start
            print('\tExtract:', extraction_time, 's')

            if args.people:
                start = time.time()
                descriptor = normalize(descriptor.reshape(1, -1))

                if args.use_sklearn_knn:  # sklearn knn
                    confidences = knn.predict_proba(descriptor)
                    person_id = np.argmax(confidences)
                    confidence = confidences[person_id]
                else:  # VIR knn
                    person_id, confidence = knn_score(descriptor,
                                                      auth_descriptors,
                                                      auth_id, args.k)

                end = time.time()
                match_time = end - start

                match = people[
                    person_id] if confidence > args.match_confidence else 'Unauthorized'
                match = '{} (Conf = {:.2f})'.format(match, confidence)

                print('\tMatch  :', match_time, 's,', match)

                end_whole = time.time()
                whole = end_whole - start_whole
                print('\tTOTAL  :', whole, 's')
Exemple #11
0
#split training data and test data, The ratio is 4: 1
X_train, X_test, y_train, y_test = train_test_split(select_X,
                                                    y1,
                                                    test_size=0.2,
                                                    random_state=0)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

#cross validation and grid search for hyperparameter estimation
param_dist = {'weights': ["uniform", "distance"]}

cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
clf = GridSearchCV(kNN(), param_grid=param_dist, cv=cv)
clf = clf.fit(X_train, y_train.values.ravel())

print("Best estimator found by grid search:")
print(clf.best_estimator_)
#apply the classifier on the test data and show the accuracy of the model
print('the acuracy for all is:')
print(clf.score(X_test, y_test.values.ravel()))

prediction = clf.predict(X_test)
#use the metrics.classification to report.
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, prediction))
print("Classification report:\n %s\n" %
      metrics.classification_report(y_test, prediction))

########################################################################################################################
estimator_lol = SVC(C=C_val, gamma=gamma_val, kernel='rbf', random_state=100)
samp_lol, SVM_train_score_lol, SVM_train_time_lol, SVM_pred_time_lol = plotTraining(
    estimator_lol, X_train, y_train, title="SVM for LOL games")
evaluation(estimator_lol, X_train, X_test, y_train, y_test)

# ### 5. KNN

# In[19]:

X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.20)
hyperKNN(X_train,
         y_train,
         X_test,
         y_test,
         title="F1 Score(NBA games)\nHyperparameter : No. Neighbors")
estimator_nba = kNN(n_neighbors=20, n_jobs=-1)
samp_nba, kNN_train_score_nba, kNN_train_time_nba, kNN_pred_time_nba = plotTraining(
    estimator_nba, X_train, y_train, title="kNN SVM for NBA games")
evaluation(estimator_nba, X_train, X_test, y_train, y_test)

# In[20]:

X_train, X_test, y_train, y_test = train_test_split(X2, Y2, test_size=0.20)
hyperKNN(X_train,
         y_train,
         X_test,
         y_test,
         title="F1 Score(LOL games)\nHyperparameter: No. Neighbors")
estimator_lol = kNN(n_neighbors=5, n_jobs=-1)
samp_lol, kNN_train_score_lol, kNN_train_time_lol, kNN_pred_time_lol = plotTraining(
    estimator_lol, X_train, y_train, title="kNN SVM for LOL games")
Exemple #13
0
print(f"Eigenvectors {evecs}")  # list the eigenvectors

transf = evecs * LA.inv(np.sqrt(
    np.diagflat(evals)))  # compute the transformation matrix
print("Transformation Matrix = \n", transf)

transf_x = data_inp * transf  # compute the transformed matrix
print("The Transformed x = \n", transf_x)

# Proof for orthogonalization
xtx = transf_x.transpose(
) * transf_x  # this should yield an identity matrix as orthogonalized variables would
# have zero correlation between them.
print("Expect an Identity Matrix = \n", xtx)

kNNSpec = kNN(n_neighbors=5,
              algorithm='brute')  # specify the parameters for KNN classifier.
knn_fit = kNNSpec.fit(transf_x, fraud_df['FRAUD'])  # fit the model.
print(
    f"Accuracy for model: {round(knn_fit.score(transf_x, fraud_df['FRAUD']), 4)}"
)  # compute accuracy of model on
# training data.

new_inp = [[7500, 15, 3, 127, 2, 2]]  # test input.

transf_inp = new_inp * transf  # transform test input.
transf_inp_nbrs = knn_fit.kneighbors(
    transf_inp, return_distance=False)  # compute the nearest neighbors for the
# test input, the distance values are not needed and hence, has been omitted.

# print the input and output values for nearest neighbors.
for nbr in transf_inp_nbrs.tolist()[0]:
Exemple #14
0
# MACs of networks in dataset
networks = [z for (_, _, z) in list(dataset)[3:]]

# If we want only selected wifis in dataset:
network_names = ["ap", "hub0", "hub1", "hub2"]
networks = [z for (x, _, z) in list(dataset)[3:] if x in network_names]
features = np.asarray(dataset[network_names])


found_networks = [0] * len(networks)

# a = dataset.hist()
# plt.plot(a)

clf = kNN(n_neighbors=2)
print("-" * 30)
print(clf)
clf.fit(features, labels)

# wifi_monitor = net.wifi_mon(interface = 'ap')
stop = False
while not stop:
    try:
        # cells = net.parse_scan(None, wifi_monitor)
        # print(cells)
        # unit test mock some test data
        cells = [
            {
                "bssid": "00:0b:6b:de:ea:36",
                "frequency": "2437",
Exemple #15
0
def fMAP(samples,nnn=25):
	nbrs   = kNN(n_neighbors=nnn).fit(samples)
	distances, indices = nbrs.kneighbors(samples)
	idx    = np.argmin(distances[:,-1])
	return samples[idx]
Exemple #16
0
def execute_program():
    def status(word):
        word = str(word)
        if word.upper() != 'OK':
            beautiful_output.red_normal('--> Status: ' + '[' +
                                        "Error because " + str(word) + ']')

            input('Enter to quit...')
            os._exit(0)

        beautiful_output.green_normal('--> Status: ' + '[' + str(word) + ']')

    # Generate the code

    print('Begin to reconstruct...', end='      ')
    try:
        DB.reconstruct()

        # do not use del command become the CLI will be ugly
        os.system('rd /s/q train_data')
        os.system('md train_data')
        status('OK')

    except Exception as e:
        status(e)

    print('\nExecute generating progress...')
    try:
        num = int(input('The number of code you wanna generate: '))

        print('Generating...')
        generator.generate(num)
    except Exception as e:
        num = 0
        status(e)

    # print sample database

    print('\n')
    beautiful_output.underline('DATABASE CHECK:')
    DB.disp_DB()

    print('\nInitialize database...', end='      ')
    status('OK')

    # Clean the DB

    print('Remake the database, it may take a while...')

    try:
        data = data_generator.generate()
    except Exception as e:
        data = []
        status(e)

    # print out the clean data

    beautiful_output.underline('\nData check:')
    try:
        print(list(data.values())[0][0])
        print('\nData Check...', end='      ')

        status('OK')
    except Exception as e:
        status(e)

    # print log

    beautiful_output.underline('\nLog:')
    print('-------------------------------')
    print('train data number:  ' + str(num * 4))
    print('train data Pairs :  ' + str(len(list(data.keys()))))
    print('covered data rate:  ' + str(len(list(data.keys())) / 26 * 100)[:4] +
          '%')
    print('data shape       :  ' + str(list(data.values())[0][0].shape))
    print('-------------------------------')

    # data constructor for training

    def construct_data():

        label_total = list(data.keys())

        feature = []
        label = []

        for i in label_total:

            for ii in data[i]:
                # decrease the dimension
                ii = ii.reshape(1, ii.shape[1] * ii.shape[0])[0]
                feature.append(ii)
                label.append(i)

        return [feature, label]

    # train the data

    print('\nReconstruct the feature and label array...')
    try:

        # construct the knn model

        temp = construct_data()
        feature = temp[0]
        label = temp[1]

        beautiful_output.underline('\nCheck the feature:')
        print(feature[0][:10])
        print('\nReconstruct data...', end='      ')
        status('OK')
    except Exception as e:
        label = []
        feature = []
        status(e)

    print('\nTraining...', end='      ')
    try:

        # define cluster
        neighbor_num = len(np.unique(label))
        mode = kNN(n_neighbors=neighbor_num, algorithm='auto')
        mode.fit(feature, label)
        status('OK')
    except Exception as e:
        neighbor_num = 0
        mode = None
        status(e)

    # save model

    print('\nSave the model...', end='     ')
    try:
        joblib.dump(mode, './model.m')
        status('OK')
    except Exception as e:
        status(e)

    # validate accuracy

    print('\nValidate model accuracy')
    print('processing...')

    try:
        print('\nReconstruct...')
        DB.reconstruct()
        os.system('rd /s/q train_data')
        os.system('md train_data')

        print('\nGenerating test data...')
        generator.generate(int(num / 4))

        print('Clean the data')
        data = data_generator.generate()

        print('Reconstruct the data', end='      ')
        temp = construct_data()
        feature = temp[0]
        label = temp[1]
        predict_label = mode.predict(feature)

        compare = sum(list(map(lambda x, y: x == y, predict_label, label)))

        accuracy = str(compare / len(label) * 100)[:4]

        status('OK')

    except Exception as e:
        predict_label = []
        label = []
        accuracy = None
        status(e)

    beautiful_output.underline('\nModel accuracy: ')
    print('---------------------')
    print('Predict: ' + str(predict_label[:10]) + '...')
    print('Actual:  ' + str(label[:10]) + '...')
    print('---------------------')
    print(accuracy + '%')

    # print final summary
    beautiful_output.underline('\nSummary:')
    print('---------------------')
    print('Train data:     ' + str(len(predict_label)))
    print('Test data:      ' + str(int(len(predict_label) * 0.2)))
    print('Neighbor:       ' + str(neighbor_num) + '/26')
    print('Model Accuracy: ' + accuracy + '%')
    print('Model Address:  ' + './model.m')
    print('Train method:   ' + 'Knn')
    print('---------------------')
Exemple #17
0
# pip install scikit-optimize
from sklearn.neighbors import KNeighborsClassifier as kNN
from skopt import BayesSearchCV
import warnings
warnings.filterwarnings("ignore")
import sklearn.datasets as ds  # 数据准备

iris = ds.load_iris(as_frame=True)
irisData = iris.data
irisTarget = iris.target.values
print('irisData:\n', irisData)
print('irisTarget:\n', irisTarget)
# parameter ranges are specified by one of below
# from skopt.space import Real, Categorical, Integer

knn = kNN()
# 定义贝叶斯搜索的参数组合
grid_param = {
    'n_neighbors': list(range(2, 11)),
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# 实例化贝叶斯搜索器,传入分类模型和参数组合以及迭代次数
Bayes = BayesSearchCV(knn, grid_param, n_iter=10, random_state=14)
Bayes.fit(irisData, irisTarget)

# best parameter combination
print("最优参数:", Bayes.best_params_)

# Score achieved with best parameter combination
print("最优分数:", Bayes.best_score_)
Exemple #18
0
        # define X and y training and testing sets
        X, y = data[symbol + '_X'], data[symbol + '_y']
        X_test, y_test = test_data[symbol + '_X'], test_data[symbol + '_y']

        ##### Training ##################################

        # scale
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        data[symbol + '_scaler'] = scaler

        # predict
        pca = PCA(n_components=10, random_state=42)
        X = pca.fit_transform(X)

        clf = kNN()
        clf.fit(X, y)
        data[symbol + '_pca'] = pca
        data[symbol + '_clf'] = clf

        ##### Cross Validating ##################################

        # validate
        cv = StratifiedShuffleSplit(n_splits=10, test_size=.1, random_state=42)
        scores = cross_val_score(clf, X, y, cv=cv, scoring='precision')

        # save results
        row = (strategy, symbol)
        try:
            results.loc[row, 'CV_Success%'] = np.bincount(y)[1] / len(y)
            results.loc[row, 'CV_Precision_Mean'] = scores.mean()
from sklearn.neighbors import KNeighborsClassifier as kNN


X_test = happiness_num_df[happiness_num_df['D.1_values'].isna()].select_dtypes([np.number]) # 4k obs
X_train = happiness_num_df[happiness_num_df['D.1_values'].notna()].select_dtypes([np.number]) # 36.3k obs
happiness_num_df = happiness_num_df.select_dtypes([np.number]).\
    drop(columns=['N_Entrevista'])
print(happiness_num_df.dtypes)
Xtr, Xtt = X_train, X_test
X_test, X_train = X_test.drop(columns=['D.1_values']).fillna(0), X_train.drop(columns=['D.1_values']).fillna(0)
y_train, y_test = np.array(Xtr['D.1_values']), np.array(Xtt['D.1_values'])
print(X_train.shape, len(y_train), X_test.shape, len(y_test))

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
knn = kNN(n_neighbors=10) # it1 (n=20 OR n=10, error = 0.02), it2 (n=100, error = 0.03)
knn.fit(X_train, y_train)
dt = DecisionTreeClassifier(max_depth=24, min_samples_leaf=0.05) # it1 <md=8, msl=0.15> ~ lightly underfitted,
# itn <md=24, msl=0.05> ~ lightly underfitted
dt.fit(X_train, y_train)

classifiers = [('Logistic Regression', logreg),
               ('K Nearest Neighbours', knn),
               ('Classification Tree', dt)]

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

for clf_name, clf in classifiers:
    clf.fit(X_train, y_train)
Exemple #20
0
 def __init__(self, nn_model, layer='fc6', k=3):
     """Your implementation"""
     network_output = nn_model.get_layer(layer).output
     self.feature_extraction_model = Model(model.input, network_output)
     self.clf = kNN(n_neighbors=k, weights='distance')
def combinational_cost(data1, data2, data3, data4, reg_type, image_tag,
                       no_of_folds, number_rep):
    '''
    Parameters
    ----------
    data1 : arrays
        matrix of all costs of group1 (normal) for training/testing. Each individual cost (feature) should be arranged as a column
    data2 : arrays
        matrix of all costs of group2 (abnormal) for training/testing. Each individual cost (feature) should be arranged as a column
    data3 : arrays
        matrix of all costs of group1 (normal) for validation. Each individual cost (feature) should be arranged as a column
    data4 : arrays
        matrix of all costs of group2 (abnormal) for validation. Each individual cost (feature) should be arranged as a column
    reg_type : str
        registration type, either rigid (6-dof) or affine (12-dof), or it could be non-linear.
    no_of_folds : int
        specify number of folds for nested cross-validation

    Returns
    -------
    accuracy and AUC of the combinational cost function based on different supervised-learning classifiers for identifying mis-registrations.
    '''
    print(f'classifier comparison for {image_tag}-{reg_type}--------------')

    # transposing and creating labels for data1
    X_normal = np.transpose(data1)  # to make each feature into a column
    x_normal_label = np.zeros(len(X_normal))
    print(
        f'number of correctly aligned images for cross-validation are {len(x_normal_label)}'
    )

    balance_data = 0  # Since the generated misaligned images are 5 times higher, the two classes can be balanced by considering this flag variable one

    # transposing and creating labels for data2
    if balance_data:
        X_misaligned = np.transpose(data2)[:np.shape(X_normal)[0], :]
    else:
        X_misaligned = np.transpose(data2)
    x_misaligned_label = np.ones(len(X_misaligned))
    print(
        f'number of misaligned images for cross-validation are {len(x_misaligned_label)}'
    )

    # data for validation, combining data3 and data4
    X_normal_val = np.transpose(data3)
    x_normal_val_label = np.zeros(len(X_normal_val))
    print(f'number of images for testing are {len(x_normal_val_label)}')

    if balance_data:
        X_misaligned_val = np.transpose(data4)[:np.shape(X_normal_val)[0], :]
    else:
        X_misaligned_val = np.transpose(data4)
    x_misaligned_val_label = np.ones(len(X_misaligned_val))
    #print(f'number of misaligned images for validation are {len(x_misaligned_val_label)}')

    #X_val = np.concatenate((X_normal_val, X_misaligned_val))
    #y_val = np.concatenate((x_normal_val_label, x_misaligned_val_label))
    X_val = X_normal_val
    y_val = x_normal_val_label

    # combining data1 and data2 and the corresponding labels
    X = np.concatenate((X_normal, X_misaligned))
    y = np.concatenate((x_normal_label, x_misaligned_label))

    visualize_costs = 0  # This will do a 3D plot to visualize the costs

    if visualize_costs:
        visualize_cost_values(X, y, standardize=1)

    # print('cost values (min, mean, max) before scaling')
    # for a in range(3):
    #     print(np.min(X[:, a]), np.mean(X[:, a]), np.max(X[:, a]), np.min(X_val[:, a]), np.mean(X_val[:, a]), np.max(X_val[:, a]))

    # scaling the costs (features) to make sure the ranges of individual features are same to avoid the effect of features that have relatively large values. It may not be necessary in this case as all these 3 costs lie between 0 and 1
    #scale = QuantileTransformer(n_quantiles = 10, output_distribution = 'uniform') # Subtracting the mean and dividing with standard deviation
    scale = StandardScaler()
    scale_test = StandardScaler()

    # scale.fit(X)
    # X = scale.transform(X)

    #X_val = scale_test.fit_transform(X_val) # fit_transform is necessary here instead of just transform

    # print('cost values (min, mean, max) after scaling')
    # X1 = StandardScaler().fit_transform(X) # Making a copy for standardization
    # for a in range(3):
    #     print(np.min(X1[:, a]), np.median(X1[:, a]), np.max(X1[:, a]), np.min(X_val[:, a]), np.median(X_val[:, a]), np.max(X_val[:, a]))
    # X = np.concatenate((X, X_val))
    # y = np.concatenate((y, y_val))

    unsupervised_learning = 0  # reated models without giving labels,

    if unsupervised_learning:
        kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
        visualize_cost_values(X, 1 - kmeans.labels_, standardize=1)
        print(
            f'balanced accuracy using KMeans Clustering algorithm is {metrics.balanced_accuracy_score(y, 1-kmeans.labels_)}'
        )
        print(
            f'tested BA score using KMeans Clustering algorithm is {metrics.balanced_accuracy_score(y_val, 1-kmeans.predict(X_val))}'
        )

        agglo = AgglomerativeClustering(n_clusters=2).fit(X)
        visualize_cost_values(X, 1 - agglo.labels_, standardize=1)
        print(
            f'balanced accuracy using Agglomerative Clustering algorithm is {metrics.balanced_accuracy_score(y, 1-agglo.labels_)}'
        )
        #print(f'tested BA score using Agglomerative Clustering algorithm is {metrics.balanced_accuracy_score(y_val, 1-agglo.predict(X_val))}')

    # doing Grid Search CV for hyper-parameter tuning

    #X_gridCV = StandardScaler().fit_transform(X)
    X_gridCV = X

    print(
        'doing hyperparameter tuning of the ML models using Grid Search Cross-Validation'
    )

    lda_parameters = {
        'solver': ('svd', 'lsqr', 'eigen'),
        'tol': [0.001, 0.0001, 0.00001]
    }
    gridCV_lda = GridSearchCV(LDA(n_components=1),
                              lda_parameters,
                              refit=True,
                              scoring=('balanced_accuracy'),
                              cv=5).fit(X_gridCV, y)
    print(
        f'F1-score for LDA {gridCV_lda.best_params_} is {gridCV_lda.best_score_}'
    )

    rfc_parameters = {
        'n_estimators': [1, 10, 20, 40, 60, 80, 100],
        'criterion': ('gini', 'entropy'),
        'max_depth': [2, 4, 6, 8, 10],
        'max_features': ('auto', 'sqrt', 'log2')
    }
    gridCV_rfc = GridSearchCV(RandomForestClassifier(),
                              rfc_parameters,
                              refit=True,
                              scoring=('balanced_accuracy'),
                              cv=5).fit(X_gridCV, y)
    print(
        f'F1-score for Random Forest {gridCV_rfc.best_params_} is {gridCV_rfc.best_score_}'
    )

    svc_parameters = {
        'kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
        'gamma': [1e-3, 1e-4],
        'C': [1, 10, 100, 1000]
    }
    gridCV_svc = GridSearchCV(SVC(),
                              svc_parameters,
                              refit=True,
                              scoring=('balanced_accuracy'),
                              cv=5).fit(X_gridCV, y)
    print(
        f'F1-score for Support Vector Machine {gridCV_svc.best_params_} is {gridCV_svc.best_score_}'
    )

    knn_parameters = {
        'n_neighbors': [3, 7, 11, 15, 19, 21],
        'weights': ('uniform', 'distance'),
        'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute')
    }
    gridCV_knn = GridSearchCV(kNN(),
                              knn_parameters,
                              refit=True,
                              scoring=('balanced_accuracy'),
                              cv=5).fit(X_gridCV, y)
    print(
        f'F1-score for K Nearest Neighbors {gridCV_knn.best_params_} is {gridCV_knn.best_score_}'
    )

    ada_parameters = {
        'n_estimators': [1, 10, 20, 40, 60, 80, 100],
        'learning_rate': [0.8, 0.9, 1]
    }
    gridCV_ada = GridSearchCV(AdaBoostClassifier(),
                              ada_parameters,
                              refit=True,
                              scoring=('balanced_accuracy'),
                              cv=5).fit(X_gridCV, y)
    print(
        f'F1-score for Adaptive Boosting {gridCV_ada.best_params_} is {gridCV_ada.best_score_}'
    )

    # Repeated K-fold cross validation, n_splits specifies the number of folds, n_repeats specifies the no.of repetetions
    folds = RepeatedStratifiedKFold(n_splits=no_of_folds, n_repeats=number_rep)

    scores_lda = []
    scores_qda = []
    scores_rfc = []
    scores_svm = []
    scores_gnb = []
    scores_knn = []
    scores_lor = []
    scores_ada = []
    scores_gra = []
    scores_automl = []

    auc_lda = []
    auc_qda = []
    auc_rfc = []
    auc_svm = []
    auc_gnb = []
    auc_knn = []
    auc_lor = []
    auc_ada = []
    auc_gra = []
    auc_automl = []

    for train_index, test_index in folds.split(X, y):

        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
            train_index], y[test_index]

        # X_train = scale.fit_transform(X_train) # scaling is implemented on X_train and the transformation is implemented on the X_test
        # X_test = scale.transform(X_test)

        #X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size = 0.20, stratify = y_1, shuffle = True)

        # 1. Linear Discriminant Analysis Classifier
        lda = LDA(solver='eigen', shrinkage='auto', n_components=1)
        score_lda, roc_auc_lda, model_lda = classifier_accuracy(
            lda, X_train, X_test, y_train, y_test)
        # print('F1-score for LDA', {metrics.f1_score(y_val, LDA(solver = 'eigen', shrinkage = 'auto', n_components = 1).fit(X, y).predict(X_val))})
        scores_lda.append(score_lda)  # F1-score
        auc_lda.append(roc_auc_lda)  # AUC
        #auc_lda.append(metrics.f1_score(y_validation, LDA(solver = 'eigen', shrinkage = 'auto', n_components = 1).fit(X_1, y_1).predict(X_validation))) # AUC

        # 1a. Quadratic Discriminant Analysis Classifier
        qda = QDA()
        score_qda, roc_auc_qda, model_qda = classifier_accuracy(
            qda, X_train, X_test, y_train, y_test)
        scores_qda.append(score_qda)
        auc_qda.append(roc_auc_qda)

        # 2. Random Forest Classifier (it could be done in LDA transformed space if you have large number of features)
        rfc = RandomForestClassifier(criterion='gini', n_estimators=100)
        score_rfc, roc_auc_rfc, model_rfc = classifier_accuracy(
            rfc, X_train, X_test, y_train, y_test)
        scores_rfc.append(score_rfc)
        auc_rfc.append(roc_auc_rfc)

        # 3. Support Vector Machine Classifier
        svc = SVC(kernel='rbf', gamma='scale', probability=True)
        score_svm, roc_auc_svm, model_svm = classifier_accuracy(
            svc, X_train, X_test, y_train, y_test)
        scores_svm.append(score_svm)
        auc_svm.append(roc_auc_svm)
        #print(svc.coef_)

        # 4. Gaussian Naive Bayes Classifier
        gnb = GaussianNB()
        score_gnb, roc_auc_gnb, model_gnb = classifier_accuracy(
            gnb, X_train, X_test, y_train, y_test)
        scores_gnb.append(score_gnb)
        auc_gnb.append(roc_auc_gnb)

        # 5. k-Nearest Neighbour Classifier
        knn = kNN(n_neighbors=15)
        score_knn, roc_auc_knn, model_knn = classifier_accuracy(
            knn, X_train, X_test, y_train, y_test)
        scores_knn.append(score_knn)
        auc_knn.append(roc_auc_knn)

        # 6. Logistic Regression Classifier
        lor = LogisticRegression()
        score_lor, roc_auc_lor, model_lor = classifier_accuracy(
            lor, X_train, X_test, y_train, y_test)
        scores_lor.append(score_lor)
        auc_lor.append(roc_auc_lor)

        # 7. Ada Boost Classifier
        ada = AdaBoostClassifier(n_estimators=100)
        score_ada, roc_auc_ada, model_ada = classifier_accuracy(
            ada, X_train, X_test, y_train, y_test)
        scores_ada.append(score_ada)
        auc_ada.append(roc_auc_ada)

        # # 7a. Gradient Boosting Classifier
        # gra = GradientBoostingClassifier(random_state = 0)
        # score_gra, roc_auc_gra, model_gra = classifier_accuracy(gra, X_train, X_test, y_train, y_test)
        # scores_gra.append(score_gra)
        # auc_gra.append(roc_auc_gra)

        # 8. Auto Sklearn Classifier (for automatic model selection with hyperparameter tuning)
        # automl = AutoSklearnClassifier(time_left_for_this_task = 50, per_run_time_limit = 10)
        # score_automl, roc_auc_automl, model_automl = classifier_accuracy(automl, X_train, X_test, y_train, y_test)
        # scores_automl.append(score_automl)
        # auc_automl.append(roc_auc_automl)

    # Note: 'cross_val_score' method from sklearn could be used directly on the classifier model to avoid the above for loop. Further, f1-score could be used instead of accuracy metric if number of positive samples (mis-aligned) are low.
    if False:
        print(
            f'accuracy using LDA classifier for {image_tag}-{reg_type} is: {np.average(scores_lda)}, AUC is: {np.average(auc_lda)}'
        )
        #print(f'accuracy using QDA classifier for {image_tag}-{reg_type} is: {np.average(scores_qda)}, AUC is: {np.average(auc_qda)}\n')
        print(
            f'accuracy using RandomForest classifier for {image_tag}-{reg_type} is: {np.average(scores_rfc)}, AUC is: {np.average(auc_rfc)}'
        )
        print(
            f'accuracy using SVM classifier for {image_tag}-{reg_type} is: {np.average(scores_svm)}, AUC is: {np.average(auc_svm)}'
        )
        print(
            f'accuracy using Naive Bayes classifier for {image_tag}-{reg_type} is: {np.average(scores_gnb)}, AUC is: {np.average(auc_gnb)}'
        )
        print(
            f'accuracy using kNN classifier for {image_tag}-{reg_type} is: {np.average(scores_knn)}, AUC is: {np.average(auc_knn)}'
        )
        #print(f'accuracy using Logistic Regression classifier for {image_tag}-{reg_type} is: {np.average(scores_lor)}, AUC is: {np.average(auc_lor)}\n')
        print(
            f'accuracy using Ada Boost classifier for {image_tag}-{reg_type} is: {np.average(scores_ada)}, AUC is: {np.average(auc_ada)}'
        )
        #print(f'accuracy using Gradient boosting classifier for {image_tag}-{reg_type} is: {np.average(scores_gra)}, AUC is: {np.average(auc_gra)}\n')
        #print(f'accuracy using AutoML Classifier for {image_tag}-{reg_type} is: {np.average(scores_automl)}, AUC is: {np.average(auc_automl)}\n')

    save_model = '/home/tummala/mri/ml_classifier_models_checking_reg'

    if not os.path.exists(save_model):
        os.makedirs(save_model)

    # saving the trained model, e.g. shown for saving ada boost classifier model and minmax scaling model
    #scale_all = StandardScaler().fit(X)
    #X = scale_all.transform(X)

    #pickle.dump(scale_all, open(save_model+'/'+'scale_'+reg_type+image_tag, 'wb'))
    pickle.dump(gridCV_lda,
                open(save_model + '/' + 'lda_' + reg_type + image_tag, 'wb'))
    pickle.dump(gridCV_rfc,
                open(save_model + '/' + 'rfc_' + reg_type + image_tag, 'wb'))
    pickle.dump(gridCV_svc,
                open(save_model + '/' + 'svm_' + reg_type + image_tag, 'wb'))
    pickle.dump(GaussianNB().fit(X, y),
                open(save_model + '/' + 'gnb_' + reg_type + image_tag, 'wb'))
    pickle.dump(gridCV_knn,
                open(save_model + '/' + 'knn_' + reg_type + image_tag, 'wb'))
    pickle.dump(
        gridCV_ada,
        open(save_model + '/' + 'ada_boost_' + reg_type + image_tag, 'wb'))

    # automl_model = AutoSklearnClassifier(time_left_for_this_task = 50, per_run_time_limit = 10).fit(X, y)
    # pickle.dump(automl_model, open(save_model+'/'+'automl_'+reg_type+image_tag, 'wb'))
    # pickle.load method could be used to load the model for later use and predict method of the seved model to categorize new cases

    # plotting ROC curve for Sensitivity/Specificity all above classifiers
    subjects_test = os.listdir(subpath2)

    for index, subject in enumerate(subjects_test, start=1):
        global_cost_vector = []
        local_cost_vector = []
        cost_folder = subpath2 + '/' + subject + '/cost' + str(voi_size) + str(
            step_size)
        #print('{}-{}, {}-{}'.format(index, subject, reg_type, cost_func))
        data_files = os.listdir(cost_folder)
        for data_file in data_files:
            if reg_type in data_file and (image_tag in data_file):
                if not 'alignedToT1' in data_file:
                    cost_data = np.loadtxt(cost_folder + '/' + data_file)
                    global_cost_vector.append(cost_data[0])
                    local_cost_vector.append(cost_data[1])

        if not local_cost_vector:
            print(f'No {image_tag} image for {subject}')
            continue
        sample = np.reshape(np.array(local_cost_vector), (1, 3))
        reg_quality = gridCV_knn.predict_proba(sample)[0][0] * 100

        if reg_quality < 50:
            print(
                f'Quality of {reg_type} registration for {subject} using kNN Classifier is {reg_quality}'
            )

    if False:
        lda_disp = metrics.plot_roc_curve(gridCV_lda,
                                          X_val,
                                          y_val,
                                          drop_intermediate=False)
        print('F1-score for LDA',
              {metrics.accuracy_score(y_val, gridCV_lda.predict(X_val))})
        #qda_disp = metrics.plot_roc_curve(qda, X_test, y_test, ax = lda_disp.ax_)
        svm_disp = metrics.plot_roc_curve(gridCV_svc,
                                          X_val,
                                          y_val,
                                          ax=lda_disp.ax_)
        print('F1-score for SVM',
              {metrics.accuracy_score(y_val, gridCV_svc.predict(X_val))})
        #nsvm_disp = metrics.plot_roc_curve(nsvm, X_test, y_test, ax = lda_disp.ax_)
        gnb_disp = metrics.plot_roc_curve(GaussianNB().fit(X, y),
                                          X_val,
                                          y_val,
                                          ax=lda_disp.ax_)
        print('F1-score for GNB', {
            metrics.accuracy_score(y_val,
                                   GaussianNB().fit(X, y).predict(X_val))
        })
        knn_disp = metrics.plot_roc_curve(gridCV_knn,
                                          X_val,
                                          y_val,
                                          ax=lda_disp.ax_)
        print('F1-score for kNN',
              {metrics.accuracy_score(y_val, gridCV_knn.predict(X_val))})
        rfc_disp = metrics.plot_roc_curve(gridCV_rfc,
                                          X_val,
                                          y_val,
                                          ax=lda_disp.ax_)
        print('F1-score for RFC',
              {metrics.accuracy_score(y_val, gridCV_rfc.predict(X_val))})
        #print(y_val)
        #print(RandomForestClassifier(criterion = 'gini', n_estimators = 100).fit(X, y).predict_proba(X_val)[:, 1])
        metrics.plot_confusion_matrix(gridCV_rfc, X_val, y_val, colorbar=False)
        plt.show()
        ada_disp = metrics.plot_roc_curve(gridCV_ada,
                                          X_val,
                                          y_val,
                                          ax=lda_disp.ax_)
        print('F1-score for Ada Boost',
              {metrics.accuracy_score(y_val, gridCV_ada.predict(X_val))})
        # automl_disp = metrics.plot_roc_curve(automl_model, X_val, y_val, ax = lda_disp.ax_)
        # print('F1-score for AutoML Classifier', {metrics.balanced_accuracy_score(y_val, automl_model.predict(X_val))})
        # print(automl_model.sprint_statistics())

        # # Plotting the sklearn models using PipelineProfiler
        # profiler_data = PipelineProfiler.import_autosklearn(automl_model)
        # PipelineProfiler.plot_pipeline_matrix(profiler_data)
        # plt.show()

        #knn_disp.figure_.suptitle(f"ROC curve comparison {image_tag}-{reg_type}")

    # plotting Precision-Recall ROC curve for all above classifiers
    if False:
        lda_disp = metrics.plot_precision_recall_curve(
            LDA(solver='eigen', shrinkage='auto', n_components=1).fit(X, y),
            X_val, y_val)
        #qda_disp = metrics.precision_recall_curve(qda, X_test, y_test, ax = lda_disp.ax_)
        svm_disp = metrics.plot_precision_recall_curve(SVC(
            kernel='linear', gamma='scale', probability=True).fit(X, y),
                                                       X_val,
                                                       y_val,
                                                       ax=lda_disp.ax_)
        #nsvm_disp = metrics.plot_roc_curve(nsvm, X_test, y_test, ax = lda_disp.ax_)
        gnb_disp = metrics.plot_precision_recall_curve(GaussianNB().fit(X, y),
                                                       X_val,
                                                       y_val,
                                                       ax=lda_disp.ax_)
        knn_disp = metrics.plot_precision_recall_curve(kNN(n_neighbors=15).fit(
            X, y),
                                                       X_val,
                                                       y_val,
                                                       ax=lda_disp.ax_)
        rfc_disp = metrics.plot_precision_recall_curve(RandomForestClassifier(
            criterion='gini', n_estimators=100).fit(X, y),
                                                       X_val,
                                                       y_val,
                                                       ax=lda_disp.ax_)
        ada_disp = metrics.plot_precision_recall_curve(
            AdaBoostClassifier(n_estimators=100).fit(X, y),
            X_val,
            y_val,
            ax=lda_disp.ax_)
Exemple #22
0
def main():
    # # Loading the datasets

    cancer_data = pd.read_csv('breast_cancer.csv')
    print("Data has", len(cancer_data), "rows and", len(cancer_data.columns),
          "columns.")

    phishing_data = pd.read_csv('PhishingWebsites.csv')
    print("Data has", len(phishing_data), "rows and",
          len(phishing_data.columns), "columns.")

    # # Preprocessing datasets

    # This section is preprocessing the breast cancer dataset. Like dropping unnecessary columns and replacing the values of target to 0/1 numeric values.

    # In[26]:

    y_can = cancer_data.diagnosis  # malignant (M) or benign (B)
    # The column "Unnamed: 32" feature includes NaN so drop it from the data. Also drop "id" as it is not a feature and
    # "diagnosis" as it is the label
    column_names = ['Unnamed: 32', 'id', 'diagnosis']
    X_can = cancer_data.drop(column_names, axis=1)
    X_can = preprocessing.scale(X_can)

    # Convert string labels to numerical values
    y_can = y_can.values
    y_can[y_can == 'M'] = 1
    y_can[y_can == 'B'] = 0
    y_can = y_can.astype(int)
    column_order = list(cancer_data)
    column_order.insert(0, column_order.pop(
        column_order.index('diagnosis')))  #move target diagnosis to front
    cancer_data = cancer_data.loc[:, column_order]
    cancer_data.describe(include='all')

    # For preprocessing the phishing dataset, there are several columns that are categorical with the levels {-1,0,1} and the rest are all binary with levels {-1,1}. For the 3-level columns one-hot encoding was used to create additional features with level {0,1}. Finally, edited the binary features so that the new levels are all {0,1}. There are more features now, but it will all be binary.

    # In[28]:

    column_names = [
        'URL_Length', 'having_Sub_Domain', 'SSLfinal_State', 'URL_of_Anchor',
        'Links_in_tags', 'SFH', 'web_traffic', 'Links_pointing_to_page'
    ]
    data_1hot = phishing_data[column_names]
    data_1hot = pd.get_dummies(data_1hot)
    data_others = phishing_data.drop(column_names, axis=1)
    phishing_data = pd.concat([data_1hot, data_others], axis=1)
    phishing_data = phishing_data.replace(-1, 0).astype('category')
    column_order = list(phishing_data)
    column_order.insert(0, column_order.pop(column_order.index('Result')))
    phishing_data = phishing_data.loc[:,
                                      column_order]  #move the target variable 'Result' to the front
    phishing_data.describe(include='all')

    # # Splitting the datasets
    # The datasets are now split into train and test datasets. Training dataset is 70% of the original dataset while test dataset is 30% of the original dataset.

    # In[83]:

    X_can_train, X_can_test, y_can_train, y_can_test = train_test_split(
        X_can, y_can, test_size=0.3, random_state=18)

    X_phish = np.array(phishing_data.values[:, 1:], dtype='int64')
    y_phish = np.array(phishing_data.values[:, 0], dtype='int64')
    X_ph_train, X_ph_test, y_ph_train, y_ph_test = train_test_split(
        X_phish, y_phish, test_size=0.3, random_state=18)

    # In[41]:

    plt.rcParams['xtick.labelsize'] = 12
    plt.rcParams['ytick.labelsize'] = 12
    plt.rcParams['axes.titlesize'] = 12
    plt.rcParams['font.size'] = 12

    # # Decision Tree Classifier
    # In this section constructed a Decision Tree Classifier using information gain (based on entropy) to determine the best feature split per the ID3 algorithm. The model will be pre-pruned by limiting tree depth using the hyperparameter 'max_depth' and by ensuring that each leaf (a terminal node on the tree) has at least 'min_samples_leaf'.
    DTree_classifier(
        X_can_train,
        y_can_train,
        X_can_test,
        y_can_test,
        title=
        "Model Complexity Curve for Decision Tree (Breast Cancer Data)\nHyperparameter : Tree Max Depth"
    )
    start_leaf_n = round(0.005 * len(X_can_train))
    end_leaf_n = round(
        0.05 * len(X_can_train))  #leaf nodes of size [0.5%, 5% will be tested]
    max_depth, min_samples_leaf = TreeGridSearchCV(start_leaf_n, end_leaf_n,
                                                   X_can_train, y_can_train)
    estimator_can = DecisionTreeClassifier(max_depth=max_depth,
                                           min_samples_leaf=min_samples_leaf,
                                           random_state=100,
                                           criterion='entropy')
    train_samp_can, DT_train_score_can, DT_fit_time_can, DT_pred_time_can = plot_learning_curve(
        estimator_can,
        5,
        X_can_train,
        y_can_train,
        title="Decision Tree Breast CancerData")
    DTree_fp_can, DTree_tp_can = final_classifier_evaluation(
        estimator_can,
        X_can_train,
        X_can_test,
        y_can_train,
        y_can_test,
        title="Decision Tree Breast CancerData")

    DTree_classifier(
        X_ph_train,
        y_ph_train,
        X_ph_test,
        y_ph_test,
        title=
        "Model Complexity Curve for Decision Tree (Phishing Data)\nHyperparameter : Tree Max Depth"
    )
    start_leaf_n = round(0.005 * len(X_ph_train))
    end_leaf_n = round(
        0.05 * len(X_ph_train))  #leaf nodes of size [0.5%, 5% will be tested]
    max_depth, min_samples_leaf = TreeGridSearchCV(start_leaf_n, end_leaf_n,
                                                   X_ph_train, y_ph_train)
    estimator_phish = DecisionTreeClassifier(max_depth=max_depth,
                                             min_samples_leaf=min_samples_leaf,
                                             random_state=100,
                                             criterion='entropy')
    train_samp_phish, DT_train_score_phish, DT_fit_time_phish, DT_pred_time_phish = plot_learning_curve(
        estimator_phish,
        10,
        X_ph_train,
        y_ph_train,
        title="Decision Tree Phishing Data")
    DTree_fp_phish, DTree_tp_phish = final_classifier_evaluation(
        estimator_phish,
        X_ph_train,
        X_ph_test,
        y_ph_train,
        y_ph_test,
        title="Decision Tree Phishing Data")

    kNN_classifier(
        X_can_train,
        y_can_train,
        X_can_test,
        y_can_test,
        title=
        "Model Complexity Curve for kNN (Breast Cancer Data)\nHyperparameter : No. Neighbors"
    )
    estimator_can = kNN(n_neighbors=14, n_jobs=-1)
    train_samp_can, kNN_train_score_can, kNN_fit_time_can, kNN_pred_time_can = plot_learning_curve(
        estimator_can,
        5,
        X_can_train,
        y_can_train,
        title="kNN Breast Cancer Data")
    KNN_fp_can, KNN_tp_can = final_classifier_evaluation(
        estimator_can,
        X_can_train,
        X_can_test,
        y_can_train,
        y_can_test,
        title="kNN Breast Cancer Data")

    kNN_classifier(
        X_ph_train,
        y_ph_train,
        X_ph_test,
        y_ph_test,
        title=
        "Model Complexity Curve for kNN (Phishing Data)\nHyperparameter : No. Neighbors"
    )
    estimator_phish = kNN(n_neighbors=20, n_jobs=-1)
    train_samp_phish, kNN_train_score_phish, kNN_fit_time_phish, kNN_pred_time_phish = plot_learning_curve(
        estimator_phish, 10, X_ph_train, y_ph_train, title="kNN Phishing Data")
    KNN_fp_ph, KNN_tp_ph = final_classifier_evaluation(
        estimator_phish,
        X_ph_train,
        X_ph_test,
        y_ph_train,
        y_ph_test,
        title="kNN Phishing Data")

    hlist = np.linspace(1, 150, 30).astype('int')
    NN_classifier(
        X_can_train,
        y_can_train,
        X_can_test,
        y_can_test,
        title=
        "Model Complexity Curve for NN (Breast Cancer Data)\nHyperparameter : No. Hidden Units"
    )
    h_units, learn_rate = NNGridSearchCV(X_can_train, y_can_train)
    estimator_can = MLPClassifier(hidden_layer_sizes=(h_units, ),
                                  solver='adam',
                                  activation='logistic',
                                  learning_rate_init=learn_rate,
                                  random_state=100)
    train_samp_can, NN_train_score_can, NN_fit_time_can, NN_pred_time_can = plot_learning_curve(
        estimator_can,
        5,
        X_can_train,
        y_can_train,
        title="Neural Net Breast Cancer Data")
    NN_fp_can, NN_tp_can = final_classifier_evaluation(
        estimator_can,
        X_can_train,
        X_can_test,
        y_can_train,
        y_can_test,
        title="Neural Net Breast Cancer Data")

    NN_classifier(
        X_ph_train,
        y_ph_train,
        X_ph_test,
        y_ph_test,
        title=
        "Model Complexity Curve for NN (Phishing Data)\nHyperparameter : No. Hidden Units"
    )
    h_units, learn_rate = NNGridSearchCV(X_ph_train, y_ph_train)
    estimator_phish = MLPClassifier(hidden_layer_sizes=(h_units, ),
                                    solver='adam',
                                    activation='logistic',
                                    learning_rate_init=learn_rate,
                                    random_state=100)
    train_samp_phish, NN_train_score_phish, NN_fit_time_phish, NN_pred_time_phish = plot_learning_curve(
        estimator_phish,
        10,
        X_ph_train,
        y_ph_train,
        title="Neural Net Phishing Data")
    NN_fp_ph, NN_tp_ph = final_classifier_evaluation(
        estimator_phish,
        X_ph_train,
        X_ph_test,
        y_ph_train,
        y_ph_test,
        title="Neural Net Phishing Data")

    estimator_can = MLPClassifier(hidden_layer_sizes=(30, ),
                                  solver='adam',
                                  activation='logistic',
                                  learning_rate_init=0.01,
                                  random_state=100,
                                  max_iter=200)
    estimator_can.fit(X_can_train, y_can_train)
    loss_values_can = estimator_can.loss_curve_
    estimator_can.fit(X_can_test, y_can_test)
    loss_can_test = estimator_can.loss_curve_

    estimator_phish = MLPClassifier(hidden_layer_sizes=(50, ),
                                    solver='adam',
                                    activation='logistic',
                                    learning_rate_init=0.05,
                                    random_state=100,
                                    max_iter=200)
    estimator_phish.fit(X_ph_train, y_ph_train)
    loss_values_ph = estimator_phish.loss_curve_
    estimator_phish.fit(X_ph_test, y_ph_test)
    loss_ph_test = estimator_phish.loss_curve_

    plt.figure()
    plt.title("Loss Curve for NN")
    plt.xlabel("Number of Iterations")
    plt.ylabel("Loss")
    plt.plot(loss_values_can,
             'o-',
             color="g",
             label="Train Breast Cancer Data")
    plt.plot(loss_can_test, 'o-', color="r", label="Test Breast Cancer Data")
    plt.legend(loc="best")
    plt.show()

    plt.figure()
    plt.title("Loss Curve for NN")
    plt.xlabel("Number of Iterations")
    plt.ylabel("Loss")
    plt.plot(loss_values_ph, 'o-', color="g", label="Train Phishing Data")
    plt.plot(loss_ph_test, 'o-', color="r", label="Test Phishing Data")
    plt.legend(loc="best")
    plt.show()

    SVM_classifier(
        X_can_train,
        y_can_train,
        X_can_test,
        y_can_test,
        title=
        "Model Complexity Curve for SVM (Breast Cancer Data)\nHyperparameter : Kernel Function"
    )
    C_val, gamma_val = SVMGridSearchCV(X_can_train, y_can_train)
    estimator_bank = svm.SVC(C=C_val,
                             gamma=gamma_val,
                             kernel='linear',
                             random_state=100)
    train_samp_can, SVM_train_score_can, SVM_fit_time_can, SVM_pred_time_can = plot_learning_curve(
        estimator_bank,
        5,
        X_can_train,
        y_can_train,
        title="SVM Breast Cancer Data")
    SVM_fp_can, SVM_tp_can = svm_classifier_evaluation(
        estimator_bank,
        X_can_train,
        X_can_test,
        y_can_train,
        y_can_test,
        title="SVM Breast Cancer Data")

    SVM_classifier(
        X_ph_train,
        y_ph_train,
        X_ph_test,
        y_ph_test,
        title=
        "Model Complexity Curve for SVM (Phishing Data)\nHyperparameter : Kernel Function"
    )
    C_val, gamma_val = SVMGridSearchCV(X_ph_train, y_ph_train)
    estimator_phish = svm.SVC(C=C_val,
                              gamma=gamma_val,
                              kernel='linear',
                              random_state=100)
    train_samp_phish, SVM_train_score_phish, SVM_fit_time_phish, SVM_pred_time_phish = plot_learning_curve(
        estimator_phish, 10, X_ph_train, y_ph_train, title="SVM Phishing Data")
    SVM_fp_ph, SVM_tp_ph = svm_classifier_evaluation(estimator_phish,
                                                     X_ph_train,
                                                     X_ph_test,
                                                     y_ph_train,
                                                     y_ph_test,
                                                     title="SVM Phishing Data")

    AdaBoost_classifier(
        X_can_train,
        y_can_train,
        X_can_test,
        y_can_test,
        3,
        50,
        title=
        "Model Complexity Curve for Boosted Tree (Breast Cancer Data)\nHyperparameter : No. Estimators"
    )
    start_leaf_n = round(0.005 * len(X_can_train))
    end_leaf_n = round(
        0.05 * len(X_can_train))  #leaf nodes of size [0.5%, 5% will be tested]
    n_est, learn_rate = BoostedGridSearchCV(start_leaf_n, end_leaf_n,
                                            X_can_train, y_can_train)
    dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
    estimator_can = AdaBoostClassifier(base_estimator=dt_stump,
                                       n_estimators=n_est,
                                       random_state=100)

    train_samp_can, BT_train_score_can, BT_fit_time_can, BT_pred_time_can = plot_learning_curve(
        estimator_can,
        5,
        X_can_train,
        y_can_train,
        title="Boosted Tree Breast Cancer Data")
    BT_fp_can, BT_tp_can = final_classifier_evaluation(
        estimator_can,
        X_can_train,
        X_can_test,
        y_can_train,
        y_can_test,
        title="Boosted Tree Breast Cancer Data")

    AdaBoost_classifier(
        X_ph_train,
        y_ph_train,
        X_ph_test,
        y_ph_test,
        3,
        50,
        title=
        "Model Complexity Curve for Boosted Tree (Phishing Data)\nHyperparameter : No. Estimators"
    )
    start_leaf_n = round(0.005 * len(X_ph_train))
    end_leaf_n = round(
        0.05 * len(X_ph_train))  #leaf nodes of size [0.5%, 5% will be tested]
    n_est, learn_rate = BoostedGridSearchCV(start_leaf_n, end_leaf_n,
                                            X_ph_train, y_ph_train)
    dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
    estimator_phish = AdaBoostClassifier(base_estimator=dt_stump,
                                         n_estimators=n_est,
                                         random_state=100)

    train_samp_phish, BT_train_score_phish, BT_fit_time_phish, BT_pred_time_phish = plot_learning_curve(
        estimator_phish,
        10,
        X_ph_train,
        y_ph_train,
        title="Boosted Tree Phishing Data")
    BT_fp_phish, BT_tp_phish = final_classifier_evaluation(
        estimator_phish,
        X_ph_train,
        X_ph_test,
        y_ph_train,
        y_ph_test,
        title="Boosted Tree Phishing Data")

    compare_fit_time(train_samp_can, NN_fit_time_can, SVM_fit_time_can,
                     kNN_fit_time_can, DT_fit_time_can, BT_fit_time_can,
                     'Breast Cancer Dataset')
    compare_pred_time(train_samp_can, NN_pred_time_can, SVM_pred_time_can,
                      kNN_pred_time_can, DT_pred_time_can, BT_pred_time_can,
                      'Breast Cancer Dataset')
    compare_roc(NN_fp_can, NN_tp_can, SVM_fp_can, SVM_tp_can, KNN_fp_can,
                KNN_tp_can, DTree_fp_can, DTree_tp_can, BT_fp_can, BT_tp_can,
                'Breast Cancer Dataset')

    compare_fit_time(train_samp_phish, NN_fit_time_phish, SVM_fit_time_phish,
                     kNN_fit_time_phish, DT_fit_time_phish, BT_fit_time_phish,
                     'Phishing Dataset')
    compare_pred_time(train_samp_phish, NN_pred_time_phish,
                      SVM_pred_time_phish, kNN_pred_time_phish,
                      DT_pred_time_phish, BT_pred_time_phish,
                      'Phishing Dataset')
    compare_roc(NN_fp_ph, NN_tp_ph, SVM_fp_ph, SVM_tp_ph, KNN_fp_ph, KNN_tp_ph,
                DTree_fp_phish, DTree_tp_phish, BT_fp_phish, BT_tp_phish,
                'Phishing Dataset')

    # In[103]:

    classifiers = ('Decision tree', 'kNN', 'Neural network', 'SVM', 'AdaBoost')
    y_pos = np.arange(len(classifiers))
    f1_score_ph = (0.92, 0.93, 0.95, 0.93, 0.93)
    f1_score_can = (0.95, 0.91, 0.98, 0.97, 0.96)

    # In[106]:

    plt.figure()
    plt.barh(y_pos, f1_score_ph)
    plt.gca().set_yticks(y_pos)
    plt.gca().set_xlim(0.9, 1.0)
    plt.gca().set_yticklabels(classifiers)
    plt.gca().invert_yaxis()  # labels read top-to-bottom
    plt.title('F1 score: Phishing Dataset')
    plt.xlabel('F1 score')
    plt.show()

    # In[107]:

    plt.figure()
    plt.barh(y_pos, f1_score_can)
    plt.gca().set_yticks(y_pos)
    plt.gca().set_xlim(0.9, 1.0)
    plt.gca().set_yticklabels(classifiers)
    plt.gca().invert_yaxis()  # labels read top-to-bottom
    plt.title('F1 score: Breast Cancer Dataset')
    plt.xlabel('F1 score')
    plt.show()
    #scale the attributes
    attributes = [
        "c1", "c2", "c3", "m1", "m2", "m3", "n1", "n2", "n3", "p1", "p2", "p3"
    ]
    scaled_attributes = scaledata(dataframe, attributes)
    scaled_predict = scaledata(dataframe_predict, attributes)

    #fitting training data
    model.fit(scaled_attributes, target)

    #calculate predictions according to fitted model
    probeB_tna = model.predict(scaled_predict)

    #calculate Rsquared
    r2 = metrics.r2_score(target, model.predict(scaled_attributes))

    return r2, probeB_tna


#kNNreg model
kNNregModel = kNN(n_neighbors=6)

#Rsquared of kNNreg
print "kNNreg:", Rsquared(kNNregModel, probeAs, probeA["tna"], probeBs)[0]

#output probeB_tna to csv file "tnaB.csv"
with open("tnaB.csv", "wb") as f:
    thewriter = csv.writer(f)
    thewriter.writerow(["tna"])
    for val in Rsquared(kNNregModel, probeAs, probeA["tna"], probeBs)[1]:
        thewriter.writerow([val])
Exemple #24
0
print('irisData:\n', irisData)
print('irisTarget:\n', irisTarget)

# 选择模型
# RandomForest
'''
model = RandomForestClassifier()
# 参数搜索空间
param_grid = {
    'max_depth': np.arange(1, 20, 1),
    'n_estimators': np.arange(1, 50, 10),
    'max_leaf_nodes': np.arange(2, 100, 10)
}
'''
# kNN
model = kNN()
# 参数搜索空间
param_grid = {
    'n_neighbors': list(range(2, 11)),
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
# 网格搜索模型参数
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_micro')
grid_search.fit(irisData, irisTarget)
print('best_params_:\n', grid_search.best_params_)
print('grid_search best_score_:\n', grid_search.best_score_)
print('best_estimator_:\n', type(grid_search.best_estimator_))
''''
# 随机搜索模型参数
rd_search = RandomizedSearchCV(model, param_grid, n_iter=200, cv=5, scoring='f1_micro')
rd_search.fit(x, y)
Exemple #25
0
import pandas
data = pandas.read_csv('/Users/snehamitta/Desktop/ML/Assignment1/Fraud(1).csv',
                       delimiter=',')

from sklearn.neighbors import NearestNeighbors as kNN
import numpy as np

kNNSpec = kNN(n_neighbors=5, algorithm='brute', metric='euclidean')

trainData = data[[
    'TOTAL_SPEND', 'DOCTOR_VISITS', 'NUM_CLAIMS', 'MEMBER_DURATION',
    'OPTOM_PRESC', 'NUM_MEMBERS'
]]
print(trainData.shape)
# Build nearest neighbors
nbrs = kNNSpec.fit(trainData)
print(nbrs.shape)
distances, indices = nbrs.kneighbors(trainData)
print(distances)
print(indices)

target = data[['FRAUD']]
print(target.shape)

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5,
                             algorithm='brute',
                             metric='euclidean')
nbrs = neigh.fit(trainData, target)

class_result = nbrs.predict(trainData)
def kNN_pack(xtrain, xtest, ytrain, k):
    model_kNN = kNN(n_neighbors=k)
    model_kNN.fit(xtrain, ytrain)
    ypre = model_kNN.predict(xtest)
    return ypre
Exemple #27
0
#k-Nearest Neighbours
start = time.time()

#To create TFxIDF Matrix by converting document terms into a TF-IDF matrix
tfidfvector = TfIdf(min_df=0.01,
                    max_df=0.5,
                    sublinear_tf=True,
                    stop_words='english')

#Model for Dataset 1
x_train1_tfidf_matrix = tfidfvector.fit(x_variableTrain1)
x_train1_tfidf_matrix = tfidfvector.transform(x_variableTrain1)
x_test1_tfidf_matrix = tfidfvector.transform(x_variableTest1)

k_nearest = kNN(n_neighbors=12, metric="minkowski", random_state=1)
k_n_n = k_nearest.fit(x_train1_tfidf_matrix, y_variableTrain1)
knn_pred1 = k_n_n.predict(x_test1_tfidf_matrix)
knn_pred1_train = k_n_n.predict(x_train1_tfidf_matrix)

accuracy_dataset1 = metrics.accuracy_score(y_variableTest1, knn_pred1)
accuracy_dataset1_train = metrics.accuracy_score(y_variableTrain1,
                                                 knn_pred1_train)
accuracy_dataset1 *= 100
accuracy_dataset1_train *= 100
print(
    "Accuray on Testing Dataset1 (Debate Data) Using k-Nearest Neighbour: %.2f"
    % (accuracy_dataset1) + "%")
print(
    "Accuray on Training Dataset1 (Debate Data) Using k-Nearest Neighbour: %.2f"
    % (accuracy_dataset1_train) + "%")
Exemple #28
0
train.head()

# In[ ]:

from mlxtend.classifier import StackingCVClassifier as SCVC

# In[ ]:

from sklearn.neighbors import KNeighborsClassifier as kNN
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import RidgeClassifier as RC

# In[ ]:

clf1 = kNN()
clf2 = SVC(probability=True)
clf3 = RFC()
meta_clf = RC()

# In[ ]:

stacker = SCVC(classifiers=[clf1, clf2, clf3, clf1],
               meta_classifier=meta_clf,
               use_probas=True,
               use_features_in_secondary=True)

# In[ ]:

for c in train.columns:
    train[c] = train[c].fillna(train[c].median())
    # for iter in range(1000):
    #    print nca(A, X, y)
    #    Ascale.append( np.sum(A) )
    #
    # sns.plt.plot(Ascale)
    # sns.plt.show()
    #
    from sklearn.datasets import make_classification
    from sklearn.neighbors import KNeighborsClassifier as kNN
    from sklearn.model_selection import cross_val_score

    X, y = make_classification(n_samples=100, n_features=2, n_redundant=0)
    plt.scatter(X[:, 0], X[:, 1], c=y)
    plt.show()

    clf = kNN(weights='distance')
    scores = cross_val_score(clf, X, y, scoring='neg_log_loss', cv=25)
    print(np.mean(scores))

    A = np.eye(X.shape[1])
    Xt = transform(A, X)
    print(Xt.shape)

    Ascale = []
    for iter in range(20):
        if iter % 5 == 0:
            print('Iteration', iter)
        nca(A, X, y)
        # print 'A',A
        flattenedA = np.sum(A)
        Ascale.append(np.sum(A))
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour) 

 
#Build new array
train_data = pd.concat([hour, days, district], axis=1)
train_data['crime_category']= crime_category

#Repeat for test data
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
hour = test.Dates.dt.hour
hour = pd.get_dummies(hour) 
 
# Array for the test data
test_data = pd.concat([hour, days, district], axis=1)
 
training, validation = train_test_split(train_data, train_size=.66)


# BUILDING THE CLASSIFIER
daysfeatures = [x for x in days]
districtfeatures = [x for x in district]
hoursf = [h for h in hour]
features = daysfeatures + districtfeatures + hoursf
classifier = kNN()
classifier.fit(training[features], training['crime_category'])
validation_predict = classifier.predict(validation[features])

# Testing the model
print accuracy_score(validation['crime_category'], validation_predict)
Exemple #31
0
        f1_train.append(f1_score(y_train, y_pred_train))

    plt.plot(klist, f1_test, 'o-', color='r', label='Test F1 Score')
    plt.plot(klist, f1_train, 'o-', color='b', label='Train F1 Score')
    plt.ylabel('Model F1 Score')
    plt.xlabel('No. Neighbors')

    plt.title(title)
    plt.legend(loc='best')
    plt.tight_layout()
    # plt.show()
    plt.savefig(algoName + "hyper" + ".png")


print("stage 1")
hyperKNN(
    X_train,
    y_train,
    X_test,
    y_test,
    title=
    "Model Complexity Curve for kNN (Phishing Data)\nHyperparameter : No. Neighbors"
)
print("stage 2")
estimator_phish = kNN(n_neighbors=25, n_jobs=-1)
print("stage 3")
train_samp_phish, kNN_train_score_phish, kNN_fit_time_phish, kNN_pred_time_phish = plot_learning_curve(
    estimator_phish, X_train, y_train, title="kNN Phishing Data")
print("stage 4")
final_classifier_evaluation(estimator_phish, X_train, X_test, y_train, y_test)
Exemple #32
0
def combinational_cost(data1, data2, no_of_folds):
    '''
    Parameters
    ----------
    data1 : arrays
        matrix of all costs of group1 (normal). Each individual feature should be arrnaged as a column
    data2 : arrays
        matrix of all costs of group2 (abnormal). Each individual feature should be arrnaged as a column
    no_of_folds : int
        specify number of folds for nested cross-validation
    Returns
    -------
    accuracy and AUC of the combinational cost function based on different supervised-learning classifiers for identifying mis-registrations.
    '''
    print(
        f'classifier comparison for DTI metrics with {no_of_folds} fold cross-validation --------------'
    )

    # transposing and creating labels for data1
    X_normal = np.transpose(data1)
    x_normal_label = np.zeros(len(X_normal))

    # transposing and creating labels for data2
    X_misaligned = np.transpose(data2)
    x_misaligned_label = np.ones(len(X_misaligned))

    # combining data1 and data2 and the corresponding labels
    X = np.concatenate((X_normal, X_misaligned))
    y = np.concatenate((x_normal_label, x_misaligned_label))

    # scaling the costs (features) to make sure the ranges of individual features are same to avoid the effect of features that have relatively large values. It may not be necessary in this case as all these 3 costs lie between 0 and 1
    scale = MaxAbsScaler()
    X = scale.fit_transform(X)

    # K-fold cross validation, n_splits specifies the number of folds
    folds = StratifiedKFold(n_splits=no_of_folds)

    scores_lda = []
    scores_qda = []
    scores_rfc = []
    scores_svm = []
    scores_gnb = []
    scores_knn = []
    scores_lor = []
    scores_ada = []
    scores_gra = []
    scores_ann = []

    auc_lda = []
    auc_qda = []
    auc_rfc = []
    auc_svm = []
    auc_gnb = []
    auc_knn = []
    auc_lor = []
    auc_ada = []
    auc_gra = []
    auc_ann = []

    for train_index, test_index in folds.split(X, y):

        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[
            train_index], y[test_index]

        # 1. Linear Discriminant Analysis Classifier
        lda = LDA(solver='eigen', shrinkage='auto', n_components=1)
        scores_lda.append(
            classifier_accuracy(lda, X_train, X_test, y_train,
                                y_test)[0])  # Accuracy
        auc_lda.append(
            classifier_accuracy(lda, X_train, X_test, y_train,
                                y_test)[1])  # AUC

        # 1a. Quadratic Discriminant Analysis Classifier
        qda = QDA()
        scores_qda.append(
            classifier_accuracy(qda, X_train, X_test, y_train, y_test)[0])
        auc_qda.append(
            classifier_accuracy(qda, X_train, X_test, y_train, y_test)[1])

        # 2. Random Forest Classifier (it could be done in LDA transformed space if you have large number of features)
        rfc = RandomForestClassifier(criterion='gini', n_estimators=100)
        scores_rfc.append(
            classifier_accuracy(rfc, X_train, X_test, y_train, y_test)[0])
        auc_rfc.append(
            classifier_accuracy(rfc, X_train, X_test, y_train, y_test)[1])

        # 3. Support Vector Machine Classifier
        svc = SVC(kernel='rbf', gamma=2, probability=True)
        scores_svm.append(
            classifier_accuracy(svc, X_train, X_test, y_train, y_test)[0])
        auc_svm.append(
            classifier_accuracy(svc, X_train, X_test, y_train, y_test)[1])

        # 4. Gaussian Naive Bayes Classifier
        gnb = GaussianNB()
        scores_gnb.append(
            classifier_accuracy(gnb, X_train, X_test, y_train, y_test)[0])
        auc_gnb.append(
            classifier_accuracy(gnb, X_train, X_test, y_train, y_test)[1])

        # 5. k-Nearest Neighbour Classifier
        knn = kNN(n_neighbors=15)
        scores_knn.append(
            classifier_accuracy(knn, X_train, X_test, y_train, y_test)[0])
        auc_knn.append(
            classifier_accuracy(knn, X_train, X_test, y_train, y_test)[1])

        # 6. Logistic Regression Classifier
        lor = LogisticRegression()
        scores_lor.append(
            classifier_accuracy(lor, X_train, X_test, y_train, y_test)[0])
        auc_lor.append(
            classifier_accuracy(lor, X_train, X_test, y_train, y_test)[1])

        # 7. Ada Boost Classifier
        ada = AdaBoostClassifier(n_estimators=100)
        scores_ada.append(
            classifier_accuracy(ada, X_train, X_test, y_train, y_test)[0])
        auc_ada.append(
            classifier_accuracy(ada, X_train, X_test, y_train, y_test)[1])

        # 7a. Gradient Boosting Classifier
        gra = GradientBoostingClassifier(random_state=0)
        scores_gra.append(
            classifier_accuracy(gra, X_train, X_test, y_train, y_test)[0])
        auc_gra.append(
            classifier_accuracy(gra, X_train, X_test, y_train, y_test)[1])

        # 8. Arteficial Neural Network (Deep Learning)
        # model_ann = tf.keras.models.Sequential()
        # model_ann.add(tf.keras.layers.Dense(units = np.shape(X_train)[1] + 1, activation = 'relu', input_shape = (np.shape(X_train)[1],))) # input_shape takes height of the input layer which is usually fed during first dense layer allocation
        # model_ann.add(tf.keras.layers.Dense(units = np.shape(X_train)[1] + 1, activation = 'relu')) # hidden layer
        # model_ann.add(tf.keras.layers.Dense(units = np.shape(X_train)[1] + 2, activation = 'relu')) # hidden layer
        # model_ann.add(tf.keras.layers.Dense(units = np.shape(X_train)[1] + 1, activation = 'relu')) # hidden layer
        # model_ann.add(tf.keras.layers.Dense(units = 2, activation = 'softmax')) # hidden layer
        # model_ann.compile(optimizer = 'sgd', loss = 'binary_crossentropy', metrics = ['accuracy']) # compile the neural network
        # model_ann.fit(X_train, y_train, epochs = 20) # fit the neural network on the training data
        # scores_ann.append(model_ann.evaluate(X_test, y_test)) # network accuracy
        # auc_ann.append(metrics.roc_auc_score(y_test, model_ann.predict_proba(X_test)[:, 1])) # network AUC

    # Note: 'cross_val_score' method from sklearn could be used directly on the classifier model to avoid the above for loop. Further, f1-score could be used instead of accuracy metric if number of positive samples (mis-aligned) are low.

    print(
        f'accuracy using LDA classifier for dti measures is: {np.average(scores_lda)}, AUC is: {np.average(auc_lda)}\n'
    )
    print(
        f'accuracy using QDA classifier for dti measures is: {np.average(scores_qda)}, AUC is: {np.average(auc_qda)}\n'
    )
    print(
        f'accuracy using RandomForest classifier for dti measures is: {np.average(scores_rfc)}, AUC is: {np.average(auc_rfc)}\n'
    )
    print(
        f'accuracy using SVM classifier for dti measures is: {np.average(scores_svm)}, AUC is: {np.average(auc_svm)}\n'
    )
    print(
        f'accuracy using Naive Bayes classifier for dti measures is: {np.average(scores_gnb)}, AUC is: {np.average(auc_gnb)}\n'
    )
    print(
        f'accuracy using kNN classifier for dti measures is: {np.average(scores_knn)}, AUC is: {np.average(auc_knn)}\n'
    )
    print(
        f'accuracy using Logistic Regression classifier for dti measures is: {np.average(scores_lor)}, AUC is: {np.average(auc_lor)}\n'
    )
    print(
        f'accuracy using Ada Boost classifier for dti measures is: {np.average(scores_ada)}, AUC is: {np.average(auc_ada)}\n'
    )
    print(
        f'accuracy using Gradient boosting classifier for dti measures is: {np.average(scores_gra)}, AUC is: {np.average(auc_gra)}\n'
    )
    #print(f'accuracy using ANN for dti measures is: {np.average(scores_ann)}, AUC is: {np.average(auc_ann)}\n')

    save_model = 'D:/Tummala/Parkinson-Data/ml-models-dti1000'

    if not os.path.exists(save_model):
        os.makedirs(save_model)

    # saving the trained model, e.g. shown for saving ada boost classifier model and minmax scaling model
    pickle.dump(scale, open(save_model + '/' + 'scale', 'wb'))
    pickle.dump(lda, open(save_model + '/' + 'lda', 'wb'))
    pickle.dump(qda, open(save_model + '/' + 'qda', 'wb'))
    pickle.dump(rfc, open(save_model + '/' + 'rfc', 'wb'))
    pickle.dump(svc, open(save_model + '/' + 'svm', 'wb'))
    pickle.dump(gnb, open(save_model + '/' + 'gnb', 'wb'))
    pickle.dump(knn, open(save_model + '/' + 'knn', 'wb'))
    pickle.dump(lor, open(save_model + '/' + 'lor', 'wb'))
    pickle.dump(ada, open(save_model + '/' + 'ada_boost', 'wb'))
Exemple #33
0
                                             'Alcalinity of ash', 'Magnesium', 'Total phenols'
                                             'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                                             'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                                             'Proline' ]
 """                                            

Classes = data[0]
Features = data.drop(0, axis = 1)
from sklearn.cross_validation import KFold
from sklearn.preprocessing import scale
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier as kNN


CV = KFold(Classes.count(), n_folds = 5, shuffle = True, random_state = 42)
params = {'n_neighbors':list(range(1,50))}
grid = GridSearchCV(kNN(), params, cv = CV)
grid.fit(X = Features, y = Classes)
print(grid.best_score_)
print(grid.best_params_)

X_scaled = scale(Features)
grid_scaled = GridSearchCV(kNN(), params, cv = CV)
grid_scaled.fit(X = X_scaled, y = Y)
print(grid_scaled.best_score_)
print(grid_scaled.best_params_)




Exemple #34
0
# In[15]:

phishX, phishY, bankX, bankY = import_data()

X_train, X_test, y_train, y_test = train_test_split(np.array(phishX),
                                                    np.array(phishY),
                                                    test_size=0.20)
hyperKNN(
    X_train,
    y_train,
    X_test,
    y_test,
    title=
    "Model Complexity Curve for kNN (Phishing Data)\nHyperparameter : No. Neighbors"
)
estimator_phish = kNN(n_neighbors=20, n_jobs=-1)
train_samp_phish, kNN_train_score_phish, kNN_fit_time_phish, kNN_pred_time_phish = plot_learning_curve(
    estimator_phish, X_train, y_train, title="kNN Phishing Data")
final_classifier_evaluation(estimator_phish, X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = train_test_split(np.array(bankX),
                                                    np.array(bankY),
                                                    test_size=0.20)
hyperKNN(
    X_train,
    y_train,
    X_test,
    y_test,
    title=
    "Model Complexity Curve for kNN (Banking Data)\nHyperparameter : No. Neighbors"
)
Exemple #35
0
    def classifier(self, sdss, des, des_tags=None, sdss_tags = None, train=None, train_size=60000):

        print 'after prior cut (s/d) :', len(sdss), len(des)
        print 'train_size ', train_size
        """
        dperp_sdss = ( sdss['MODELMAG_R'] - sdss['MODELMAG_I']) - (sdss['MODELMAG_G'] - sdss['MODELMAG_R'])/8.
        sdss_cuts = ( ( dperp_sdss > 0.55 ) &
                     (sdss['CMODELMAG_I'] < (19.86 + 1.6*(dperp_sdss - 0.8))) &
                     (sdss['CMODELMAG_I'] < 19.9) &
                     (sdss['CMODELMAG_I'] > 17.5) &
                     (sdss['FIBER2MAG_I'] < 21.5) &
                     ((sdss['MODELMAG_R'] - sdss['MODELMAG_I']) < 2.) )
        """
        sdss_matched, des_matched = sdss, des
        
        #sdss_matched, des_matched = DES_to_SDSS.match(sdss, des)
        #sdss, des = DES_to_SDSS.match(sdss, des)
        #sdss_cuts = SDSS_cmass_criteria(sdss)
        #sdss, des = DES_to_SDSS.match(sdss, des)
        #if train is None:

        sdss_cuts = self._SDSS_cmass_criteria(sdss_matched)
        train = np.random.choice(np.arange(des_matched.size), size=train_size, replace=False)
        train_mask = np.zeros(des_matched.size, dtype=bool)
        train_mask[train] = 1
        test = np.where(~train_mask)[0]
        test_size = np.sum((~train_mask))

        x = self._arrange_data_for_fitting(des_matched[train],tags=des_tags)
        y = np.zeros(train_size)
        y[sdss_cuts[train]] = 1

        x_test = self._arrange_data_for_fitting(des_matched[test],tags=des_tags)
        y_test = np.zeros(test_size)
        y_test[sdss_cuts[test]] = 1
    
        #x_all = self._arrange_data_for_fitting(des,tags=des_tags)
        #y_all = np.zeros(des.size)
        #y_all[sdss_cuts] = 1

        """
        elif train is not None:
            
            train_sample = train.copy()
            sdss_matched, train_sample = DES_to_SDSS.match(sdss, train_sample)
            sdss_cuts = self._SDSS_cmass_criteria(sdss_matched)
            
            train = np.random.choice(np.arange(train_sample.size), size=train_size, replace=False)
            train_mask = np.zeros(train_sample.size, dtype=bool)
            train_mask[train] = 1
            test = np.where(~train_mask)[0]
            test_size = np.sum((~train_mask))

            x = self_.arrange_data_for_fitting(train_sample[train],tags=des_tags)
            y = np.zeros(train_size)
            y[sdss_cuts[train]] = 1
        
            x_test = self_.arrange_data_for_fitting(train_sample[test],tags=des_tags)
            y_test = np.zeros(test_size)
            y_test[sdss_cuts[test]] = 1

            #x_all = self_.arrange_data_for_fitting(des,tags=des_tags)
            #y_all = np.zeros(des.size)
            #y_all[sdss_cuts] = 1
        """
        
        print 'train set ', np.sum(train_mask), ' test set ', np.sum((~train_mask))
        print 'cmass/train', np.sum(sdss_cuts[train]), ' cmass/test', np.sum(sdss_cuts[test]), ' total', np.sum(sdss_cuts)


        #from sklearn.ensemble import RandomForestClassifier as rfc
        #from sklearn.ensemble import AdaBoostClassifier as rfc
        #from sklearn.ensemble import GradientBoostingClassifier as rfc
        #pl = rfc(n_estimators=1000)
        from sklearn.neighbors import KNeighborsClassifier as kNN
        
        n_neighbors = 100 #   int(train_size * 0.02)
        print 'n_neighbors', n_neighbors
        pl = kNN(n_neighbors=n_neighbors,weights='distance',p=2,n_jobs=-1)
        pl.fit(x,y)
        predict_test = pl.predict(x_test)
        truth_test = y_test == 1
        
        predict_test_all = pl.predict(x_all)
        #truth_test_all = y_all == 1
        
        print "Classifier completeness:", np.sum(predict_test * truth_test) *1. / np.sum(truth_test)
        print "Classifier purity:", np.sum(predict_test * truth_test) * 1./np.sum(predict_test)
        print "number (test/all)", np.sum(predict_test), np.sum(predict_test_all)


        # Now reverse it, and see what SDSS galaxies we can be sure are selected by the classifier.

        #sdss_matched, des_matched = DES_to_SDSS.match(sdss, des)
        x = arrange_data_for_fitting(sdss[train],tags=sdss_tags)
        y = np.zeros(sdss.size)
        y[predict_test_all[train] == 1] = 1
        
        x_test = arrange_data_for_fitting(sdss_matched[test],tags=sdss_tags)
        y_test[predict_test == 1] = 1

        x_all = arrange_data_for_fitting(sdss,tags=sdss_tags)
        y_all[predict_test_all == 1] = 1
        
        pl2 = kNN(n_neighbors=n_neighbors,weights='distance',p=2,n_jobs=-1)
        pl2.fit(x,y)
        predict_rev = pl2.predict(x_test)
        good = (predict_rev ==0) & (predict_test == 1)
        
        predict_rev_all = pl2.predict(x_all)
        good = (predict_rev_all ==0) & (predict_test_all == 1)
        
        print "Reverse classifier completeness:", np.sum(predict_rev * predict_test ) *1. / np.sum(predict_test)
        print "Reverse classifier purity:", np.sum(predict_rev * predict_test) * 1./np.sum(predict_rev)
        
        return pl, (predict_test_all == 1), (predict_test_all == 1)
Exemple #36
0
    # print(image_test, label_test);

    image_test = read_image(test_image_path)
    label_test = read_label(test_label_path)

    image_train = read_image(train_image_path)
    label_train = read_label(train_label_path)
    # print_num(image_train[0])
    # print(label_train)

    # 将图像二值化处理,大于等于128设置为1 小于128为0 误差率会上升,通用性更强
    image_test = pic_binary(image_test)
    image_train = pic_binary(image_train)

    # print_num(image_train[0])

    knn = kNN(n_neighbors = 5, algorithm = 'auto')
    knn.fit(image_train, label_train)

    res = knn.predict(image_test[:100])
    #
    right = 0
    wrong = 0
    for i in range(len(res)):
        if (res[i] == label_test[i]):
            right += 1
            print("预测值:%s,真是值:%s" % (res[i], label_test[i]))
        else:
            wrong += 1
            print("\033预测值:%s,真是值:%s\033" % (res[i], label_test[i]))
    print("准确率:%2f%%" % (right*100/(right+wrong)))
Exemple #37
0
plt.title('Twin moon dataset')
plt.show()

#Divide train and test, forget about the latter for a while
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

test_errors = []
errors = []
stds = []
cv_numb = 10
upper_bound_k = len(X_train) - (len(X_train) // cv_numb) - 1
print('K from 1 to {}'.format(upper_bound_k))
K = range(0, upper_bound_k, 10)
for k in K:
    #Create knn classifier and fit with
    knn = kNN(n_neighbors=k + 1)

    #Perform cross validation with cv_numb divisions
    results = cross_val_score(knn, X_train, y_train, cv=cv_numb)

    #Mean error and std for this step of cross-validation
    mean_error = np.array(results).sum() / cv_numb
    std = np.array(results).std()

    errors.append(1 - mean_error)
    stds.append(std)

    #How this k performs on future data? Retrain with that k and whole set and score test
    tstknn = kNN(n_neighbors=k + 1).fit(X_train, y_train)
    y_pred = tstknn.predict(X_test)
    test_errors.append(1 - np.sum(y_pred == y_test) / len(y_pred))
Exemple #38
0
from sklearn import datasets
from sklearn import neighbors, metrics
from sklearn.neighbors import KNeighborsClassifier as kNN
import matplotlib.pyplot as plt


Digits = datasets.load_digits()
Imglabels = [x for x in list(zip(Digits.images, Digits.target)) if x[1] == 7 or x[1] == 3]
for ind, (image, label) in enumerate(Imglabels[:4]):
    plt.subplot(2, 4, ind + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: label')
    plt.show()

num = len(Digits.images) #calculates number
imgs = Digits.images.reshape((num, -1))
labs = Digits.target

y_trainset, x_trainset = labs[:int(num*.7)].reshape(-1,), imgs[:int(num*.7)]
y_testset, x_testset= labs[int(num*.7):].reshape(-1,), imgs[int(num*.7):]

neighbor = kNN(n_neighbors=3)
neighbor.fit(x_trainset, y_trainset)

new_val = neighbor.predict(x_testset)

print("kNN classifirer reports: %s:\n%s\n" % (neighbor, metrics.classification_report(y_testset, new_val)))
print("Confusion matrix is:", metrics.confusion_matrix(y_testset, new_val))