Exemple #1
0
def traineKNNHSV(inFHEE, inFPAS, base):
    X = [];
    y = [];    
    for i in range(int(qtdDataTrainning/2)):                        
        X.append(pickle.load(inFHEE));
        y.append(labelHEE);
        X.append(pickle.load(inFPAS));
        y.append(labelPAS);
    
    inFHEE.close();
    inFPAS.close();
    
    X1  = np.array(X);
    y1 = np.array(y);
    knn5 = KNeighborsClassifier(n_neighbors=5);
    knn10 = KNeighborsClassifier(n_neighbors=10);
    knn15 = KNeighborsClassifier(n_neighbors=15);
    knn20 = KNeighborsClassifier(n_neighbors=20);        
    kf = KFold(n_splits=10);
    print(len(y1), len(X1));
    
    for k, (train, test) in enumerate(kf.split(X1, y1)):  
        print("TRAIN:", train, " - TEST:", test);
        knn5.fit(X1[train], y1[train]);
        knn10.fit(X1[train], y1[train]);
        knn15.fit(X1[train], y1[train]);
        knn20.fit(X1[train], y1[train]);
        print("K ->",k);
        
        print(knn5.score(X1[test], y1[test]));
        print(knn10.score(X1[test], y1[test]));
        print(knn15.score(X1[test], y1[test]));
        print(knn20.score(X1[test], y1[test]));
    n = "..\\trainedKNN_0HistogramColors_H.bin";
    if(base==360):
        n = "..\\trainedKNN_0HistogramColors_H.bin";
    elif(base==180):
        n = "..\\trainedKNN_1HistogramColors_H.bin";
    elif(base==90):
        n = "..\\trainedKNN_2HistogramColors_H.bin";
    elif(base==45):
        n = "..\\trainedKNN_3HistogramColors_H.bin";
    knnFile = open(n, "wb");
    print(knn5.get_params());
    print(knn10.get_params());
    print(knn15.get_params());
    print(knn20.get_params());
    pickle.dump(knn5, knnFile);
    pickle.dump(knn10, knnFile);
    pickle.dump(knn15, knnFile);
    pickle.dump(knn20, knnFile);
    knnFile.close();
Exemple #2
0
def traineKNN(channels, numberOfShifts):    
    fHEE = "..\\Logs\\3\\Treinamento\\"+channels+"\\output"+channels+"_"+str(numberOfShifts)+"HistogramColors_H&E.bin";
    fPAS = "******"+channels+"\\output"+channels+"_"+str(numberOfShifts)+"HistogramColors_PAS.bin";
    inFHEE = open(fHEE, 'rb');
    inFPAS = open(fPAS, 'rb');
    print("Training KNN to channels "+channels+" with "+str(numberOfShifts)+" shifts");
    
    base = 256>>numberOfShifts;    
    X = [];
    y = [];

    for i in range(int(qtdDataTrainning/2)):                        
        X.append(pickle.load(inFHEE));
        y.append(labelHEE);
        X.append(pickle.load(inFPAS));
        y.append(labelPAS);
        
    inFHEE.close();
    inFPAS.close();
    
    X1  = linearizeTraining(base, channels, X);    
    y1 = np.array(y);
    knn5 = KNeighborsClassifier(n_neighbors=5);
    knn10 = KNeighborsClassifier(n_neighbors=10);
    knn15 = KNeighborsClassifier(n_neighbors=15);
    knn20 = KNeighborsClassifier(n_neighbors=20);        
    kf = KFold(n_splits=10);
    print(len(y1), len(X1))
    
    for k, (train, test) in enumerate(kf.split(X1, y1)):  
        print("TRAIN:", train, " - TEST:", test);
        knn5.fit(X1[train], y1[train]);
        knn10.fit(X1[train], y1[train]);
        knn15.fit(X1[train], y1[train]);
        knn20.fit(X1[train], y1[train]);
        print("K ->",k);
        
        print(knn5.score(X1[test], y1[test]));
        print(knn10.score(X1[test], y1[test]));
        print(knn15.score(X1[test], y1[test]));
        print(knn20.score(X1[test], y1[test]));
    
    knnFile = open("..\\trainedKNN_"+str(numberOfShifts)+"HistogramColors_"+channels+".bin", "wb");
    print(knn5.get_params());
    print(knn10.get_params());
    print(knn15.get_params());
    print(knn20.get_params());    
    pickle.dump(knn5, knnFile);    
    pickle.dump(knn10, knnFile);    
    pickle.dump(knn15, knnFile);    
    pickle.dump(knn20, knnFile);
    knnFile.close();
Exemple #3
0
def kFoldValidationMethod(X_train, y_train):

  kf = KFold(n_splits = 10)

  bestAccuracyLogisticRegression = 0
  bestAccuracyRandomForestClassifier = 0
  bestAccuracyKNNClassifier = 0

  accuracyTable = pd.DataFrame(columns = ['Logistic Regression', 'Random Forest', 'KNN Classifier'])
  for train_index, validation_index in kf.split(X_train):
    modelLogisticRegression = LogisticRegression().fit(X_train.iloc[train_index], y_train.iloc[train_index])
    modelRandomForestClassifier = RandomForestClassifier().fit(X_train.iloc[train_index], y_train.iloc[train_index])
    modelKNNClassifier = KNeighborsClassifier().fit(X_train.iloc[train_index], y_train.iloc[train_index])

    accuracyLogisticRegression = accuracy_score(modelLogisticRegression.predict(X_train.iloc[validation_index]), y_train.iloc[validation_index])
    accuracyRandomForestClassifier = accuracy_score(modelRandomForestClassifier.predict(X_train.iloc[validation_index]), y_train.iloc[validation_index])
    accuracyKNNClassifier = accuracy_score(modelKNNClassifier.predict(X_train.iloc[validation_index]), y_train.iloc[validation_index])

    if accuracyLogisticRegression > bestAccuracyLogisticRegression:
      bestAccuracyLogisticRegression = accuracyLogisticRegression
      paramsLogisticRegression = modelLogisticRegression.get_params()
    if accuracyRandomForestClassifier > bestAccuracyRandomForestClassifier:
      bestAccuracyRandomForestClassifier = accuracyRandomForestClassifier 
      paramsRandomForestClassifier = modelRandomForestClassifier.get_params() 
    if accuracyKNNClassifier > bestAccuracyKNNClassifier:
      bestAccuracyKNNClassifier = accuracyKNNClassifier 
      paramsKNNClassifier = modelKNNClassifier.get_params()  
    
    accuracyTable = accuracyTable.append({'Logistic Regression': accuracyLogisticRegression, 'Random Forest': accuracyRandomForestClassifier, 'KNN Classifier': accuracyKNNClassifier}, ignore_index = True)  

  return (bestAccuracyLogisticRegression, bestAccuracyRandomForestClassifier, bestAccuracyKNNClassifier, paramsLogisticRegression, paramsRandomForestClassifier, paramsKNNClassifier, accuracyTable)
def KNeighbors_Model(X_train, y_train, X_test, y_test):
    
    k = 1
    max_score = 0.0
    testing_model = None
    cv_scores = None
    
    for n_neighbors in  range(1,4):
        
        model = KNeighborsClassifier(n_neighbors = n_neighbors)
        classifier = model.fit(X_train, y_train)
        testing_model = model.predict(X_test)
        score = model.score(X_test, y_test)

        if score > max_score:
            max_score = score
            k = n_neighbors
            cv_scores = cross_val_score(classifier, X_test, y_test, cv = 3)
               
    print(' ')
    print('===== k-Neighbors Model =====')
    print('score:', max_score)
    print('cross validation scores:', cv_scores) 
      
    # Visualize parameters in a table.
    visualize_params(model.get_params())
    
    # Display confusion matrix.
    visualize_heatmap(y_test, testing_model, 'k-Neighbors')     
    
    return score
Exemple #5
0
    def getmodel(self, type):
        if type == 'knn':
            model = KNeighborsClassifier(n_neighbors=5)
        elif type == 'nearestcentroid':
            model = NearestCentroid()
        elif type == 'svm':
            model = SVC(gamma='scale')
        elif type == 'gaussianprocess':
            model = GaussianProcessClassifier()
        elif type == 'rf':
            model = RandomForestClassifier(n_estimators=100,
                                           max_features=10,
                                           max_depth=5)
        elif type == 'ada':
            model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                                      n_estimators=100,
                                      random_state=24)
        elif type == 'mlp':
            model = MLPClassifier(solver='adam',
                                  hidden_layer_sizes=(28, 28),
                                  random_state=1)
        else:
            print("Unsupported quality estimator %s" % type)
            exit(1)

        if 'random_state' in model.get_params().keys():
            model.set_params(random_state=self.random_state)
        return model
Exemple #6
0
def assign_to_clusters(x_train: np.ndarray, clusters: np.ndarray,
                       x_test: np.ndarray, y_true: np.ndarray) -> None:
    """
    Assigns new data to existing clusters, using nearest neighbors classification.

    :param x_train: the data which have been clustered.
    :param clusters: the clusters.
    :param x_test: the data to be assigned to clusters.
    :param y_true: the data class labels.
    """
    logger.log('Creating Nearest Neighbors classifier with params:')
    clf = KNeighborsClassifier()
    clf_params = clf.get_params()
    logger.log(clf_params)
    clf.fit(x_train, clusters)
    y_pred = clf.predict(x_test)

    if PLOTTING_MODE != 'none':
        # Plot data vs clusters.
        plotter.subfolder = 'classification'
        plotter.filename = 'data_vs_clusters-k={}'.format(
            clf_params['n_neighbors'])
        plotter.xlabel = 'first feature'
        plotter.ylabel = 'second feature'
        plotter.title = 'Classified data vs Clusters'
        plotter.scatter_classified_comparison(
            x_train, clusters, x_test, y_true, y_pred, 'Test data vs clusters',
            'Test data assigned to clusters\nk={}'.format(
                clf_params['n_neighbors']), helpers.datasets.get_gene_name)
def test_nearest_neighbor_defaults():
    """Unit test for Nearest Neighbors classifer algorithm.
    Check if classifier container with default parameters 
    performs the same as running the corresponding sklearn algorithm
    with their default parameters."""

    # Generate dataset
    datasets = generate_tutorial_data()

    clf = KNeighborsClassifier()

    # manual sklearn categorizations
    expected_predictions = []
    for ds_name in datasets:
        X, y = datasets[ds_name]
        clf.fit(X, y)  # Train classifier
        expected_predictions.append(clf.predict(X))

    clf_container = classifiers.KNeighborsContainer()

    # Check that default params are equal
    assert (clf_container.create_clf().get_params() == clf.get_params())

    # Check that the evaluate function works correctly
    for i, ds_name in enumerate(datasets):
        X, y = datasets[ds_name]
        pipeline_ds = generate_pipeline_dataset(X, y)
        actual_predictions = clf_container.evaluate(pipeline_ds.training_set,
                                                    pipeline_ds.testing_set)
        assert len(actual_predictions) == len(expected_predictions[i])
        assert (actual_predictions == expected_predictions[i]).all()
def get_knn(x_train, t_train, x_val, t_val, search=False):
    # KNN params: {'algorithm': 'kd_tree', 'leaf_size': 30, 'n_neighbors': 20, 'p': 1, 'weights': 'distance'}
    # KNN tested at (array([0.88484087, 0.88107203, 0.89007538, 0.88628272, 0.89256545]), 0.6603707265070087, 0.9209732808442919)
    if search:
        knn_params = param_sel(
            x_train, t_train, KNeighborsClassifier(), {
                'n_neighbors': [1, 3, 5, 10, 20],
                'weights': ['uniform', 'distance'],
                'algorithm': [
                    'ball_tree',
                    'kd_tree',
                ],
                'p': [1, 2]
            })
    else:
        knn_params = {
            'algorithm': 'kd_tree',
            'leaf_size': 30,
            'n_neighbors': 20,
            'p': 1,
            'weights': 'distance'
        }

    knn_classifier = KNeighborsClassifier(**knn_params)
    knn_classifier.fit(x_train, t_train)
    print("KNN params:", knn_classifier.get_params())
    print("KNN validated at", validate(knn_classifier, x_val, t_val))
    return knn_classifier
def runKNNSimulation(dataTrain, dataTest, holdout, train_M, test_M, hold_M):
    outFile = open('knnLog25.txt','a')
    print 'running mashable knn simulation'
    outFile.write('train==> %d, %d \n'%(train_M.shape[0],train_M.shape[1]))
    outFile.write('test==>  %d, %d \n'%(test_M.shape[0],test_M.shape[1]))
    with SimpleTimer('time to train', outFile):
        clf = KNeighborsClassifier(weights='distance', ).fit(train_M, dataTrain.target)
    plot_learning_curve(clf, 'knn with %d neighbors' , train_M, dataTrain.target, cv=5, n_jobs=4)
    
    baseScore = clf.score(test_M, dataTest.target)
    baseParams = clf.get_params(True)
    baseNeighbors = baseParams['n_neighbors']
    print 'baseline score %.3f base n_neighbors %d' % (baseScore, baseNeighbors)
    outFile.write('baseline score %.3f base height %d \n' % (baseScore, baseNeighbors))
    
    res = []
    with SimpleTimer('time to fine tune number of neighbors', outFile):
        for neighbors in range(2,baseNeighbors * 10):
#             print 'training for neighbors %d' % neighbors
            clf = KNeighborsClassifier(n_neighbors=neighbors, weights='distance').fit(train_M, dataTrain.target)
            score = clf.score(hold_M, holdout.target)
            res.append((score, neighbors))
            outFile.write('%d %.3f \n' % (neighbors, score))
    res = sorted(res, key=lambda x:x[0], reverse=True)
    print res[:5]
    bestNeighbors = res[0][1]
    print ('best number of neighbors is %d' % bestNeighbors)
    outFile.write('best number of neighbors is %d  and score is %.3f\n' % (bestNeighbors, res[0][0]))
    
    bestClf = KNeighborsClassifier(n_neighbors=bestNeighbors, weights='distance')
    bestClf.fit(train_M, dataTrain.target)
    
    predicted = bestClf.predict(test_M)
    trainPredict = bestClf.predict(train_M)
    print 'testing score'
    outFile.write('testing score')
    outputScores(dataTest.target, predicted, outFile)
    print 'training score'
    outFile.write('testing score')
    outputScores(dataTrain.target, trainPredict, outFile)
    
    results = predicted == dataTest.target
    print numpy.mean(results)
    res = []
    for i in range(len(results)):
        if not results[i]:
            res.append(i)
    print 'classifier got these wrong:'
    for i in res[:10]:
        print dataTest.data[i], dataTest.target[i]
        outFile.write('%s %d \n' % (dataTest.data[i], dataTest.target[i]))
    '''
    train_sizes, train_scores, valid_scores = learning_curve(DecisionTreeClassifier(), train_M, dataTrain.target, train_sizes=[50, 80, 110], cv=5)
    print train_sizes
    print train_scores
    print valid_scores
    '''
       
    plot_learning_curve(bestClf, 'knn with %d neighbors' % bestNeighbors, train_M, dataTrain.target, cv=5, n_jobs=4)
Exemple #10
0
 def perform_KNeighborsClassifier(self):
     KNeighbors_classifier = KNeighborsClassifier()
     KNeighbors_classifier.fit(self.data_train, self.labels_train) 
     self.KNeighborsClassifier_result={"parameters":KNeighbors_classifier.get_params(),"labels_test_data":KNeighbors_classifier.predict(self.data_test),"score":KNeighbors_classifier.score(self.data_test,self.labels_test)}
     
     print_dict(self.KNeighborsClassifier_result)
     print("f1_score:")
     print(f1_score(self.labels_test, self.KNeighborsClassifier_result["labels_test_data"], average='macro') )
Exemple #11
0
def recommend(interactions_map, item_profiles, user_profiles, target_users):

    ############## OLDER ALGS TO RUN FOR COMPARISON ##################################

    #return recommendTopPop(interactions_map, item_profiles, user_profiles, target_users)
    #return recommendTopPopNP(interactions_map, item_profiles, user_profiles, target_users)

    print("Listing users interactions...")
    tic = time.time()

    print(interactions_map.head())
    interacting_items = interactions_map[['user_id', 'item_id']]
    #interacting_items = interacting_items.set_index('user_id')
    print('--------------------------')
    print(interacting_items.head())
    listed_interactions = interacting_items.groupby('user_id').apply(
        lambda group: group.drop_duplicates())
    listed_interactions = listed_interactions[['item_id']]
    print(listed_interactions.head())

    print("Interactions listed in {:.3f} sec!".format(time.time() - tic))
    #target_users = pd.Series(target_users.user_id)
    #print(target_users.head())
    #for u, user in enumerate(target_users):
    ## X can be an array of points!
    #X = listed_interactions.loc[user]
    #print(user)
    #print(X)
    #return

    # Prepare the item table, making all fields meaningful
    print("Preparing rows...")
    tic = time.time()
    item_profiles = item_profiles.fillna(0)
    print("# NaN filled")
    item_profiles = item_profiles.apply(lambda x: prepare_row(x), axis=1)
    print("Done in {:.3f} sec!".format(time.time() - tic))

    print("Training started...")
    neigh = KNeighborsClassifier(n_neighbors=5,
                                 metric='cosine',
                                 algorithm='brute')
    print(neigh.get_params())
    neigh.fit(item_profiles, item_profiles.id)
    print("Training completed!")

    print("Creating recommendations")
    target_users = pd.Series(target_users.user_id)
    for u, user in enumerate(target_users):
        # X can be an array of points!
        X = listed_interactions.loc[user][:1].item_id.reshape(1, -1)
        print(X)
        print("User {} gets these recs: {}".format(
            user, neigh.kneighbors(X=X, n_neighbors=5, return_distance=False)))
        if u > 1: return
    print("Done!")
    return
Exemple #12
0
class KNN(ClassicalModel):
    def __init__(self,
                 input_size,
                 output_size,
                 labels,
                 class_weights=None,
                 **kwargs):
        super().__init__(input_size, output_size, labels, class_weights)
        self.model = KNeighborsClassifier(**kwargs)
        self.name = "KNN:\n" + str(self.model.get_params())
def knnClassifier(X_train, X_test, y_train, y_test):
    print("knn")
    model1 = KNeighborsClassifier()
    model1.fit(X_train, y_train)
    y_pred = model1.predict(X_test)
    print(f1_score(y_test, y_pred))

    # Look at parameters used by our current forest
    print('Parameters currently in use:\n')
    print(model1.get_params())
Exemple #14
0
def knn():
    #knn = KNeighborsClassifier(n_neighbors=10, weights="distance")
    knn = KNeighborsClassifier()

    print(knn.get_params().keys())

    param_grid = {
        "n_neighbors": [3, 5, 7, 10],
        "weights": ["distance", "uniform"]
    }

    #clf = GridSearchCV(knn, param_grid=param_grid, scoring = 'accuracy', cv = 5)

    return clf
def flastClassification(trainData, trainLabels, testData, sigma, k, params):
    # training
    t0 = time.perf_counter()
    kNN = KNeighborsClassifier(
        algorithm=params["algorithm"],
        metric=params["metric"],
        weights=params["weights"],
        n_neighbors=k,
        n_jobs=1
    )
    kNN.fit(trainData, trainLabels)
    t1 = time.perf_counter()
    trainTime = t1 - t0

    t0 = time.perf_counter()
    predictLabels = []
    neighborDist, neighborInd = kNN.kneighbors(testData)
    for (distances, indices) in zip(neighborDist, neighborInd):
        phi, psi = 0, 0
        for (distance, neighbor) in zip(distances, indices):
            if kNN.get_params()["weights"] == "distance":
                dInv = (1 / distance) if distance != 0 else float("Inf")
            else:
                dInv = 1
            if trainLabels[neighbor] == 1:
                phi += dInv
            else:
                psi += dInv

        # handle limit cases for prediction
        if phi == float("Inf") and psi == float("Inf"):
            prediction = 0
        elif psi == float("Inf"):
            prediction = 0
        elif phi == float("Inf"):
            prediction = 1
        elif (phi + psi) == 0:
            prediction = 0
        else:
            if phi / (phi + psi) >= sigma:
                prediction = 1
            else:
                prediction = 0
        predictLabels.append(prediction)

    t1 = time.perf_counter()
    testTime = t1 - t0

    return trainTime, testTime, predictLabels
Exemple #16
0
class myKnn():
    def __init__(self, train_data, train_label, test_data, test_label):
        self.train_data = train_data
        self.train_label = train_label
        self.test_data = test_data
        self.test_label = test_label
        self.predict_label = None
        self.train_time = 0
        self.test_time = 0
        self.clf = None

    def setK(self, k = 1):
        self.clf = KNeighborsClassifier(n_neighbors = k)
    
    def train(self):
        print("Start train")
        time_start = time.time()
        self.clf.fit(self.train_data, self.train_label)
        time_end = time.time() - time_start
        print("End train", time_end)
        self.train_time = time_end
        return self.train_time

    def test(self):
        print("Start test")
        time_start = time.time()
        self.predict_label = self.clf.predict(self.test_data)
        time_end = time.time() - time_start
        print("End test", time_end)
        self.test_time = time_end
        return self.test_label, self.test_time

    def getTestLabel(self):
        return self.test_label

    def getPredictLabel(self):
        return self.predict_label
    
    def getTrainTime(self):
        return self.train_time

    def getTestTime(self):
        return self.test_time
        
    def getParams(self):
        return self.clf.get_params()
Exemple #17
0
def knn_classify(df,
                 dep_var,
                 n_neighbors,
                 metric,
                 predictors=None,
                 threshold=.5,
                 temporal=False,
                 start_col=None,
                 end_col=None,
                 start_date=None,
                 end_date=None):
    '''
	Create a nearest neighbor model using sklearn. Requires pandas dataframe, number of neighbors,
	list of predictors, dependent variable to use as input. If no predictors are input, it defaults 
	to using all potential predictors.

	Creates separate training, testing data either using sklearn default or making
	a temporal split as above.

	Returns predicted y-values and y-testing values.
	'''

    if temporal:
        x, x_train, x_test, y_train, y_test = temporal_split(
            df, start_col, end_col, start_date, end_date, dep_var, predictors)

    else:
        y = df[dep_var]

        if not predictors:
            x = df.drop(dep_var, axis=1)

        else:
            x = df[predictors]

    knn = KNeighborsClassifier(n_neighbors, metric=metric)
    knn.fit(x_train, y_train)
    y_scores = knn.predict_proba(x_test)
    y_predict = [1 if x[1] > threshold else 0 for x in y_scores]
    params = knn.get_params()

    return (y_test, y_predict, y_scores, params)
Exemple #18
0
def knn(X, y, n_neighbors=None):
    # Split data into training set and testing set
    # By default, 75% of the data set is used to for training and
    # 25% of the data is used to test the model
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Instantiate K-Nearest Neighbors and fit
    modl = KNeighborsClassifier(n_neighbors=n_neighbors)

    # Fit data
    modl.fit(X_train, y_train)

    soft_yes = modl.predict_proba(X_test)
    hard_yes = modl.predict(X_test)

    # logloss and others requires the probabilities that Yes or 1 is predicted
    logl = metrics.log_loss(y_test, soft_yes)
    fpr, tpr, _ = metrics.roc_curve(y_test, soft_yes[:, 1])
    auc = metrics.roc_auc_score(y_test, soft_yes[:, 1])

    # Precision and accuracy requires y-predictions as (0, 1)
    accuracy = metrics.accuracy_score(y_test, hard_yes)
    precision = metrics.precision_score(y_test, hard_yes)
    recall = metrics.recall_score(y_test, hard_yes)

    metrics_str = f'KNN:  Accuracy: {accuracy:.4f}.  Precision: {precision:.4f}.  Recall: {recall:.4f}.  Log-loss: {logl:.4f}.  AUC: {auc:.4f}'

    return metrics_str, {
        'Model': 'K-Nearest Neighbors',
        'X_test': X_test,  # For plotting at the end
        'y_test': y_test,  # For plotting at the end
        'hard_predictions': hard_yes,
        'prediction probs': soft_yes,
        # For logistic regression, feature importances can be extracted from beta_coefficients
        'false pos rate': fpr,
        'true pos rate': tpr,
        'logloss': logl,
        'area under curve': auc,
        # This returns parameters used in function call
        'parameters': modl.get_params()
    }
Exemple #19
0
dat = df[:, 0:64]
tar1 = df[:, 64]
X = dat
y = tar1

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=20)
clf = KNeighborsClassifier()
s1 = time.time()
clf.fit(X_train, y_train)
e1 = time.time()
t1 = e1 - s1
y_test = np.array(y_test)
print(clf.get_params())
print("Training time: ", t1)

plot_learning_curve(clf,
                    'Learning Curve for K-nn',
                    X_train,
                    y_train, (0, 1.01),
                    cv=5)
# plot_tree(clf.fit(X_train, y_train),filled=True)
# plt.show()
clf1 = KNeighborsClassifier()
plot_validation_curve(X_train, y_train, clf1, 'k1')

clf3 = KNeighborsClassifier()
clf3 = GridSearchCV(estimator=clf3,
                    param_grid={
Exemple #20
0
    def _classification():

        accuracy_mean_list, accuracy_min_list, accuracy_max_list = [], [], []
        accuracy_sd_list, accuracy_se_list = [], []

        f1_mean_list, f1_min_list, f1_max_list = [], [], []
        f1_sd_list, f1_se_list = [], []

        precision_mean_list, precision_min_list, precision_max_list = [], [], []
        precision_mean_list, precision_min_list, precision_max_list = [], [], []
        
        recall_sd_list, recall_se_list = [], []
        recall_sd_list, recall_se_list = [], []

        best_params_list = []
        kfold_accuracy_for_df = []
        kfold_f1_macro_for_df, kfold_f1_for_df, kfold_precision_for_df, kfold_recall_for_df = [], [], [], []
        params_for_df = []

        kfold_f1_for_csv, kfold_precision_for_csv, kfold_recall_for_csv  = [], [], []

        feature_columns = data_dict.keys()
        scaler = MinMaxScaler()
        
        for feature_column in feature_columns:
            X = data_dict[feature_column]
            y = features_ml['Status'].values
            kfold_accuracy_list = []
            kfold_f1_macro_list, kfold_f1_list, kfold_precision_list, kfold_recall_list = [], [], [], []
           

            params_list = []
            # kfold_accuracy_for_df
            kfold_f1_macro_for_df, kfold_f1_for_df, kfold_precision_for_df, kfold_recall_for_df = [], [], [], []
            # params_for_df = []

            
            print ('\n')
            print (path_csv)
            print ('K-fold : ', kfold)
            print ('Features : ', feature_column)
            print ('\n')
            
            if kfold is 'Stratified':
                k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # Stratified
                k_fold_split = k_fold.split(X, y)
                
                
            elif kfold is 'LeaveOneSubjectOut':
                subjects = features_ml['Subjects'].values
                k_fold = LeaveOneGroupOut()       
                k_fold_split = k_fold.split(X, y, subjects)
            
            for train_index, test_index in k_fold_split: 
        #         print("TRAIN:", train_index, "TEST:", test_index)
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                X_train = scaler.fit_transform(X_train)
                # X_test = scaler.fit_transform(X_test)
                X_test = scaler.transform(X_test)

     
                # GridSearch CV
                KNN = KNeighborsClassifier()
                print (KNN)

                paramaters = {
                             'n_neighbors': np.arange(10, 50),
                             'weights' : ['uniform', 'distance'],
                             'metric' : ['euclidean', 'manhattan', 'minskowski']
                             }

                gridsearch = GridSearchCV(KNN,
                                          paramaters,
                                          cv=k_fold,
                                          verbose=1,
                                          n_jobs=-1)
                             

                groups = None if kfold is 'Stratified' else subjects[train_index]
                # if kfold is 'Stratified':
                  # groups = None

                # elif kfold is 'LeaveOneSubjectOut':
                  # groups = subjects[train_index]


                gridsearch.fit(X_train, y_train, groups=groups)    
                optimal_params = gridsearch.best_params_

                 # Support vector machine
                knn = KNeighborsClassifier(**optimal_params) # class_weight = 'balanced' = {Favored :3 and Non_favored :1) approximately.
                print (knn.get_params())
                knn.fit(X_train, y_train)
                knn_accuracy = knn.score(X_test, y_test) 
                y_pred = knn.predict(X_test)
                f1_ = f1_score(y_test, y_pred, average=None)
                f1_macro = f1_score(y_test, y_pred, average='macro')
                precision_ = precision_score(y_test, y_pred, average=None)
                recall_ = recall_score(y_test, y_pred, average=None)

            
        #         print (confusion_matrix(y_test_stratified, y_pred))
        #         print (classification_report(y_test_stratified,y_pred))

#                 print ('Kernel: %s' % (gridsearch.best_params_['kernel']))
#                 print ("Accuracy: %0.2f %s\n" % (svc_accuracy.mean() * 100, '%'))
#                 print ("---------------------------------------------------------")
                # print (f1_[1])
                kfold_accuracy_list.append(knn_accuracy) # 10 models
                kfold_f1_list.append(f1_)
                kfold_f1_macro_list.append(f1_macro)
                kfold_precision_list.append(precision_)
                kfold_recall_list.append(recall_)
                params_list.append(optimal_params) # 10 paramete sets

            index_best_accuracy = kfold_accuracy_list.index(max(kfold_accuracy_list))
            index_worst_accuracy = kfold_accuracy_list.index(min(kfold_accuracy_list))

            index_best_f1 = kfold_f1_macro_list.index(max(kfold_f1_macro_list))
            index_worst_f1 = kfold_f1_macro_list.index(min(kfold_f1_macro_list))

            accuracy_mean_list.append(np.round(np.mean(kfold_accuracy_list)*100, decimals=2))
            accuracy_sd_list.append(np.round(np.std(kfold_accuracy_list),decimals=2))
            accuracy_se_list.append(np.round(np.std(kfold_accuracy_list)/np.sqrt(len(kfold_accuracy_list)),decimals=2))
            accuracy_min_list.append(np.round(kfold_accuracy_list[index_worst_accuracy]*100, decimals=2))
            accuracy_max_list.append(np.round(kfold_accuracy_list[index_best_accuracy]*100, decimals=2))

            f1_mean_list.append(np.round(np.mean(kfold_f1_macro_list)*100, decimals=2))
            f1_sd_list.append(np.round(np.std(kfold_f1_macro_list),decimals=2))
            f1_se_list.append(np.round(np.std(kfold_f1_macro_list)/np.sqrt(len(kfold_f1_list)),decimals=2))
            f1_min_list.append(np.round(kfold_f1_macro_list[index_worst_f1]*100, decimals=2))
            f1_max_list.append(np.round(kfold_f1_macro_list[index_best_f1]*100, decimals=2))

            best_params_list.append(params_list[index_best_accuracy])
            params_for_df.append([params_list])
            kfold_accuracy_for_df.append(kfold_accuracy_list)
            # kfold_f1_for_df.append([kfold_f1_list])
            # kfold_precision_for_df.append([kfold_precision_list])
            # kfold_recall_for_df.append([kfold_recall_list])
            
            for i in range(len(kfold_f1_list)):
              kfold_f1_for_df.append(list(kfold_f1_list[i]))
              kfold_precision_for_df.append([list(kfold_precision_list[i])])
              kfold_recall_for_df.append([list(kfold_recall_list[i])])

            print (f'{len(kfold_f1_for_df)}')
          
            kfold_f1_for_csv.append([kfold_f1_for_df])
            kfold_precision_for_csv.append([kfold_precision_for_df])
            kfold_recall_for_csv.append([kfold_recall_for_df])
            print (f'{len(kfold_f1_for_df)}')
            # # accuracy_dict = {'accuracy': kfold_accuracy_for_df}
            # params_dict = {'params': params_for_df}
            # f1_dict = {'f1_score' : kfold_f1_for_df}
            # precision_dict = {'precision' : kfold_precision_for_df}
            # recall_dict = {'recall' : kfold_recall_for_df}

            
            print ('\n')
            print ('Average of accuracy : %.2f (+/- %.2f)' % (np.mean(kfold_accuracy_list)*100, np.std(kfold_accuracy_list)))
            print ('Highest accuracy : %.2f \n' % (kfold_accuracy_list[index_best_accuracy]*100))
            print ('Average of F1-score : %.2f (+/- %.2f)' % (np.mean(kfold_f1_macro_list)*100, np.std(kfold_f1_macro_list)))  
            print ('Highest F1-score : %.2f \n' % (kfold_f1_macro_list[index_best_f1]*100))
            print ('Best parameters : %s' % (params_list[index_best_accuracy]))
            print ('\n-------------------------------------------------------------\n')

        df_columns_1 = [kfold + '_acc_mean', kfold + '_acc_sd', kfold + '_acc_se', kfold + '_acc_min', kfold + '_acc_max',
                        kfold + '_f1_mean', kfold + '_f1_sd', kfold + '_f1_se', kfold + '_f1_min', kfold + '_f1_max',
                        kfold + '_best_params', kfold + '_fold_accuracy', kfold + '_fold_best_params', kfold + '_fold_f1', kfold + '_fold_precision', kfold + '_fold_recall']

        print ('Writing dataframe ...')
      
        result_df_1 = pd.DataFrame(np.array([accuracy_mean_list,
                                             accuracy_sd_list,
                                             accuracy_se_list,
                                             accuracy_min_list,
                                             accuracy_max_list,
                                             f1_mean_list,
                                             f1_sd_list,
                                             f1_se_list,
                                             f1_min_list,
                                             f1_max_list,
                                             best_params_list,
                                             kfold_accuracy_for_df,
                                             params_for_df,
                                             kfold_f1_for_csv,
                                             kfold_precision_for_csv,
                                             kfold_recall_for_csv
                                             ], dtype=object).T,
                                            columns=df_columns_1,
                                   index=list(feature_columns)) #kfold_accuracy_for_df,params_for_df

        # result_df_2 = pd.DataFrame([np.array([
                                              # # params_for_df,
                                             # kfold_f1_for_df,
                                             # kfold_precision_for_df,
                                             # kfold_recall_for_df
        # ], dtype=object).T],
                                   # columns=df_columns_2,
                                   # index=list(feature_columns))

        return result_df_1
Exemple #21
0

print(iris_X[:2]) ## 顯示前2筆
print(iris_y)
print(np.unique(iris.target)) ## 重複的值不顯示

X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.3)
## 將iris_X和iris_y這2個數據集分成訓練集和測試集,其中測試集佔數據集的30%

print(X_train[:2]) ## 顯示屬性的訓練集(前2筆)
print(X_test[:2]) ## 顯示屬性的測試集(前2筆)
print(y_train) ## 顯示分類的訓練集
print(y_test) ## 顯示分類的測試集

knn = KNeighborsClassifier()
print(knn.get_params()) ## 取出之前定義的參數
knn.fit(X_train, y_train) ## 訓練模型

print(knn.predict(X_test)) ## 預測測試集的數據
print(y_test) ## 真實值

## 2D圖
x_min, x_max = iris_X[:, 0].min() - .5, iris_X[:, 0].max() + .5
y_min, y_max = iris_X[:, 1].min() - .5, iris_X[:, 1].max() + .5
plt.figure(2, figsize=(10, 8))

# plt.clf()
# Plot the training points

plt.scatter(iris_X[:, 0], iris_X[:, 1], c=iris_y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
Exemple #22
0
current_data = data
for col in ['cholhighyr','ytqyogyr','vitany']:
    current_data = current_data[current_data[col].isin([1,2])] # only want values 1 and 2

# set up training/test set
X = current_data[['ytqyogyr','vitany']]
Y = current_data[['cholhighyr']]
X_train, X_test,Y_train, Y_test = train_test_split(X,Y, test_size=.15, random_state=5)

# KNN model (accuracy ~0.63, recall values 0.28/0.73
neigh_best = KNeighborsClassifier(n_neighbors=4)
neigh_best.fit(X_train, Y_train)
y_predicted = neigh_best.predict(X_test)
print classification_report(Y_test, y_predicted)
print neigh_best.get_params()


print '\nALL DATA:'
print 'data entries:', len(current_data)
print 'did yoga in the past year - no', len(X[X['ytqyogyr']==1])
print 'did yoga in the past year - yes', len(X[X['ytqyogyr']==2])
print 'took vitamins/supplements in the past year - no', len(X[X['vitany']==1])
print 'took vitamins/supplements in the past year - yes', len(X[X['vitany']==2])
print 'high cholesterol - no:', len(Y[Y['cholhighyr']==1])
print 'high cholesterol - yes:', len(Y[Y['cholhighyr']==2])

X_new = X
X_new['ytqyogyr'] = X_new['ytqyogyr']-1
X_new['vitany'] = X_new['vitany']-1
X_new.to_csv('vizdata.csv', index=False,header = ['yoga','vitamins'])
trainingSet = np.vstack((trainingSetEllipticals, trainingSetSpirals))  #using only elliptical and spiral for training
np.random.shuffle(trainingSet)
trainingSetLabels = trainingSet[:,12]  #putting labels in separate array

trainingSetLabels[trainingSetLabels == 0] = -1 #replacing all 0 with -1 to match sklearn format

trainingSet = trainingSet[:, 1:11] #removing label cols from actual inputs

trainingSet, testingSet, trainingSetLabels, testingSetLabels = train_test_split(trainingSet, trainingSetLabels, test_size = 0.6, random_state = 0) #fixes random_state so results reproducible

startTime = time.time()
print "Time before training = ", startTime

clf = KNeighborsClassifier(n_neighbors = 5) #starting off with 5 neighbors for now
clf = clf.fit(trainingSet, trainingSetLabels)

print "Params after training:"
print clf.get_params()

trainingAccuracy = clf.score(trainingSet, trainingSetLabels)

print "Training accuracy = ", trainingAccuracy

testingAccuracy = clf.score(testingSet, testingSetLabels)

print "Testing accuracy = ", testingAccuracy

print "Done training and testing! Time = ", time.time() - startTime, "seconds"

Exemple #24
0
def test_KNN(fn):
    """
    Function which will tune and test a K-Nearest Neighbors model. It will plot
    a confusion matrix and write a performance report to file.

    Arguments:
        - fn        :       Name of the input file.
    """
    #Timer variables
    start = 0
    end = 0

    #Load datasets
    X_train_df = pd.read_csv("input/{}_train_X.csv".format(fn), sep=";")
    y_train_df = pd.read_csv("input/{}_train_y.csv".format(fn), sep=";")
    X_test_df = pd.read_csv("input/{}_test_X.csv".format(fn), sep=";")
    y_test_df = pd.read_csv("input/{}_test_y.csv".format(fn), sep=";")

    X_val_tr = X_train_df.values
    y_val_tr = y_train_df.values
    X_val_test = X_test_df.values
    y_val_test = y_test_df.values

    #Convert to numpy arrays
    X_train = X_val_tr[:].astype(float)
    y_train = y_val_tr[:]
    X_test = X_val_test[:].astype(float)
    y_test = y_val_test[:]

    #Scale X values (train)
    scaler = RobustScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)

    #Scale X values (test)
    scaler.fit(X_test)
    X_test = scaler.transform(X_test)

    #Transform non-numerical values into numericals
    encoder = LabelEncoder()
    encoder.fit(y_train.ravel())
    encoded_y_train = encoder.transform(y_train.ravel())
    encoder.fit(y_test.ravel())
    encoded_y_test = encoder.transform(y_test.ravel())

    #Number of neighbors (K) to test
    nr_of_neighbors = [x for x in range(5, 100, 5)]

    #Variables to store the best values
    best_model = KNeighborsClassifier()
    best_acc = 0.0
    time_taken = 0

    #Test different values for K
    for K in nr_of_neighbors:
        knn = KNeighborsClassifier(n_neighbors=K)

        #Train the model
        start = time.time()
        knn.fit(X_train, encoded_y_train)
        end = time.time()

        #Predicted values
        y_pred = knn.predict(X_test)

        print("\nK: {}".format(knn.get_params()['n_neighbors']))
        print("Acc: {}".format(accuracy_score(encoded_y_test, y_pred)))

        #Measure accuracy and save model if it is the best one
        if accuracy_score(encoded_y_test, y_pred) > best_acc:
            time_taken = end - start
            best_model = knn
            best_acc = accuracy_score(encoded_y_test, y_pred)

    #Predict using the best model
    y_pred = encoder.inverse_transform(best_model.predict(X_test))
    K = best_model.get_params()['n_neighbors']

    #Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("\n")
    print(classification_report(y_test, y_pred))
    print("Scores for final, best model:\n")
    print("\nK: {}".format(K))
    print("Acc: {}".format(accuracy_score(y_test, y_pred)))

    #Find labels
    labels = [label for label in y_test_df.iloc[:, 0].unique()]

    #Plot confusion matrix
    plot_confusion_matrix(cm, sorted(labels), False)

    #Show the plot
    plt.savefig("figures/KNN_confusion_matrix_{}.svg".format(int(time.time())))
    #plt.show()

    #Write a .txt report file
    with open("reports/KNN_{}_report.txt".format(fn), "w") as f:
        f.write("REPORT FOR \"{}\"\n\n".format(fn))
        f.write("Best value for K: {}".format(K))

        f.write("\n\n\nClassification Report:\n")
        for line in classification_report(y_test, y_pred):
            f.write(line)

        f.write("\nConfusion Matrix:\n\n")
        f.write(np.array2string(cm, separator=', '))

        f.write(
            "\n\nTime used to train the model: {} seconds".format(time_taken))

        f.write("\n\nScores for final, best model:\n")
        f.write("Accuracy: {}".format(best_acc))

        f.close()
                                                len(Trees_RF))] = Trees_RF
                        accuracy_paras[start_idx:(start_idx +
                                                  len(Trees_RF))] = Accuracy_RF
                        values_optimal[3] = Trees_RF_optimal
                        accuracy_optimal[3] = Accuracy_RF.max()

                else:  ##### USE DEFAULT PARAS --> NO OPTIMAL #####
                    clf = classifiers[idx_method]
                    clf.fit(X_train, Y_train)
                    if (method[idx_method] == 'SVM'):
                        bK = clf.intercept_[0]
                        XK = clf.support_vectors_
                        YK = Y_train[clf.support_]
                        BetaK = clf.dual_coef_ / YK
                        BetaK = BetaK[0]
                        GammaK = clf.get_params(deep=False)['gamma']

                if (g < G_Points.shape[0]):

                    if (np.mod(g + 1, 50) == 0 or g == 0):
                        print('Generating Point: ' + str(g))

                    break_flag = 1  # flag for solving boundary function (1 -- solve fail --> 0 solve successful)

                    # Find active points
                    while break_flag:
                        # ---------- RANDOM POINT ----------
                        if (Shape == '2D_Circle'):
                            initial_guess = RS.Random_2D_Circle(
                                [np.sqrt(0.1 * R),
                                 np.sqrt(1.9 * R)], [0, 2 * math.pi],
Exemple #26
0
def classifyPHC():
    data = readFile()
    #data = equalizeClasses(data)
    features, labels = splitData(data)

    #determine the training and testing size in the range of 1, 1 = 100%
    validation_size = 0.2

    #here we are splitting our data based on the validation_size into training and testing data
    features_train, features_validation, labels_train, labels_validation = model_selection.train_test_split(
        features, labels, test_size=validation_size)

    #normalize data in the range [-1,1]
    scaler = MinMaxScaler(feature_range=(-1, 1))
    #fit only th training data in order to find the margin and then test to data without normalize them
    scaler.fit(features_train)

    features_train_scalar = scaler.transform(features_train)

    #trnasform the validation features without fitting them
    features_validation_scalar = scaler.transform(features_validation)

    #determine the pca, and determine the dimension you want to end up
    pca = KernelPCA(n_components=6, kernel='rbf', fit_inverse_transform=True)

    #fit only the features train
    pca.fit(features_train_scalar)

    #dimensionality reduction of features train
    features_train_pca = pca.transform(features_train_scalar)

    #dimensionality reduction of fatures validation
    features_validation_pca = pca.transform(features_validation_scalar)

    #reconstruct data training error
    reconstruct_data = pca.inverse_transform(features_train_pca)

    error_percentage = (
        sum(sum(error_matrix)) /
        (len(features_train_scalar) * len(features_train_scalar[0]))) * 100

    #len(features_train_scalar) = len(reconstruct_data) = 89
    #len(features_train_scalar[0]) = len(reconstruct_data[0]) = 13

    #len(error_matrix) = 89, which means for all the samples
    #len(error_matrix[0]) = 13, for every feature of every sample
    #we take the sum and we conlcude in an array which has the sum for every feature (error)
    #so we take the sum again and we divide it with the 89 samples * 13 features
    print 'Information loss of KernelPCA:', error_percentage, '% \n'

    lda = LinearDiscriminantAnalysis()

    lda.fit(features_train_pca, labels_train)

    features_train_pca = lda.transform(features_train_pca)

    features_validation_pca = lda.transform(features_validation_pca)

    #we can see the shapes of the array just to check
    print 'feature training array: ', features_train.shape, 'and label training array: ', labels_train.shape
    print 'feature testing array: ', features_validation.shape, 'and label testing array: ', labels_validation.shape, '\n'

    #take the best couple of parameters from the procedure of greedy search
    #paramTuning(features_train, labels_train, 5)

    #we initialize our model
    #svm = SVC(kernel='poly',C=0.001,gamma=10,degree=3,decision_function_shape='ovr')
    svm = KNeighborsClassifier(n_neighbors=3)

    #train our model with the data that we previously precessed
    svm.fit(features_train_pca, labels_train)

    #now test our model with the test data
    predicted_labels = svm.predict(features_validation_pca)
    accuracy = accuracy_score(labels_validation, predicted_labels)
    print 'Classification accuracy: ', accuracy * 100, '\n'

    #see the accuracy in training procedure
    predicted_labels_train = svm.predict(features_train_pca)
    accuracy_train = accuracy_score(labels_train, predicted_labels_train)
    print 'Training accuracy: ', accuracy_train * 100, '\n'

    #confusion matrix to illustrate the faulty classification of each class
    conf_matrix = confusion_matrix(labels_validation, predicted_labels)
    print 'Confusion matrix: \n', conf_matrix, '\n'
    print 'Support    class 0   class 1:'
    #calculate the support of each class
    print '          ', conf_matrix[0][0] + conf_matrix[0][
        1], '     ', conf_matrix[1][0] + conf_matrix[1][1], '\n'

    #calculate the accuracy of each class
    hC = (conf_matrix[0][0] / (conf_matrix[0][0] + conf_matrix[0][1])) * 100
    pC = (conf_matrix[1][1] / (conf_matrix[1][0] + conf_matrix[1][1])) * 100

    #see the inside details of the classification
    print 'For class 0 man cases:', conf_matrix[0][
        0], 'classified correctly and', conf_matrix[0][
            1], 'missclassified,', hC, 'accuracy \n'
    print 'For class 1 woman cases:', conf_matrix[1][
        1], 'classified correctly and', conf_matrix[1][
            0], 'missclassified,', pC, 'accuracy\n'

    #try 5-fold cross validation
    scores = cross_val_score(svm, features_train_pca, labels_train, cv=5)
    print 'cross validation scores for 5-fold', scores, '\n'
    print 'parameters of the model: \n', svm.get_params(), '\n'

    #print 'number of samples used as support vectors',len(svm.support_vectors_),'\n'

    #return svm.support_vectors_
    '''#plot the training features before the kpca and the lda procedure
Exemple #27
0
plt.figure(figsize=(7,7))
sns.heatmap(confusion_matrix(y_test, predictions_dtc),
            annot=True,
            cmap="Blues",
            square=True,
            xticklabels=['No Disease', 'Disease'],
            yticklabels=['No Disease', 'Disease'])
plt.xlabel("Predicted", fontsize=15)
plt.ylabel("Actual", fontsize=15)
plt.show()

from sklearn.neighbors import KNeighborsClassifier

clf_knn = KNeighborsClassifier(n_neighbors=10)
clf_knn.get_params()

clf_knn.fit(X_train, y_train)

predictions_knn = clf_knn.predict(X_test)
accuracy_score(y_test, predictions_knn)

plt.figure(figsize=(7,7))
sns.heatmap(confusion_matrix(y_test, predictions_knn),
            annot=True,
            cmap="Blues",
            square=True,
            xticklabels=['No Disease', 'Disease'],
            yticklabels=['No Disease', 'Disease'])
plt.xlabel("Predicted", fontsize=15)
plt.ylabel("Actual", fontsize=15)
Exemple #28
0
class KNeighbors(Classifier):
    r"""Implementation of k neighbors classifier.
    
    Date:
        2020

    Author:
        Luka Pečnik

    License:
        MIT
    
    Reference:
        “Neighbourhood Components Analysis”, J. Goldberger, S. Roweis, G. Hinton, R. Salakhutdinov, Advances in Neural Information Processing Systems, Vol. 17, May 2005, pp. 513-520.
    
    Documentation:
        https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

    See Also:
        * :class:`niaaml.classifiers.Classifier`
    """
    Name = 'K Neighbors Classifier'

    def __init__(self, **kwargs):
        r"""Initialize KNeighbors instance.
        """
        warnings.filterwarnings(action='ignore',
                                category=ChangedBehaviorWarning)
        warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
        warnings.filterwarnings(action='ignore',
                                category=DataConversionWarning)
        warnings.filterwarnings(action='ignore',
                                category=DataDimensionalityWarning)
        warnings.filterwarnings(action='ignore', category=EfficiencyWarning)
        warnings.filterwarnings(action='ignore', category=FitFailedWarning)
        warnings.filterwarnings(action='ignore', category=NonBLASDotWarning)
        warnings.filterwarnings(action='ignore',
                                category=UndefinedMetricWarning)

        self._params = dict(weights=ParameterDefinition(
            ['uniform', 'distance']),
                            algorithm=ParameterDefinition(
                                ['auto', 'ball_tree', 'kd_tree', 'brute']))
        self.__kn_classifier = KNC()

    def set_parameters(self, **kwargs):
        r"""Set the parameters/arguments of the algorithm.
        """
        self.__kn_classifier.set_params(**kwargs)

    def fit(self, x, y, **kwargs):
        r"""Fit KNeighbors.

        Arguments:
            x (pandas.core.frame.DataFrame): n samples to classify.
            y (pandas.core.series.Series): n classes of the samples in the x array.

        Returns:
            None
        """
        self.__kn_classifier.fit(x, y)

    def predict(self, x, **kwargs):
        r"""Predict class for each sample (row) in x.

        Arguments:
            x (pandas.core.frame.DataFrame): n samples to classify.

        Returns:
            pandas.core.series.Series: n predicted classes.
        """
        return self.__kn_classifier.predict(x)

    def to_string(self):
        r"""User friendly representation of the object.

        Returns:
            str: User friendly representation of the object.
        """
        return Classifier.to_string(self).format(
            name=self.Name,
            args=self._parameters_to_string(self.__kn_classifier.get_params()))
Exemple #29
0
y = diagnostic[:trainingSetLength,
               1:]  # target values (i.e. expected output for X)

for i in range(len(y)):
    y[i] = int(y[i])
y = np.transpose(y).astype('int')

trainingSet = extractedFeatures[:trainingSetLength]
neigh = KNeighborsClassifier(n_neighbors=1)

neigh.fit(trainingSet, y[0])
# letting the algorithm know which sample in X belongs to which class labelled in y

# save the params to disk
neigh_params = neigh.get_params()
params_neigh = 'params_neigh.sav'

# save the model to disk
filename_neigh = 'neigh_model.sav'
pickle.dump(neigh, open(filename_neigh, 'wb'))

#testSet=extractedFeatures[trainingSetLength:trainingSetLength+10]
#prediction=lda.predict(testSet)

pickle.dump(neigh_params, open(params_neigh, 'wb'))
#%%TEST CLASSIFICATION - Logistic Regression
excelAddress = 'C:\\Users\\theor\\Downloads\\Ground_truth_ISIC_1.xlsx'
trainingSetLength = 100

diagnostic = preProcessing(excelAddress)
#                  'n_estimators': [190,200,210,240,250],
#                  'learning_rate': [0.1, 0.01, 0.001, 0.0001]
#                  }
#
# cross_validation = StratifiedKFold(Y_train, n_folds=5)
# grid_search = GridSearchCV(forest,
#                            param_grid=parameter_grid,
#                            cv=cross_validation)
#
# grid_search.fit(X_train, Y_train)
#
# print('Best score: {}'.format(grid_search.best_score_))
# print('Best parameters: {}'.format(grid_search.best_params_))

forest = KNeighborsClassifier()
print forest.get_params().keys()

parameter_grid = {
                 'n_neighbors' : [2,3,4,5,6,7,8],
                 'algorithm': ['ball_tree', 'kd_tree', 'auto', 'brute'],
                 'leaf_size': [10, 20, 30, 40, 50]
                 }

cross_validation = StratifiedKFold(Y_train, n_folds=5)
grid_search = GridSearchCV(forest,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(X_train, Y_train)

print('Best score: {}'.format(grid_search.best_score_))
Exemple #31
0
class KNNCombinedClassifier:
    def __init__(self, **kwargs):
        self._classifier1 = KNeighborsClassifier(n_jobs=1, **kwargs)
        self._classifier2 = KNeighborsClassifier(n_jobs=1, **kwargs)
        self._nNN = self._classifier1.get_params()['n_neighbors']
        self._official_labels = None
        self._fit1 = False
        self._fit2 = False

    def fit1(self, official_embeddings, official_labels):
        self._classifier1.fit(official_embeddings, official_labels)
        self._official_labels = official_labels
        self._fit1 = True

    def fit2(self, official_embeddings, official_labels):
        self._classifier2.fit(official_embeddings, official_labels)
        self._official_labels = official_labels
        self._fit2 = True

    def get_recurring_indices(self, ind1, ind2):
        recurring_inds = []
        ninds = ind1.shape[0]
        for i in xrange(ninds):
            combined_indices = np.hstack((ind1[i], ind2[i]))
            unique_indices = np.unique(combined_indices)
            nunique_indices = unique_indices.shape[0]
            recurring_inds.append([
                unique_indices[i] for i in xrange(nunique_indices)
                if np.sum(combined_indices == unique_indices[i]) > 1
            ])
        return recurring_inds

    def predict_combined(self,
                         test_embeddings1,
                         test_embeddings2,
                         alpha=0.2,
                         pbar=False):
        chunk_size = 2000
        ntest_docs = test_embeddings1.shape[0]

        predicted_codes = np.empty((ntest_docs, self._nNN), dtype=int)
        prediction_weights = np.empty((ntest_docs, self._nNN))
        potential_full_codes = []
        for start_index in xrange(0, ntest_docs, chunk_size):
            if pbar: progress_bar(start_index, ntest_docs)
            stop_index = start_index + chunk_size
            if stop_index > ntest_docs:
                stop_index = ntest_docs

            NN_dists1, NN_indices1 = self._classifier1.kneighbors(
                test_embeddings1[start_index:stop_index], return_distance=True)
            NN_dists2, NN_indices2 = self._classifier2.kneighbors(
                test_embeddings2[start_index:stop_index], return_distance=True)
            probs1, class1 = self.get_assignment_probs(NN_dists1, NN_indices1)
            probs2, class2 = self.get_assignment_probs(NN_dists2, NN_indices2)
            predicted_codes[start_index:stop_index], prediction_weights[
                start_index:stop_index] = self.predict_from_weights(
                    probs1, probs2, class1, class2, alpha)

            potential_full_codes += self.get_recurring_indices(
                NN_indices1, NN_indices2)

        return [predicted_codes, prediction_weights, potential_full_codes]

    def get_assignment_probs(self, dists, indices):
        ndocs, nNN = indices.shape
        probs_total = np.zeros((ndocs, nNN))
        classes = np.zeros((ndocs, nNN))
        for i in xrange(ndocs):
            pred_codes = self._official_labels[indices[i]]
            unique_codes = np.unique(pred_codes)
            nunique_pred_codes = unique_codes.shape[0]
            probs = np.zeros(nunique_pred_codes)
            for j in xrange(nunique_pred_codes):
                probs[j] = np.sum(
                    1 / dists[i, np.where(pred_codes == unique_codes[j])])
            if np.any(np.isinf(probs)):
                inf_index = np.where(probs == np.inf)
                probs[:] = 0
                probs[inf_index] = 1
            sorted_probs = np.argsort(probs)[::-1]
            stop_index = nNN
            if nunique_pred_codes < nNN:
                stop_index = nunique_pred_codes
            probs_total[i, :stop_index] = probs[sorted_probs[:stop_index]]
            classes[i, :stop_index] = unique_codes[sorted_probs[:stop_index]]

        return probs_total, classes

    def predict_from_weights(self, probs1, probs2, class1, class2, alpha):
        ndocs, nNN = probs1.shape
        predictions = np.zeros((ndocs, nNN), dtype=int)
        sorted_weights = np.zeros((ndocs, nNN))
        for i in xrange(ndocs):
            unique_classes = np.unique(np.hstack((class1[i], class2[i])))
            nunique_classes = unique_classes.shape[0]
            combined_probs = np.zeros(nunique_classes)
            for j in xrange(nunique_classes):
                combined_probs[j] += np.sum(
                    probs1[i][np.where(class1[i] == unique_classes[j])])
                combined_probs[j] += alpha * np.sum(
                    probs2[i][np.where(class2[i] == unique_classes[j])])
            stop_index = nunique_classes
            if nunique_classes > nNN:
                stop_index = nNN
            sorted_probs = np.argsort(combined_probs)[::-1]
            sorted_weights[i, :stop_index] = combined_probs[
                sorted_probs[:stop_index]]
            sorted_weights[i] /= np.sum(sorted_weights[i])
            predictions[i, :stop_index] = unique_classes[
                sorted_probs[:stop_index]]

        return [predictions, sorted_weights]
# In[6]:


sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)


# ### Make Model

# In[7]:


clf = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')
clf.fit(X_train, y_train)
clf.get_params()


# ### Predict Test Set with that Model

# In[8]:


y_pred = clf.predict(X_test)
y_pred


# ### Check Accuracy

# In[9]:
Exemple #33
0
# print(iris)
iris_X = iris.data
print(len(iris_X))
iris_y = iris.target
print(iris_y)

X_train, X_test, y_train, y_test = train_test_split(iris_X,
                                                    iris_y,
                                                    test_size=0.3)

print(y_train)
print(y_test)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
params = knn.get_params()
print(params)
score = knn.score(X_test, y_test)
print(score)

y_predict = knn.predict(X_test)
labels = ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
print(y_predict)
print(y_test)
mcm = confusion_matrix(y_test, y_predict)
# mcm=multilabel_confusion_matrix(y_test,y_predict,label=labels)
print(mcm)
plt.imshow(mcm, cmap=plt.cm.Blues)
indices = range(len(mcm))
plt.xticks(indices, labels)
plt.yticks(indices, labels)
Exemple #34
0
print_score(knn, X_train, y_train, X_test, y_test, train=False)


# # Grid Search

# In[32]:


from sklearn.model_selection import GridSearchCV


# In[33]:


knn.get_params()


# In[33]:


params = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}


# In[34]:


.


grid_search_cv = GridSearchCV(KNeighborsClassifier(),
Exemple #35
0
from sklearn.neighbors import KNeighborsClassifier

tt = []
aa = []

for i in range(10):
    t1 = time.time()

    clf_knn = KNeighborsClassifier()
    clf_knn.fit(x_train, y_train)
    t2 = time.time()
    accuracy_knn = clf_knn.score(x_test, y_test)

    t = t2 - t1
    tt.append(t)
    aa.append(accuracy_knn)

for k in range(10):
    print("the time of", k + 1, "th training is", tt[k])
    print("The accuracy of ", k + 1, "th training is ", aa[k], "\n")

tt_ = (tt[1] + tt[2] + tt[3] + tt[4] + tt[5] + tt[6] + tt[7] + tt[8] + tt[9] +
       tt[0]) / 10
aa_ = (aa[1] + aa[2] + aa[3] + aa[4] + aa[5] + aa[6] + aa[7] + aa[8] + aa[9] +
       aa[0]) / 10
print("the time of training averagely is ", tt_, "\n",
      "the accuracy of training averagely is ", aa_)

paras_knn = clf_knn.get_params()
print(paras_knn)