Esempio n. 1
0
def knnSimulate(param):
    trainSet = SimData.simulate2Group(
        n = int(param['n']),
        p = int(param['p']),
        effect = [param['effect']] * int(param['p'])
    )
    knnFit = KNeighborsClassifier(n_neighbors=int(param['k']))
    knnFit.fit(np.array(trainSet['x']), np.array(trainSet['y']))
    testSet = SimData.simulate2Group(
        n = int(param['n']),
        p = int(param['p']),
        effect = [param['effect']] * int(param['p'])
    )
    out = OrderedDict()
    out['p'] = int(param['p'])
    out['k'] = int(param['k'])
    out['train'] = trainSet
    out['test'] = testSet
    out['resubPreds'] = knnFit.predict(trainSet['x'])
    out['resubProbs'] = knnFit.predict_proba(trainSet['x'])
    out['testPreds'] = knnFit.predict(testSet['x'])
    out['testProbs'] = knnFit.predict_proba(testSet['x'])
    out['resubTable'] = pd.crosstab(
        Series(out['resubPreds'], index=trainSet['y'].index),
        trainSet['y']
    )
    out['resubAccuracy'] = (np.sum(np.diag(out['resubTable'])) /
                            (1.0 * np.sum(np.sum(out['resubTable']))))
    out['testTable'] = pd.crosstab(
        Series(out['testPreds'], index=testSet['y'].index),
        testSet['y']
    )
    out['testAccuracy'] = (np.sum(np.diag(out['testTable'])) /
                           (1.0 * np.sum(np.sum(out['testTable']))))
    return out
Esempio n. 2
0
def main(output=RESULTS1B):
    """
    Using 1 nearest neighbor, predicts NYC Taxi trip times based on feature 
    vectors (pickup latitude, pickup longitude, dropoff latitude, dropoff latitude). 

    Tests on a subset of trip_data_1.csv

    Uses sklearn to implement nearest neighbors
    """
    features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 
               'dropoff_longitude', 'trip_time_in_secs']

    ## Extract necessary data into pandas dataframes
    numrows = 100000
    df_train_read = pd.read_csv(TRAIN_DATA)
    df_test_read = pd.read_csv(TRIP_DATA_1, nrows = numrows)    # first 100k rows, for speed
    df_test = df_test_read[features].dropna()
    df_train = df_train_read[features].dropna() 


    ## Use sklearn to run nearest neighbors
    k = 1 
    clf = KNeighborsClassifier(n_neighbors=k)                   # default distance metric: euclidean
    clf.fit(df_train[features[0:4]], df_train[features[-1]])
    preds = clf.predict(df_test[features[0:4]])

    # # Calculate statistics (Root Mean Squared Error, Correlation Coefficient, Mean Absolute Error)
    print "Calculating statistics"
    with open(output, "a+") as outputFile:
        outputFile.write("Ran knn with k={}".format(k) + \
            " Trained on {}. Tested on first".format(TRAIN_DATA) + \
            " {} rows of {}. Stats:".format(numrows, TRIP_DATA_1))
    calcAndLogStats( numpy.array(preds), 
                     numpy.array(df_test[features[-1]]), 
                     output=output)
Esempio n. 3
0
def train():
    data = Prediction.objects.filter(predict=False)
    df = pd.DataFrame(list(data.values()))
    users = [o.user for o in data]
    df['age'] = pd.DataFrame(list([user.userprofile.age for user in users]))
    for x in ['id', 'base_personal', 'base_general', 'predict', 'created', 'user_id', 'training_id']:
        df = df.drop(x, axis=1)

    y = df['next_level']
    df = df.drop('next_level', axis=1)
    #neigh = svm.SVC()
    neigh = KNeighborsClassifier(n_neighbors=2)
    neigh.fit(df, y) 

    #kf = KFold(len(ft[features]), n_folds=10)
    kf = ShuffleSplit(len(df), n_iter=K, test_size=test_size, random_state=0)
    # score is accuracy here 

    accuracy = cross_val_score(neigh, df, y, cv=kf)
    batch = Training.objects.create(training_accuracy=sum(accuracy[:(1-test_size)*K])/K/(1-test_size),
                                    sample_size=len(df.index),
                                    fold=K,
                                    subset_accuracy=json.dumps(accuracy.tolist()),
                                    test_accuracy=sum(accuracy[(1-test_size)*K:])/K/test_size
                                    )
    Prediction.objects.filter(predict=False).update(training=batch)
    
    if not os.path.exists('./models'):
            os.makedirs('./models')

    joblib.dump(neigh, './models/model.pkl')
                      
    
Esempio n. 4
0
def kann_classify(train_data,train_label,test_data):  
      
    knnClf=KNeighborsClassifier(n_neighbors=5)
    knnClf.fit(train_data,ravel(train_label))  
    test_label=knnClf.predict(test_data)  
    save_result(test_label,'sklearn_knn_Result.csv')  
    return test_label  
Esempio n. 5
0
    def _evaluate_projection(self, x, y):
        """
        kNNEvaluate - evaluate class separation in the given projection using a k-NN method
        Parameters
        ----------
        x - variables to evaluate
        y - class

        Returns
        -------
        scores
        """
        if self.percent_data_used != 100:
            rand = np.random.choice(len(x), int(len(x) * self.percent_data_used / 100),
                                    replace=False)
            x = x[rand]
            y = y[rand]
        neigh = KNeighborsClassifier(n_neighbors=3) if self.attr_color.is_discrete else \
            KNeighborsRegressor(n_neighbors=3)
        assert ~(np.isnan(x).any(axis=None) | np.isnan(x).any(axis=None))
        neigh.fit(x, y)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            scores = cross_val_score(neigh, x, y, cv=3)
        return scores.mean()
Esempio n. 6
0
def kppv_vecteur():
    "Interprétation des images comme vecteurs de pixels et classification via les k plus proches voisins"
    best = np.zeros(6)    
    
    for npix in range(50,200,50):
        _, data, target, _ = utils.chargementVecteursImages(mer,ailleurs,1,-1,npix)
        X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3,random_state=random.seed())
        
        for iterations in range(250,1000,250):
            for n in range(2,12,2):
                for param in range (1,3):
                    start_time = time.time()
                    kppv = KNeighborsClassifier(n_neighbors=n, p=param, n_jobs=-1)
                    
                    x1=np.array(X_train)
                    x1 = np.reshape(x1, (x1.shape[0],x1.shape[2]))
                    x2=np.array(X_test)
                    x2 = np.reshape(x2, (x2.shape[0],x2.shape[2]))
                        
                    kppv.fit(X=x1, y=Y_train)
                    score = kppv.score(x2,Y_test)
                        
                    end_time = time.time()
                    if score>best[0]:
                        best[0] = score
                        best[1] = iterations
                        best[2] = n
                        best[3] = param
                        best[4] = end_time-start_time
                        best[5] = npix
    
    print("| K plus proches voisins         | V.Pix {:4.0f} | n={:1.0f} param={:1.0f} iterations={:1.0f}           | {:10.3f}ms | {:1.3f} |".format(best[5],best[2],best[3],best[1],best[4]*1000,best[0]))
Esempio n. 7
0
def kppv_histo():
    "Interprétation des images comme histogrammes de couleurs et classification via les k plus proches voisins"
    best = np.zeros(5)    
    
    _, data, target, _ = utils.chargementHistogrammesImages(mer,ailleurs,1,-1)
    X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3,random_state=random.seed())
    
    for iterations in range(250,1000,250):
        for n in range(2,12,2):
            for param in range (1,3):
                start_time = time.time()
                kppv = KNeighborsClassifier(n_neighbors=n, p=param, n_jobs=-1)
                
                x1=np.array(X_train)
                x2=np.array(X_test)
                    
                kppv.fit(X=x1, y=Y_train)
                score = kppv.score(x2,Y_test)
                    
                end_time = time.time()
                if score>best[0]:
                    best[0] = score
                    best[1] = iterations
                    best[2] = n
                    best[3] = param
                    best[4] = end_time-start_time
    
    print("| K plus proches voisins          | V.Histo    | n={:1.0f} param={:1.0f} iterations={:1.0f}         | {:10.3f}ms | {:1.3f} |".format(best[2],best[3],best[1],best[4]*1000,best[0]))
Esempio n. 8
0
def process_one_cell(df_train, df_test, grid_id, th):
    """
    Classification inside one grid cell.
    """
    # Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    # Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index

    # Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int)

    # Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance',
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
    return pred_labels, row_ids
Esempio n. 9
0
def knn_accuracy(trn_data, trn_labels, tst_data, tst_labels, k_neighbors):

    knn = KNeighborsClassifier(k_neighbors)
    knn.fit(trn_data, trn_labels)
    results = knn.predict(tst_data)

    return np.sum(tst_labels == results)/float(tst_labels.size)
Esempio n. 10
0
 def BuildModel(self, data, labels):
     # Create and train the classifier.
     knc = KNeighborsClassifier(
         n_neighbors=self.n_neighbors, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric
     )
     knc.fit(data, labels)
     return knc
Esempio n. 11
0
    def analyze_image(self):
        '''
        Load the image and analyze it with KNN

        im_file - pre-processed with histogram specification
        '''

        if self._avg_pixels.size == 0:
            self._process_annotations()        
            self._get_initial_classes()
        
        im = self._image
        rows = im.shape[0]

        clf = KNeighborsClassifier(n_neighbors = self._n_neighbors)
        clf.fit(self._avg_pixels, self._labels)

        im_1d = im.reshape(-1, 3)

        # calculate prediction reshape into image
        prediction = clf.predict(im_1d)
        prediction = prediction.reshape(rows, -1)

        prediction [self._mask == 0] = Labels.Masked
        self.display_current(prediction)
        return prediction
Esempio n. 12
0
def main():
    # obtain the number of features in the dataset
    with open('../data/test_lung_s3.csv', 'rb') as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            num_columns = len(row)
            break
    print num_columns
    # load data
    mat = np.loadtxt('../data/test_lung_s3.csv', delimiter=',', skiprows=1, usecols=range(0, num_columns))
    X = mat[:, 1:num_columns]  # data
    y = mat[:, 0]  # label
    X = X.astype(float)
    y = y.astype(float)
    n_samples, n_features = X.shape

    # using 10 fold cross validation
    cv = KFold(n_samples, n_folds=10, shuffle=True)

    # evaluation
    n_features = 100
    neigh = KNeighborsClassifier(n_neighbors=1)
    acc = 0

    for train, test in cv:
        idx = svm_backward.svm_backward(X[train], y[train], n_features)
        print idx
        X_selected = X[:, idx]
        neigh.fit(X_selected[train], y[train])
        y_predict = neigh.predict(X_selected[test])
        acc_tmp = accuracy_score(y[test], y_predict)
        print acc_tmp
        acc += acc_tmp
    print 'ACC', float(acc)/10
Esempio n. 13
0
def run(class_num, subsample_size , cluster_num, window_size,  method='knn' , n_nb = 2):
    
    # will load data as the patch size defined , 3 means 3*3 = 9 for each patch, and will return the dictionary included:
    # 'data'  (one patch)  , 'target' (the sample of this patch belongs to ) , 'filename' (the file comes from)
    bofs = []
    lable = []
    filename = "%s/TRAIN_VLAD_%d_%d_%d_%d.txt" %(vlad_accee , class_num , subsample_size , window_size , cluster_num)        
    bofs , lable = get_vlad(filename)

    #knn_init = KNeighborsClassifier()
    #parameters = {'n_neighbors':[ 5, 10 , 15]}
    #knn = grid_search.GridSearchCV(knn_init, parameters)

    bofs_test = []
    lable_test = []
    filename = "%s/TEST_VLAD_%d_%d_%d_%d.txt" %(vlad_accee , class_num , subsample_size , window_size , cluster_num)
    bofs_test , lable_test = get_vlad(filename)


    start = time.time()    
    if(method == "knn"):
        knn = KNeighborsClassifier(n_neighbors = n_nb)
        knn.fit(bofs, lable)
        predicted = knn.predict(bofs_test)
        score = knn.score(bofs_test,lable_test)
   
    print(time.time()-start) 

    return score  
def main_process():
    data_dict = parse_txt()
    x_data, y_data, places_cnt, path_int_dict = build_x_y_data(data_dict)
    print 'data counts', len(x_data), len(y_data)
    print 'zone names counts', places_cnt
    print 'path counts', len(path_int_dict)

    # start to train, change list type to numpy.array
    x_data = np.array(x_data)
    y_data = np.array(y_data)

    knn = KNeighborsClassifier()

    indices = np.random.permutation(len(x_data))
    x_train = x_data
    y_train = y_data
    x_test = x_data[indices[-TEST_DATA_ROWS:]]
    y_test = y_data[indices[-TEST_DATA_ROWS:]]
    knn.fit(x_train, y_train)  # work

    test_result = knn.predict(x_test)  # test
    proba_test_result = knn.predict_proba(x_test)

    # no duplicate value, so reverse this dictionary
    int_path_dict = dict(zip(path_int_dict.values(), path_int_dict.keys()))

    print 'predict result:', test_result
    print [int_path_dict[x] for x in test_result]  # test result
Esempio n. 15
0
def train(x_train, y_train):

    # reg = LinearRegression()
    reg = KNeighborsClassifier()
    reg.fit(x_train, y_train)

    return reg
def exercise_2a():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    # plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
    # plt.show()
    kf = KFold(1000, n_folds=10, shuffle=False, random_state=None)
    accuracy_lst = np.zeros([49, 2], dtype=float)
    accuracy_current = np.zeros(10, dtype=float)
    for k in range(1,50):
        iterator = 0
        clf = KNeighborsClassifier(n_neighbors=k)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf.fit(X_train, y_train)
            accuracy_current[iterator] = (1. - clf.score(X_test,y_test))
            iterator+=1
        accuracy_lst[k-1, 0] = accuracy_current.mean()
        # accuracy_lst[k-1, 1] = accuracy_current.std() #confidence interval 95%
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, accuracy_lst[:, 0], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K values')
    plt.ylabel('Missclasification Error')
    plt.show()
    def build_model():
        """ request cip and skill data from DataUSA and develop predictive model using scikit-learn

        :return: fit model ready to accept user input to make a prediction
        """

        # request data on college majors and relevant skills
        r = requests.get(r'http://api.datausa.io/api/?show=skill&sumlevel=all')
        data_usa = r.json()

        headers = data_usa['headers']
        data = data_usa['data']

        df = pd.DataFrame(data, columns=headers)
        df.drop('value_rca', axis=1, inplace=True)

        # reshape data so that each skill becomes a single column (i.e. feature for the model)
        pivot = df.pivot_table(index='cip', columns='skill', values='value')
        pivot = pivot.reset_index()

        X = pivot.drop('cip', axis=1)  # feature matrix
        y = pivot.cip  # response

        knn = KNeighborsClassifier(n_neighbors=10, weights='distance')
        knn.fit(X, y)

        return knn
def process_one_cell(df_cell_train, df_cell_test):
    
    #Working on df_train
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 8).values
    df_cell_train = df_cell_train.loc[mask]
    
    #Working on df_test
    row_ids = df_cell_test.index
    
    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= 500.0
    df_cell_train.loc[:,'y'] *= 1000.0
    df_cell_test.loc[:,'x'] *= 500.0
    df_cell_test.loc[:,'y'] *= 1000.0
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    X_test = df_cell_test.values

    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) 
    
    return pred_labels, row_ids
Esempio n. 19
0
def knn(depth, joints, C, visualize=False, zScale=1.0):
    pts_world, labels = joints2skeleton(joints)

    pts_world[:, 2] *= zScale
    classifier = KNeighborsClassifier(n_neighbors=nNeighbors)
    classifier.fit(pts_world, labels)

    X = np.vstack((np.nonzero(depth)[1], np.nonzero(depth)[0]))
    X = np.vstack((X, depth[depth != 0]))
    X_world = pixel2world(X.T, C)
    X_world[:, 2] *= zScale
    predicts = classifier.predict(X_world)

    perPixelLabels = -np.ones(depth.shape)
    perPixelLabels[depth != 0] = predicts

    img = np.zeros((H, W, 3), np.uint8)
    for i in range(nJoints):
        img[perPixelLabels == i] = palette[i]

    skel = None
    if visualize is True:
        # foreground = visualizePts(world2pixel(pts_world, C), labels)
        # img[foreground != 0] = foreground[foreground != 0]
        skel = visualizePts(world2pixel(pts_world, C), labels)

    return (img, X_world, predicts, skel)
Esempio n. 20
0
    def onstartButton(self):

        cap = cv2.VideoCapture(str(self.file_name))

        if self.isfileWorking == False and self.ishasFile == True:
            self.ishasFile = False
            self.startButton.setText("Close")

            # cap = cv2.VideoCapture(str(self.file_name))

            self.isfileWorking = True
            data=spio.loadmat("openface_fea.mat")
            X=data['feature']
            id=data['id'].astype(int)-1
            Y=id[0,:]
            name=list(set(data['names']))
            name.sort()
            print("***Train knn classifier***")
            knn=KNeighborsClassifier(n_neighbors=20,weights='distance',p=2)
            knn.fit(X,Y)

            success,frame = cap.read()

            while success and self.isfileWorking :
            	start=time.time()
                success, frame = cap.read() 
                
                if success:
                    img=frame.copy()
                   
                    bb,rep=getRep(img)
                    if bb is None:
                        print "Can't find any face in this picture"
                    else:
                        if rep is 0:
                            print "Get rep failed..."
                        else:
                            rep=np.reshape(rep,(1,128))
                            idx=knn.predict(rep)
                            # print("label is {} ".format(idx))
                            proba=knn.predict_proba(rep)
                            actor=name[idx]
                            self.namelineEdit.setText(actor)
                            self.timelineEdit.setText(str(round(time.time()-start,3)))
                            self.confidencelineEdit.setText(str(round(max(proba[0]),2)))
                            # print("Proba is {} ".format(proba))
                            
                            

                            draw_dlib_rects(frame,bb,actor,(0,255,0))
                    image = QtGui.QImage(frame.data, frame.shape[1], frame.shape[0], QtGui.QImage.Format_RGB888).rgbSwapped()
                    pixmap = QtGui.QPixmap.fromImage(image)
                    self.showlabel.setPixmap(pixmap)
                    k = cv2.waitKey(5)
        else:
            self.ishasFile = False
            self.startButton.setText("Start")
            self.isfileWorking = False
            cap.release()
            self.showlabel.clear()
def exercise_2b():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    kf = ShuffleSplit(100, train_size= 0.9, test_size=0.1, random_state=0)
    # kf = KFold(1000, n_folds=10, shuffle=False, random_state=None)
    accuracy_lst = np.zeros([49, 2], dtype=float)
    accuracy_current = np.zeros(10, dtype=float)
    for k in range(1,50):
        iterator = 0
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf = KNeighborsClassifier(n_neighbors=k)
            clf.fit(X_train, y_train)
            accuracy_current[iterator] = (1. - clf.score(X_test,y_test))
            iterator+=1
            print mean_squared_error(y_test, clf.predict(X_test))
        accuracy_lst[k-1, 0] = accuracy_current.mean()
        accuracy_lst[k-1, 1] = accuracy_current.var()#*2 #confidence interval 95%
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, accuracy_lst[:, 1], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K')
    plt.ylabel('Variance')
    plt.show()
Esempio n. 22
0
def build_classifier(images, labels):
    #this will actually build the classifier. In general, it
    #will call something from sklearn to build it, and it must
    #return the output of sklearn. Right now it does nothing.
    classifier = KNN(n_neighbors=3,weights='distance')
    classifier.fit(images, labels)
    return classifier
Esempio n. 23
0
def xValidateKFold(n, k, iris_x, iris_y, VERBOSE):
# K Fold cross validation
# n = # of folds
# k = # of nearest neighbors to check
# iris_x = data
# iris_y = classes
# VERBOSE = flag to spit out more information on each fold iteration

# Create the cross validator
    kf = cross_validation.KFold(n=len(iris_x), n_folds = n, random_state=0)
    if VERBOSE:
        print "kFold validator: "+str(kf)

    avgScore = 0.0  # function returns avg score for all runs

# for each set of training and test data
    for train_index, test_index in kf:
        knn = KNeighborsClassifier(n_neighbors=k)   # Create Classifier
        knn.fit(iris_x[train_index],iris_y[train_index])  # Training Classifier
        prediction = knn.predict(iris_x[test_index])  # Predict on test data
        score = knn.score(iris_x[test_index], iris_y[test_index])  # Evaluate success of prediction
        avgScore += score  # Accrue score for averaging
        if VERBOSE:
            print "\tscore for this validation round: "+str(score)

    return avgScore/float(n)    # Return average score for all iterations
def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    testing_file = file('test.p', 'r')
    training_file = file('train.p', 'r')

    train = pickle.load(training_file)
    test = pickle.load(testing_file)

    testing_file.close()
    training_file.close()
    
    trainX = train[:,:-1]
    trainy = train[:,-1]
    
    testX = test[:,:-1]
    testy = test[:,-1]

    print '[INFO, time: %s] Downsampling ...' % (time.strftime('%H:%M:%S'))
    trainX = downsample_features(trainX)
    testX = downsample_features(testX)

    trainX, testX = normalize(trainX, testX)

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), '50 - Neighbors')
    clf = KNeighborsClassifier(n_neighbors=50)
    clf.fit(trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
Esempio n. 25
0
def main():
    means = [[-1, -1], [1.0, 1.0]]
    variances = [np.random.rand]
    knn_models = [3, 5, 10]
    data_sizes = [10, 25, 50, 75, 100, 125, 150, 175, 200]
    points_per_class = 500
    data = dg.generate_gaussian_mixture(class_means=means, class_variances=np.eye(2),
                                        num_components=5, num_desired_points_per_class=points_per_class)
    class_0 = np.hstack((data[0], np.zeros((len(data[0]), 1))))
    class_1 = np.hstack((data[1], np.ones((len(data[0]), 1))))
    results_train = np.empty((len(knn_models), len(data_sizes)))
    results_test = np.empty((len(knn_models), len(data_sizes)))
    train_data_class_0, test_data_class_0 = split_train_test(class_0)
    train_data_class_1, test_data_class_1 = split_train_test(class_1)
    print 'train size, test size', len(train_data_class_1), len(test_data_class_1)
    train_data = np.vstack((train_data_class_0, train_data_class_1))
    test_data = np.vstack((test_data_class_0, test_data_class_1))
    for i, knn_model in enumerate(knn_models):
        kncs = KNeighborsClassifier(n_neighbors=knn_model)
        for j, data_size in enumerate(data_sizes):
            curr_train_class_0, curr_train_class_1 = train_data_class_0[:data_size], train_data_class_1[:data_size]
            curr_train_data = np.vstack((curr_train_class_0, curr_train_class_1))
            kncs.fit(curr_train_data[:, :2], curr_train_data[:, -1])
            predictions_train = kncs.predict(train_data[:, :2])
            predictions_test = kncs.predict(test_data[:, :2])
            results_train[i][j] = len(np.where(predictions_train != train_data[:, -1])[0]) / float(len(train_data))
            results_test[i][j] = len(np.where(predictions_test != test_data[:, -1])[0]) / float(len(test_data))

    plt.plot(data_sizes, results_test[0, :], 'r')
    plt.plot(data_sizes, results_test[1, :], 'b')
    plt.plot(data_sizes, results_test[2, :], 'g')
    plt.plot(data_sizes, results_train[0, :], 'r--')
    plt.plot(data_sizes, results_train[1, :], 'b--')
    plt.plot(data_sizes, results_train[2, :], 'g--')
    plt.show()
Esempio n. 26
0
def knnClassify(enrollment_id, trainData, trainLabel, testData):
    knnClf = KNeighborsClassifier(n_neighbors=5)  # default: k=5
    # knnClf.fit(trainData,trainLabel)
    knnClf.fit(trainData, ravel(trainLabel))  # numpy.ravel将数组展平,变为一行
    testLabel = knnClf.predict(testData)
    saveResult(enrollment_id, testLabel, "sklearn_knn_Result.csv")
    return testLabel
def process_one_cell(cell_train, cell_test, fw, th, n_neighbors):
    
    # Remove infrequent places
    cell_train = remove_infrequent_places(cell_train, th)
    
    # Store row_ids for test
    row_ids = cell_test[:, -1].flatten().astype(np.int32)
    cell_test = cell_test[:, :-1]
    
    # Preparing data
    y = cell_train[:, -1].flatten().astype(np.int64)
    X = cell_train[:, :-1]
    
    #Applying the classifier
    cte = 5.8
    n_neighbors = int((y.size ** 0.5) / cte)
    clf = KNeighborsClassifier(n_neighbors=n_neighbors,
                            weights=calculate_distance, p=1, 
                            n_jobs=2, leaf_size=15)
    clf.fit(X, y)
    y_pred = clf.predict_proba(cell_test)
    y_pred_labels = np.argsort(y_pred, axis=1)[:,:-4:-1]
    pred_labels = clf.classes_[y_pred_labels]
    cell_pred = np.column_stack((row_ids, pred_labels)).astype(np.int64) 
    
    return cell_pred
def exercise_1():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    n_samples = len(X)
    kf = cross_validation.KFold(n_samples, n_folds=10, shuffle=False, random_state=None)
    # kf = cross_validation.ShuffleSplit(1000,n_iter=25, test_size=0.1, train_size=0.9, random_state=None)

    error_total = np.zeros([49, 1], dtype=float)
    for k in range(1,50):
        error = []
        clf = KNeighborsClassifier(n_neighbors=k)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train, y_train)
            error.append( zero_one_loss(y_test, clf.predict(X_test)) )


            # error.append(clf.predict(X_test))
            # error.append( 1. - clf.score(X_test, y_test) ) #, accuracy_score(y_test, clf.predict(X_test))
            # error.append(mean_squared_error(y_test, clf.predict(X_test)))
            # error.append()
        # print error
        error_total[k-1, 0] = np.array(error).mean()
    # print error_total
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, error_total[:, 0], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K values')
    plt.ylabel('Missclasification Error')
    plt.show()
def neighborsPrediction(train_dfs, targetLabels, fold_cv):
    scoresNeighbor = [0.0]
    n_neighbors = 0

    for i in range(1, 10):
        neighbor, instances_train, instances_test, target_train, target_test, scoresNeighborTmp = testScore(train_dfs,
                                                                                                            targetLabels,
                                                                                                            fold_cv,
                                                                                                            i * 2)
        if sum(scoresNeighborTmp) / len(scoresNeighborTmp) > sum(scoresNeighbor) / len(scoresNeighbor):
            scoresNeighbor = scoresNeighborTmp
            n_neighbors = i * 2
            # print(sum(scoresNeighborTmp)/len(scoresNeighborTmp))

    neighbor = KNeighborsClassifier(n_neighbors)
    neighbor.fit(train_dfs, targetLabels)

    instances_train, instances_test, target_train, target_test = cross_validation.train_test_split(train_dfs,
                                                                                                   targetLabels,
                                                                                                   test_size=0.4,
                                                                                                   random_state=0)

    predictions = neighbor.predict(instances_test)
    print("Generate random forest with: {0} neighbors".format(str(n_neighbors)))
    return neighbor, instances_train, target_train, target_test, predictions, scoresNeighbor
Esempio n. 30
0
#         csvFile.close()

traindata = np.vstack(
    (pd.read_csv('wifidata0.csv',
                 header=0).values, pd.read_csv('wifidata1.csv',
                                               header=0).values,
     pd.read_csv('wifidata2.csv',
                 header=0).values, pd.read_csv('wifidata3.csv',
                                               header=0).values))
# pd.read_csv('wifidata4.csv', header=0).values, pd.read_csv('wifidata5.csv', header=0).values))
print traindata.shape

trainlabel = pd.read_csv('shopdata.csv', header=0).values[:12000, :]
print trainlabel.shape

testdata = pd.read_csv('wifidata6.csv', header=0)
testlabel = pd.read_csv('shopdata.csv', header=0).values[18000:21000, :]

##########
clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
clf.fit(traindata, trainlabel)
test = clf.predict(testdata.values)
accuracy = np.trace(np.dot(np.array(test), testlabel.T)) / 3000

print clf.score(traindata, trainlabel)
print clf.score(testdata, testlabel)
print accuracy
print datetime.now()
######## trainscore 1.0  testscore0.827666666667    samples 12000 n_neighbors=5,weights='distance'
########trainscore 1.0   testscore 0.832666666667   samples 12000 n_neighbors=3,weights='distance'
######## hao nei cun , man 20min
X.info()


# 3. split the data into train, and test datasets. We will be predicting whether or not someone votes based on the remaining features.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)



# 4. Fit a k-neighbors classifer on the training data. Use 4 for your number of neighbors.
#    How accurate is your model?
#    How does it perform on the test data?
#Create KNN Object
knn = KNeighborsClassifier(n_neighbors = 4)

# Fit the model to the training data
knn.fit(X_train, y_train)

# Estimate whether or not a person will vote, using the training data.

y_pred = knn.predict(X_train)

best = [0]
for k in range(1,5):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    print(f'for k = {k}')
    print('Accuracy of KNN classifer on test set: {:.2f}'.format(knn.score(X_test, y_test)))
    if knn.score(X_test, y_test) > best[0]:
        best = [knn.score(X_test,y_test)]
        ypred = knn.predict(X_test)

df = pd.read_csv("week4_set2.csv")

X1 = df.iloc[:, 0]
X2 = df.iloc[:, 1]
X = np.column_stack((X1, X2))
y = df.iloc[:, 2]

# --- KNN MODEL ----
knn_model = KNeighborsClassifier(
    n_neighbors=5, weights="uniform")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

knn_model.fit(X_train, y_train)
predictions = knn_model.predict(X_test)

calculate_confusion_matrix(y_test, predictions, "KNN")
y_score = knn_model.predict_proba(X_test)
plot_roc_curve(y_test, y_score[:, 1], "KNN")

# --- LOGISTIC REGRESSION ---
# not using polynomial features as they dont make much of a difference
polynomial_features = prep.PolynomialFeatures(degree=2)  #  use initial q
X = polynomial_features.fit_transform(X)
log_model = LogisticRegression(C=1, penalty="l2", random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)
log_model.fit(X_train, y_train)
predictions = log_model.predict(X_test)
Esempio n. 33
0
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Creating classifier
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)

# Fitting the classifier to the training set
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:, 0].min() - 1,
              stop=X_set[:, 0].max() + 1,
              step=0.01),
Esempio n. 34
0
from numpy.random import random

#  Use 20-fold cross-validation to evaluate the classification error rate of k-NN over
# the Iris dataset in sklearn, for each of the values k = 1, 2, 4, 8, 16, 32. Use a
# KNeighborsClassifier with the appropriate parameter values.
# Plot the crossvalidated error rate values as a function of k

iris = datasets.load_iris()
X = iris.data
y = iris.target

k_range = range(1, 33)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)

    scores = cross_val_score(knn, X, y, cv=20)
    k_scores.append(scores.mean())

plt.plot(k_range, k_scores)
plt.title("nearest neighbor optimization")
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

# (a) Write a Python program around the randPointUnitBall(d) function that generates a data set consisting of 1000 independently sampled points in the ddimensional unit ball for each dimension d = 1, 10, 100, and that reports the
# mean Euclidean length (norm) of these examples for each d. Submit your documented source code (copy the text into your writeup, and attach the source file
# separately), as well as the results. Describe the results.
# (b) For additional insight, plot a histogram of the Euclidean lengths for dimensions
# 2 = 1, 10, 100. Use matplotlib.pyplot.hist and specify density=True as one
kclf = KNeighborsClassifier(n_neighbors=10)

# In[ ]:

from numpy import loadtxt
train = loadtxt('/Users/ishaan/Documents/255-Prog-2/data/train.dat')
test = loadtxt('/Users/ishaan/Documents/255-Prog-2/data/test.dat')
labels = loadtxt('/Users/ishaan/Documents/255-Prog-2/data/train.labels')
sample_format = loadtxt('/Users/ishaan/Documents/255-Prog-2/data/format.dat')

# In[ ]:

#Dimensionality Reduction

svd = TruncatedSVD(n_components=80)
x_rd = svd.fit(train).transform(train)

# In[ ]:

#Using K Nearest Neighbours to classify data

kclf = kclf.fit(train, labels)
pred = kclf.predict(test)

np.savetxt('/Users/ishaan/Desktop/predictions17.dat',
           pred,
           delimiter=',',
           fmt='%i')

# In[ ]:
Esempio n. 36
0
def imagenet_knn(train_file='gs://dataset-jesus-bucket/DataSet/',
                 job_dir='gs://dataset-jesus-bucket/',
                 **args):
    from keras.applications.vgg16 import VGG16
    import numpy as np

    file_stream = file_io.FileIO(
        "gs://data-daisy/full_gs_paths_large_size.pickle", mode='rb')
    data_frame = pickle.load(file_stream)

    vgg16_model = VGG16(weights='imagenet', include_top=True)

    vgg16_rep_layer = Model(inputs=vgg16_model.input,
                            outputs=vgg16_model.get_layer(index=21).output)

    print(vgg16_rep_layer.summary())

    x_001, y_001, normalized_check = read_data_file_io(data_frame, ['/001/'],
                                                       data_type="test")
    x_002, y_002, normalized_check = read_data_file_io(data_frame, ['/002/'],
                                                       data_type="test")

    x_001_list, y_001_list = x_001.tolist(), y_001.tolist()
    x_002_list, y_002_list = x_002.tolist(), y_002.tolist()

    list_to_randomize = []
    list_test = []

    for (x, y) in zip(x_001_list, y_001_list):
        list_to_randomize.append([x, y])

    random.shuffle(list_to_randomize)  # shuffle data used to train
    n = 10
    batch_size = len(list_to_randomize) // n
    remainder = len(list_to_randomize) - batch_size * n
    print(batch_size)

    for (x, y) in zip(x_002_list, y_002_list):
        list_to_randomize.append([x, y])

    # extract data to test (001 dataset up to batch size * n + remainder
    x_001_randarr = np.array([
        item[0] for item in list_to_randomize[0:n * batch_size + remainder - 1]
    ])
    y_001_randarr = np.array([
        item[1] for item in list_to_randomize[0:n * batch_size + remainder - 1]
    ])

    x_002_list = [
        item[0] for item in list_to_randomize[n * batch_size + remainder:]
    ]  # used for ref. point
    y_002_list = [
        item[1] for item in list_to_randomize[n * batch_size + remainder:]
    ]

    clf = KNeighborsClassifier()  # creat KNN object
    accuracy_list = []

    # train with dataset 2
    x_002_arr = np.array(x_002_list)
    int_output = vgg16_rep_layer.predict(x_002_arr)
    int_output = int_output.reshape(x_002_arr.shape[0], -1)
    clf.fit(int_output, np.array(y_002_list))

    init_loss = knn_accuracy(clf, x_001_randarr, y_001_randarr,
                             vgg16_rep_layer)  # Test on 001
    accuracy_list.append(init_loss)

    z = 1

    for i in range(10):
        print("Fitting on batch number:", z)

        x_test_list = [
            item[0] for item in list_to_randomize[0:(i + 1) * batch_size - 1 +
                                                  remainder * (i // 9)]
        ] + [
            item[0] for item in list_to_randomize[n * batch_size + remainder:]
        ]
        y_test_list = [
            item[1] for item in list_to_randomize[0:(i + 1) * batch_size - 1 +
                                                  remainder * (i // 9)]
        ] + [
            item[1] for item in list_to_randomize[n * batch_size + remainder:]
        ]

        x = np.array(x_test_list)
        y = np.array(y_test_list)

        print(x.shape, y.shape)

        int_output = vgg16_rep_layer.predict(x)
        int_output = int_output.reshape(x.shape[0], -1)

        clf.fit(int_output, y)

        accuracy = knn_accuracy(clf, x_001_randarr, y_001_randarr,
                                vgg16_rep_layer)
        print(accuracy)
        accuracy_list.append(accuracy)

        z += 1

    with open('gs://data-daisy/increasing_knn_acc_vgg16.pickle',
              'wb+') as handle:
        pickle.dump(accuracy_list, handle)

    print(accuracy_list)
Esempio n. 37
0

#plot_data()

print("Features: ")
print(x)
print('Labels:- ')
print(y)

print('features:-')
print(x.values)
print("Labels:-\n ")
print(y.values)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x, y)

#print("Fire Status : {}%".format(knn.predict([[9,175,218]])[0]))
forestClient.connect(hostname, 1883, 60)


def on_message(client, userdata, msg):
    t = msg.topic
    d = msg.payload.decode()
    data.append(d)
    print(msg.topic, " , ", d)
    datas.append(data)
    print(datas)
    knn.predict(datas)
    #    count=0
y_pred=lda.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

end=time.time()
print("traning time is:")
print(end - start)

print(classification_report(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred))

from sklearn.neighbors import KNeighborsClassifier
start = time.time()

knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred=knn.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

end=time.time()
print("traning time is:")
print(end - start)

print(classification_report(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred))

from sklearn.tree import DecisionTreeClassifier
start = time.time()

clf = DecisionTreeClassifier().fit(x_train, y_train)
print(cancer.feature_names)
print(cancer.target_names)

for i in range(0,569):
    for j in range(0,569):
        if(cancer.target[j]<cancer.target[i]):
            temp1=cancer.data[j]
            cancer.data[j]=cancer.data[i]
            cancer.data[i]=temp1
            temp2=cancer.target[j]
            cancer.target[j]=cancer.target[i]
            cancer.target[i]=temp2

print(cancer.target)

i=0
while(i<369):
    print(cancer.data[i],"*****",cancer.target[i])
    i +=1 
   
x=cancer.data
y=cancer.target
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.5)
normal_classifier=KNeighborsClassifier()
normal_classifier.fit(x_train,y_train)
predictions1=normal_classifier.predict(x_test)
print(accuracy_score(y_test,predictions1))

myclassifier=classifier()
myclassifier.fit(x_train,y_train)
predictions2=myclassifier.predict(x_test)
    def KNN(cls, X_train, Y_train):
        knn = KNeighborsClassifier()
        knn.fit(X_train, Y_train)

        cls.save(knn, 'KNN')
        return knn
Esempio n. 41
0
recall_svm = cm_svm[0][0] / (cm_svm[0][0] + cm_svm[0][1])
precision_svm = cm_svm[0][0] / (cm_svm[0][0] + cm_svm[1][1])
print(recall_svm, precision_svm)

# the results of this section is Accuracy : 0.741 Recall : 0.85 Precision : 0.739

# Now we are do the test for KNN
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5,
                               n_jobs=-1,
                               leaf_size=60,
                               algorithm='brute')

knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute',
                     leaf_size=60,
                     metric='minkowski',
                     metric_params=None,
                     n_jobs=-1,
                     n_neighbors=5,
                     p=2,
                     weights='uniform')

y_pred_knn = knn_clf.predict(X_test)
print(y_pred_knn)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
Esempio n. 42
0
"""
Created on Tue Nov 27 14:17:10 2018

@author: jaynanda
"""

import pandas as pd
import numpy as np

train = pd.read_csv(
    "/Users/jaynanda/Desktop/Assignments/660/Project/Numeric Data/kids_family_numeric.csv"
)

feature = pd.DataFrame(train['Genre'])
train = train.drop('Genre', axis=1)

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train,
                                                    feature,
                                                    test_size=0.30)

from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

res = clf.predict(X_test)

from sklearn.metrics import accuracy_score

print(accuracy_score(res, y_test))
Esempio n. 43
0
class KNNClassifier(BaseEstimator, ClassifierMixin):
    """k nearest neighbors classifier.

    Parameters
    ----------
    n_neighbors : int, optional (default = 5)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    weights : str or callable, optional (default = 'uniform')
        weight function used in prediction.  Possible values:

        - 'uniform' : uniform weights.  All points in each neighborhood
          are weighted equally.
        - 'distance' : weight points by the inverse of their distance.
          in this case, closer neighbors of a query point will have a
          greater influence than neighbors which are further away.
        - [callable] : a user-defined function which accepts an
          array of distances, and returns an array of the same shape
          containing the weights.

    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        Algorithm used to compute the nearest neighbors:

        - 'ball_tree' will use :class:`BallTree`
        - 'kd_tree' will use :class:`KDTree`
        - 'brute' will use a brute-force search.
        - 'auto' will attempt to decide the most appropriate algorithm
          based on the values passed to :meth:`fit` method.

        Note: fitting on sparse input will override the setting of
        this parameter, using brute force.

    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or KDTree.  This can affect the
        speed of the construction and query, as well as the memory
        required to store the tree.  The optimal value depends on the
        nature of the problem.

    metric : string or DistanceMetric object (default = 'minkowski')
        the distance metric to use for the tree.  The default metric is
        minkowski, and with p=2 is equivalent to the standard Euclidean
        metric. See the documentation of the DistanceMetric class for a
        list of available metrics. 'dtw' and 'fast_dtw' are also
        available.

    p : integer, optional (default = 2)
        Power parameter for the Minkowski metric. When p = 1, this is
        equivalent to using manhattan_distance (l1), and euclidean_distance
        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

    metric_params : dict, optional (default = None)
        Additional keyword arguments for the metric function.

    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run for neighbors search.
        If ``-1``, then the number of jobs is set to the number of CPU cores.
        Doesn't affect :meth:`fit` method.

    """
    def __init__(self,
                 n_neighbors=1,
                 weights='uniform',
                 algorithm='auto',
                 leaf_size=30,
                 p=2,
                 metric='minkowski',
                 metric_params=None,
                 n_jobs=1,
                 **kwargs):
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.algorithm = algorithm
        self.leaf_size = leaf_size
        self.p = p
        self.metric = metric
        self.metric_params = metric_params
        self.n_jobs = n_jobs
        self.kwargs = kwargs

    def fit(self, X, y):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target vector relative to X

        Returns
        -------
        self : object
            Returns self.

        """
        X, y = check_X_y(X, y)

        if self.metric == 'dtw':
            self._clf = KNeighborsClassifier(self.n_neighbors, self.weights,
                                             self.algorithm, self.leaf_size,
                                             self.p, dtw, self.metric_params,
                                             self.n_jobs, **self.kwargs)

        elif self.metric == 'fast_dtw':
            self._clf = KNeighborsClassifier(self.n_neighbors, self.weights,
                                             self.algorithm, self.leaf_size,
                                             self.p, fast_dtw,
                                             self.metric_params, self.n_jobs,
                                             **self.kwargs)

        else:
            self._clf = KNeighborsClassifier(self.n_neighbors, self.weights,
                                             self.algorithm, self.leaf_size,
                                             self.p, self.metric,
                                             self.metric_params, self.n_jobs,
                                             **self.kwargs)

        self._clf.fit(X, y)
        return self

    def predict(self, X):
        """Predict the class labels for the provided data.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        y : array-like, shape [n_samples]
            Class labels for each data sample.

        """
        check_is_fitted(self, '_clf')
        X = check_array(X)
        return self._clf.predict(X)
#obs_all = ['ID','Elevation','Aspect','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Roadways','Hillshade_9am','Hillshade_noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points','Wilderness_Area_1','Wilderness_Area_2','Wilderness_Area_3','Wilderness_Area_4','2702','2703','2704','2705','2706','2717','3501','3502','4201','4703','4704','4744','4758','5101','5151','6101','6102','6731','7101','7102','7103','7201','7202','7700','7701','7702','7709','7710','7745','7746','7755','7756','7757','7790','8703','8707','8708','8771','8772','8776','Cover_Type']

cls = ['Cover_Type']
trainObs = train.as_matrix(obs_bin)
trainCls = train.as_matrix(cls).ravel()
testObs = test.as_matrix(obs_bin)
testCls = test.as_matrix(cls).ravel()

# ----  K Nearest Neighbor Classification
print("---- KNN ----")

# Set up a K Nearest Neighbor Classifier with the number of neighbors = 3 and weights based on Euclidean distance
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')

# Fit the K Nearest Neighbor classifier to the training data and use the resulting classifier to predict the class values for the test dataset
knn.fit(trainObs, trainCls)
knn_pred = knn.predict(testObs)
print(knn_pred)

# Calculate the accuracy of the classifier.
print("KNN Accuracy:")
print((sum(testCls == knn_pred)) / len(knn_pred))
# Create a confusion matrix using Scikit-Learn confusion_matrix
knn_tab = confusion_matrix(testCls, knn_pred, labels=labs)
print(knn_tab)
# Create a classification report for the result including precision, recall, and f measure.
print(metrics.classification_report(testCls, knn_pred))

# Exercise 1: Now go back and experiment with different values of k.  What happened?

# ---- Decision Tree Classification
Esempio n. 45
0
# Criando X and y par ao algorítmo de aprendizagem de máquina.\
print(
    ' - Criando X e y para o algoritmo de aprendizagem a partir do arquivo diabetes_dataset'
)
# Caso queira modificar as colunas consideradas basta algera o array a seguir.
feature_cols = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
    'BMI', 'DiabetesPedigreeFunction', 'Age'
]
X = data[feature_cols]
y = data.Outcome

# Ciando o modelo preditivo para a base trabalhada
print(' - Criando modelo preditivo')
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, y)

#realizando previsões com o arquivo de
print(' - Aplicando modelo e enviando para o servidor')
data_app = pd.read_csv('diabetes_app.csv')
y_pred = neigh.predict(data_app)

# Enviando previsões realizadas com o modelo para o servidor
URL = "https://aydanomachado.com/mlclass/01_Preprocessing.php"

#TODO Substituir pela sua chave aqui
DEV_KEY = "Tô de ouvinte"

# json para ser enviado para o servidor
data = {
    'dev_key': DEV_KEY,
Esempio n. 46
0
    def machineTrain(self):
        ## Load dataset
        url = "/home/pi/Desktop/6pplData.csv"
        urlFeature = "/home/pi/Desktop/features.csv"
        urlOutput = "/home/pi/Desktop/outputs.csv"
        dataset = pandas.read_csv(url, header=None)
        
        global window_size
        window_size = 30
        shift_size = 30
        models = []
        models.append(('KNN', KNeighborsClassifier(n_neighbors=7)))
        knn = KNeighborsClassifier(n_neighbors=7)
        
        ## Split-out validation dataset
        array = dataset.values
        X = array[:, 0:12]
        Y = array[:, 12]
        
        ## Declaring acc and gyro array
        accData = numpy.empty((array.shape[0], 6))
        gyroData = numpy.empty((array.shape[0], 6))
        
        accData = array[:, :6]
        gyroData = array[:, 6:12]

        ## Creating the normalizer
        global normalizerAcc
        global normalizerGyro

        normalizerAcc = preprocessing.Normalizer().fit(accData)
        normalizerGyro = preprocessing.Normalizer().fit(gyroData)

        ## Normalizing the data
        # accData = normalizerAcc.transform(accData)
        # gyroData = normalizerGyro.transform(gyroData)


        global le 
        le = preprocessing.LabelEncoder()
        le.fit(['nomove', 'wavehands', 'busdriver', 'frontback', 'sidestep', 'jumping', 'jumpingjack', 'turnclap', 'squatturnclap', 'windowcleaning', 'windowcleaner360', 'final'])
######### Segmentation and Feature Extraction #####################################################################        
        # Y_encoded = le.transform(Y)
        
        # N = dataset.shape[0]
        # dim_X = X.shape[1]
        # K = (N // shift_size) - 15
        # segments_X = numpy.empty((K, window_size, 3*(dim_X)))
        # segments_Y = numpy.empty((K, 3*window_size))
        
        # segment_X = numpy.empty((window_size, 3*(dim_X)))
        # for i in range(K):
        #     segment_X[:, :6] = accData[i * shift_size : (i*shift_size) + window_size, :]
        #     segment_X[:,6:12] = gyroData[i * shift_size: (i*shift_size) + window_size, :]
        #     segment_X[:, 12:18] = accData[i * shift_size + window_size : (i*shift_size) + 2*window_size, :]
        #     segment_X[:, 18:24] = gyroData[i * shift_size + window_size: (i*shift_size) + 2*window_size, :]
        #     segment_X[:, 24:30] = accData[i * shift_size + 2*window_size : (i*shift_size) + 3*window_size, :]
        #     segment_X[:, 30:36] = gyroData[i * shift_size + 2*window_size: (i*shift_size) + 3*window_size, :]
        #     segment_Y = Y_encoded[i * shift_size : (i * shift_size) + 3*window_size]
        #     segments_X[i] = segment_X
        #     segments_Y[i] = segment_Y
        
        # for i in range(K):
        #     segment_X = X[i * shift_size : (i * shift_size) + window_size , :]
        #     segment_Y = Y_encoded[i * shift_size : (i * shift_size) + window_size]
        #     segments_X[i] = segment_X
        #     segments_Y[i] = segment_Y
        
        # features = numpy.empty((K, 72))
        # outputs = numpy.empty((K))
        # for i in range(K):
        #     for j in range(0, features.shape[1] - 1, 2):
        #         features[i, j] = segments_X[i, : , j // 2].mean()
        #         features[i, j + 1] = segments_X[i, : , j // 2].std()
        #     outputs[i] = stats.mode(segments_Y[i])[0]

        # df = pandas.DataFrame(features)
        # df.to_csv("features.csv", header = None)
        # df = pandas.DataFrame(outputs)
        # df.to_csv("outputs.csv", header = None)
###################################################################################################################
        features_csv = pandas.read_csv(urlFeature, header=None)
        features = features_csv.values
        
        outputs_csv = pandas.read_csv(urlOutput, header=None)
        outputs = numpy.ravel(outputs_csv.values, order = "C")        
        
        validation_size = 0.2
        
        seed = 7
        X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(features, outputs, test_size=validation_size, random_state=seed)

        # Test options and evaluation metric
        scoring = 'accuracy'
        
        # evaluate each model in turn
        results = []
        
        names = []
        for name, model in models:
        	kfold = model_selection.KFold(n_splits=10, random_state=seed)
        	cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
        	results.append(cv_results)
        	names.append(name)
        	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        	print(msg)
         
        # Make predictions on validation dataset
        knn.fit(X_train, Y_train)
        predictions = knn.predict(X_validation)
        print("exit training")
        # print("Accuracy Score: ", accuracy_score(Y_validation, predictions), file=open('summary.txt', 'a'))
        # print("Confusion Matrix: \n", confusion_matrix(Y_validation, predictions, labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), file=open('summary.txt', 'a'))
        # print("Classification Report: \n", classification_report(Y_validation, predictions), file=open('summary.txt', 'a'))
        
        return knn

# run = learning()
# run.machineTrain()
	SnapShots.append(tmp)
print(len(SnapShots))

SnapShots=np.concatenate(SnapShots,axis=0) #along column or row wise

SnapShots=SnapShots.reshape(SnapShots.shape[0],-1) #it make our data suitable for knn 
print(SnapShots.shape) #40:30000 in this 30000 is feature or ndim for cal nearest neighbour

labels=np.repeat(labels,10)

labels=labels.reshape(labels.shape[0],-1) # 40:1  #Here we make our label in row : col=1 for suitable to attach to our snapashot

dataset=np.hstack((SnapShots,labels))#along row or column wise

knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(dataset[:,:-1],dataset[:,-1])

cap=cv2.VideoCapture(0)

Cascadeclassifier=cv2.CascadeClassifier("haarcascade_frontalface_default.xml")

while True:

	ref,frame=cap.read()

	if not ref :
		continue

	faces=Cascadeclassifier.detectMultiScale(frame,1.3,5)

def train_model(feats_csv):

	df = pd.DataFrame()
	df = pd.read_csv(feats_csv).iloc[:,1:]

	y = np.ravel(df.iloc[:,-1:])
	X = np.array(df.iloc[:,:-1])

	############ 15 Best selected features using ANOVA F-value score function ###############
	X_new = SelectKBest(f_classif, k=15).fit_transform(X, y)
	selected_features = SelectKBest(f_classif, k=15).fit(X, y).get_support(indices = True)

	############ KNN manhattan ###############
	##### preprocessing: data scaling######## 
	min_max_scaler = MinMaxScaler()
	X_new = min_max_scaler.fit_transform(X_new)

	model = KNeighborsClassifier(n_neighbors = 1,algorithm = 'brute',metric = 'manhattan',weights = 'uniform')
	model.fit(X_new,y)

	newdir = '../kNN_clfr'
	os.mkdir(newdir)

	joblib.dump(model, os.path.join(newdir,'kNN.pkl')) 

	return