Exemple #1
0
def tfidf_classify(user):
    train_set, y, src, test_set = extract_data(user.id)
    if not train_set:
        return []
    # Analyse using tf-idf
    # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
    vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english')
    # List of topic extracted from text
    # feature_names = vector.get_feature_names()
    # print feature_names
    xtrain = vector.transform(train_set)
    xtest = vector.transform(test_set)

    # Select sample using chi-square
    ch2 = SelectKBest(chi2)
    xtrain = ch2.fit_transform(xtrain, y)
    xtest = ch2.transform(xtest)

    # Predict testing set
    # classifier = DecisionTreeClassifier()
    classifier = KNeighborsClassifier(n_neighbors=4)
    classifier = classifier.fit(xtrain, y)
    result = classifier.predict(xtest)
    final = []
    for i in xrange(len(result)):
        if result[i]:
            final.append(src[i])
    print len(final)
    return final
Exemple #2
0
def main(output=RESULTS1B):
    """
    Using 1 nearest neighbor, predicts NYC Taxi trip times based on feature 
    vectors (pickup latitude, pickup longitude, dropoff latitude, dropoff latitude). 

    Tests on a subset of trip_data_1.csv

    Uses sklearn to implement nearest neighbors
    """
    features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 
               'dropoff_longitude', 'trip_time_in_secs']

    ## Extract necessary data into pandas dataframes
    numrows = 100000
    df_train_read = pd.read_csv(TRAIN_DATA)
    df_test_read = pd.read_csv(TRIP_DATA_1, nrows = numrows)    # first 100k rows, for speed
    df_test = df_test_read[features].dropna()
    df_train = df_train_read[features].dropna() 


    ## Use sklearn to run nearest neighbors
    k = 1 
    clf = KNeighborsClassifier(n_neighbors=k)                   # default distance metric: euclidean
    clf.fit(df_train[features[0:4]], df_train[features[-1]])
    preds = clf.predict(df_test[features[0:4]])

    # # Calculate statistics (Root Mean Squared Error, Correlation Coefficient, Mean Absolute Error)
    print "Calculating statistics"
    with open(output, "a+") as outputFile:
        outputFile.write("Ran knn with k={}".format(k) + \
            " Trained on {}. Tested on first".format(TRAIN_DATA) + \
            " {} rows of {}. Stats:".format(numrows, TRIP_DATA_1))
    calcAndLogStats( numpy.array(preds), 
                     numpy.array(df_test[features[-1]]), 
                     output=output)
Exemple #3
0
Fichier : knn.py Projet : kbai/uss
def main():
    print("k nearest neighbours classifier!")

    X,Y,Xtest = importdata()
    print(Y.shape)
    param_grid={
            "n_neighbors":[10,20,50,100,200],
            "algorithm":['auto','ball_tree','kd_tree','brute'],
            "weights":['uniform','distance']
            }

    knn = KNeighborsClassifier() 
    Gridsearch_impl(X,Y,knn,param_grid,5)

#    for i in range(10,11,5):
#        clf = DecisionTreeClassifier(min_samples_split=i)
#        rf = RandomForestClassifier(n_estimators = 100,random_state=0,min_samples_split=i)
#        ab = AdaBoostClassifier(rf,n_estimators = 10)
        #ab = GradientBoostingClassifier(n_estimators = 100)
#        score = cross_validation.cross_val_score(ab,X,Y,cv=3)
      #  print(score)
      #  print("average score %f"%np.mean(score))
      #  print("std %f"%np.std(score))
      #  ab.fit(X,Y)
   


    Ytest = knn.predict(Xtest)
    output(Ytest,'submit3.csv')
Exemple #4
0
    def fit(self, X, y):
        """
        Fit the model according to the given training data

        Parameters
        ----------
        X: {array-like}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features

        y: array-like, shape (n_samples,)
            Target vector relative to X.

        Returns
        -------
        self : object
            return self.
        """
        clf = KNeighborsClassifier(n_neighbors=self.n_neighbors,
                                   p = self.p,
                                   weights = self.weights,
                                   algorithm="kd_tree")

        if self.scaler:
            X = self.scaler.transform(X)

        self.model = clf.fit(X, y)

        # self.print_coefficients()

        self.num_class = len(np.unique(y))
Exemple #5
0
    def _evaluate_projection(self, x, y):
        """
        kNNEvaluate - evaluate class separation in the given projection using a k-NN method
        Parameters
        ----------
        x - variables to evaluate
        y - class

        Returns
        -------
        scores
        """
        if self.percent_data_used != 100:
            rand = np.random.choice(len(x), int(len(x) * self.percent_data_used / 100),
                                    replace=False)
            x = x[rand]
            y = y[rand]
        neigh = KNeighborsClassifier(n_neighbors=3) if self.attr_color.is_discrete else \
            KNeighborsRegressor(n_neighbors=3)
        assert ~(np.isnan(x).any(axis=None) | np.isnan(x).any(axis=None))
        neigh.fit(x, y)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            scores = cross_val_score(neigh, x, y, cv=3)
        return scores.mean()
def kppv_histo():
    "Interprétation des images comme histogrammes de couleurs et classification via les k plus proches voisins"
    best = np.zeros(5)    
    
    _, data, target, _ = utils.chargementHistogrammesImages(mer,ailleurs,1,-1)
    X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3,random_state=random.seed())
    
    for iterations in range(250,1000,250):
        for n in range(2,12,2):
            for param in range (1,3):
                start_time = time.time()
                kppv = KNeighborsClassifier(n_neighbors=n, p=param, n_jobs=-1)
                
                x1=np.array(X_train)
                x2=np.array(X_test)
                    
                kppv.fit(X=x1, y=Y_train)
                score = kppv.score(x2,Y_test)
                    
                end_time = time.time()
                if score>best[0]:
                    best[0] = score
                    best[1] = iterations
                    best[2] = n
                    best[3] = param
                    best[4] = end_time-start_time
    
    print("| K plus proches voisins          | V.Histo    | n={:1.0f} param={:1.0f} iterations={:1.0f}         | {:10.3f}ms | {:1.3f} |".format(best[2],best[3],best[1],best[4]*1000,best[0]))
def kann_classify(train_data,train_label,test_data):  
      
    knnClf=KNeighborsClassifier(n_neighbors=5)
    knnClf.fit(train_data,ravel(train_label))  
    test_label=knnClf.predict(test_data)  
    save_result(test_label,'sklearn_knn_Result.csv')  
    return test_label  
Exemple #8
0
def knn_accuracy(trn_data, trn_labels, tst_data, tst_labels, k_neighbors):

    knn = KNeighborsClassifier(k_neighbors)
    knn.fit(trn_data, trn_labels)
    results = knn.predict(tst_data)

    return np.sum(tst_labels == results)/float(tst_labels.size)
def kppv_vecteur():
    "Interprétation des images comme vecteurs de pixels et classification via les k plus proches voisins"
    best = np.zeros(6)    
    
    for npix in range(50,200,50):
        _, data, target, _ = utils.chargementVecteursImages(mer,ailleurs,1,-1,npix)
        X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3,random_state=random.seed())
        
        for iterations in range(250,1000,250):
            for n in range(2,12,2):
                for param in range (1,3):
                    start_time = time.time()
                    kppv = KNeighborsClassifier(n_neighbors=n, p=param, n_jobs=-1)
                    
                    x1=np.array(X_train)
                    x1 = np.reshape(x1, (x1.shape[0],x1.shape[2]))
                    x2=np.array(X_test)
                    x2 = np.reshape(x2, (x2.shape[0],x2.shape[2]))
                        
                    kppv.fit(X=x1, y=Y_train)
                    score = kppv.score(x2,Y_test)
                        
                    end_time = time.time()
                    if score>best[0]:
                        best[0] = score
                        best[1] = iterations
                        best[2] = n
                        best[3] = param
                        best[4] = end_time-start_time
                        best[5] = npix
    
    print("| K plus proches voisins         | V.Pix {:4.0f} | n={:1.0f} param={:1.0f} iterations={:1.0f}           | {:10.3f}ms | {:1.3f} |".format(best[5],best[2],best[3],best[1],best[4]*1000,best[0]))
Exemple #10
0
    def analyze_image(self):
        '''
        Load the image and analyze it with KNN

        im_file - pre-processed with histogram specification
        '''

        if self._avg_pixels.size == 0:
            self._process_annotations()        
            self._get_initial_classes()
        
        im = self._image
        rows = im.shape[0]

        clf = KNeighborsClassifier(n_neighbors = self._n_neighbors)
        clf.fit(self._avg_pixels, self._labels)

        im_1d = im.reshape(-1, 3)

        # calculate prediction reshape into image
        prediction = clf.predict(im_1d)
        prediction = prediction.reshape(rows, -1)

        prediction [self._mask == 0] = Labels.Masked
        self.display_current(prediction)
        return prediction
def process_one_cell(df_train, df_test, grid_id, th):
    """
    Classification inside one grid cell.
    """
    # Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    # Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index

    # Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int)

    # Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance',
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3])
    return pred_labels, row_ids
def run(class_num, subsample_size , cluster_num, window_size,  method='knn' , n_nb = 2):
    
    # will load data as the patch size defined , 3 means 3*3 = 9 for each patch, and will return the dictionary included:
    # 'data'  (one patch)  , 'target' (the sample of this patch belongs to ) , 'filename' (the file comes from)
    bofs = []
    lable = []
    filename = "%s/TRAIN_VLAD_%d_%d_%d_%d.txt" %(vlad_accee , class_num , subsample_size , window_size , cluster_num)        
    bofs , lable = get_vlad(filename)

    #knn_init = KNeighborsClassifier()
    #parameters = {'n_neighbors':[ 5, 10 , 15]}
    #knn = grid_search.GridSearchCV(knn_init, parameters)

    bofs_test = []
    lable_test = []
    filename = "%s/TEST_VLAD_%d_%d_%d_%d.txt" %(vlad_accee , class_num , subsample_size , window_size , cluster_num)
    bofs_test , lable_test = get_vlad(filename)


    start = time.time()    
    if(method == "knn"):
        knn = KNeighborsClassifier(n_neighbors = n_nb)
        knn.fit(bofs, lable)
        predicted = knn.predict(bofs_test)
        score = knn.score(bofs_test,lable_test)
   
    print(time.time()-start) 

    return score  
Exemple #13
0
 def BuildModel(self, data, labels):
     # Create and train the classifier.
     knc = KNeighborsClassifier(
         n_neighbors=self.n_neighbors, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric
     )
     knc.fit(data, labels)
     return knc
Exemple #14
0
def train(x_train, y_train):

    # reg = LinearRegression()
    reg = KNeighborsClassifier()
    reg.fit(x_train, y_train)

    return reg
def main():
    # obtain the number of features in the dataset
    with open('../data/test_lung_s3.csv', 'rb') as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            num_columns = len(row)
            break
    print num_columns
    # load data
    mat = np.loadtxt('../data/test_lung_s3.csv', delimiter=',', skiprows=1, usecols=range(0, num_columns))
    X = mat[:, 1:num_columns]  # data
    y = mat[:, 0]  # label
    X = X.astype(float)
    y = y.astype(float)
    n_samples, n_features = X.shape

    # using 10 fold cross validation
    cv = KFold(n_samples, n_folds=10, shuffle=True)

    # evaluation
    n_features = 100
    neigh = KNeighborsClassifier(n_neighbors=1)
    acc = 0

    for train, test in cv:
        idx = svm_backward.svm_backward(X[train], y[train], n_features)
        print idx
        X_selected = X[:, idx]
        neigh.fit(X_selected[train], y[train])
        y_predict = neigh.predict(X_selected[test])
        acc_tmp = accuracy_score(y[test], y_predict)
        print acc_tmp
        acc += acc_tmp
    print 'ACC', float(acc)/10
def exercise_2b():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    kf = ShuffleSplit(100, train_size= 0.9, test_size=0.1, random_state=0)
    # kf = KFold(1000, n_folds=10, shuffle=False, random_state=None)
    accuracy_lst = np.zeros([49, 2], dtype=float)
    accuracy_current = np.zeros(10, dtype=float)
    for k in range(1,50):
        iterator = 0
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf = KNeighborsClassifier(n_neighbors=k)
            clf.fit(X_train, y_train)
            accuracy_current[iterator] = (1. - clf.score(X_test,y_test))
            iterator+=1
            print mean_squared_error(y_test, clf.predict(X_test))
        accuracy_lst[k-1, 0] = accuracy_current.mean()
        accuracy_lst[k-1, 1] = accuracy_current.var()#*2 #confidence interval 95%
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, accuracy_lst[:, 1], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K')
    plt.ylabel('Variance')
    plt.show()
def main_process():
    data_dict = parse_txt()
    x_data, y_data, places_cnt, path_int_dict = build_x_y_data(data_dict)
    print 'data counts', len(x_data), len(y_data)
    print 'zone names counts', places_cnt
    print 'path counts', len(path_int_dict)

    # start to train, change list type to numpy.array
    x_data = np.array(x_data)
    y_data = np.array(y_data)

    knn = KNeighborsClassifier()

    indices = np.random.permutation(len(x_data))
    x_train = x_data
    y_train = y_data
    x_test = x_data[indices[-TEST_DATA_ROWS:]]
    y_test = y_data[indices[-TEST_DATA_ROWS:]]
    knn.fit(x_train, y_train)  # work

    test_result = knn.predict(x_test)  # test
    proba_test_result = knn.predict_proba(x_test)

    # no duplicate value, so reverse this dictionary
    int_path_dict = dict(zip(path_int_dict.values(), path_int_dict.keys()))

    print 'predict result:', test_result
    print [int_path_dict[x] for x in test_result]  # test result
def exercise_1():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    n_samples = len(X)
    kf = cross_validation.KFold(n_samples, n_folds=10, shuffle=False, random_state=None)
    # kf = cross_validation.ShuffleSplit(1000,n_iter=25, test_size=0.1, train_size=0.9, random_state=None)

    error_total = np.zeros([49, 1], dtype=float)
    for k in range(1,50):
        error = []
        clf = KNeighborsClassifier(n_neighbors=k)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train, y_train)
            error.append( zero_one_loss(y_test, clf.predict(X_test)) )


            # error.append(clf.predict(X_test))
            # error.append( 1. - clf.score(X_test, y_test) ) #, accuracy_score(y_test, clf.predict(X_test))
            # error.append(mean_squared_error(y_test, clf.predict(X_test)))
            # error.append()
        # print error
        error_total[k-1, 0] = np.array(error).mean()
    # print error_total
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, error_total[:, 0], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K values')
    plt.ylabel('Missclasification Error')
    plt.show()
def exercise_2a():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    # plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
    # plt.show()
    kf = KFold(1000, n_folds=10, shuffle=False, random_state=None)
    accuracy_lst = np.zeros([49, 2], dtype=float)
    accuracy_current = np.zeros(10, dtype=float)
    for k in range(1,50):
        iterator = 0
        clf = KNeighborsClassifier(n_neighbors=k)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf.fit(X_train, y_train)
            accuracy_current[iterator] = (1. - clf.score(X_test,y_test))
            iterator+=1
        accuracy_lst[k-1, 0] = accuracy_current.mean()
        # accuracy_lst[k-1, 1] = accuracy_current.std() #confidence interval 95%
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, accuracy_lst[:, 0], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K values')
    plt.ylabel('Missclasification Error')
    plt.show()
    def build_model():
        """ request cip and skill data from DataUSA and develop predictive model using scikit-learn

        :return: fit model ready to accept user input to make a prediction
        """

        # request data on college majors and relevant skills
        r = requests.get(r'http://api.datausa.io/api/?show=skill&sumlevel=all')
        data_usa = r.json()

        headers = data_usa['headers']
        data = data_usa['data']

        df = pd.DataFrame(data, columns=headers)
        df.drop('value_rca', axis=1, inplace=True)

        # reshape data so that each skill becomes a single column (i.e. feature for the model)
        pivot = df.pivot_table(index='cip', columns='skill', values='value')
        pivot = pivot.reset_index()

        X = pivot.drop('cip', axis=1)  # feature matrix
        y = pivot.cip  # response

        knn = KNeighborsClassifier(n_neighbors=10, weights='distance')
        knn.fit(X, y)

        return knn
def process_one_cell(df_cell_train, df_cell_test):
    
    #Working on df_train
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 8).values
    df_cell_train = df_cell_train.loc[mask]
    
    #Working on df_test
    row_ids = df_cell_test.index
    
    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= 500.0
    df_cell_train.loc[:,'y'] *= 1000.0
    df_cell_test.loc[:,'x'] *= 500.0
    df_cell_test.loc[:,'y'] *= 1000.0
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    X_test = df_cell_test.values

    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) 
    
    return pred_labels, row_ids
Exemple #22
0
    def onstartButton(self):

        cap = cv2.VideoCapture(str(self.file_name))

        if self.isfileWorking == False and self.ishasFile == True:
            self.ishasFile = False
            self.startButton.setText("Close")

            # cap = cv2.VideoCapture(str(self.file_name))

            self.isfileWorking = True
            data=spio.loadmat("openface_fea.mat")
            X=data['feature']
            id=data['id'].astype(int)-1
            Y=id[0,:]
            name=list(set(data['names']))
            name.sort()
            print("***Train knn classifier***")
            knn=KNeighborsClassifier(n_neighbors=20,weights='distance',p=2)
            knn.fit(X,Y)

            success,frame = cap.read()

            while success and self.isfileWorking :
            	start=time.time()
                success, frame = cap.read() 
                
                if success:
                    img=frame.copy()
                   
                    bb,rep=getRep(img)
                    if bb is None:
                        print "Can't find any face in this picture"
                    else:
                        if rep is 0:
                            print "Get rep failed..."
                        else:
                            rep=np.reshape(rep,(1,128))
                            idx=knn.predict(rep)
                            # print("label is {} ".format(idx))
                            proba=knn.predict_proba(rep)
                            actor=name[idx]
                            self.namelineEdit.setText(actor)
                            self.timelineEdit.setText(str(round(time.time()-start,3)))
                            self.confidencelineEdit.setText(str(round(max(proba[0]),2)))
                            # print("Proba is {} ".format(proba))
                            
                            

                            draw_dlib_rects(frame,bb,actor,(0,255,0))
                    image = QtGui.QImage(frame.data, frame.shape[1], frame.shape[0], QtGui.QImage.Format_RGB888).rgbSwapped()
                    pixmap = QtGui.QPixmap.fromImage(image)
                    self.showlabel.setPixmap(pixmap)
                    k = cv2.waitKey(5)
        else:
            self.ishasFile = False
            self.startButton.setText("Start")
            self.isfileWorking = False
            cap.release()
            self.showlabel.clear()
def kNearestNeighbors(features_test, features_train,labels_test,labels_train):    
    print "Using K Nearest Neighbors"
    #Setup classifier
    clf = KNeighborsClassifier(algorithm='ball_tree',n_jobs=-1,weights='distance');
    
    #Timing fit algorithm
    t0 = time();
    
    #Fitting data
    clf = clf.fit(features_train,labels_train);
    
    print "Training Time: ", round(time() - t0, 3), "s";
    
    #Reset timer for prediction;
    t0 = time();
    
    #Predicting using test data
    nbrs_predict = clf.predict(features_test);
    
    print "Prediction Time: ", round(time() - t0, 3), "s";
    
    nbrs_acc = accuracy_score(nbrs_predict,labels_test);
    
    print "Accuracy: ", nbrs_acc;
    return clf;
Exemple #24
0
def build_classifier(images, labels):
    #this will actually build the classifier. In general, it
    #will call something from sklearn to build it, and it must
    #return the output of sklearn. Right now it does nothing.
    classifier = KNN(n_neighbors=3,weights='distance')
    classifier.fit(images, labels)
    return classifier
Exemple #25
0
def train():
    data = Prediction.objects.filter(predict=False)
    df = pd.DataFrame(list(data.values()))
    users = [o.user for o in data]
    df['age'] = pd.DataFrame(list([user.userprofile.age for user in users]))
    for x in ['id', 'base_personal', 'base_general', 'predict', 'created', 'user_id', 'training_id']:
        df = df.drop(x, axis=1)

    y = df['next_level']
    df = df.drop('next_level', axis=1)
    #neigh = svm.SVC()
    neigh = KNeighborsClassifier(n_neighbors=2)
    neigh.fit(df, y) 

    #kf = KFold(len(ft[features]), n_folds=10)
    kf = ShuffleSplit(len(df), n_iter=K, test_size=test_size, random_state=0)
    # score is accuracy here 

    accuracy = cross_val_score(neigh, df, y, cv=kf)
    batch = Training.objects.create(training_accuracy=sum(accuracy[:(1-test_size)*K])/K/(1-test_size),
                                    sample_size=len(df.index),
                                    fold=K,
                                    subset_accuracy=json.dumps(accuracy.tolist()),
                                    test_accuracy=sum(accuracy[(1-test_size)*K:])/K/test_size
                                    )
    Prediction.objects.filter(predict=False).update(training=batch)
    
    if not os.path.exists('./models'):
            os.makedirs('./models')

    joblib.dump(neigh, './models/model.pkl')
                      
    
def neighborsPrediction(train_dfs, targetLabels, fold_cv):
    scoresNeighbor = [0.0]
    n_neighbors = 0

    for i in range(1, 10):
        neighbor, instances_train, instances_test, target_train, target_test, scoresNeighborTmp = testScore(train_dfs,
                                                                                                            targetLabels,
                                                                                                            fold_cv,
                                                                                                            i * 2)
        if sum(scoresNeighborTmp) / len(scoresNeighborTmp) > sum(scoresNeighbor) / len(scoresNeighbor):
            scoresNeighbor = scoresNeighborTmp
            n_neighbors = i * 2
            # print(sum(scoresNeighborTmp)/len(scoresNeighborTmp))

    neighbor = KNeighborsClassifier(n_neighbors)
    neighbor.fit(train_dfs, targetLabels)

    instances_train, instances_test, target_train, target_test = cross_validation.train_test_split(train_dfs,
                                                                                                   targetLabels,
                                                                                                   test_size=0.4,
                                                                                                   random_state=0)

    predictions = neighbor.predict(instances_test)
    print("Generate random forest with: {0} neighbors".format(str(n_neighbors)))
    return neighbor, instances_train, target_train, target_test, predictions, scoresNeighbor
Exemple #27
0
def brute_force_acc_rd(features_train, labels_train, features_test, labels_test, ids):

    clf = KNeighborsClassifier(
        n_neighbors=100,
        )

    clf = clf.fit(features_train, labels_train)
    # print(clf.best_estimator_)
    pred = clf.predict(features_test)
    acc = accuracy_score(labels_test, pred)
    #print pred
    print acc

    if(acc > 0.8):
        print ("Acc: {} ").format(acc)


    if(acc > 0.831):
        data_train.to_csv("data_train{}.tst".format(round(acc,5)), "\t")
        predictions_file = open("data/canivel_knn_{}.csv".format(round(acc, 5)), "wb")
        predictions_file_object = csv.writer(predictions_file)
        predictions_file_object.writerow(["PassengerId", "Survived"])
        predictions_file_object.writerows(zip(ids, pred))
        predictions_file.close()
        print ("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  NEW FILE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! YEA!!!!")
    return acc
def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    testing_file = file('test.p', 'r')
    training_file = file('train.p', 'r')

    train = pickle.load(training_file)
    test = pickle.load(testing_file)

    testing_file.close()
    training_file.close()
    
    trainX = train[:,:-1]
    trainy = train[:,-1]
    
    testX = test[:,:-1]
    testy = test[:,-1]

    print '[INFO, time: %s] Downsampling ...' % (time.strftime('%H:%M:%S'))
    trainX = downsample_features(trainX)
    testX = downsample_features(testX)

    trainX, testX = normalize(trainX, testX)

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), '50 - Neighbors')
    clf = KNeighborsClassifier(n_neighbors=50)
    clf.fit(trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))
Exemple #29
0
def knn(depth, joints, C, visualize=False, zScale=1.0):
    pts_world, labels = joints2skeleton(joints)

    pts_world[:, 2] *= zScale
    classifier = KNeighborsClassifier(n_neighbors=nNeighbors)
    classifier.fit(pts_world, labels)

    X = np.vstack((np.nonzero(depth)[1], np.nonzero(depth)[0]))
    X = np.vstack((X, depth[depth != 0]))
    X_world = pixel2world(X.T, C)
    X_world[:, 2] *= zScale
    predicts = classifier.predict(X_world)

    perPixelLabels = -np.ones(depth.shape)
    perPixelLabels[depth != 0] = predicts

    img = np.zeros((H, W, 3), np.uint8)
    for i in range(nJoints):
        img[perPixelLabels == i] = palette[i]

    skel = None
    if visualize is True:
        # foreground = visualizePts(world2pixel(pts_world, C), labels)
        # img[foreground != 0] = foreground[foreground != 0]
        skel = visualizePts(world2pixel(pts_world, C), labels)

    return (img, X_world, predicts, skel)
Exemple #30
0
def blend_models(n_folds, train_data, train_labels, holdout, test_data, test_mode):
	"""
	Function which performs the blending procedure explained below:

	Step 1) initialize classifiers to use in the blending task as clfs variable (add classifiers to that variable)
	TODO: extract it out into config

	Step 2) split training data into kfolds

	Step 3) 
	Do for every classifier:
		Do for every fold:
			train each classifier in blender on the kth training fold and do the following:
				a) predict probabilities of the kth "test" fold only
				b) append predictions to holdout set "dataset_blend_holdout_j" for classifier trained on that fold only
				c) append predictions to test set "dataset_blend_test_j" for classifier trained on that fold only

		When all folds are finished processing take a mean of predictions generated for the classifier trained on different folds
		for both dataset_blend_holdout_j and dataset_blend_test_j and append mean values to dataset_blend_holdout, dataset_blend_test

	Args:
		n_folds: number of folds in the blender
		train_data: trianing data
		train_labels: true labels for training data
		holdout: holdout set
		test_data: test data set to generate final predictions on
		test_mode: this is the debug mode (it uses only one classifier in the blender)

	Returns:
		dataset_blend_train: blended training data set based on above procedure
		dataset_blend_holdout: blended holdout set based on above procedure
		dataset_blend_test: blended test set based on above procedure

	"""

	np.random.seed(0) # seed to shuffle the train set

	shuffle = False
	X = train_data
	y = train_labels.ravel()
	X_submission = holdout

	if shuffle:
		idx = np.random.permutation(y.size)
		X = X[idx]
		y = y[idx]
	skf = list(cross_validation.KFold(len(y), n_folds))

	if test_mode:
		clfs = [KNeighborsClassifier(weights="uniform", n_jobs=-1)]
	else:
		clfs = [KNeighborsClassifier(weights="uniform", n_jobs=-1),
			KNeighborsClassifier(weights="distance", n_jobs=-1),
			SVC(),
			RandomForestClassifier(n_estimators=250, n_jobs=-1, criterion='gini'),
			RandomForestClassifier(n_estimators=250, n_jobs=-1, criterion='entropy'),
			ExtraTreesClassifier(n_estimators=250, n_jobs=-1, criterion='gini'),
			ExtraTreesClassifier(n_estimators=250, n_jobs=-1, criterion='entropy'),
			GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50),
			discriminant_analysis.LinearDiscriminantAnalysis(),
			discriminant_analysis.QuadraticDiscriminantAnalysis()]
			#MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(200,), verbose=False, random_state=55),
			#AdaBoostClassifier(_ADABOOST_BASE_ESTIMATOR_, n_estimators=_ADABOOST_NUM_ESTIMATORS_, algorithm=_ADABOOST_LALGO_, learning_rate=_ADABOOST_LEARNING_RATE_)]

	print "Creating train and test sets for blending."

	dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
	dataset_blend_holdout = np.zeros((X_submission.shape[0], len(clfs)))
	dataset_blend_test = np.zeros((test_data.shape[0], len(clfs)))

	for j, clf in enumerate(clfs):
		print "Classifier no: ", j + 1
		print clf
		dataset_blend_holdout_j = np.zeros((X_submission.shape[0], len(skf)))
		dataset_blend_test_j = np.zeros((test_data.shape[0], len(skf)))
		for i, (train, test) in enumerate(skf):
			print "====Fold", i
			X_train = X.iloc[train]
			y_train = y[train]
			X_test = X.iloc[test]
			y_test = y[test]
			clf.fit(X_train, y_train)
			y_submission = clf.predict_proba(X_test)[:,1]
			dataset_blend_train[test, j] = y_submission
			dataset_blend_holdout_j[:, i] = clf.predict_proba(X_submission)[:,1]
			dataset_blend_test_j[:, i] = clf.predict_proba(test_data)[:,1]
		dataset_blend_holdout[:,j] = dataset_blend_holdout_j.mean(1)
		dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)

	return pd.DataFrame(dataset_blend_train), pd.DataFrame(dataset_blend_holdout), pd.DataFrame(dataset_blend_test)
Exemple #31
0
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

from examples.load_wine import load_wine
from selexor.random_forest import RFSelector

# we will use Wine dataset for demonstration
x_train_std, x_test_std, y_train, y_test = load_wine()

# let's create a classifier and calculate accuracy score
knn = KNeighborsClassifier(n_jobs=-1)

knn.fit(x_train_std, y_train)
y_pred = knn.predict(x_test_std)
print(
    f'Accuracy score before RFSelector: {accuracy_score(y_pred=y_pred, y_true=y_test)}'
)

# now, let's create RFSelector instance and use fit_transform and transform methods to fit the dataset and transform
# samples
rf = RFSelector(n_components=2,
                estimator_params={
                    'max_depth': 3,
                    'n_jobs': -1
                })
x_train_rf = rf.fit_transform(x_train_std, y_train)
x_test_rf = rf.transform(x_test_std)

# let's fit the classifier on new data and calculate accuracy score again
knn.fit(x_train_rf, y_train)
y_pred = knn.predict(x_test_rf)
Exemple #32
0
def knncls():
    """
    K-近邻预测用户签到位置
    :return:None
    """
    # 读取数据
    data = pd.read_csv("./data/FBlocation/train.csv")

    print(data.head(10))

    # 处理数据
    # 1、缩小数据,查询数据晒讯
    data = data.query("x > 1.0 &  x < 1.25 & y > 2.5 & y < 2.75")

    # 处理时间的数据
    time_value = pd.to_datetime(data['time'], unit='s')

    print(time_value)

    # 把日期格式转换成 字典格式
    time_value = pd.DatetimeIndex(time_value)

    # 构造一些特征
    data['day'] = time_value.day
    data['hour'] = time_value.hour
    data['weekday'] = time_value.weekday

    # 把时间戳特征删除
    data = data.drop(['time'], axis=1)

    print(data)

    # 把签到数量少于n个目标位置删除
    place_count = data.groupby('place_id').count()

    tf = place_count[place_count.row_id > 3].reset_index()

    data = data[data['place_id'].isin(tf.place_id)]

    # 取出数据当中的特征值和目标值
    y = data['place_id']

    x = data.drop(['place_id'], axis=1)

    # 进行数据的分割训练集合测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

    # 特征工程(标准化)
    std = StandardScaler()

    # 对测试集和训练集的特征值进行标准化
    x_train = std.fit_transform(x_train)

    x_test = std.transform(x_test)

    # 进行算法流程 # 超参数
    knn = KNeighborsClassifier()

    # # fit, predict,score
    knn.fit(x_train, y_train)

    # # 得出预测结果
    y_predict = knn.predict(x_test)
    #
    # print("预测的目标签到位置为:", y_predict)
    #
    # # 得出准确率
    # print("预测的准确率:", knn.score(x_test, y_test))

    # 构造一些参数的值进行搜索
    param = {"n_neighbors": [3, 5, 10]}

    # 进行网格搜索
    gc = GridSearchCV(knn, param_grid=param, cv=2)

    gc.fit(x_train, y_train)

    # 预测准确率
    print("在测试集上准确率:", gc.score(x_test, y_test))

    print("在交叉验证当中最好的结果:", gc.best_score_)

    print("选择最好的模型是:", gc.best_estimator_)

    print("每个超参数每次交叉验证的结果:", gc.cv_results_)

    return None
#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color="r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################

### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(features_train, labels_train)

pred = neigh.predict(features_test)

from sklearn.metrics import accuracy_score
print accuracy_score(pred, labels_test)

########################
### adaboost algorithm
########################
from time import time
from sklearn.ensemble import AdaBoostClassifier
print "-:: adaboost ::------------------"
t0 = time()
adab = AdaBoostClassifier(n_estimators=100, learning_rate=1)
Exemple #34
0
def fitness(individual, granulationBucket, trEmbeddBucket, vsEmbeddBucket,
            TRindices, VSindices, TRlabels, VSlabels):

    Q = individual[0]
    wNSub = individual[1]
    wNIns = individual[2]
    wNDel = individual[3]
    wESub = individual[4]
    wEIns = individual[5]
    wEDel = individual[6]
    tau = individual[7]
    eta = individual[8]

    Repr = Medoid

    #Setting GED
    graphDist = BMF(nodeDissimilarity, edgeDissimilarity)
    graphDist.nodeSubWeight = wNSub
    graphDist.nodeInsWeight = wNIns
    graphDist.nodeDelWeight = wNDel
    graphDist.edgeSubWeight = wESub
    graphDist.edgeInsWeight = wEIns
    graphDist.edgeDelWeight = wEDel

    #Setting granulation strategy
    granulationStrategy = BsasBinarySearch(graphDist, Repr, 0.1)
    granulationStrategy.BsasQmax = Q
    granulationStrategy.eta = eta
    granulationStrategy.symbol_thr = tau
    #Setup embedder
    embeddingStrategy = SymbolicHistogram(Dissimilarity=graphDist,
                                          isSymbolDiss=False,
                                          isParallel=False)

    #Start granulation
    granulationStrategy.granulate(granulationBucket)
    #retrieving alphabet
    alphabet = granulationStrategy.symbols

    if alphabet:

        #Embedded with current symbols
        # embeddingStrategy.getSet(trEmbeddBucket, alphabet)
        # TRembeddingMatrix = np.asarray(embeddingStrategy._embeddedSet)
        # TRpatternID = embeddingStrategy._embeddedIDs

        ##Debug
        embeddingStrategy.getSetDebug(trEmbeddBucket, alphabet, TRindices)
        TRembeddingMatrix = np.asarray(embeddingStrategy._embeddedSet)
        TRpatternID = embeddingStrategy._embeddedIDs
        print(
            np.all(
                np.asarray(TRlabels) == np.asarray(
                    embeddingStrategy._embeddedClass)))
        ##

        # embeddingStrategy.getSet(vsEmbeddBucket, alphabet)
        # VSembeddingMatrix = np.asarray(embeddingStrategy._embeddedSet)
        # VSpatternID = embeddingStrategy._embeddedIDs
        embeddingStrategy.getSetDebug(vsEmbeddBucket, alphabet, VSindices)
        VSembeddingMatrix = np.asarray(embeddingStrategy._embeddedSet)
        VSpatternID = embeddingStrategy._embeddedIDs
        print(
            np.all(
                np.asarray(VSlabels) == np.asarray(
                    embeddingStrategy._embeddedClass)))

        #Resorting matrix for consistency with dataset
        TRorderID = np.asarray([TRpatternID.index(x) for x in TRindices])
        VSorderID = np.asarray([VSpatternID.index(x) for x in VSindices])
        TRMat = TRembeddingMatrix[TRorderID, :]
        VSMat = VSembeddingMatrix[VSorderID, :]

        #DEBUG
        # x = np.all(TRMat==TRembeddingMatrix2)
        # y = np.all(VSMat==VSembeddingMatrix2)
        # print(x,y)
        ##

        classifier = KNN()
        classifier.fit(TRMat, TRlabels)
        predictedVSLabels = classifier.predict(VSMat)

        # classifier.fit(TRembeddingMatrix,TRlabels)
        # predictedVSLabels = classifier.predict(VSembeddingMatrix)
        accuracyVS = sum(predictedVSLabels == VSlabels) / len(VSlabels)

        print("Accuracy VS = {}".format(accuracyVS))

        #Minimisation problem
        indFit = 0.9 * (1 - accuracyVS) + 0.1 * (len(alphabet) /
                                                 len(granulationBucket))

    else:

        print("Empty alphabet. Penalizing fitness with worst fitness")
        indFit = 1

    fitness = indFit

    return fitness,
Exemple #35
0
#-*- coding=utf-8 -*-
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
iris = load_iris()

print iris.DESCR
x_train, x_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.25,
                                                    random_state=666)
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)
knnc = KNeighborsClassifier()
knnc.fit(x_train, y_train)
predict = knnc.predict(x_test)
print 'The accuracy of KNN classifier is', knnc.score(x_test, y_test)

print metrics.classification_report(y_test, predict)
print metrics.confusion_matrix(y_test, predict)
data = 1
# with    open(r"C:\Users\hanghang\Desktop\hh_practice.csv") as file:
#     data = 2
# print data
Exemple #36
0
    'female', 'male', 'male'
]

testX = [[175, 63, 43], [180, 69, 44], [162, 54, 38]]
testY = ['male', 'male', 'female']

#classification tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

#classification GaussianNB
clf2 = GaussianNB()
clf2.fit(X, Y)

#classification neighbors
clf3 = KNeighborsClassifier(n_neighbors=3, algorithm='auto')
clf3.fit(X, Y)

#classification SVC
clf4 = SVC()
clf4.fit(X, Y)

alt = input("Ingresa tu altura: ")
peso = input("Ingresa tu peso: ")
talla = input("Ingresa tu talla de zapato: ")

predicition = clf.predict([[int(alt), int(peso), int(talla)]])
points = clf.score(testX, testY)
#190,70,43
print(predicition, "\n La precision de decision tree fue de:", points)
    vector = vectorizeSentence(sentence.split())
    all_features[i, :] = vector

all_features = pd.DataFrame(all_features)
all_features['label'] = full_data['dialog_act']
all_features = all_features.dropna()

X = all_features.drop('label', axis=1)
y = all_features['label']
#y = LabelBinarizer().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.15,
                                                    random_state=101)

modelsToTest = [baselineModel, LogisticRegression(), SVC(kernel='linear'), \
                DecisionTreeClassifier(), RandomForestClassifier(n_estimators=200),\
                KNeighborsClassifier(n_neighbors=10)]

modelNames = ['Neural network', 'Logistic regression', 'Support vector machine',\
              'Decision Tree', 'Random Forest', 'KNN']

performances = []
for model, name in zip(modelsToTest, modelNames):
    print('fitting model {}'.format(name))
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    fscore = 2 * precision * recall/(precision + recall)
    return tp, tn, fp, fn, acc, fscore



train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
corpus = train_df['text']
toate_cuvintele = get_corpus_vocabulary(corpus)
wd2idx, idx2wd = get_representation(toate_cuvintele, 70)
data = corpus_to_bow(corpus, wd2idx)
labels = train_df['label']
test_data = corpus_to_bow(test_df['text'], wd2idx)

from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors = 7)
# scrierea predictiilor (scor 75.225% kaggle)
clf.fit(data, labels)
preds = clf.predict(test_data)
write_prediction('352_TroianStefan_submisie1.csv', preds)



# # kfold si matrice de confuzie
# predictie_medie = []
# tp = []
# tn = []
# fp = []
# fn = []
# acc = []
#
Exemple #39
0
random_state = check_random_state(0)
permutation = random_state.permutation(X.shape[0])
X = X[permutation]
y = y[permutation]
X = X.reshape((X.shape[0], -1))

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=60000,
                                                    test_size=10000)

label_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

begin = time.time()
k = KNeighborsClassifier(n_neighbors=3, metric='euclidean', n_jobs=-1)
k.fit(X_train, y_train)
k_y_pred = k.predict(X_test)
print(classification_report(y_test, k_y_pred, target_names=label_names))
end = time.time()
print("time:", end - begin)

begin = time.time()
LR = LogisticRegression(penalty='l2', solver='saga', max_iter=50, n_jobs=-1)
LR.fit(X_train, y_train)
LR_y_pred = LR.predict(X_test)
print(classification_report(y_test, LR_y_pred, target_names=label_names))
end = time.time()
print("time:", end - begin)

# In[12]:
Exemple #40
0
def cross_validate(x_train, y_train, x_test, y_test):  # using grid search instead for now
    n_neighbors = 2
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    best_k = None
    best_score = 0
    for train_index, test_index in kf.split(x_train):
        x_train_folds, x_val_fold = x_train[train_index], x_train[test_index]
        y_train_folds, y_val_fold = y_train[train_index], y_train[test_index]
        knn = KNeighborsClassifier(n_neighbors=n_neighbors)
        knn.fit(x_train_folds, y_train_folds)
        score = knn.score(x_val_fold, y_val_fold)
        print('K = ', n_neighbors, ':', score)
        if score > best_score:
            best_k = n_neighbors
        n_neighbors += 1

    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(x_train, y_train)
    pred = knn.predict(x_test)
    final_score = knn.score(x_test, y_test)
    matrix = confusion_matrix(y_test, pred)
Exemple #41
0
from sklearn.neighbors import KNeighborsClassifier

# 生成随机样本
# multivariate_normal方法用于生成随机正太分布
n_points = 100
X1 = np.random.multivariate_normal([1, 50], [[1, 0], [0, 10]], n_points)
X2 = np.random.multivariate_normal([2, 50], [[1, 0], [0, 10]], n_points)
# concatenate方法用于将多个数组进行拼接,默认axis=0,即上下拼接,若axis=1则左右拼接
X = np.concatenate([X1, X2], axis=0)
y = np.array([0] * n_points + [1] * n_points)

# KNN模型训练过程
clfs = []
neighbors = [1, 3, 5, 9, 11, 13, 15, 17, 19]
for i in range(len(neighbors)):
    clfs.append(KNeighborsClassifier(n_neighbors=neighbors[i]).fit(X, y))

# 可视化结果
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
# meshgrid方法用于生成网格矩阵,第一个参数为相当于横坐标,第二个参数相当于纵坐标,返回的是以横纵坐标组合成的网格后,横坐标矩阵和纵坐标的矩阵
# 参考https://www.cnblogs.com/gengyi/p/9420559.html
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

f, axarr = plt.subplots(3, 3, sharex='col', sharey='row', figsize=(15, 12))
for idx, clf, tt in zip(product([0, 1, 2], [0, 1, 2]), clfs,
                        ['KNN (k=%d)' % k for k in neighbors]):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
Exemple #42
0
# In[ ]:

# Validation Set approach
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=42)

accuracyList = {}

clf = tree.DecisionTreeClassifier(class_weight="balanced")
gaussian = GaussianNB()
logreg = LogisticRegression(class_weight="balanced")
boost = GradientBoostingClassifier()
knn = KNeighborsClassifier(n_neighbors=3)
forest = RandomForestClassifier(n_estimators=20)

models = [clf, gaussian, logreg, boost, knn, forest]

for model in models:
    accuracyList[model] = 0

for model in models:
    modelV = model.fit(X_train, Y_train)
    Y_pred = modelV.predict(X_test)
    accuracyList[model] = round(modelV.score(X_test, Y_test) * 100, 3)

#models = [clf,gaussian,logreg,boost,knn,forest]
accResult = pd.DataFrame({
    'Model': [
sgd_score=cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

sgd_score

sgd_score.mean()

# ## Saving the model

import joblib
filename = 'sgd_clf.sav'
joblib.dump(sgd_clf, filename)

# # K nearest neighbours

from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=1,weights="uniform", metric="cosine")
knn_clf.fit(X_train_scaled,y_train)

knn_score=cross_val_score(knn_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

knn_score

knn_score.mean()

# ## Saving the model

import joblib
filename = 'knn_1_uniform_cosine.sav'
joblib.dump(knn_clf, filename)

# # Evaluating on test set
Exemple #44
0
def train_and_eval_ML(X_train,
                      X_test,
                      y_train,
                      y_test,
                      metrics_manager,
                      fold,
                      quick_test=False):
    """ Train and evulate Traditional ML classifiers from sci-kit learn

        Description: This function will train all the models on the given feature set of the X (data) for predicting y (target) and add the acquired metrics 
        to the MetricsManager object from the user

        Args: 
            X => pd.DataFrame object containing the data
            y => pd.Series object containings the target classifications
            feature_set => list of features in X to use for training
            metrics_manager => MetricsManager object (custom)

        Returns:
            None
        
        Classifer names used as keys for the manager:
                        XGBoost Classifier => xgb
                        Random Forest => rf
                        Decision Tree => dt
                        k-Nearest Neighbors => knn
                        Support Vector Machine => svm
                        Logistic Regression => lr
                        Linear Discriminant Analysis => lda
                        AdaBoost => ab
                        Naive Bayes => nb

    """
    random_state = 100
    if quick_test:
        # Random Forest Model
        rf = RandomForestClassifier(random_state=random_state)
        model_eval(rf, 'rf', fold, X_train, X_test, y_train, y_test,
                   metrics_manager)

        # XGBoost Classifier
        xgb = XGBClassifier()
        model_eval(xgb, 'xgb', fold, X_train, X_test, y_train, y_test,
                   metrics_manager)
        return

    # Random Forest Model
    rf = RandomForestClassifier(random_state=random_state)
    model_eval(rf, 'rf', fold, X_train, X_test, y_train, y_test,
               metrics_manager)

    # XGBoost Classifier
    xgb = XGBClassifier()
    model_eval(xgb, 'xgb', fold, X_train, X_test, y_train, y_test,
               metrics_manager)

    # AdaBoost Model
    ab = AdaBoostClassifier(random_state=random_state)
    model_eval(ab, 'ab', fold, X_train, X_test, y_train, y_test,
               metrics_manager)

    # Decision Tree Model
    dt = DecisionTreeClassifier(random_state=random_state)
    model_eval(dt, 'dt', fold, X_train, X_test, y_train, y_test,
               metrics_manager)

    # k-Nearest Neighbors Model
    knn = KNeighborsClassifier()
    model_eval(knn, 'knn', fold, X_train, X_test, y_train, y_test,
               metrics_manager)

    # Support Vector Machine Model
    svm = SVC(random_state=random_state)
    model_eval(svm, 'svm', fold, X_train, X_test, y_train, y_test,
               metrics_manager)

    # Logistic Regression Model
    lr = LogisticRegression(random_state=random_state)
    model_eval(lr, 'lr', fold, X_train, X_test, y_train, y_test,
               metrics_manager)

    # Linear Discriminant Analysis Model
    lda = LinearDiscriminantAnalysis()
    model_eval(lda, 'lda', fold, X_train, X_test, y_train, y_test,
               metrics_manager)

    # Naive Bayes Model
    nb = GaussianNB()
    model_eval(nb, 'nb', fold, X_train, X_test, y_train, y_test,
               metrics_manager)
# 特征归一化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled= scaler.transform(X_test)

for i in range(4):
    print('归一化前的最大值{:.3f},最小值{:.3f},'
          .format(X_train.iloc[:,i].max(), X_train.iloc[:,i].min()))
    print('归一化后的最大值{:.3f},最小值{:.3f},'
          .format(X_train_scaled[:,i].max(), X_train_scaled[:,i].min()))
    print()

knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_scaled, y_train_binary)
y_pre = knn.predict(X_test_scaled)

# 由图可知对SVM,C = 10 或者100 时模型最优
svm_model = SVC(C=10)
svm_model.fit(X_train_scaled, y_train)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# 准确率
print('准确率:{:.3f}'.format(accuracy_score(y_test_binary, y_pre)))
# 精确率
print('精确率:{:.3f}'.format(precision_score(y_test_binary, y_pre)))
# 召回率
print('召回率:{:.3f}'.format(recall_score(y_test_binary, y_pre)))
# f1指标
Exemple #46
0
def classify_knn(features_train, labels_train, n):
    neigh = KNeighborsClassifier(n)
    neigh.fit(features_train, labels_train)
    return neigh
            if (y_train[seq[j]][0] == -1.0):
                cnt3 += 1
        if cnt1 >= cnt2 and cnt1 >= cnt3:
            label[i] = 1.0
        if cnt2 > cnt1 and cnt2 >= cnt3:
            label[i] = 0.0
        if cnt3 > cnt2 and cnt3 > cnt1:
            label[i] = -1.0
        print cnt1, cnt2, cnt3
        print label[i], y_test[i]
    acc = 0.0
    for i in range(len(x_test)):

        if (label[i] == y_test[i]):
            acc += 1.0
    return acc / len(x_test)


pca = PCA(n_components=200)
filename = 'EEG.mat'
x_train, y_train, x_test, y_test = preprocess(filename)
#x_train=pca.fit_transform(x_train)
#x_test=pca.fit_transform(x_test)
#print k_nn(x_train[:5000],y_train[:5000],x_test[:1500],y_test[:1500],200)
neighbors = KNeighborsClassifier(n_neighbors=5)
neighbors.fit(x_train, y_train)
pre = neighbors.predict(x_test)

acc = float((pre == y_test).sum()) / len(y_test)
print acc
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import matplotlib.pyplot as plt

models = [
    KNeighborsClassifier(n_neighbors=5),
    SVC(gamma='auto'),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(max_depth=20, random_state=0),
    AdaBoostClassifier(n_estimators=100, random_state=0),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis()
]

luckies = ["Mrs", "Miss", "Master", "Sir", "Lady", "Ms", "Mle", "Counthess"]
unluckies = ["Mr", "Don", "Rev", "Dr", "Jonkheer"]


def preprocessData(train, test):
    Y_train = train['Survived']
    train['Train'] = train.apply(lambda row: 1, axis=1)
    test['Train'] = test.apply(lambda row: 0, axis=1)
    data = pd.concat([train, test], ignore_index=True, axis=0)
Exemple #49
0
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
#Predict Output
gauss_pred = gaussian.predict(X_test)

#Using Logistic Regression
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(X_train, Y_train)
#Predict output
regression_pred = reg.predict(X_test)

#Using K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
k_near = KNeighborsClassifier()
k_near.fit(X_train, Y_train)
#Predict output
k_near_pred = k_near.predict(X_test)

#Using Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, Y_train)
#Predict output
dec_tree_pred = dec_tree.predict(X_test)

# Fitting SVC to the dataset
from sklearn.svm import SVC
regressor = SVC()
regressor.fit(X_train, Y_train)
Exemple #50
0
            C=100.,
            probability=True,
            class_weight='balanced',
            kernel='linear'))
clf_output = clf.fit(data_train, targets_train)
print(clf.score(data_test, targets_test))

#Naive bayes
print("Naive bayes")
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(data_train, targets_train)
predictions = nb.predict(data_test)
print(metrics.accuracy_score(targets_test, predictions))

#KNN
print("KNN")
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(data_train, targets_train)
predictions = KNN.predict(data_test)
print(metrics.accuracy_score(targets_test, predictions))

#Decision Tree
print("Decision Tree")
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(data_train, targets_train)
predictions = clf.predict(data_test)
print(metrics.accuracy_score(targets_test, predictions))
Exemple #51
0
 def model(self):
     knn = KNeighborsClassifier(n_neighbors=1, metric='cosine')
     clf = knn.fit(self.X_train_tfidf, self.encode)
     return clf
Exemple #52
0
from sklearn.neighbors import KNeighborsClassifier  #using KNN
from sklearn.datasets import load_iris

iris = load_iris()

features = iris.data
labels = iris.target

from sklearn.cross_validation import train_test_split  #fn train
X_train, X_test, Y_train, Y_test = train_test_split(features,labels,test_size=.3)

neigh = KNeighborsClassifier()
neigh.fit(X_train,Y_train)
#clf = DecisionTreeClassifier()
#clf.fit(X_train,Y_train)
p = neigh.predict(X_test)

from sklearn.metrics import accuracy_score
print ("Accuracy =",accuracy_score(Y_test,p))


                             max_depth=8,
                             min_samples_split=12,
                             min_samples_leaf=3,
                             min_weight_fraction_leaf=0.06,
                             max_features='auto',
                             max_leaf_nodes=None,
                             min_impurity_decrease=0.0,
                             min_impurity_split=None,
                             bootstrap=True,
                             oob_score=False,
                             n_jobs=None,
                             random_state=0,
                             verbose=0,
                             warm_start=False,
                             class_weight=None)
 knn = KNeighborsClassifier(n_neighbors=40, n_jobs=1)  # K临近算法
 naiveB = naive_bayes.BernoulliNB(alpha=1.6,
                                  binarize=1.41,
                                  fit_prior=True,
                                  class_prior=None)  # 0.575
 svm = SVC(C=1, kernel='rbf', gamma=0.001, probability=True)
 LR = LogisticRegression(penalty='l2',
                         dual=False,
                         tol=0.0001,
                         C=4.0,
                         fit_intercept=True,
                         intercept_scaling=2,
                         class_weight=None,
                         random_state=None,
                         solver='liblinear',
                         max_iter=100,
Exemple #54
0
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

# Assign colum names to the dataset
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']

# Read dataset to pandas dataframe
dataset = pd.read_csv(url, names=names)
print(dataset.head())
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
def data_trainer(df, algo, y, x1, x2=None, x3=None, x4=None, x5=None, x6=None, x7=None, x8=None, x9=None, x10=None):
    checklist = [x1]
    if x2 != None:
        checklist.append(x2)
    if x3 != None:
        checklist.append(x3)
    if x4 != None:
        checklist.append(x4)
    if x5 != None:
        checklist.append(x5)
    if x6 != None:
        checklist.append(x6)
    if x7 != None:
        checklist.append(x7)
    if x8 != None:
        checklist.append(x8)
    if x9 != None:
        checklist.append(x9)
    if x10 != None:
        checklist.append(x10)
    X = df[checklist]
    y = df[y]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    #training
    if algo == 'DecisionTreeRegressor':
        regressor = DecisionTreeRegressor()
    elif algo == 'DecisionTreeClassifier':
        regressor = DecisionTreeClassifier()
    elif algo == 'SVC':
        regressor = SVC(kernel='rbf', probability=True)
    elif algo == 'GaussianNB':
        regressor = GaussianNB()
    elif algo == 'RandomForestClassifier':
        regressor = RandomForestClassifier()
    elif algo == 'KNeighbors':
        regressor = KNeighborsClassifier()
    elif algo == 'MLP':
        regressor = MLPClassifier()

    regressor.fit(X_train, y_train)

    #prediction
    y_pred = regressor.predict(X_test)

    prediction = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) 
    print(prediction)

    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 
    print()

    '''
    pickle.dump(regressor, open(targetFile, 'wb'))
    print('Model saved: ', targetFile)
    print()

    loaded_model = pickle.load(open(targetFile, 'rb'))
    result = loaded_model.score(X_test, y_test)
    result = round((result*100), 2)
    print('Confidence: ', result)
    '''

    return prediction
Exemple #56
0
def assign_labels(X_total,X_pred,y_pred):
    knn = KNeighborsClassifier(n_neighbors=1)    
    knn.fit(X_pred, y_pred)
    return knn.predict(X_total)
Exemple #57
0
def run_knn(X_train, X_test, y_train, y_test):
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)
Exemple #58
0
                               test_size=0.1,
                               random_state=84)

# Ser på størrelsen for hver split
print("training data points: {}".format(len(trainLabels)))
print("validation data points: {}".format(len(valLabels)))
print("testing data points: {}".format(len(testLabels)))

# Initialiserer k får vår knn classifier
kVals = range(1, 30, 2)
accuracies = []

# Looper over kVals
for k in range(1, 30, 2):
    # Tren the classifier med valuen til "k"
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(trainData, trainLabels)

    # Evaluate moddelen og print
    score = model.score(valData, valLabels)
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))
    accuracies.append(score)

# Største accuracy
i = np.argmax(accuracies)
print("k=%d achieved highest accuracy of %.2f%% on validation data" %
      (kVals[i], accuracies[i] * 100))

# Nå som vi har den beste K verdien, tren classifieren på nytt
model = KNeighborsClassifier(n_neighbors=kVals[i], algorithm='brute')
model.fit(trainData, trainLabels)
Exemple #59
0
def knn(n_neighbors=5):
	knn = KNeighborsClassifier(n_neighbors, weights="uniform", n_jobs=-1)
	return knn
def train_model(feats_csv):

	df = pd.DataFrame()
	df = pd.read_csv(feats_csv).iloc[:,1:]

	y = np.ravel(df.iloc[:,-1:])
	X = np.array(df.iloc[:,:-1])

	############ 15 Best selected features using ANOVA F-value score function ###############
	X_new = SelectKBest(f_classif, k=15).fit_transform(X, y)
	selected_features = SelectKBest(f_classif, k=15).fit(X, y).get_support(indices = True)

	############ KNN manhattan ###############
	##### preprocessing: data scaling######## 
	min_max_scaler = MinMaxScaler()
	X_new = min_max_scaler.fit_transform(X_new)

	model = KNeighborsClassifier(n_neighbors = 1,algorithm = 'brute',metric = 'manhattan',weights = 'uniform')
	model.fit(X_new,y)

	newdir = '../kNN_clfr'
	os.mkdir(newdir)

	joblib.dump(model, os.path.join(newdir,'kNN.pkl')) 

	return