コード例 #1
0
def Radius_Neighbors(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans Radius_Neighbors")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = RadiusNeighborsClassifier(n_neighbors=1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "Radius Neighbors Accuracy "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"Raidus_Neighbors_metrics.txt"
    file = open(results, "w")
    file.write("Radius Neighbors estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "Radius Neighbors"
    save = Output + "Radius_Neighbors_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans Radius_Neighbors")
コード例 #2
0
    def draw(self):
        """
        Draw the estimated floorplan in the current figure
        """
        xy = self.dimred.transform(self._fingerprints)

        x_min, x_max = xy[:,0].min(), xy[:,0].max()
        y_min, y_max = xy[:,1].min(), xy[:,1].max()
        xx, yy = np.meshgrid(np.arange(x_min, x_max, 1.0),
                             np.arange(y_min, y_max, 1.0))
        clf = RadiusNeighborsClassifier(radius=3.0, outlier_label=0)
        clf.fit(xy, self._label)
        label = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

        plt.pcolormesh(xx, yy, label)
        plt.scatter(xy[:,0], xy[:,1], c=self._label, vmin=0)
def Radius_Neighbors(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans radius_kneighbors split_test")
    try:
        ncol=tools.file_col_coma(input_file)
        data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
        X = data[:,1:]
        y = data[:,0]
        n_samples, n_features = X.shape
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        print X_train.shape, X_test.shape
        clf = RadiusNeighborsClassifier(radius=0.001, weights='uniform', algorithm='auto')
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print "Radius Neighbors accuracy "
        print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
        print "precision:", metrics.precision_score(y_test, y_pred)
        print "recall:", metrics.recall_score(y_test, y_pred)
        print "f1 score:", metrics.f1_score(y_test, y_pred)
        print "\n"
        results = Output+"Raidus_Neighbors_metrics_test.txt"
        file = open(results, "w")
        file.write("Radius Neighbors estimator accuracy\n")
        file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
        file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
        file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
        file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
        file.write("\n")
        file.write("True Value, Predicted Value, Iteration\n")
        for n in xrange(len(y_test)):
            file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
        file.close()
        title = "Radius Neighbors %f"%test_size
        save = Output + "Radius_Neighbors_confusion_matrix"+"_%s.png"%test_size
        plot_confusion_matrix(y_test, y_pred,title,save)
    except (ValueError):
        results = Output+"Raidus_Neighbors_metrics_test.txt"
        file = open(results, "w")
        file.write("In configuration.py file:  No neighbors found for test samples, you can try using larger radius, give a label for outliers, consider or removing them from your dataset.")
        file.close()
    lvltrace.lvltrace("LVLSortie dans radius_kneighbors split_test")
コード例 #4
0
r = 3000  # <------------------------------------

clf = RadiusNeighborsClassifier(radius=r)
# radius=1.0, weights=’uniform’, algorithm=’auto’
# leaf_size=30, p=2, metric=’minkowski’, outlier_label=None
# metric_params=None, n_jobs=None, **kwargs

from time import process_time

start = process_time()
clf.fit(trainData, trainLabel)
print('time of train :', process_time() - start)

start = process_time()
predicts = clf.predict(testData)
print('time of test :', process_time() - start)

from sklearn.metrics import accuracy_score

print("Accuracy : ", accuracy_score(testLabel, predicts))

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(y_true,
                          y_pred,
                          classes,
                          normalize=False,
コード例 #5
0
#load libraries
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

#load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

#create standardizer
standardizer = StandardScaler()

#Standardize feature
features_standardized = standardizer.fit_transform(features)

#Train radius neighbors classifier
rnn = RadiusNeighborsClassifier(radius=0.5,
                                n_jobs=-1).fit(features_standardized, target)

#create two observation
new_observations = [[1, 1, 1, 1]]

#predict the class of two observations
print(rnn.predict(new_observations))
コード例 #6
0
x = dataset[0]
y = dataset[1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.metrics import accuracy_score

radio = 3

model_radius = RadiusNeighborsClassifier(radius=radio)

model_radius.fit(x_train, y_train)

predict_radius = model_radius.predict(x_train)

accu_radius = accuracy_score(y_train, predict_radius)

print('La precision del modelo KNN Radius es ', round(accu_radius, 2))

from sklearn.neighbors import KNeighborsClassifier

neighbors = 4

model_KNN = KNeighborsClassifier(n_neighbors=neighbors, n_jobs=2)

model_KNN.fit(x_train, y_train)

predict_KNN = model_KNN.predict(x_test)
コード例 #7
0
ファイル: prog_to_titanic.py プロジェクト: AlbMLpy/TitanicML
pred_RF = RF_model.predict(X_valid)
print('MAE RF: ', mean_absolute_error(pred_RF, Y_valid))

KN_model = KNeighborsClassifier(n_neighbors=55,
                                weights='distance',
                                algorithm='auto')
KN_model.fit(X_train, Y_train)
pred_KN = KN_model.predict(X_valid)
print('MAE KN: ', mean_absolute_error(pred_KN, Y_valid))

RN_model = RadiusNeighborsClassifier(radius=3.32,
                                     weights='distance',
                                     algorithm='ball_tree',
                                     outlier_label=1)
RN_model.fit(X_train, Y_train)
pred_RN = RN_model.predict(X_valid)
print('MAE RAD_N: ', mean_absolute_error(pred_RN, Y_valid))

GB_model = GradientBoostingClassifier(
    learning_rate=0.0730,
    n_estimators=250)  #(learning_rate=0.028,n_estimators=375)
Fitted_GB = GB_model.fit(X_train, Y_train)
pred_GB = GB_model.predict(X_valid)
print('MAE GB: ', mean_absolute_error(pred_GB, Y_valid))

score = (cross_val_score(GB_model, X, Y)).mean()
print('CROSS-VALIDATION_GB= ', score)

SVM_model = SVC()
SVM_model.fit(X_train, Y_train)
pred_SVM = SVM_model.predict(X_valid)
コード例 #8
0
        hist = desc.describe(gray)
        # extraia o rótulo do caminho da imagem e atualiza o
        # rótulo e listas de dados
        # print(imagePath.split("/"))  # use "\\" no Windows
        labels.append(imagePath.split("/")[-2])  # use "\\" no Windows
        data.append(hist)
    #print(labels)
    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.1,
                                                        random_state=0)

    # treinar um KNN Linear nos dados
    k_nn = [0.005, 0.01, 0.015, 0.02]
    for k in k_nn:
        neigh = RadiusNeighborsClassifier(radius=k, outlier_label=0.1)
        neigh.fit(X_train, y_train)

        benar = 0
        jml = 0
        for i in range(len(y_test)):
            jml += 1
            hist = X_test[i]
            prediction = neigh.predict([hist])[0]
            if prediction == y_test[i]:
                benar += 1
        akurasi = float(benar * 100 / jml)
        print(benar, jml, k, ": Akurasi", akurasi, "%")
        hasil.append([k, p[0], p[1], akurasi])
tulis_hasil(hasil, "results/{0}_rnn.csv".format(db))
コード例 #9
0
        if self._clasifyData.has_key(coltag):            
            try:
                tag = self._clasifyData[coltag]['neigh'].predict([[screenspace_x,screenspace_y]])
                tag = tag[0]
                self._clasifyData[coltag]['data'][tag] = [screenspace_x,screenspace_y]       
                
            except ValueError:
                return MOCAP_ROGE_DATA   
            return tag
        return MOCAP_ROGE_DATA
    
    def updateBoxesForNextFrame(self):
        for clotag,data in self._clasifyData.items():
            centroids = []
            labels = []
            for tag,centroid in data['data'].items():
                centroids.append(centroid)
                labels.append(tag)
            self._clasifyData[clotag]['neigh'].fit(centroids,labels)            
        
        
             
X = [[229.5, 500.5], [127.0, 497.0]]#[[0,0], [1,1], [2,2], [3,3]]
y = [1, 5]#[5, 1, 3, 4]
neigh = RadiusNeighborsClassifier(radius=1.0)
neigh.fit(X, y) 
print(neigh.predict([[229.5, 500.5]]))



コード例 #10
0
def knn_classifier_Radius(X_train, categories, X_test, test_categories):
    from sklearn.neighbors import RadiusNeighborsClassifier
    clf = RadiusNeighborsClassifier(outlier_label= 0).fit(X_train, categories)
    y_rknn_predicted = clf.predict(X_test)
    print "\n Here is the classification report for RadiusNeighborsClassifier classifier:"
    print metrics.classification_report(test_categories, y_rknn_predicted)
コード例 #11
0
class Model(object):
    """
    Text-classification-system with scikit-learn.
    For reference see: http://scikit-learn.org/stable/

    This Model class is based on Data class. Defines training
    and test data. Build classification model. Provides
    evaluation methods.

    Parameter
    ---------
    data : Data, optional
        Contains a data object with filled data.real_data.

    data_list : array, shape = [data1 object, data2 object, ...]
        Contains data objects with filled data.real_data.

    Attributes
    ----------
    clf : classifier object from sklean moduls.
        Contains a selected classifier object from sklean modul.
        see reference: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning

    classifier_list : array, shape = [string classifier1 name, ...]
        Contains names of all available classification algorithms.

    __train_data_set : boolean
        Contains bolloean value that describes if train_data is set.

    train_data : Data
        Contains the data object that is set as training data.

    test_data : Data
        Contains the data object that is set as test data.

    train_targets : numpy array of shape [n_samples]
        Contains the class labels of training data. A sample is
        a textpair object, it's class label is found in textpair.target.

    train_samples : numpy array of shape [n_samples,n_features]
        Contains the feature values of the training data. A sample is
        a textpair object, it's feature values are found in textpair.features
        hash. After vectorize() them, they are stored in
        textpair.feature_vector.

    test_targets : numpy array of shape [n_samples]
        Contains the class labels of test data. A sample is
        a textpair object, it's class label is found in textpair.target.

    test_samples : numpy array of shape [n_samples,n_features]
        Contains the feature values of the test data. A sample is
        a textpair object, it's feature values are found in textpair.features
        hash. After vectorize() them, they are stored in
        textpair.feature_vector.
    """
    def __init__(self, data=None, data_list=None):
        self.clf = None

        if data is not None:
            self.data_list = [data]
        elif data_list is not None:
            self.data_list = data_list

        self.classifier_list = [
            "svm_linear", "svm_poly", "naive_bayes", "decision_tree",
            "nearest_centroid", "k_neighbors", "radius_neighbors"
        ]

        self.__train_data_set = False

    def set_train_data(self, data_name):
        """Setter for training data

        Walk through data_list and set data object with
        data.name as train_data.

        Parameter
        ---------
        data_name : string
            Contains the name of the data object, that should
            be set as train_data for the model.
        """
        data_in_list = False
        for data in self.data_list:
            if data.name == data_name:
                print data_name + " is in model_data_list"
                self.train_data = data
                self.train_samples, self.train_targets = self.fill_feature_target(
                    data)
                print data_name + " is set as train_data"
                data_in_list = True
        if data_in_list:
            self.__train_data_set = True
        else:
            print data_name + " not in model_data_list "

    def set_test_data(self, data_name):
        """Setter for test data

        Walk through data_list and set data object with
        data.name as test_data.

        Notes
        -----
        Training data has to be set before test data, due to the fact
        that some features need skeletons that have to be build before seeing
        the test data.

        see reference: bag_of_pos.py, bag_of_words.py, tf_idf.py

        Parameter
        ---------
        data_name : string
            Contains the name of the data object, that should
            be set as test_data for the model.
        """
        if self.__train_data_set and self.train_data.name == data_name:
            self.test_data = self.train_data
            print "train_data and test_data from one data_set"
        elif not self.__train_data_set:
            print "please set train_data first"
        else:
            data_in_list = False
            for data in self.data_list:
                if data.name == data_name:
                    print data_name + " is in model_data_list"
                    self.test_data = data
                    self.test_samples, self.test_targets = self.fill_feature_target(
                        data)
                    data_in_list = True
                    print data_name + " is set as test_data"
            if not data_in_list:
                print data_name + " not in model_data_list "

    def fill_feature_target(self, data):
        """ Fill the feature samples and target values.

        The classifier objects from sklearn need a numpy array for
        classification.

        Shape of the data class labels : numpy array of shape [n_samples]
        Shape of the data feature values : numpy array of shape [n_samples,n_features]

        Vectorize() textpair feature values, for building required numpy arrays.

        Note
        ----
        Check __train_data_set first, cause there is no need to attache the
        same features for test data manually in main.py. This will be performed
        automatically in here.

        Parameter
        ---------
        data : Data
            Contains a Data object that data.real_data should be vectorized.
        """
        sample_list = []
        target_list = []

        if self.__train_data_set:
            for feature in self.train_data.features_fit:
                if feature == "bag_of_words" or feature == "bag_of_pos" or feature == "tf_idf":
                    data.bow_model = self.train_data.bow_model

            print self.train_data.features_fit
            data.attach_feature_list(self.train_data.features_fit)

            for textpair in data.real_data.values():
                textpair.vectorize()
                target_list.append(textpair.target)
                sample_list.append(textpair.feature_vector)

            return np.array(sample_list), np.array(target_list)
        else:
            for textpair in data.real_data.values():
                textpair.vectorize()
                target_list.append(textpair.target)
                sample_list.append(textpair.feature_vector)

            return np.array(sample_list), np.array(target_list)

    def set_classifier(self, classifier_name):
        """ Setter for clf

        Building instances of classifier objects with corresponding name.

        Parameter
        ---------
        classifier_name : string
            Contains the corresponding name of the wanted classifier from
            sklearn.
        """
        if classifier_name == "svm_linear":
            self.clf = svm.SVC(kernel="linear", class_weight="auto")
        elif classifier_name == "svm_poly":
            self.clf = svm.SVC(kernel="poly", class_weight="auto")
        elif classifier_name == "naive_bayes":
            self.clf = GaussianNB()
        elif classifier_name == "decision_tree":
            self.clf = tree.DecisionTreeClassifier()
        elif classifier_name == "nearest_centroid":
            self.clf = NearestCentroid()
        elif classifier_name == "k_neighbors":
            self.clf = KNeighborsClassifier(n_neighbors=100)
        elif classifier_name == "radius_neighbors":
            self.clf = RadiusNeighborsClassifier(radius=1.0, outlier_label=1)
        else:
            raise ClassifierNotExistException(classifier_name)

    def train(self, fraction):
        """ Train the model

        Training the classifier with the wanted fraction of the training data.

        Parameter
        -------
        fraction : int
            Contains a number from 0 to 100. Defines the fraction of the
            training data that will be used for training the classifier.
        """
        if self.clf is None:
            raise NoClassifierException
        elif self.train_targets.size == 0 and self.train_samples.size == 0:
            raise EmptyFeaturesEmptyTargetsException
        else:
            count = int(
                round((float(len(self.train_targets)) / float(100)) *
                      float(fraction), 0))
            self.clf.fit(self.train_samples[:count],
                         self.train_targets[:count])

    def predict(self, sample):
        """ Predict a given sample.

        Make a prediction for a given sample. Classifier needs a numpy array
        with the feature values of a sample.

        Note
        ----
        Requires a trained(fitted) model.

        Parameters
        ----------
        samples : numpy array of shape [n_samples,n_features]

        Returns
        -------
        self.clf.predict(sample) : int
            Contains the prediction value from the model. It is the predicted
            class label. For a textpair object it can be 0 or 1.
        """
        if self.clf is None:
            raise NoClassifierException
        elif self.test_targets.size == 0 and self.test_samples.size == 0:
            raise EmptyFeaturesEmptyTargetsException
        else:
            return self.clf.predict(sample)

    def evaluate_cross_validation(self, folds):
        """ Evaluation through a cross-validation

        Perform a cross-validation on the set training data
        with measured accuracy.
        It requires a given number of folds.

        Note
        ----
        cross validation is performed on the training data, not
        on the test data. So set your data as training data, if you
        want to perform a cross validation.

        Parameter
        ---------
        folds : int
            Contains the number of folds for the cross-validation.

        Returns
        -------
        accuracy_list : array, shape = [float acc score1, float acc score2, ...]
            Contains the accuracy scores of all iterations.

        acc_mean : float
            Contains the accuracy mean of the all iterations.
        """
        if self.clf is None:
            raise NoClassifierException

        elif self.train_targets.size == 0 and self.train_samples.size == 0:
            raise EmptyFeaturesEmptyTargetsException

        elif folds > len(self.train_samples):
            raise FoldSizeToBigException(folds, self.train_samples)

        else:
            kf = KFold(len(self.train_samples), n_folds=folds)
            accuracy_list = []

            for train, test in kf:
                x_train, x_test, y_train, y_test = self.train_samples[train], self.train_samples[test], \
                                                   self.train_targets[train], self.train_targets[test]

                self.clf.fit(x_train, y_train)
                accuracy_list.append(
                    accuracy_score(np.array(y_test),
                                   np.array(self.clf.predict(x_test))))

            n = 0
            sum_values = 0

            for acc_value in accuracy_list:
                sum_values = sum_values + acc_value
                n += 1

            acc_mean = (sum_values / n)

            return accuracy_list, acc_mean

    def evaluate_classification_report(self, fraction):
        """ A detailed classification report

        For an easy use to measure how well your trained model performs,
        the given method uses your set data objects and gives an accuracy
        score output on the shell.

        Note
        ----
        There are two scenarios :

            1. training data and test data are from the same data object.
                (means there names are the same !)
                - Normalization
            2. training data and test data are from different data objects.
                + Normalization

        The first scenario will use given fraction and divide the training
        data in train and test data for the classification. If fraction is
        100 then it will be trained and tested on the same data object.
        With a number of 80 fraction it will be trained on 80 percent and
        tested on 20 percent of the given data object. There is no
        Normalization for this scenario implemented !

        The second scenario needs a number of 100 fraction, to use the
        whole training data for the training ! Working with normalized
        values.

        Parameter
        ---------
        fraction : int
            Contains a number from 0 to 100. Defines the fraction of the
            training data that will be used for training the classifier.
        """
        if self.clf is None:
            raise NoClassifierException

        elif self.train_targets.size == 0 and self.train_samples.size == 0:
            raise EmptyFeaturesEmptyTargetsException

        else:
            # if trained on 100 % fraction, it will be tested on 100 %
            # fraction, than train and test data are the same

            # if count_predict is 0 (with 100% count_train), than
            # self.targets[-count_predict:] == self.targets[:] = True
            if self.test_data.name == self.train_data.name:

                print "train_data and test_data from one data_set"
                count_train = int(
                    round((float(len(self.train_targets)) / float(100)) *
                          float(fraction), 0))
                count_predict = len(self.train_targets) - count_train

                print "count_train:", count_train
                print "count_predict:", count_predict

                # Summarize placed in here, cause data objects are equal and
                # dived in this method. So training and test data are defined
                # in here.
                print "##########train_data summarize##########"
                summarize_textpair(
                    self.train_data.real_data.values()[:count_train])

                print "##########test_data summarize##########"
                summarize_textpair(
                    self.train_data.real_data.values()[-count_predict:])

                # setting train and test data
                train_samples = self.train_samples[:count_train]
                train_targets = self.train_targets[:count_train]
                test_samples = self.train_samples[-count_predict:]
                test_targets = self.train_targets[-count_predict:]

                # Training
                self.clf.fit(train_samples, train_targets)

                # Testing
                test_targets_predicted = self.clf.predict(test_samples)

                # calculating baseline
                null = 0
                eins = 0
                for i in test_targets:
                    if i == 0:
                        null += 1
                    else:
                        eins += 1
                if null > eins:
                    baseline = float(null) / (float(null) + float(eins))
                else:
                    baseline = float(eins) / (float(null) + float(eins))

                print "Anzahl 0:", null
                print "Anzahl 1:", eins
                print "Baseline:", baseline
                print "-------------------------------"

                # Calculating accuracy score of predicted samples
                print "accuracy_score: ", accuracy_score(
                    test_targets, test_targets_predicted)

            else:
                # Normalization
                norma = preprocessing.normalize(self.train_samples)

                count_train = int(
                    round((float(len(self.train_targets)) / float(100)) *
                          float(fraction), 0))
                print "count_train:", count_train
                print "count_predict:", len(self.test_targets)

                # Setting train and test data

                # without normalization take this one instead
                # train_samples = self.train_samples[:count_train]
                train_samples = norma[:count_train]
                train_targets = self.train_targets[:count_train]

                # without normalization take this one instead
                # test_samples = self.test_samples
                test_samples = preprocessing.normalize(self.test_samples)
                test_targets = self.test_targets

                # Training
                self.clf.fit(train_samples, train_targets)

                # Testing
                test_targets_predicted = self.clf.predict(test_samples)

                # Calculating baseline
                null = 0
                eins = 0
                for i in test_targets:
                    if i == 0:
                        null += 1
                    else:
                        eins += 1
                if null > eins:
                    baseline = float(null) / (float(null) + float(eins))
                else:
                    baseline = float(eins) / (float(null) + float(eins))

                print "Anzahl 0:", null
                print "Anzahl 1:", eins
                print "Baseline:", baseline
                print "-------------------------------"

                # Calculating accuracy score of predicted samples
                print "accuracy_score: ", accuracy_score(
                    test_targets, test_targets_predicted)
def detect(containers,
           fields,
           time_range,
           learning=True,
           usual_file=None,
           stdout=None):
    global maxim_distances

    if stdout is not None:
        sys.stdout = stdout

    data = dict()
    knowledge = dict()

    exceptions = load_exceptions()

    if time_range == 'seconds':
        print '-' * 50
        print "SECONDS"
        print '-' * 50

    for container in containers:
        data[container.module_data['name']] = copy.deepcopy(
            container.database_events[time_range])

        knowledge[container.module_data['name']] = copy.deepcopy(
            container.database_info[time_range])

    usual = load_usual_data(time_range)
    usual, changed = normalize(data, fields, usual)

    if changed and not learning:
        save_usual_events_json(usual, time_range)

    if learning:
        add_new_data_to_cluster(data, usual, knowledge)
        save_usual_events_json(usual, time_range)
        return

    # -----------------------------------
    # NOT LEARNING ---> PREDICT
    # -----------------------------------

    maxims, averages = get_maxims_and_averages(knowledge)
    usual_to_fit = normalize_fit_input(usual['data'], usual['events'],
                                       usual['fields'], averages, maxims)
    # -----------------------------------
    # PREPARING TO PREDICTs
    # -----------------------------------

    new_timestamps = eventdata_to_timestamps(data, usual)
    new_data = from_timestamps_to_data(new_timestamps)
    new_data_to_fit = normalize_fit_input(new_data, usual['events'],
                                          usual['fields'], averages, maxims)

    classifier = RadiusNeighborsClassifier(radius=ANOMALY_RADIUS,
                                           metric=similarity,
                                           outlier_label=-1)

    print "PREDICTING"
    t = time.time()

    if len(usual['labels']) < len(usual_to_fit):
        cluster_data(usual, usual_to_fit)
        save_usual_events_json(usual, time_range)

    classifier.fit(usual_to_fit, usual['labels'])
    labels = classifier.predict(new_data_to_fit)
    print "PREDICTION TOOK", time.time() - t, "seconds"

    print 'maxim distances:', sorted(maxim_distances, reverse=True)[:10]
    print 'NEW SAMPLES LABELS: ', labels

    # -----------------------------------
    # DONE PREDICTION
    # -----------------------------------

    events = usual['events']
    fields = usual['fields']

    for i in range(len(new_data) - 1, -1, -1):
        if labels[i] == -1:
            detected = do_detection(new_data[i], new_timestamps, maxims,
                                    new_data_to_fit[i], events, fields, usual,
                                    exceptions)
コード例 #13
0
classifier = GridSearchCV(pipe, search_space, cv=5,
                          verbose=0).fit(features_standardized,
                                         target)  # 创建grid搜索

classifier.best_estimator_.get_params()["knn__n_neighbors"]  # 最佳邻域的大小(k)
# k值的大小对KNN分类器的性能有重要的影响。在机器学习中,我们一直尝试在偏差(bias)和方差(variance)之间找到一种平衡,而k值对这种平衡的影响很明显。
# 如果k=n(这里n是观察值的数量),那么偏差就会很大而方差很小。如果k=1,那么偏差会很小但是方差很大。只有找到了能在偏差和方差之间取得折中的k值, 才能
# 得到最佳的KNN分类器。  在解决方案中,我们用GridSearchCV 对不同k值的KNN分类器做5折交叉验证,可以得到能产生最佳的KNN分类器的k值。

# 15.4 创建一个基于半径的最近邻分类器
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
iris = datasets.load_iris()
features = iris.data
target = iris.target

standardizer = StandardScaler()
features_standardized = standardizer.fit_transform(features)

rnn = RadiusNeighborsClassifier(radius=.5,
                                n_jobs=1).fit(features_standardized,
                                              target)  # 训练一个基于半径的最近邻分类器
new_observations = [[0.75, 0.75, 0.75, 0.75], [1, 1, 1, 1]]  # 创建两个观察值
rnn.predict(new_observations)  # 预测这两个观察值的分类
# 基于半径的最近邻分类器不太常用,其观察值的分类是根据某一半径r范围内所有观察值的分类来预测的。
# 在scikit-learn中, RadiusNeighborsClassifier 与 KNeighborsClassifier 很相似,除了两个参数
# 1)radius: 我们需要指定一个半径来确定某个观察值能不能算作目标观察值的邻居。除非你有很充分的理由要把radius设为某个值,否则最好像对待其他超参数一样
# 在模型选择起见对它进行调整。
# 2)outlier_label: 用来指定如果一个观察值周围没有其他观察值在半径radius的范围内,这个观察值应该被标记为什么。这是一个有用的分辨界外点的方法。
コード例 #14
0
ファイル: radneighbors.py プロジェクト: d-giles/KeplerML
        y_test = labels[272:, i]
    else:
        X_train = training
        y_train = labels[:172, i]
        X_test = sampletest
        y_test = labels[172:, i]

    posterior = np.empty([100, 72, 6])
    box = np.zeros([6, 6])
    for j in range(4, 5):
        for k in range(1, 2):
            accuracy = np.zeros(100)
            for m in range(0, 100):
                rnc = RadiusNeighborsClassifier(radius=j, leaf_size=k)
                rnc.fit(X_train, y_train)
                y_pred = rnc.predict(X_test)

                n = 0
                for i in range(0, len(y_pred)):
                    if y_pred[i] == y_test[i]:
                        # print i, y_pred[i], y_test[i]
                        n = n + 1
                        accuracy[m] = accuracy[m] + 1
                    box[y_test[i] - 1, y_pred[i] - 1] = box[y_test[i] - 1, y_pred[i] - 1] + 1
                # posterior[m] =  knc.predict_proba(X_test)
            print j, k, np.mean(accuracy) / 0.72, np.std(accuracy) / 0.72
            # print 30, 20, sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0
        """
    means = np.empty([72,6])
    stds = np.empty([72,6])
    grid = np.empty([6,6])
コード例 #15
0
#dimension reduction 
from sklearn.decomposition import PCA
pca = PCA(n_components=10000)

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

X_train.shape

y_train = list(y_train)

from sklearn.neighbors import RadiusNeighborsClassifier

clf = RadiusNeighborsClassifier(radius=1.0,  weights='uniform', algorithm='auto', leaf_size=30,p=2, metric='minkowski')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import accuracy_score
 accuracy_score(y_test, y_pred)

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='gini', splitter = 'best', max_depth = None, min_samples_split = 2, random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)
コード例 #16
0
x_train
y_train
x_test
y_test

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
knn.score(x_test, y_test)

knn.predict(x_test)
y_test

knn.predict(x_test[2:3][:4])
y_test[2:3][:4]
pr = x_test[2:3][:4]

from sklearn.neighbors import RadiusNeighborsClassifier

knn_r = RadiusNeighborsClassifier(radius=5)

knn_r.fit(x_train, y_train)
knn_r.score(x_test, y_test)

knn_r.predict(x_test)
y_test

knn_r.predict(x_test[2:3][:4])
y_test[2:3][:4]
コード例 #17
0
ファイル: 15.4.py プロジェクト: PanNowik/github_personal
# Wczytanie bibliotek.
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

# Wczytanie danych.
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Utworzenie egzemplarza typu StandardScaler.
standardizer = StandardScaler()

# Standaryzacja cech.
features_standardized = standardizer.fit_transform(features)

# Wytrenowanie klasyfikatora sąsiedztwa na podstawie promienia.
rnn = RadiusNeighborsClassifier(
    radius=.5, n_jobs=-1).fit(features_standardized, target)

# Utworzenie dwóch obserwacji.
new_observations = [[ 1,  1,  1,  1]]

# Prognozowanie klasy tych dwóch obserwacji.
rnn.predict(new_observations)
コード例 #18
0
# 查看目标样本的近邻样本(距离+位置)
# print(clf.radius_neighbors(X[0,:].reshape(1, -1), return_distance=True))
# 查看目标样本的近邻图(稀疏矩阵,位置+距离或者连通)
# print(clf.radius_neighbors_graph(X[0].reshape(1, -1), mode='distance'))


# 可视化预测的效果(决策边界)
from matplotlib.colors import ListedColormap
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# 确认训练集的边界
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
# 生成随机数据来做测试集,然后作预测
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))
new_x = np.c_[xx.ravel(), yy.ravel()]
y_pred = clf.predict(new_x)

# 画出测试集数据
ax = plt.subplot()
ax.pcolormesh(xx, yy, y_pred.reshape(xx.shape), cmap=cmap_light)
# 也画出所有的训练集数据
ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = 15, weights = 'distance')" )
plt.show()
コード例 #19
0
# Create and train the Radius Neighbors Classifier
clf = RadiusNeighborsClassifier(radius=0.5,
                                weights='distance',
                                algorithm='auto',
                                leaf_size=30,
                                p=2,
                                metric='minkowski',
                                outlier_label=None,
                                metric_params=None,
                                n_jobs=None)
clf.fit(encodings, names)

# Load the test image with unknown faces into a numpy array
test_image = face_recognition.load_image_file('test_image.jpg')

# Find all the faces in the test image using the default HOG-based model
face_locations = face_recognition.face_locations(test_image)
no = len(face_locations)
print("Number of faces detected: ", no)

# Predict all the faces in the test image using the trained classifier
print("Found:")
for i in range(no):
    test_image_enc = face_recognition.face_encodings(
        test_image, known_face_locations=face_locations)[i]
    try:
        name = clf.predict(test_image_enc)
        print(*name)
    except ValueError:
        print('No matches Found')
#plt.xticks(())
#plt.yticks(())
#plt.axis([-3, 3, -3, 3])


biz['svm_pred']=(biz.expensive>clf.predict(X)).astype(int)
plt.scatter(x=biz[biz.svm_pred==1].X,y=biz[biz.svm_pred==1].Y, s=20, c='g')
print("Prop of expensive businesses seen as gentrifiers [%.2f]" %(biz['svm_pred'].sum()/biz.expensive.sum()))
print("Prop of expensive businesses seen as gentrifiers [%.2f]" %(biz['svm_pred'].sum()/len(biz.expensive)))
#biz['gentrifier']=(biz.expensive>biz.svm_pred).astype(int)
#####################################################################################################################################
################################################################ Nearest Neighbor ###################################################
r=.00025 #A block is .001 and two blocks are .003; therefore, .011 scans about 8 blocks in diameter.
neigh = RadiusNeighborsClassifier(radius=r) #from qGis nneighbor analysis
neigh.fit(X, Y)
predictions=neigh.predict(X)
plt.scatter(X.iloc[:,0], X.iloc[:,1], s=30, c=Y, cmap=plt.cm.Paired); plt.title('True labels')
plt.subplots_adjust(left=0, bottom=0, right=1, top=.95, wspace=0, hspace=0)


plt.figure(); 
plt.scatter(X.iloc[:,0], X.iloc[:,1], s=30, c=predictions, cmap=plt.cm.Paired); plt.title('Predicted labels, rad=%.3f' %r)
plt.subplots_adjust(left=0, bottom=0, right=1, top=.95, wspace=0, hspace=0)


biz['rnn_gentrifier']=(biz.expensive>predictions).astype(int)
plt.scatter(x=biz[biz.rnn_gentrifier==1].X,y=biz[biz.rnn_gentrifier==1].Y, s=20, c='g')
print("Prop of expensive businesses seen as gentrifiers [%.2f]" %((biz.rnn_gentrifier.sum()/biz.expensive.sum())))
print("Prop of expensive businesses seen as gentrifiers [%.2f]" %((biz.rnn_gentrifier.sum()/len(biz.expensive))))
#####################################################################################################################################
################################################################  SAVE RESULTS ######################################################
コード例 #21
0
#filename = "serialized_y_test_" + country + ".pck"
# filepath = os.path.join(
#    here, 'persisted_models', country, filename)
#y_test = joblib.load(filepath)

#print("loading data finished")

# the radius neighbors
# https: // scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html

classifier_radius = RadiusNeighborsClassifier(
    radius=5, metric='euclidean', weights='distance')
classifier_radius.set_params(outlier_label='Z')
classifier_radius.fit(X_train, y_train)
y_pred_radius = classifier_radius.predict(X_test)

print("radius prediction")
print(y_pred_radius)

print("Accuracy radius classifier")
print(confusion_matrix(y_test, y_pred_radius))
print(classification_report(y_test, y_pred_radius))

# joblib - save model to file
filenameRadiusClassifier = "serialized_radius_classifier_" + country + ".pck"

here = os.path.dirname(os.path.abspath(__file__))
filepathRadiusClassifier = os.path.join(
    here, 'persisted_models', country, filenameRadiusClassifier)
joblib.dump(classifier_radius, filepathRadiusClassifier)
コード例 #22
0
ファイル: model.py プロジェクト: frosky/Text_Classifier
class Model(object):
    """
    Text-classification-system with scikit-learn.
    For reference see: http://scikit-learn.org/stable/

    This Model class is based on Data class. Defines training
    and test data. Build classification model. Provides
    evaluation methods.

    Parameter
    ---------
    data : Data, optional
        Contains a data object with filled data.real_data.

    data_list : array, shape = [data1 object, data2 object, ...]
        Contains data objects with filled data.real_data.

    Attributes
    ----------
    clf : classifier object from sklean moduls.
        Contains a selected classifier object from sklean modul.
        see reference: http://scikit-learn.org/stable/supervised_learning.html#supervised-learning

    classifier_list : array, shape = [string classifier1 name, ...]
        Contains names of all available classification algorithms.

    __train_data_set : boolean
        Contains bolloean value that describes if train_data is set.

    train_data : Data
        Contains the data object that is set as training data.

    test_data : Data
        Contains the data object that is set as test data.

    train_targets : numpy array of shape [n_samples]
        Contains the class labels of training data. A sample is
        a textpair object, it's class label is found in textpair.target.

    train_samples : numpy array of shape [n_samples,n_features]
        Contains the feature values of the training data. A sample is
        a textpair object, it's feature values are found in textpair.features
        hash. After vectorize() them, they are stored in
        textpair.feature_vector.

    test_targets : numpy array of shape [n_samples]
        Contains the class labels of test data. A sample is
        a textpair object, it's class label is found in textpair.target.

    test_samples : numpy array of shape [n_samples,n_features]
        Contains the feature values of the test data. A sample is
        a textpair object, it's feature values are found in textpair.features
        hash. After vectorize() them, they are stored in
        textpair.feature_vector.
    """

    def __init__(self, data=None, data_list=None):
        self.clf = None

        if data is not None:
            self.data_list = [data]
        elif data_list is not None:
            self.data_list = data_list

        self.classifier_list = ["svm_linear", "svm_poly", "naive_bayes", "decision_tree", "nearest_centroid",
                                "k_neighbors", "radius_neighbors"]

        self.__train_data_set = False

    def set_train_data(self, data_name):
        """Setter for training data

        Walk through data_list and set data object with
        data.name as train_data.

        Parameter
        ---------
        data_name : string
            Contains the name of the data object, that should
            be set as train_data for the model.
        """
        data_in_list = False
        for data in self.data_list:
            if data.name == data_name:
                print data_name + " is in model_data_list"
                self.train_data = data
                self.train_samples, self.train_targets = self.fill_feature_target(data)
                print data_name + " is set as train_data"
                data_in_list = True
        if data_in_list:
            self.__train_data_set = True
        else:
            print data_name + " not in model_data_list "

    def set_test_data(self, data_name):
        """Setter for test data

        Walk through data_list and set data object with
        data.name as test_data.

        Notes
        -----
        Training data has to be set before test data, due to the fact
        that some features need skeletons that have to be build before seeing
        the test data.

        see reference: bag_of_pos.py, bag_of_words.py, tf_idf.py

        Parameter
        ---------
        data_name : string
            Contains the name of the data object, that should
            be set as test_data for the model.
        """
        if self.__train_data_set and self.train_data.name == data_name:
            self.test_data = self.train_data
            print "train_data and test_data from one data_set"
        elif not self.__train_data_set:
            print "please set train_data first"
        else:
            data_in_list = False
            for data in self.data_list:
                if data.name == data_name:
                    print data_name + " is in model_data_list"
                    self.test_data = data
                    self.test_samples, self.test_targets = self.fill_feature_target(data)
                    data_in_list = True
                    print data_name + " is set as test_data"
            if not data_in_list:
                print data_name + " not in model_data_list "

    def fill_feature_target(self, data):
        """ Fill the feature samples and target values.

        The classifier objects from sklearn need a numpy array for
        classification.

        Shape of the data class labels : numpy array of shape [n_samples]
        Shape of the data feature values : numpy array of shape [n_samples,n_features]

        Vectorize() textpair feature values, for building required numpy arrays.

        Note
        ----
        Check __train_data_set first, cause there is no need to attache the
        same features for test data manually in main.py. This will be performed
        automatically in here.

        Parameter
        ---------
        data : Data
            Contains a Data object that data.real_data should be vectorized.
        """
        sample_list = []
        target_list = []

        if self.__train_data_set:
            for feature in self.train_data.features_fit:
                if feature == "bag_of_words" or feature == "bag_of_pos" or feature == "tf_idf":
                    data.bow_model = self.train_data.bow_model

            print self.train_data.features_fit
            data.attach_feature_list(self.train_data.features_fit)

            for textpair in data.real_data.values():
                textpair.vectorize()
                target_list.append(textpair.target)
                sample_list.append(textpair.feature_vector)

            return np.array(sample_list), np.array(target_list)
        else:
            for textpair in data.real_data.values():
                textpair.vectorize()
                target_list.append(textpair.target)
                sample_list.append(textpair.feature_vector)

            return np.array(sample_list), np.array(target_list)

    def set_classifier(self, classifier_name):
        """ Setter for clf

        Building instances of classifier objects with corresponding name.

        Parameter
        ---------
        classifier_name : string
            Contains the corresponding name of the wanted classifier from
            sklearn.
        """
        if classifier_name == "svm_linear":
            self.clf = svm.SVC(kernel="linear", class_weight="auto")
        elif classifier_name == "svm_poly":
            self.clf = svm.SVC(kernel="poly", class_weight="auto")
        elif classifier_name == "naive_bayes":
            self.clf = GaussianNB()
        elif classifier_name == "decision_tree":
            self.clf = tree.DecisionTreeClassifier()
        elif classifier_name == "nearest_centroid":
            self.clf = NearestCentroid()
        elif classifier_name == "k_neighbors":
            self.clf = KNeighborsClassifier(n_neighbors=100)
        elif classifier_name == "radius_neighbors":
            self.clf = RadiusNeighborsClassifier(radius=1.0, outlier_label=1)
        else:
            raise ClassifierNotExistException(classifier_name)

    def train(self, fraction):
        """ Train the model

        Training the classifier with the wanted fraction of the training data.

        Parameter
        -------
        fraction : int
            Contains a number from 0 to 100. Defines the fraction of the
            training data that will be used for training the classifier.
        """
        if self.clf is None:
            raise NoClassifierException
        elif self.train_targets.size == 0 and self.train_samples.size == 0:
            raise EmptyFeaturesEmptyTargetsException
        else:
            count = int(round((float(len(self.train_targets)) / float(100)) * float(fraction), 0))
            self.clf.fit(self.train_samples[:count], self.train_targets[:count])

    def predict(self, sample):
        """ Predict a given sample.

        Make a prediction for a given sample. Classifier needs a numpy array
        with the feature values of a sample.

        Note
        ----
        Requires a trained(fitted) model.

        Parameters
        ----------
        samples : numpy array of shape [n_samples,n_features]

        Returns
        -------
        self.clf.predict(sample) : int
            Contains the prediction value from the model. It is the predicted
            class label. For a textpair object it can be 0 or 1.
        """
        if self.clf is None:
            raise NoClassifierException
        elif self.test_targets.size == 0 and self.test_samples.size == 0:
            raise EmptyFeaturesEmptyTargetsException
        else:
            return self.clf.predict(sample)

    def evaluate_cross_validation(self, folds):
        """ Evaluation through a cross-validation

        Perform a cross-validation on the set training data
        with measured accuracy.
        It requires a given number of folds.

        Note
        ----
        cross validation is performed on the training data, not
        on the test data. So set your data as training data, if you
        want to perform a cross validation.

        Parameter
        ---------
        folds : int
            Contains the number of folds for the cross-validation.

        Returns
        -------
        accuracy_list : array, shape = [float acc score1, float acc score2, ...]
            Contains the accuracy scores of all iterations.

        acc_mean : float
            Contains the accuracy mean of the all iterations.
        """
        if self.clf is None:
            raise NoClassifierException

        elif self.train_targets.size == 0 and self.train_samples.size == 0:
            raise EmptyFeaturesEmptyTargetsException

        elif folds > len(self.train_samples):
            raise FoldSizeToBigException(folds, self.train_samples)

        else:
            kf = KFold(len(self.train_samples), n_folds=folds)
            accuracy_list = []

            for train, test in kf:
                x_train, x_test, y_train, y_test = self.train_samples[train], self.train_samples[test], \
                                                   self.train_targets[train], self.train_targets[test]

                self.clf.fit(x_train, y_train)
                accuracy_list.append(accuracy_score(np.array(y_test), np.array(self.clf.predict(x_test))))

            n = 0
            sum_values = 0

            for acc_value in accuracy_list:
                sum_values = sum_values + acc_value
                n += 1

            acc_mean = (sum_values / n)

            return accuracy_list, acc_mean

    def evaluate_classification_report(self, fraction):
        """ A detailed classification report

        For an easy use to measure how well your trained model performs,
        the given method uses your set data objects and gives an accuracy
        score output on the shell.

        Note
        ----
        There are two scenarios :

            1. training data and test data are from the same data object.
                (means there names are the same !)
                - Normalization
            2. training data and test data are from different data objects.
                + Normalization

        The first scenario will use given fraction and divide the training
        data in train and test data for the classification. If fraction is
        100 then it will be trained and tested on the same data object.
        With a number of 80 fraction it will be trained on 80 percent and
        tested on 20 percent of the given data object. There is no
        Normalization for this scenario implemented !

        The second scenario needs a number of 100 fraction, to use the
        whole training data for the training ! Working with normalized
        values.

        Parameter
        ---------
        fraction : int
            Contains a number from 0 to 100. Defines the fraction of the
            training data that will be used for training the classifier.
        """
        if self.clf is None:
            raise NoClassifierException

        elif self.train_targets.size == 0 and self.train_samples.size == 0:
            raise EmptyFeaturesEmptyTargetsException

        else:
            # if trained on 100 % fraction, it will be tested on 100 %
            # fraction, than train and test data are the same

            # if count_predict is 0 (with 100% count_train), than
            # self.targets[-count_predict:] == self.targets[:] = True
            if self.test_data.name == self.train_data.name:

                print "train_data and test_data from one data_set"
                count_train = int(round((float(len(self.train_targets)) / float(100)) * float(fraction), 0))
                count_predict = len(self.train_targets) - count_train

                print "count_train:", count_train
                print "count_predict:", count_predict

                # Summarize placed in here, cause data objects are equal and
                # dived in this method. So training and test data are defined
                # in here.
                print "##########train_data summarize##########"
                summarize_textpair(self.train_data.real_data.values()[:count_train])

                print "##########test_data summarize##########"
                summarize_textpair(self.train_data.real_data.values()[-count_predict:])

                # setting train and test data
                train_samples = self.train_samples[:count_train]
                train_targets = self.train_targets[:count_train]
                test_samples = self.train_samples[-count_predict:]
                test_targets = self.train_targets[-count_predict:]

                # Training
                self.clf.fit(train_samples, train_targets)

                # Testing
                test_targets_predicted = self.clf.predict(test_samples)

                # calculating baseline
                null = 0
                eins = 0
                for i in test_targets:
                    if i == 0:
                        null += 1
                    else:
                        eins += 1
                if null > eins:
                    baseline = float(null)/(float(null)+float(eins))
                else:
                    baseline = float(eins)/(float(null)+float(eins))

                print "Anzahl 0:", null
                print "Anzahl 1:", eins
                print "Baseline:", baseline
                print "-------------------------------"

                # Calculating accuracy score of predicted samples
                print "accuracy_score: ", accuracy_score(test_targets, test_targets_predicted)

            else:
                # Normalization
                norma = preprocessing.normalize(self.train_samples)

                count_train = int(round((float(len(self.train_targets)) / float(100)) * float(fraction), 0))
                print "count_train:", count_train
                print "count_predict:", len(self.test_targets)

                # Setting train and test data

                # without normalization take this one instead
                # train_samples = self.train_samples[:count_train]
                train_samples = norma[:count_train]
                train_targets = self.train_targets[:count_train]

                # without normalization take this one instead
                # test_samples = self.test_samples
                test_samples = preprocessing.normalize(self.test_samples)
                test_targets = self.test_targets

                # Training
                self.clf.fit(train_samples, train_targets)

                # Testing
                test_targets_predicted = self.clf.predict(test_samples)

                # Calculating baseline
                null = 0
                eins = 0
                for i in test_targets:
                    if i == 0:
                        null += 1
                    else:
                        eins += 1
                if null > eins:
                    baseline = float(null)/(float(null)+float(eins))
                else:
                    baseline = float(eins)/(float(null)+float(eins))

                print "Anzahl 0:", null
                print "Anzahl 1:", eins
                print "Baseline:", baseline
                print "-------------------------------"

                # Calculating accuracy score of predicted samples
                print "accuracy_score: ", accuracy_score(test_targets, test_targets_predicted)
for i in range(0, nr_of_neighbors):
    # the id of the neighboars: neighbors[1][0][i]
    print(data_df.iloc[neighbors[1][0][i], :])

# sort by distance
# get the first n elements, as the first n closest neighbors
#outputlist = sorted(neighbors[0], key=itemgetter(0))
#print("sorted array")
# print(outputlist)

# the radius neighbors
# https: // scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
classifier_radius = RadiusNeighborsClassifier(radius=5)
classifier_radius.fit(X_train, y_train)
y_pred_radius = classifier_radius.predict(X_test)

print("radius prediction")
print(y_pred_radius)

print("Accuracy radius classifier")
print(confusion_matrix(y_test, y_pred_radius))
print(classification_report(y_test, y_pred_radius))

y_pred_radius_for_one = classifier_radius.predict(new_X)

print("radius prediction for one")
print(y_pred_radius_for_one)

print("Accuracy radius classifier for one")
print(confusion_matrix(new_y, y_pred_radius_for_one))
コード例 #24
0
ファイル: main.py プロジェクト: wenzhengong/salary
def par(X_tr, y_tr, X_te, r):
    neigh = RadiusNeighborsClassifier(radius = r)
    neigh.fit(X_tr, y_tr)
    y_pred = neigh.predict(X_te)
    return y_pred
コード例 #25
0
def SequentialRadiusNeighborsClassifier(epsilon, X_train, X_test, Y_train, add,
                                        alg):
    #    size_train = len(Y_train)
    X_train_temp = np.copy(X_train)
    Y_train_temp = np.copy(Y_train)
    test_size = len(X_test)
    Y_predict = [-1 for x in range(test_size)]
    Y_current = list(set(Y_train))
    test_index = [x for x in range(test_size)]
    new_indices = []
    epsilon_update = epsilon
    #    epsilon_update = updateEpsilon(distances, test_index, choice)
    for test_time in range(test_size):
        Knn_temp = NearestNeighbors(n_neighbors=1)
        Knn_temp.fit(X_train_temp)
        min_distances = Knn_temp.kneighbors(X_test[test_index])[0]
        min_distances = [np.mean(x) for x in min_distances]
        optimal_indice = min_distances.index(min(min_distances))
        optimal_test = test_index[optimal_indice]
        clf = RadiusNeighborsClassifier(radius=epsilon_update,
                                        weights='distance').fit(
                                            X_train_temp, Y_train_temp)
        predict_set = clf.radius_neighbors(X_test[optimal_test].reshape(1,
                                                                        -1))[1]
        predict_set = list(predict_set[0])
        if len(predict_set) > 0:
            if min(Y[predict_set]) == max(Y[predict_set]):
                y_predict = min(Y[predict_set])
            else:
                if alg == "srnc":
                    y_predict = clf.predict(X_test[optimal_test].reshape(
                        1, -1))
                    y_predict = y_predict[0]
                else:
                    if alg == "svm":
                        clf = svm.SVC().fit(X[predict_set], Y[predict_set])
                    if alg == "LinearSVC":
                        # clf = LinearSVC(max_iter=10000).fit(X[predict_set], Y[predict_set])
                        clf = LinearSVC().fit(X[predict_set], Y[predict_set])
                    if alg == "dt":
                        clf = DecisionTreeClassifier().fit(
                            X[predict_set], Y[predict_set])
                    if alg == "rf":
                        clf = RandomForestClassifier(n_estimators=10).fit(
                            X[predict_set], Y[predict_set])
                    if alg == "gb":
                        clf = GradientBoostingClassifier(n_estimators=10).fit(
                            X[predict_set], Y[predict_set])
                    if alg == "lr":
                        clf = LogisticRegression(max_iter=10000).fit(
                            X[predict_set], Y[predict_set])
                    if alg == "mlp":
                        clf = MLPClassifier().fit(X[predict_set],
                                                  Y[predict_set])
                    y_predict = clf.predict(X_test[optimal_test].reshape(
                        1, -1))
                    y_predict = y_predict[0]
            if add == 1:
                X_train_temp = np.append(X_train_temp, [X_test[optimal_test]],
                                         axis=0)
                Y_train_temp = np.append(Y_train_temp, [y_predict], axis=0)
        else:
            y_predict = max(Y_current) + 1
            Y_current.append(y_predict)
            X_train_temp = np.append(X_train_temp, [X_test[optimal_test]],
                                     axis=0)
            Y_train_temp = np.append(Y_train_temp, [y_predict], axis=0)
            new_indices.append(optimal_test)
#            epsilon_update = updateEpsilon(distances, test_index, choice)
        Y_predict[optimal_test] = y_predict
        test_index.remove(optimal_test)
    return Y_predict