class KNN_strings(object):
    '''
    classdocs
    '''

    def __init__(self, n_neighbors=1):
        '''
        Constructor
        '''
        self.dsr = DatasetReader()
        self.fenc = FreemanEncoder()
        self.data = []
        self.knn = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='auto', metric=self.lev_metric)
        
    def lev_metric(self, x, y):
        i, j = int(x[0]), int(y[0])     # extract indices
#         if self.data[i] == self.data[j]:
#             print self.data[i], self.data[j], edit_dist(self.data[i], self.data[j])
        return edit_dist(self.data[i], self.data[j])
    
    def knn_train(self, dataset, cv=1, datasplit=0.7):
        
        images_dataset= self.dsr.read_dataset_images(dataset)
        freeman_code_dict = self.fenc.encode_freeman_dataset(images_dataset)
        _, codes, labels = self.dsr.gen_labelled_arrays(freeman_code_dict)
        
        self.data = codes
        
        X = np.arange(len(self.data)).reshape(-1, 1)
        
        if cv <= 1:
            self.knn.fit(X, labels)
        elif cv > 1:
            cv_result = cross_validation.cross_val_score(self.knn, X, labels, cv=cv)
            print cv_result
            
        print 'Training Done!'
            
    def knn_predict(self, test_data, score=False):
        images_dataset= self.dsr.read_dataset_images(test_data)
        freeman_code_dict = self.fenc.encode_freeman_dataset(images_dataset)
        _, codes, labels = self.dsr.gen_labelled_arrays(freeman_code_dict)
        
        X_pred = np.arange(len(codes)).reshape(-1, 1)
        predictions = self.knn.predict(X_pred)
            
        if score == True:
            accuracy = self.knn.score(X_pred, labels)
            print "Test Accuracy: ", accuracy
        
        return predictions
    
    def knn_predict_one(self, test_image):
        image_code = self.fenc.encode_freeman(test_image)
        print image_code
        data = [image_code]
        X_pred = np.arange(len(data)).reshape(-1, 1)
        prediction = self.knn.predict(X_pred)
    
        return prediction
class NaiveBayes(ml_alg_base):
    '''
    classdocs
    '''
    def __init__(self):
        '''
        Constructor
        '''
        ml_alg_base.__init__(self)
        self.dsr = DatasetReader()
        self.learning_model = naive_bayes.GaussianNB()

    def get_data(self, dataset_path="./teams_dataset"):
        data_dict = self.dsr.read_dataset_images(dataset_path)

        _, data_set_x, data_set_y = self.dsr.gen_labelled_arrays(data_dict)
        data_set_x = data_set_x.reshape(len(data_set_x), -1)

        return data_set_x, data_set_y

    def training(self, dataset_path, cv=1):
        dataset = self.dsr.read_dataset_images(dataset_path)
        _, images, labels = self.dsr.gen_labelled_arrays(dataset)
        images = numpy.array(images)
        #reshape images for input
        data = images.reshape(len(images), -1)
        if cv <= 1:
            self.learning_model.fit(data, labels)
        elif cv > 1:
            cv_result = cross_validation.cross_val_score(self.learning_model,
                                                         data,
                                                         labels,
                                                         cv=cv)
            return cv_result

        pickle.dump(self.learning_model,
                    open("./Models/naivebayes_model.p", "wb"))

    def predict(self, image_path):
        try:
            self.learning_model = pickle.load(
                open("./Models/naivebayes_model.p", "rb"))
        except:
            print "Please train the Naive Bayes model first"

        if isbasestring(image_path):
            image = self.dsr.read_img_bw(image_path)
        else:
            image = image_path
        image = image.reshape(-1, image.shape[0] * image.shape[1])
        result = self.learning_model.predict(image)

        return result


# from NaiveBayes import NaiveBayes
# NB = NaiveBayes()
# # NB.training('I:\\eclipse_workspace\\CharacterRecognition\\digits_dataset_clean', cv=5)
# # print NB.predict('I:\\eclipse_workspace\\CharacterRecognition\\test1.jpg')
# data_x, data_y = NB.get_data()
# print data_x.shape, data_y.shape
# NB.first_exp(data_x, data_y, NB.learning_model, algorithm_name='NaiveBayes' ,num_iter=50)
class KNN_statistic(object):
    '''
    classdocs
    '''

    def __init__(self):
        '''
        Constructor
        '''
        self.dsr = DatasetReader()
        self.fenc = FreemanEncoder()
        self.training_data = []

    def generate_labelled_sequences(self, freeman_codes_dict):
        labelled_sequences = []
        codes_list = freeman_codes_dict.items()
        for tup in codes_list:
            for code in tup[1]:
                labelled_sequences.append((tup[0],code))

        return labelled_sequences


    def prepare_data(self, arrays_data=[], arrays_labels=[], split=0.2):
        # Separate data into 2 sets, 1 is training and 1 is test,split is the ratio (the default is 0.20)
        ad_train, ad_test, al_train, al_test = train_test_split(arrays_data, arrays_labels, test_size=split, random_state=42)
        return ad_train, ad_test, al_train, al_test


    def get_neighbors(self, data, data_label, test_instance, k):
        # Get the list of nearest neighbors to a test instance
        distances = []
        for i in range(len(data)):
            dist = edit_dist(test_instance, data[i])
            distances.append((data[i], data_label[i], dist))

        distances.sort(key=operator.itemgetter(2))
        neighbors = []
        for x in range(0, k):
            neighbors.append([distances[x][0], distances[x][1]])

        return neighbors

    def get_label(self, neighbors):
        # Determine the label of a test instance base on its nearest neighbors
        labels = {}
        for neighbor in neighbors:
            if neighbor[1] not in labels:
                labels[neighbor[1]] = 1
            else:
                labels[neighbor[1]] += 1

        sorted_labels = sorted(labels.items(), key=operator.itemgetter(1), reverse=True)

        return sorted_labels[0][0]

    def evaluation(self, data, data_for_distance_caculation, data_label, data_for_distance_calculation_label, k=3):
        # Evaluate the accuracy of knn
        correct_count = 0
        for instance in range(0, len(data)-1):
            neighbors = self.get_neighbors(data_for_distance_caculation,data_for_distance_calculation_label, data[instance], k)
            label = self.get_label(neighbors)
            if int(label) == int(data_label[instance]):
                correct_count += 1

        return (float(correct_count)/len(data))

    def knn_train(self, dataset_path, train_test_split=0.2):
        dataset = self.dsr.read_dataset_images(dataset_path)
        freeman_codes_dict = self.fenc.encode_freeman_dataset(dataset)
        _, arrays_data, arrays_label = self.dsr.gen_labelled_arrays(freeman_codes_dict)
        arrays_data, arrays_label = shuffle(arrays_data, arrays_label)
        ad_train, ad_test, al_train, al_test = self.prepare_data(arrays_data, arrays_label, split=train_test_split)

        # Cross validation with 5 folds
        kf = KFold(len(ad_train), 5)
        result = 0
        for train_index, test_index in kf:
            ad_train_kfold, ad_test_kfold = ad_train[train_index], ad_train[test_index]
            al_train_kfold, al_test_kfold = al_train[train_index], al_train[test_index]
            result += self.evaluation(ad_test_kfold, ad_train_kfold, al_test_kfold, al_train_kfold, k=2)
        result_average = result/5

        # Result with the training
        result_training = self.evaluation(ad_train, ad_train, al_train, al_train, k=2)

        # Result with the test
        result_test = self.evaluation(ad_test, ad_train, al_test, al_train, k=2)
        return result_average, result_training, result_test


# knn = KNN_strings(n_neighbors=1)
# knn = KNN_statistic()
# results = []
# for x in range(50):
#     result_average, result_training, result_test = knn.knn_train("/home/thovo/PycharmProjects/CharacterRecognition/digits_dataset", 0.2)
#     text = result_average.__str__() + " , " + result_training.__str__() + " , " + result_test.__str__() + "\n"
#     results.append(text)
# 
# 
# f = open("Results/knn.txt", "w")
# for item in results:
#     f.write(item)
# 
# f.close()
Beispiel #4
0
class KNN_statistic(object):
    '''
    classdocs
    '''
    def __init__(self):
        '''
        Constructor
        '''
        self.dsr = DatasetReader()
        self.fenc = FreemanEncoder()
        self.training_data = []

    def generate_labelled_sequences(self, freeman_codes_dict):
        labelled_sequences = []
        codes_list = freeman_codes_dict.items()
        for tup in codes_list:
            for code in tup[1]:
                labelled_sequences.append((tup[0], code))

        return labelled_sequences

    def prepare_data(self, arrays_data=[], arrays_labels=[], split=0.2):
        # Separate data into 2 sets, 1 is training and 1 is test,split is the ratio (the default is 0.20)
        ad_train, ad_test, al_train, al_test = train_test_split(
            arrays_data, arrays_labels, test_size=split, random_state=42)
        return ad_train, ad_test, al_train, al_test

    def get_neighbors(self, data, data_label, test_instance, k):
        # Get the list of nearest neighbors to a test instance
        distances = []
        for i in range(len(data)):
            dist = edit_dist(test_instance, data[i])
            distances.append((data[i], data_label[i], dist))

        distances.sort(key=operator.itemgetter(2))
        neighbors = []
        for x in range(0, k):
            neighbors.append([distances[x][0], distances[x][1]])

        return neighbors

    def get_label(self, neighbors):
        # Determine the label of a test instance base on its nearest neighbors
        labels = {}
        for neighbor in neighbors:
            if neighbor[1] not in labels:
                labels[neighbor[1]] = 1
            else:
                labels[neighbor[1]] += 1

        sorted_labels = sorted(labels.items(),
                               key=operator.itemgetter(1),
                               reverse=True)

        return sorted_labels[0][0]

    def evaluation(self,
                   data,
                   data_for_distance_caculation,
                   data_label,
                   data_for_distance_calculation_label,
                   k=3):
        # Evaluate the accuracy of knn
        correct_count = 0
        for instance in range(0, len(data) - 1):
            neighbors = self.get_neighbors(
                data_for_distance_caculation,
                data_for_distance_calculation_label, data[instance], k)
            label = self.get_label(neighbors)
            if int(label) == int(data_label[instance]):
                correct_count += 1

        return (float(correct_count) / len(data))

    def knn_train(self, dataset_path, train_test_split=0.2):
        dataset = self.dsr.read_dataset_images(dataset_path)
        freeman_codes_dict = self.fenc.encode_freeman_dataset(dataset)
        _, arrays_data, arrays_label = self.dsr.gen_labelled_arrays(
            freeman_codes_dict)
        arrays_data, arrays_label = shuffle(arrays_data, arrays_label)
        ad_train, ad_test, al_train, al_test = self.prepare_data(
            arrays_data, arrays_label, split=train_test_split)

        # Cross validation with 5 folds
        kf = KFold(len(ad_train), 5)
        result = 0
        for train_index, test_index in kf:
            ad_train_kfold, ad_test_kfold = ad_train[train_index], ad_train[
                test_index]
            al_train_kfold, al_test_kfold = al_train[train_index], al_train[
                test_index]
            result += self.evaluation(ad_test_kfold,
                                      ad_train_kfold,
                                      al_test_kfold,
                                      al_train_kfold,
                                      k=2)
        result_average = result / 5

        # Result with the training
        result_training = self.evaluation(ad_train,
                                          ad_train,
                                          al_train,
                                          al_train,
                                          k=2)

        # Result with the test
        result_test = self.evaluation(ad_test,
                                      ad_train,
                                      al_test,
                                      al_train,
                                      k=2)
        return result_average, result_training, result_test


# knn = KNN_strings(n_neighbors=1)
# knn = KNN_statistic()
# results = []
# for x in range(50):
#     result_average, result_training, result_test = knn.knn_train("/home/thovo/PycharmProjects/CharacterRecognition/digits_dataset", 0.2)
#     text = result_average.__str__() + " , " + result_training.__str__() + " , " + result_test.__str__() + "\n"
#     results.append(text)
#
#
# f = open("Results/knn.txt", "w")
# for item in results:
#     f.write(item)
#
# f.close()
Beispiel #5
0
class KNN_strings(object):
    '''
    classdocs
    '''
    def __init__(self, n_neighbors=1):
        '''
        Constructor
        '''
        self.dsr = DatasetReader()
        self.fenc = FreemanEncoder()
        self.data = []
        self.knn = KNeighborsClassifier(n_neighbors=n_neighbors,
                                        algorithm='auto',
                                        metric=self.lev_metric)

    def lev_metric(self, x, y):
        i, j = int(x[0]), int(y[0])  # extract indices
        #         if self.data[i] == self.data[j]:
        #             print self.data[i], self.data[j], edit_dist(self.data[i], self.data[j])
        return edit_dist(self.data[i], self.data[j])

    def knn_train(self, dataset, cv=1, datasplit=0.7):

        images_dataset = self.dsr.read_dataset_images(dataset)
        freeman_code_dict = self.fenc.encode_freeman_dataset(images_dataset)
        _, codes, labels = self.dsr.gen_labelled_arrays(freeman_code_dict)

        self.data = codes

        X = np.arange(len(self.data)).reshape(-1, 1)

        if cv <= 1:
            self.knn.fit(X, labels)
        elif cv > 1:
            cv_result = cross_validation.cross_val_score(self.knn,
                                                         X,
                                                         labels,
                                                         cv=cv)
            print cv_result

        print 'Training Done!'

    def knn_predict(self, test_data, score=False):
        images_dataset = self.dsr.read_dataset_images(test_data)
        freeman_code_dict = self.fenc.encode_freeman_dataset(images_dataset)
        _, codes, labels = self.dsr.gen_labelled_arrays(freeman_code_dict)

        X_pred = np.arange(len(codes)).reshape(-1, 1)
        predictions = self.knn.predict(X_pred)

        if score == True:
            accuracy = self.knn.score(X_pred, labels)
            print "Test Accuracy: ", accuracy

        return predictions

    def knn_predict_one(self, test_image):
        image_code = self.fenc.encode_freeman(test_image)
        print image_code
        data = [image_code]
        X_pred = np.arange(len(data)).reshape(-1, 1)
        prediction = self.knn.predict(X_pred)

        return prediction