class HMM(object): ''' classdocs ''' def __init__(self): ''' Constructor ''' self.dsr = DatasetReader() self.fenc = FreemanEncoder() states = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] symbols = ['0', '1', '2', '3', '4', '5', '6', '7'] self.learning_model = HiddenMarkovModelTrainer(states=states, symbols=symbols) self.model = None def generate_labelled_sequences(self, freeman_codes_dict): labeled_sequences = [] labeled_symbols = [] codes_list = freeman_codes_dict.items() for tup in codes_list: for code in tup[1]: temp = [] for symbol in code: temp.append((symbol, tup[0])) labeled_symbols.append(temp) for tup in codes_list: for code in tup[1]: labeled_sequences.append((code, tup[0])) codes = numpy.array([x[0] for x in labeled_sequences]) labels = numpy.array([y[1] for y in labeled_sequences]) return labeled_symbols, labeled_sequences, codes, labels def learning_curve(self, dataset, n_iter, train_sizes=numpy.linspace(0.1, 1.0, 5)): cv_scores = [] train_scores = [] for i in train_sizes: data = dataset[:int(len(dataset) * i)] cv_score = [] t_score = [] for j in range(n_iter): cv_score.extend(self.training(dataset, cv=10, n_iter=1)) train_score, test_score = self.training(dataset, n_iter=1) t_score.extend(train_score) cv_scores.append(cv_score) train_scores.append(t_score) cv_scores = numpy.array(cv_scores) train_scores = numpy.array(train_scores) print cv_scores.shape print train_scores.shape return train_scores, cv_scores def get_data(self, dataset_path): dataset = self.dsr.read_dataset_images(dataset_path) freeman_codes_dict = self.fenc.encode_freeman_dataset(dataset) labeled_symbols, labeled_sequence, codes, labels = self.generate_labelled_sequences( freeman_codes_dict) return labeled_symbols, labeled_sequence, codes, labels def training(self, dataset, cv=1, n_iter=1): if isinstance(dataset, basestring): labeled_symbols, labeled_sequence, codes, labels = self.get_data( dataset) else: labeled_symbols, labeled_sequence, codes, labels = dataset self.model = self.learning_model.train(labeled_symbols) if cv > 1: cv_scores = [] for i in range(n_iter): skf = cross_validation.KFold(len(labels), n_folds=10, shuffle=True) iter_score = [] for train_index, test_index in skf: train_data = list( numpy.array(labeled_symbols)[train_index]) test_data = list(numpy.array(labeled_symbols)[test_index]) self.model = self.learning_model.train(train_data) fold_score = self.model.evaluate(test_data) iter_score.append(fold_score) cv_scores.append(numpy.mean(iter_score)) return cv_scores else: skf = cross_validation.ShuffleSplit(len(labels), n_iter=n_iter, test_size=0.2, random_state=0) training_score = [] test_score = [] for train_index, test_index in skf: train_data = list(numpy.array(labeled_symbols)[train_index]) test_data = list(numpy.array(labeled_symbols)[test_index]) self.model = self.learning_model.train(train_data) training_score.append(self.model.evaluate(train_data)) test_score.append(self.model.evaluate(test_data)) if n_iter == 1: predict_labels = [] for i in range(len(list(codes[test_index]))): predicted_states = self.model.tag( list(codes[test_index])[i]) predict_labels.append(predicted_states[0][1]) self.ConfusionMatrix = ConfusionMatrix( list(labels[test_index]), predict_labels) return training_score, test_score def predict(self, image_path): if os.path.isfile(image_path): image_array = self.dsr.read_img_bw(image_path) freeman_code = self.fenc.encode_freeman(image_array) else: freeman_code = image_path predicted_states = self.model.tag(freeman_code) predicted_states = [x[1] for x in predicted_states] if len(set(predicted_states)) == 1: predicted_class = list(set(predicted_states))[0] return predicted_class ## TESTING CODE (WILL BE REMOVED) ## # from HMM import HMM # hmm = HMM() # cv_scores = hmm.training('I:\\eclipse_workspace\\CharacterRecognition\\teams_dataset', cv=10, n_iter=50) # train_score, test_score = hmm.training('I:\\eclipse_workspace\\CharacterRecognition\\teams_dataset', n_iter=1) # with open('hmm_confusion_matrix.txt', 'w') as fp: # fp.write(hmm.ConfusionMatrix.__str__()) # # with open("./Results/hmm.txt", 'w') as fp: # for i in range(len(cv_scores)): # text = str(cv_scores[i]) + ',' + str(train_score[i]) + ',' + str(test_score[i]) + '\n' # print text # print '--------------------------------' # fp.write(text)
class HMM(object): ''' classdocs ''' def __init__(self): ''' Constructor ''' self.dsr = DatasetReader() self.fenc = FreemanEncoder() states = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] symbols = ['0', '1', '2', '3', '4', '5', '6', '7'] self.learning_model = HiddenMarkovModelTrainer(states=states, symbols=symbols) self.model = None def generate_labelled_sequences(self, freeman_codes_dict): labeled_sequences = [] labeled_symbols = [] codes_list = freeman_codes_dict.items() for tup in codes_list: for code in tup[1]: temp = [] for symbol in code: temp.append((symbol, tup[0])) labeled_symbols.append(temp) for tup in codes_list: for code in tup[1]: labeled_sequences.append((code,tup[0])) codes = numpy.array([x[0] for x in labeled_sequences]) labels = numpy.array([y[1] for y in labeled_sequences]) return labeled_symbols, labeled_sequences, codes, labels def learning_curve(self, dataset, n_iter, train_sizes=numpy.linspace(0.1, 1.0, 5)): cv_scores = [] train_scores = [] for i in train_sizes: data = dataset[:int(len(dataset)*i)] cv_score = [] t_score = [] for j in range(n_iter): cv_score.extend(self.training(dataset, cv=10, n_iter=1)) train_score, test_score = self.training(dataset, n_iter=1) t_score.extend(train_score) cv_scores.append(cv_score) train_scores.append(t_score) cv_scores = numpy.array(cv_scores) train_scores = numpy.array(train_scores) print cv_scores.shape print train_scores.shape return train_scores, cv_scores def get_data(self, dataset_path): dataset = self.dsr.read_dataset_images(dataset_path) freeman_codes_dict = self.fenc.encode_freeman_dataset(dataset) labeled_symbols, labeled_sequence, codes, labels = self.generate_labelled_sequences(freeman_codes_dict) return labeled_symbols, labeled_sequence, codes, labels def training(self, dataset, cv=1, n_iter=1): if isinstance(dataset, basestring): labeled_symbols, labeled_sequence, codes, labels = self.get_data(dataset) else: labeled_symbols, labeled_sequence, codes, labels = dataset self.model = self.learning_model.train(labeled_symbols) if cv > 1: cv_scores = [] for i in range(n_iter): skf = cross_validation.KFold(len(labels), n_folds=10, shuffle=True) iter_score = [] for train_index, test_index in skf: train_data = list(numpy.array(labeled_symbols)[train_index]) test_data = list(numpy.array(labeled_symbols)[test_index]) self.model = self.learning_model.train(train_data) fold_score = self.model.evaluate(test_data) iter_score.append(fold_score) cv_scores.append(numpy.mean(iter_score)) return cv_scores else: skf = cross_validation.ShuffleSplit(len(labels), n_iter=n_iter, test_size=0.2, random_state=0) training_score = [] test_score = [] for train_index, test_index in skf: train_data = list(numpy.array(labeled_symbols)[train_index]) test_data = list(numpy.array(labeled_symbols)[test_index]) self.model = self.learning_model.train(train_data) training_score.append(self.model.evaluate(train_data)) test_score.append(self.model.evaluate(test_data)) if n_iter==1: predict_labels = [] for i in range(len(list(codes[test_index]))): predicted_states = self.model.tag(list(codes[test_index])[i]) predict_labels.append(predicted_states[0][1]) self.ConfusionMatrix = ConfusionMatrix(list(labels[test_index]), predict_labels) return training_score, test_score def predict(self, image_path): if os.path.isfile(image_path): image_array = self.dsr.read_img_bw(image_path) freeman_code = self.fenc.encode_freeman(image_array) else: freeman_code = image_path predicted_states = self.model.tag(freeman_code) predicted_states = [x[1] for x in predicted_states] if len(set(predicted_states)) == 1: predicted_class = list(set(predicted_states))[0] return predicted_class ## TESTING CODE (WILL BE REMOVED) ## # from HMM import HMM # hmm = HMM() # cv_scores = hmm.training('I:\\eclipse_workspace\\CharacterRecognition\\teams_dataset', cv=10, n_iter=50) # train_score, test_score = hmm.training('I:\\eclipse_workspace\\CharacterRecognition\\teams_dataset', n_iter=1) # with open('hmm_confusion_matrix.txt', 'w') as fp: # fp.write(hmm.ConfusionMatrix.__str__()) # # with open("./Results/hmm.txt", 'w') as fp: # for i in range(len(cv_scores)): # text = str(cv_scores[i]) + ',' + str(train_score[i]) + ',' + str(test_score[i]) + '\n' # print text # print '--------------------------------' # fp.write(text)
class KNN(object): ''' classdocs ''' def __init__(self): ''' Constructor ''' self.dsr = DatasetReader() self.fenc = FreemanEncoder() self.training_data = [] def generate_labelled_sequences(self, freeman_codes_dict): labelled_sequences = [] codes_list = freeman_codes_dict.items() for tup in codes_list: for code in tup[1]: labelled_sequences.append((tup[0],code)) return labelled_sequences def prepare_data(self, datas, training=[], test=[], split=0.80): # Separate data into 2 sets, 1 is training and 1 is test,split is the ratio (the default is 0.70) for data in range(len(datas)-1): if random.random() < split: training.append(datas[data]) else: test.append(datas[data]) def get_neighbors(self, training, test_instance, k): # Get the list of nearest neighbors to a test instance distances =[] for i in range(len(training)-1): dist = edit_dist(test_instance, training[i][1]) distances.append((training[i], dist)) distances.sort(key=operator.itemgetter(1)) neighbors = [] for x in range(0, k): neighbors.append(distances[x][0]) return neighbors def get_label(self, neighbors): # Determine the label of a test instance base on its nearest neighbors max = 0 labels = {} for neighbor in neighbors: if neighbor[0] not in labels: labels[neighbor[0]] = 1 else: labels[neighbor[0]] += 1 sorted_labels = sorted(labels.items(), key=operator.itemgetter(1), reverse=True) return sorted_labels[0][0] def evaluation(self, training, test): # Evaluate the accuracy of knn correct_count = 0 # k = int(math.ceil(len(training)/10)) k = 1 for test_data in test: neighbors = self.get_neighbors(training, test_data[1], k) label = self.get_label(neighbors) if int(label) == int(test_data[0]): correct_count += 1 print (float(correct_count)/len(test))*100 def knn_train(self, dataset_path, train_test_split=0.8): dataset = self.dsr.read_dataset_images(dataset_path) freeman_codes_dict = self.fenc.encode_freeman_dataset(dataset) labelled_sequences = self.generate_labelled_sequences(freeman_codes_dict) training = [] test = [] # print labelled_sequences self.prepare_data(labelled_sequences,training,test, split=train_test_split) self.training_data = training if train_test_split != 1.0: print "Training:" + len(training).__str__() print "Test:" + len(test).__str__() self.evaluation(training,test) def knn_predict_one(self, image, k=1): if os.path.isfile(image): image_array = self.dsr.read_img_bw(image) test = self.fenc.encode_freeman(image_array) else: test = image # Try to find the nearest neighbors of the first sequences in training neighbors = self.get_neighbors(self.training_data, test, k) label = self.get_label(neighbors) return label
class NaiveBayes(ml_alg_base): ''' classdocs ''' def __init__(self): ''' Constructor ''' ml_alg_base.__init__(self) self.dsr = DatasetReader() self.learning_model = naive_bayes.GaussianNB() def get_data(self, dataset_path="./teams_dataset"): data_dict = self.dsr.read_dataset_images(dataset_path) _, data_set_x, data_set_y = self.dsr.gen_labelled_arrays(data_dict) data_set_x = data_set_x.reshape(len(data_set_x), -1) return data_set_x, data_set_y def training(self, dataset_path, cv=1): dataset = self.dsr.read_dataset_images(dataset_path) _, images, labels = self.dsr.gen_labelled_arrays(dataset) images = numpy.array(images) #reshape images for input data = images.reshape(len(images), -1) if cv <= 1: self.learning_model.fit(data, labels) elif cv > 1: cv_result = cross_validation.cross_val_score(self.learning_model, data, labels, cv=cv) return cv_result pickle.dump(self.learning_model, open("./Models/naivebayes_model.p", "wb")) def predict(self, image_path): try: self.learning_model = pickle.load( open("./Models/naivebayes_model.p", "rb")) except: print "Please train the Naive Bayes model first" if isbasestring(image_path): image = self.dsr.read_img_bw(image_path) else: image = image_path image = image.reshape(-1, image.shape[0] * image.shape[1]) result = self.learning_model.predict(image) return result # from NaiveBayes import NaiveBayes # NB = NaiveBayes() # # NB.training('I:\\eclipse_workspace\\CharacterRecognition\\digits_dataset_clean', cv=5) # # print NB.predict('I:\\eclipse_workspace\\CharacterRecognition\\test1.jpg') # data_x, data_y = NB.get_data() # print data_x.shape, data_y.shape # NB.first_exp(data_x, data_y, NB.learning_model, algorithm_name='NaiveBayes' ,num_iter=50)
class KNN(object): ''' classdocs ''' def __init__(self): ''' Constructor ''' self.dsr = DatasetReader() self.fenc = FreemanEncoder() self.training_data = [] def generate_labelled_sequences(self, freeman_codes_dict): labelled_sequences = [] codes_list = freeman_codes_dict.items() for tup in codes_list: for code in tup[1]: labelled_sequences.append((tup[0], code)) return labelled_sequences def prepare_data(self, datas, training=[], test=[], split=0.80): # Separate data into 2 sets, 1 is training and 1 is test,split is the ratio (the default is 0.70) for data in range(len(datas) - 1): if random.random() < split: training.append(datas[data]) else: test.append(datas[data]) def get_neighbors(self, training, test_instance, k): # Get the list of nearest neighbors to a test instance distances = [] for i in range(len(training) - 1): dist = edit_dist(test_instance, training[i][1]) distances.append((training[i], dist)) distances.sort(key=operator.itemgetter(1)) neighbors = [] for x in range(0, k): neighbors.append(distances[x][0]) return neighbors def get_label(self, neighbors): # Determine the label of a test instance base on its nearest neighbors max = 0 labels = {} for neighbor in neighbors: if neighbor[0] not in labels: labels[neighbor[0]] = 1 else: labels[neighbor[0]] += 1 sorted_labels = sorted(labels.items(), key=operator.itemgetter(1), reverse=True) return sorted_labels[0][0] def evaluation(self, training, test): # Evaluate the accuracy of knn correct_count = 0 # k = int(math.ceil(len(training)/10)) k = 1 for test_data in test: neighbors = self.get_neighbors(training, test_data[1], k) label = self.get_label(neighbors) if int(label) == int(test_data[0]): correct_count += 1 print(float(correct_count) / len(test)) * 100 def knn_train(self, dataset_path, train_test_split=0.8): dataset = self.dsr.read_dataset_images(dataset_path) freeman_codes_dict = self.fenc.encode_freeman_dataset(dataset) labelled_sequences = self.generate_labelled_sequences( freeman_codes_dict) training = [] test = [] # print labelled_sequences self.prepare_data(labelled_sequences, training, test, split=train_test_split) self.training_data = training if train_test_split != 1.0: print "Training:" + len(training).__str__() print "Test:" + len(test).__str__() self.evaluation(training, test) def knn_predict_one(self, image, k=1): if os.path.isfile(image): image_array = self.dsr.read_img_bw(image) test = self.fenc.encode_freeman(image_array) else: test = image # Try to find the nearest neighbors of the first sequences in training neighbors = self.get_neighbors(self.training_data, test, k) label = self.get_label(neighbors) return label