class QClassifierImpl: """ A wrapper for question classifier """ def __init__(self, train_data_path, pred_qs = None): """ Constructor """ logging.basicConfig(level = logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='qclassifier.log', filemode='w') reload(sys) sys.setdefaultencoding('utf8') self.clf = None self.path = train_data_path self.pred_qs = pred_qs self.extractor = FeatureExtractor() self.features = None self.labels = None self.vectorizer = None self.cate = ['Person', 'Number', 'Location', 'Other'] def train(self): """ Train use all of the given data """ self.extractor.load(path = self.path) self.features = self.extractor.extract_features() self.labels = self.extractor.get_labels() self.clf = QClassifier(questions = self.extractor.questions) assert(len(self.labels) == len(self.features)) X = self.features Y = self.labels self.vectorizer = FeatureHasher(input_type = 'string', non_negative = True) X = self.vectorizer.transform(X) Y = asarray(Y) logging.info('start training') self.clf.train(X, Y) logging.info('done') def get_type(self, question): """ Get type for a given question """ if not self.features or not self.labels: logging.error('You need to train model first!') return None if not question: logging.error('Question should not be None') return None f = [self.extractor.extract_features_aux(question)] f = self.vectorizer.transform(f) # print self.clf.predict(f) return self.cate[self.clf.predict(f)[0]]
def test_extract_features_4x4_returns_correct_dimensions_and_colour(self): input_image_df = pd.read_csv(io.StringIO("label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,pixel11,pixel12,pixel13,pixel14,pixel15\n0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0")) feature_extractor = FeatureExtractor(logging.Logger("FeatureExtractor"), 4, 4, 1) features = feature_extractor.extract_features(input_image_df) self.assertEqual((1,15), features.shape) self.assertTrue(pd.DataFrame([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]).compare(features).empty)
def test_extract_features_x10_full_size_returns_correct_features(self): input_image_df = pd.read_csv("Data/train.csv", nrows=1) feature_extractor = FeatureExtractor(logging.Logger("FeatureExtractor"), 4, 4, 2) features = feature_extractor.extract_features(input_image_df) self.assertEqual((1, 75), features.shape) self.assertTrue(pd.DataFrame([[0, 15.93750, 0, 0, 63.75, 0, 0, 63.75, 0, 1, 255, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]]).iloc[0,:].compare(features.iloc[0,:]).empty)
def test_signature(test_dir): F = [] improc = ImageProcessor() ftextr = FeatureExtractor() img_files = find_image_files(test_dir) count = 1.0 for ifile in img_files: print("Extracting features, " + str(round(count / len(img_files) * 100, 1)) + "% done ...") count += 1 signature = Image.open(ifile) processed = improc.preprocess(signature) F.append(ftextr.extract_features(processed)) np.ndarray(shape=(len(F), len(F[0]))) F = np.array(F) np.ndarray(shape=(len(F), len(F[0]))) F = np.array(F) F.dump(test_dir + "feature_dump")
def save_features(training_dir): print("Extracting features ...") F = [] L = [] improc = ImageProcessor() ftextr = FeatureExtractor() img_files = find_image_files(training_dir) count = 1.0 for ifile in img_files: done = count / len(img_files) * 100 print("\r[", end="") for i in range(int(done)): print("|", end="") for i in range(100 - int(done)): print(" ", end="") print("] " + ifile + ", " + str(round(done, 2)) + "%", end="") count += 1 signature = Image.open(ifile) processed = improc.preprocess(signature) F.append(ftextr.extract_features(processed)) if ifile.startswith("dataset/TrainingSet/D"): L.append(1) elif ifile.startswith("dataset/TrainingSet/G"): L.append(2) elif ifile.startswith("dataset/TrainingSet/R"): L.append(3) else: L.append(0) np.ndarray(shape=(len(F), len(F[0]))) F = np.array(F) F.dump(training_dir + "feature_dump") np.ndarray(shape=(len(L), 1)) L = np.array(L) L.dump(training_dir + "label_dump")
class QClassifierImpl: """ A wrapper for question classifier """ def __init__(self, train_data_path, pred_qs=None): """ Constructor """ logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='qclassifier.log', filemode='w') reload(sys) sys.setdefaultencoding('utf8') self.clf = None self.path = train_data_path self.pred_qs = pred_qs self.extractor = FeatureExtractor() self.features = None self.labels = None self.vectorizer = None self.cate = ['Person', 'Number', 'Location', 'Other'] def train(self): """ Train use all of the given data """ self.extractor.load(path=self.path) self.features = self.extractor.extract_features() self.labels = self.extractor.get_labels() self.clf = QClassifier(questions=self.extractor.questions) assert (len(self.labels) == len(self.features)) X = self.features Y = self.labels self.vectorizer = FeatureHasher(input_type='string', non_negative=True) X = self.vectorizer.transform(X) Y = asarray(Y) logging.info('start training') self.clf.train(X, Y) logging.info('done') def get_type(self, question): """ Get type for a given question """ if not self.features or not self.labels: logging.error('You need to train model first!') return None if not question: logging.error('Question should not be None') return None f = [self.extractor.extract_features_aux(question)] f = self.vectorizer.transform(f) # print self.clf.predict(f) return self.cate[self.clf.predict(f)[0]]
class VehicleClassifier(): '''Classifier that detects vehicles in an image''' def __init__(self, trained_model_path=None): ''' Initialise the object :param trained_model_path: full path to were the trained model and scaler will be stored ''' self.trained_model_path = trained_model_path self.clf = None self.X_scaler = None self.trained = False self.feature_extractor = None def load_training_images(self, vehicle_path=None, non_vehicle_path=None): ''' Load training set image names from disc :param vehicle_path: path to where the vehicle training images are stored :param non_vehicle_path: path to where the non-vehicle training images are stored :return: two lists with full paths to vehicle and non-vehicle images respectively ''' vehicles = [] non_vehicles = [] # Vehicle images names print("Loading training image names...") for image in glob.glob(vehicle_path + '/**/*.png', recursive=True): vehicles.append(image) # Non-vehicle images names for image in glob.glob(non_vehicle_path + '/**/*.png', recursive=True): non_vehicles.append(image) print(' # of vehicle images: {}'.format(len(vehicles))) print('# of non-vehicle images: {}'.format(len(non_vehicles))) return vehicles, non_vehicles def extract_features(self, vehicles, non_vehicles): ''' Extract features for the two lists containing vehicle and non-vehicle image paths respectively :param vehicles: list of paths to vehicle images :param non_vehicles: list of paths to non-vehicle images :return: scaled_X: normalised feature vector, y: true labels (1 = vehicle, 0 = non-vehicle) ''' '''Load training set images and extract features''' self.feature_extractor = FeatureExtractor() print("Loading images and extracting features...") t = time.time() vehicle_features = self.feature_extractor.extract_features( vehicles, cspace=CSPACE, spatial_size=(SPATIAL_SIZE, SPATIAL_SIZE), hist_bins=HIST_BIN, hist_range=HIST_RANGE, hog_cell_per_block=HOG_CELL_PER_BLOCK, hog_channel=HOG_CHANNEL, hog_pix_per_cell=HOG_PIX_PER_CELL, hog_orient=HOG_ORIENT_BINS) non_vehicle_features = self.feature_extractor.extract_features( non_vehicles, cspace=CSPACE, spatial_size=(SPATIAL_SIZE, SPATIAL_SIZE), hist_bins=HIST_BIN, hist_range=HIST_RANGE, hog_cell_per_block=HOG_CELL_PER_BLOCK, hog_channel=HOG_CHANNEL, hog_pix_per_cell=HOG_PIX_PER_CELL, hog_orient=HOG_ORIENT_BINS) # Create an array stack of all feature vectors and scale the resulting feature vector X = np.vstack( (vehicle_features, non_vehicle_features)).astype(np.float64) self.X_scaler = StandardScaler().fit(X) scaled_X = self.X_scaler.transform(X) # Define the labels vector (1 = vehicle, 0 = non-vehicle) y = np.hstack((np.ones(len(vehicle_features)), np.zeros(len(non_vehicle_features)))) t2 = time.time() print('Number of features: {}'.format(scaled_X.shape[1])) print('Feature extraction time: {}'.format(round(t2 - t, 2))) return scaled_X, y def train(self, X, y): ''' Train the classifier using training set X and true labels y :param X: array of feature vectors (normalised) :param y: array of true predictions ''' # Split the training set into randomized training and test sets X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=TEST_TRAIN_RATIO, random_state=RANDOM_STATE, stratify=y) # Train the linear SVC print("Training SVC...") self.clf = LinearSVC() t = time.time() self.clf.fit(X_train, y_train) t2 = time.time() print('Classifier training time: {}'.format(round(t2 - t, 2))) self.trained = True # Check the score of the SVC on the test set print('Classifier test set accuracy: {}'.format( round(self.score(X_test, y_test), 4))) def save_model(self, name): ''' Save the trained model and scaler to disc :param name: name of the model ("_model.pkl" will be added to the name) ''' if self.trained: joblib.dump(self.clf, self.trained_model_path + '/' + name + '_model.pkl') joblib.dump(self.X_scaler, self.trained_model_path + '/' + name + '_scaler.pkl') else: print("ERROR: model not yet trained") def load_model(self, name): ''' Load a trained model from disc :param name: name of the model ("_model.pkl" will be added to the name) ''' self.__init__(self.trained_model_path) # Load the trained classifier and the scaler self.clf = joblib.load(self.trained_model_path + '/' + name + '_model.pkl') self.X_scaler = joblib.load(self.trained_model_path + '/' + name + '_scaler.pkl') self.feature_extractor = FeatureExtractor() self.trained = True def predict(self, X): ''' Make predictions for test set in X :param X: array of feature vectors :return: array of predictions ''' if self.trained: return self.clf.predict(X) else: print("ERROR: model not yet trained") def score(self, X, y): ''' Determine the classifier accuracy given test set X and true labels y :param X: array of feature vectors (normalised) :param y: array of predictions :return: classifier accuracy ''' if self.trained: return self.clf.score(X, y) else: print("ERROR: model not yet trained")
import pandas as pd import csv import logging from sklearn.linear_model import LogisticRegression from sklearn import preprocessing from FeatureExtractor import FeatureExtractor if __name__ == '__main__': logger = logging.Logger("MainLogger") digit_recogniser = FeatureExtractor(logger, 28, 28,2,True) pixel_training_data_df = pd.read_csv("Data/train.csv") training_feature_df = digit_recogniser.extract_features(pixel_training_data_df) print(training_feature_df.shape) x_data = training_feature_df.iloc[:, 1:] y_data = training_feature_df.iloc[:, [0]] scaler = preprocessing.StandardScaler().fit(x_data) x_data_scaled = scaler.transform(x_data) logistic_model = LogisticRegression(random_state=0, penalty='l1', solver='saga', tol=0.1).fit(x_data_scaled, y_data.values[:, 0]) accuracy = logistic_model.score(x_data_scaled, y_data.values[:, 0]) print(f"Training completed. Accuracy Rate: {accuracy * 100}%") # create predictions for the test data now pixel_test_data_df = pd.read_csv("Data/test.csv") digit_recogniser = FeatureExtractor(logger, 28, 28, False)