def create_and_save(self): """ Create and save the dataset using the arguments passed in from the console. This will shuffle the validation data, create a classifier, and select the correct classifications that are seen. The resulting list of images that are correctly classified is then saved to an NPY file. :return: """ parser = argparse.ArgumentParser() parser.add_argument('-rs', '--random-state', default=1024, help='Random state for the shuffling.') parser.add_argument('-n', '--number', default=100, help='Number of correct classifications to store.') arguments = vars(parser.parse_args()) random_state = int(arguments['random_state']) number = int(arguments['number']) print('') print('Starting %s Image and Label Extraction...' % self.name) print('') print('Parameters:') print(' Random State: %s' % random_state) print(' Number: %s' % number) print('') use_project_path() # Load and shuffle the validation dataset image_df = pd.read_csv('data/full_image_dataset.csv') image_df = image_df.sample( frac=1.0, random_state=random_state).reset_index(drop=True) # Create the CNN to be tested model = self.network(weights='imagenet') accumulator = list() # Step through the dataframe and keep all images correctly classified. for index, row in image_df.iterrows(): scaled_image = image.img_to_array( image.load_img(row['image'], target_size=(224, 224))) input = self.preprocessor( np.expand_dims(scaled_image.copy(), axis=0)) raw_predictions = model.predict(input) if int(self.class_index[row['label']]) == int( np.argmax(raw_predictions)): accumulator.append( [scaled_image, int(self.class_index[row['label']])]) if len(accumulator) >= number: break np.save('data/%s_%s_correct.npy' % (self.name, number), np.array(accumulator))
from keras.applications.vgg19 import VGG19 from keras.applications.vgg19 import preprocess_input as vgg19_preprocess_input from keras.applications.densenet import DenseNet201 from keras.applications.densenet import preprocess_input as densenet_preprocess_input from keras.applications.resnet_v2 import ResNet152V2 from keras.applications.resnet_v2 import preprocess_input as resnet_preprocess_input import numpy as np from filter import FourierUniformFilter from utility import OptimizationSearch from utility import use_project_path from utility import save_filter_search_scores if __name__ == '__main__': use_project_path() parser = argparse.ArgumentParser() parser.add_argument('-n', '--network', default='vgg16', help='The network architecture to test: vgg16, vgg19, densenet, or resnet') parser.add_argument('-s', '--sample', default=100, help='The sample file to use for testing.') parser.add_argument('-i', '--iterations', default=25, help='The number of iterations to use for testing.') arguments = vars(parser.parse_args()) NETWORK = arguments['network'] SAMPLE = arguments['sample'] ITERATIONS = int(arguments['iterations']) print('')
def load_data_frame(): use_project_path() data_frame = pd.read_csv(data_filename, dtype='float') return data_frame
def run_classification_search_experiment( self, scoring, sample=None, random_state=None, test_size=0.25, n_jobs=-1, n_iter=2, cv=5, verbose=3, multiclass=False, record_predict_proba=False): """ The classification search makes use of a bayesian search to find the best hyper-parameters. """ use_project_path() logger = Logger('%s.txt' % self.name) search = BayesSearchCV( self.estimator, self.hyper_parameters.search_space, n_jobs=n_jobs, n_iter=n_iter, cv=cv, verbose=verbose, scoring=scoring, return_train_score=True ) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split(data_frame, data_frame[self.target], test_size=test_size) logger.time_log('Starting HyperParameter Search...') results = search.fit(x_train, y_train) logger.time_log('Search Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(results.best_estimator_, x_train) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(results.best_estimator_, x_test) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(results.best_estimator_, x_test) test_proba_evaluation_frame = EvaluationFrame(y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') evaluator = Evaluator(logger) evaluator.evaluate_classifier_result( results, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass ) logger.close() self.hyper_parameters.params = results.best_params_ self.hyper_parameters.save('%s_params.p' % self.name) self.trained_estimator = results.best_estimator_
def run_classification_experiment( self, sample=None, random_state=None, test_size=0.25, multiclass=False, record_predict_proba=False): """ Running a classification experiment is used when only a single model run and fit is necessary. """ use_project_path() logger = Logger('%s.txt' % self.name) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split(data_frame, data_frame[self.target], test_size=test_size) if self.hyper_parameters is not None: self.estimator.set_params(**self.hyper_parameters.params) logger.time_log('Training Model...') self.estimator.fit(x_train, y_train) logger.time_log('Training Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(self.estimator, x_train) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(self.estimator, x_test) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(self.estimator, x_test) test_proba_evaluation_frame = EvaluationFrame(y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') evaluator = Evaluator(logger) evaluator.evaluate_classifier_result( self.estimator, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass ) logger.close() if self.hyper_parameters is not None: self.hyper_parameters.save('%s_params.p' % self.name)
def run_classification_experiment(self, sample=None, random_state=None, test_size=0.20, multiclass=False, record_predict_proba=False, sampling=None, cv=5, verbose=True, transformer=None, fit_increment=None, warm_start=False, max_iters=None, n_jobs=-1): use_project_path() logger = Logger('%s.txt' % self.name) evaluator = Evaluator(logger) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split( data_frame, data_frame[self.target], test_size=test_size) if transformer is not None: logger.time_log('Fitting Transformer...') transformer.fit(x_train) logger.time_log('Transformer Fit Complete.\n') if sampling is not None: logger.time_log('Starting Data Re-Sampling...') logger.log('Original Training Shape is %s' % Counter(y_train)) x_new, y_new = sampling.fit_resample(x_train, y_train) logger.log('Balanced Training Shape is %s' % Counter(y_new)) if hasattr(x_train, 'columns'): x_new = pd.DataFrame(x_new, columns=x_train.columns) x_train, y_train = x_new, y_new logger.time_log('Re-Sampling Complete.\n') logger.time_log('Shuffling Re-Sampled Data.\n') x_train, y_train = shuffle(x_train, y_train, random_state=random_state) logger.time_log('Shuffling Complete.\n') if self.hyper_parameters is not None: self.estimator.set_params(**self.hyper_parameters.params) if cv is not None: kfold = StratifiedKFold(n_splits=cv, random_state=random_state) logger.time_log('Cross Validating Model...') fold_scores = Parallel(n_jobs=n_jobs, verbose=3)( delayed(crossfold_classifier) (clone(self.estimator), transformer, x_train, y_train, train_index, test_index, record_predict_proba, verbose, fit_increment, warm_start, max_iters, random_state) for train_index, test_index in kfold.split(x_train, y_train)) logger.time_log('Cross Validation Complete.\n') logger.time_log('Training Model...') if fit_increment is not None: if max_iters is not None: for iter in range(max_iters): x_iter_train, y_iter_train = shuffle( x_train, y_train, random_state=random_state) batch_fit_classifier(self.estimator, x_iter_train, y_iter_train, transformer=transformer, increment=fit_increment, verbose=verbose) else: batch_fit_classifier(self.estimator, x_train, y_train, transformer=transformer, increment=fit_increment, verbose=verbose) else: if transformer is not None: x_train_transformed = transformer.transform(x_train) self.estimator.fit(x_train_transformed, y_train) else: self.estimator.fit(x_train, y_train) logger.time_log('Training Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(self.estimator, x_train, transformer=transformer, verbose=verbose) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(self.estimator, x_test, transformer=transformer, verbose=verbose) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(self.estimator, x_test, transformer=transformer, verbose=verbose) test_proba_evaluation_frame = EvaluationFrame( y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') if cv is not None: evaluator.evaluate_fold_scores(fold_scores) evaluator.evaluate_classifier_result( self.estimator, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass) logger.close() if self.hyper_parameters is not None: self.hyper_parameters.save('%s_params.p' % self.name) self.trained_estimator = self.estimator
def run_classification_search_experiment(self, scoring, sample=None, random_state=None, test_size=0.20, n_jobs=-1, n_iter=2, cv=5, verbose=3, multiclass=False, record_predict_proba=False, sampling=None): use_project_path() logger = Logger('%s.txt' % self.name) search = BayesSearchCV(self.estimator, self.hyper_parameters.search_space, n_jobs=n_jobs, n_iter=n_iter, cv=cv, verbose=verbose, scoring=scoring, return_train_score=True) data_frame = self.df if sample is not None: data_frame = data_frame.sample(n=sample, random_state=random_state) x_train, x_test, y_train, y_test = train_test_split( data_frame, data_frame[self.target], test_size=test_size) if sampling is not None: logger.time_log('Starting Data Re-Sampling...') logger.log('Original Training Shape is %s' % Counter(y_train)) x_new, y_new = sampling.fit_resample(x_train, y_train) logger.log('Balanced Training Shape is %s' % Counter(y_new)) if hasattr(x_train, 'columns'): x_new = pd.DataFrame(x_new, columns=x_train.columns) x_train, y_train = x_new, y_new logger.time_log('Re-Sampling Complete.\n') logger.time_log('Shuffling Re-Sampled Data.\n') x_train, y_train = shuffle(x_train, y_train, random_state=random_state) logger.time_log('Shuffling Complete.\n') logger.time_log('Starting HyperParameter Search...') results = search.fit(x_train, y_train) logger.time_log('Search Complete.\n') logger.time_log('Testing Training Partition...') y_train_predict = batch_predict(results.best_estimator_, x_train) logger.time_log('Testing Complete.\n') train_evaluation_frame = EvaluationFrame(y_train, y_train_predict) logger.time_log('Testing Holdout Partition...') y_test_predict = batch_predict(results.best_estimator_, x_test) logger.time_log('Testing Complete.\n') test_evaluation_frame = EvaluationFrame(y_test, y_test_predict) test_evaluation_frame.save('%s_predict.p' % self.name) test_proba_evaluation_frame = None if record_predict_proba: logger.time_log('Testing Holdout Partition (probability)...') y_test_predict_proba = batch_predict_proba(results.best_estimator_, x_test) test_proba_evaluation_frame = EvaluationFrame( y_test, y_test_predict_proba) test_proba_evaluation_frame.save('%s_predict_proba.p' % self.name) logger.time_log('Testing Complete.\n') evaluator = Evaluator(logger) evaluator.evaluate_classifier_result( results, test_evaluation_frame, train=train_evaluation_frame, test_proba=test_proba_evaluation_frame, multiclass=multiclass) logger.close() self.hyper_parameters.params = results.best_params_ self.hyper_parameters.save('%s_params.p' % self.name) self.trained_estimator = results.best_estimator_