def run( self ): """ Run the main code to train classifier and save to file :return: None """ # extract features from dataframes # features = pandas.DataFrame( hog_classifier.feature_extraction(self.data, # image_col_name=self.image_col_name) ) # print('Extracting features') features = feature_extractions.feature_extraction(self.data, input_parameters=self.image_col_name, method=self.feature_method, extraction_options=self.feature_options, image_process_options=self.image_processing_options_for_feature) # create target array which contains the correct answer target = pandas.DataFrame(numpy.repeat([True], len(features))) target[self.data[self.category_col_name].values == '0'] = False # initialize classifier print( "Initializing classifier" ) classifier = sklearn.svm.SVC( kernel=self.kernel, gamma=self.gamma, C=self.C, verbose=True, probability=True ) # train the classifier print( "Training classifier" ) classifier.fit( features, target.values.ravel() ) print() # save the classifier to be loaded later print( "Saving classifier to file, " + str( classifier_output ) ) sklearn.externals.joblib.dump( classifier, classifier_output )
def run(self): """ Run the main code to train classifier and save to file :return: None """ # process images if self.image_processing_options: print("Preprocessing images") image_df = kaggle_reader.load_all_raw_images(self.raw_image_path) processed_data = image_processing.process_roi_extractions( image_df, self.data, self.image_processing_options) else: processed_data = self.data # extract features from dataframes print("Extracting features") features = feature_extractions.feature_extraction( processed_data, input_parameters=self.image_col_name, method=self.feature_method, extraction_options=self.feature_options) # create target array which contains the correct answer target = pandas.DataFrame(numpy.repeat([True], len(features))) target[self.data[self.category_col_name].values == '0'] = False # initialize classifier print("Initializing classifier") classifier = sklearn.neural_network.MLPClassifier( solver=self.solver, alpha=self.alpha, tol=self.tol, max_iter=self.max_iter, hidden_layer_sizes=self.hidden_layer_sizes, random_state=self.random_state, verbose=True) print(classifier) print() # train the classifier print("Training classifier") classifier.fit(features, target.values.ravel()) print() # save the classifier to be loaded later print("Saving classifier: " + str(self.classifier_pkl)) sklearn.externals.joblib.dump(classifier, self.classifier_pkl) return
def run(self): """ Run the main code to train classifier and save to file :return: None """ # process images if self.image_processing_options: print('Preprocessing images') image_df = kaggle_reader.load_all_raw_images(self.raw_image_path) processed_data = image_processing.process_roi_extractions( image_df, self.data, self.image_processing_options) else: processed_data = self.data # extract features from dataframes print('Extracting features') features = feature_extractions.feature_extraction( processed_data, input_parameters=self.image_col_name, method=self.feature_method, extraction_options=self.feature_options) # create target array which contains the correct answer target = pandas.DataFrame(numpy.repeat([True], len(features))) target[self.data[self.category_col_name].values == '0'] = False # initialize classifier print("Initializing classifier") classifier = sklearn.svm.SVC(kernel=self.kernel, gamma=self.gamma, C=self.C, verbose=True) # train the classifier print("Training classifier") classifier.fit(features, target.values.ravel()) print() # save the classifier to be loaded later print("Saving classifier: " + str(self.classifier_pkl)) sklearn.externals.joblib.dump(classifier, self.classifier_pkl)
def run(self): """ Run the main code to test classifier and output results :return: None """ # loading pre-trained classifier print("Loading external classifier: " + self.classifier_pkl) try: external_classifier = sklearn.externals.joblib.load( self.classifier_pkl) except OSError: print('Please check the file location for the trained classifier.') print('Tried: ' + self.classifier_pkl) return # process images if self.image_processing_options: print('Preprocessing images') image_df = kaggle_reader.load_all_raw_images(self.raw_image_path) processed_data = image_processing.process_roi_extractions( image_df, self.data, self.image_processing_options) else: processed_data = self.data # extract features from dataframes print('Extracting features') # features = pandas.DataFrame( processed_data.image_matrix.apply( numpy.ravel ).apply( pandas.Series ) ) features = feature_extractions.feature_extraction( processed_data, input_parameters=self.image_col_name, method=self.feature_method, extraction_options=self.feature_options) # run prediction print('Running prediction') predicted = external_classifier.predict(features) # create array of expected results print('Organizing expected results') expected = pandas.DataFrame(numpy.repeat([True], len(features))) expected[self.data[self.category_col_name].values == '0'] = False # preparing to analyze predictions analyze = analyze_predictions.AnalyzePredictions() analyze.set_expected(pandas.DataFrame(expected)) analyze.set_predicted(pandas.DataFrame(predicted)) analyze.set_test_set(self.data) print() print("########################") print("# CLASSIFIER") print("#") print() print(external_classifier) print() analyze.print_summary_results() # save results to directory if not self.prediction_output_dir is None: print('Saving predictions to disk') correct = analyze.get_correct_predictions() incorrect = analyze.get_incorrect_predictions() analyze.save_predictions_as_images( correct, os.path.join(self.prediction_output_dir, 'correct')) analyze.save_predictions_as_images( incorrect, os.path.join(self.prediction_output_dir, 'incorrect')) return
def run_moving_window(classifier, image_array, feature_method, feature_options, image_options, window_sizes, step_sizes, nms_threshold, plot=False, padding="constant"): """ Run the moving window approach :param classifier: classifier to use for prediction :param image_array: input image data :param feature_method: the feature to extract :param feature_options: options for feature extraction, e.g., for Gaussian kernel this is the kernel sigmas. Current options include: 'gkhp': Gaussian kernel Hadamart product feature, requires a list of Gaussian kernel sigmas as extraction options 'hog': Histogram of gradients, requires options in a dict if none-default options is desired 'pixelval': extracting pixel values as features, no options needed (use None as input) :param image_options: options for preprocessing images before feature extractions, e.g., {'rgb2gray': None} to convert RGB image into grayscale. If no option required, use {} (enpty dict) :param window_sizes: sliding window size (nrow, ncol) :param step_sizes: step size for the sliding window (nrow, ncol) :param nms_threshold: Threshold parameter for the non-maximum suppression algorithm specifying the maximum allowable overlap :return: None """ # preprocess the image print("Preprocess image") processed_image = image_processing.process_image(image_array, image_options) # extract features from dataframes print("Extracting sub images", end=" ") start_time = time.time() boxes = extract_windowed_subimages_from_image(processed_image, window_sizes, step_sizes, padding=padding) print(time.time() - start_time) if not (type(classifier).__name__ == "SVC") and feature_method == "pixelval": print("Using fast method") boxes = boxes[[ classifier.predict_proba([x.ravel()])[0][1] > 0.5 for x in boxes.ImageMat ]] return boxes, boxes # extraction features according the feature_method parameter and feature_options, images could # be processed according to image_options if needed print("Extracting features", end=" ") start_time = time.time() features = feature_extractions.feature_extraction(boxes, 'ImageMat', feature_method, feature_options) print(time.time() - start_time) # run prediction print("Running prediction", end=" ") start_time = time.time() if type(classifier).__name__ == "SVC": boxes["classification"] = classifier.predict(features) positive_boxes = boxes[boxes.classification == True] else: boxes["prediction"] = classify_boxes(features, classifier) boxes = boxes[boxes.prediction > 0.5] print(time.time() - start_time) start_time = time.time() del features del boxes print("Removing variables time" + str(time.time() - start_time)) print("eliminating overlap", end=" ") start_time = time.time() if nms_threshold < 1: reduced_positive_boxes = NSM.remove_boxes_with_NSM( positive_boxes, nms_threshold) else: reduced_positive_boxes = positive_boxes print(time.time() - start_time) if plot: display_boxes(image_array, reduced_positive_boxes, positive_boxes) return positive_boxes, reduced_positive_boxes
def tune(main, model, tuned_parameters, feature_method="pixelval", fraction=1, diversity_vars=None, iterations=50, bayes=False, verbose=0, n_jobs=1, random_state=1): """ :param main: Classifier object :param model: Classifier :param tuned_parameters: Grid with hyperparameters :param feature_method: string describing method to extract features :param fraction: Fraction of data used to tune hyperparameters :param diversity_vars: Variable names of which diversity should be maintained :param iterations: Amount of iteration for Bayesian search :param bayes: boolean indicating to use Bayesian search (True) or normal search (False) :param verbose: How much to output (e.g. 0, 10, 50) :param n_jobs: Number of jobs to run in parallel (e.g. 1, 2, 4, 8) :param random_state: Basically the random seed (e.g. 1) :return: Classifier object and search object """ data = split_data.slice_smaller_subset_of_data(main.data, fraction=fraction, diversity_vars=diversity_vars, random_state=random_state) # preprocessing images if main.image_processing_options: print('Preprocessing images') image_df = kaggle_reader.load_all_raw_images(main.raw_image_path) processed_data = image_processing.process_roi_extractions(image_df, data, main.image_processing_options) else: processed_data = data # extract features from dataframes print('Extracting features using: '+main.feature_method) features = pandas.DataFrame( feature_extractions.feature_extraction(processed_data, input_parameters=main.image_col_name, method=main.feature_method, extraction_options=main.feature_options)) # create target array which contains the correct answer target = pandas.DataFrame(numpy.repeat([True], len(features))) target[data[main.category_col_name].values == '0'] = False # take the sliced data to further split into a training and x-validation (for hyper-parameter tuning) sets print('Preparing training and cross-validation sets') train_indices = split_data.split_data_stratified(data = data, fraction = 0.8, diversity_vars = diversity_vars, random_state = random_state) train_indices = features.index.isin(features.index[train_indices]) train_features = features[train_indices] train_target = target[train_indices].values.ravel() test_features = features[train_indices == False] test_target = target[train_indices == False].values.ravel() inner_cv = sklearn.model_selection.KFold(n_splits=5, shuffle=True, random_state=1) print('Begin training and tuning') #if bayes: # opt = skopt.BayesSearchCV(model, tuned_parameters, n_iter=iterations, cv=inner_cv, verbose=True) #else: opt = sklearn.model_selection.GridSearchCV(model, tuned_parameters, cv=inner_cv, scoring= "neg_log_loss", verbose=verbose, n_jobs=n_jobs) opt.fit(train_features, train_target) print(opt.best_params_) print("Train score: %s" % opt.best_score_) print("Test score: %s" % opt.score(test_features, test_target)) for name, value in opt.best_params_.items(): setattr(main, name, value) return main, opt