def split_and_standardize_dataset(self): assert self.dataset is not None [known_dataset, known_targets, unk] = split_dataset(self.dataset, self.targets) # standardize dataset - Gaussian with zero mean and unit variance known_dataset_scaled = preprocessing.scale(known_dataset) known_targets = np.asarray(known_targets) return known_dataset_scaled, known_targets
def thematic_data_from_feature_selection_from_file(orig_targets, theme, percentage, file_name): [dataset, features] = parse_theme_from_file(theme, file_name) [known_dataset, known_targets, unk] = split_dataset(dataset, orig_targets) known_targets = np.asarray(known_targets) selected_features = select_features(percentage, theme) sf = SelectedFeatures(known_dataset, known_targets, selected_features, features) return sf.extract_data_from_selected_features(), known_targets
def report_status_selection(selection): [dataset, features] = parse_theme(selection) [known_dataset, known_targets, unk] = split_dataset(dataset, targets) feats = feature_context(known_dataset, known_targets, features) print selection print feats print 'Nr selected features %d' % len(feats) print 'Nr total features %d' % len(features) print 'Features eliminated %s' % set(features).difference(feats) return feats
def thematic_data_from_feature_selection(orig_targets, theme, target): [dataset, features] = parse_theme(theme) [known_dataset, known_targets, unk] = split_dataset(dataset, orig_targets) nr_times = int(math.floor(TOP_FEATURES_PERCENTAGE_THRESHOLD * len(features))) known_targets = np.asarray(known_targets) ssa_features = select_proxy_features(theme, target, nr_times) sf = SelectedFeatures(known_dataset, known_targets, ssa_features, features) print '####### %s FEATURES ####### %d %s' % (theme, len(ssa_features), str(ssa_features)) return sf.extract_data_from_selected_features(), known_targets
def cv(theme, percentage, current_svm): [dataset, features] = parse_theme(theme) [known_dataset, known_targets, unk] = split_dataset(dataset, targets) known_targets = np.asarray(known_targets) # cv_features = features_cross_validation(known_dataset, known_targets, features, current_svm) # selected_features = select_final_features_from_cv(cv_features, percentage) selected_features = select_features(percentage, theme) sf = SelectedFeatures(known_dataset, known_targets, selected_features, features) combined_dataset = sf.extract_data_from_selected_features() std = StandardizedData(known_targets, combined_dataset) known_dataset_scaled, known_targets = std.split_and_standardize_dataset() print '####### FEATURES ####### %d \n %s' % (len(selected_features), str(selected_features)) return cross_validation(np.array(known_dataset_scaled), known_targets, ids, current_svm)
def thematic_data_from_feature_selection(orig_targets, theme, percentage): [dataset, features] = parse_theme(theme) [known_dataset, known_targets, unk] = split_dataset(dataset, orig_targets) known_targets = np.asarray(known_targets) # these come from feature_selection_cv # commented out because they were saved to decrease computation time # cv_features = features_cross_validation(known_dataset, known_targets, features) # selected_features = select_final_features_from_cv(cv_features, percentage) selected_features = select_features(percentage, theme) sf = SelectedFeatures(known_dataset, known_targets, selected_features, features) print '####### %s FEATURES ####### %d %s' % (theme, len(selected_features), str(selected_features)) return sf.extract_data_from_selected_features(), known_targets
def new_data_single_feature_selection(training_data, training_targets, testing_data, testing_targets, tech): [training_data, training_targets, unk] = split_dataset(training_data, training_targets) selected_features = single_features_90 sf = SelectedFeatures(training_data, training_targets, selected_features, features) training_data = sf.extract_data_from_selected_features() sf = SelectedFeatures(testing_data, testing_targets, selected_features, features) testing_data = sf.extract_data_from_selected_features() # standardize dataset - Gaussian with zero mean and unit variance scaler = StandardScaler() testing_data = replace_missings(testing_data) if tech == 'lr': error_rate, f1, model, (hp, hr, hf), (cp, cr, cf) = lr_one_fold_measures_feature_selection(training_data, testing_data, training_targets, testing_targets) elif tech == 'dt': error_rate, f1, model, (hp, hr, hf), (cp, cr, cf) = dt_one_fold_measures(training_data, testing_data, training_targets, testing_targets) elif tech == 'knn': training_data = scaler.fit_transform(training_data) testing_data = scaler.transform(testing_data) error_rate, f1, model, (hp, hr, hf), (cp, cr, cf) = knn_one_fold_measures(training_data, testing_data, training_targets, testing_targets) elif tech == 'svm': training_data = scaler.fit_transform(training_data) testing_data = scaler.transform(testing_data) error_rate, f1, model, (hp, hr, hf), (cp, cr, cf) = single_svm_fs_one_fold_measures(training_data, testing_data, training_targets, testing_targets) else: print 'ERROR technique' print 'Final error %f' % error_rate print 'Final accuracy %f' % (1 - error_rate) print 'Highval precision %f' % hp print 'Highval recall %f' % hr print 'Highval f1 %f' % hf print 'Civil precision %f' % cp print 'Civil recall %f' % cr print 'Civil f1 %f' % cf return error_rate, f1, model, (hp, hr, hf), (cp, cr, cf)
import sys sys.path.insert(0, 'utils/') from load_data import * from project_data import * from parse_theme import * from split_dataset import * import numpy as np if __name__ == "__main__": spreadsheet = Spreadsheet(project_data_file) data = Data(spreadsheet) targets = data.targets [dataset, features] = parse_theme('all') [known_dataset, known_targets, unk] = split_dataset(dataset, targets) print 'NEG %d' % len([x for x in known_targets if x==0]) print 'POS %d' % len([x for x in known_targets if x==1]) print 'HIGHVAL %d' % len([x for x in known_targets if x==1]) print 'CIVIL %d' % len([x for x in known_targets if x==2])
def get_known_data_from_theme(self, theme): [theme_dataset, theme_features] = parse_theme(theme) [known_dataset, known_targets, unk] = split_dataset(theme_dataset, self.targets) known_targets = np.asarray(known_targets) return [known_dataset, known_targets]
if __name__ == "__main__": training_spreadsheet = Spreadsheet(project_data_file) training_data = Data(training_spreadsheet) training_targets = training_data.targets testing_spreadsheet = Spreadsheet(addendum_data_file, upsampling=False) testing_data = Data(testing_spreadsheet, upsampling=False) testing_targets = testing_data.targets [training_data, features] = parse_theme('all') [testing_data, feats] = parse_theme_from_file('all', addendum_data_file) assert features == feats [training_data, training_targets, unk] = split_dataset(training_data, training_targets) # standardize dataset - Gaussian with zero mean and unit variance scaler = StandardScaler() testing_data = replace_missings(testing_data) training_data_scaled = scaler.fit_transform(training_data) testing_data_scaled = scaler.transform(testing_data) file_name = "ensemble_single.txt" for i in range(100): error_rate, f1, (hp, hr, hf), (cp, cr, cf) = ensemble_single(training_data, testing_data, training_data_scaled, testing_data_scaled, training_targets, testing_targets, dt, knn, svm_all_vars) save_output(file_name, error_rate, hp, hr, hf, cp, cr, cf, 1)
def main(): ''' This function acts as a testbench for the function clean_aflw, using it to perform the basic processing of the AFLW dataset from a set of default values defined below. ''' # Source paths. aflw_dir = '../original/aflw/' aflw_mat = '../original/aflw/dataset_landmarks_and_pose_withfaceids.mat' # Destination path. destination_dir = 'clean/aflw_haar_area/' # Detector model path. frontal_detector_path = 'models/haarcascade_frontalface_alt.xml' profile_detector_path = 'models/haarcascade_profileface.xml' # Detection parameters. out_size = 64 # Output paramenters grayscale_output = True downscaling_interpolation = cv2.INTER_AREA # Number of splits for class assignation. num_splits_tilt = 8 num_splits_pan = 8 # Ratios for train/test and train/validation split. test_ratio = 0.2 validation_ratio = 0.2 # Detector model. detector = HaarFaceDetector(frontal_detector_path, profile_detector_path) # Check if output directory exists. try: os.mkdir(destination_dir) print("Directory", destination_dir, "created.") except FileExistsError: print("Directory", destination_dir, "already exists.") shutil.rmtree(destination_dir) os.mkdir(destination_dir) # Actual cleaning. clean_aflw(aflw_dir, aflw_mat, destination_dir, detector, out_size, grayscale_output, downscaling_interpolation) # Assign classes. class_assign(destination_dir, num_splits_tilt, num_splits_pan) # Split dataset. split_dataset(destination_dir, test_ratio, validation_ratio) # Get normalization parameters. find_norm_parameters(destination_dir) # OPTIONAL: Save dataset as numpy arrays (for uploading to Google Colab). store_dataset_arrays(destination_dir)