def test_classifier_wrapper(self): num_files = 3 run_name = 'test_1' config_path = './hardy/test/' output_path = preprocessing.save_to_folder(path, 'test_classifier', run_name) test_set_filenames = preprocessing.hold_out_test_set( data_path, number_of_files_per_class=num_files) run.classifier_wrapper(path, test_set_filenames, run_name, config_path, classifier='cnn', iterator_mode='folder', split=split, classes=classes, image_path=path, project_name='test_classifier') report_dir = output_path + 'report/' report_location = os.listdir(report_dir) for item in report_location: if item.endswith('.yaml'): with open(report_dir + item, 'r') as file: report = yaml.load(file, Loader=yaml.FullLoader) assert isinstance(report, dict),\ 'The filetype returned in not a dictionary' # remove report files after checking they were # correctly created shutil.rmtree(path + 'test_classifier/') print('the result folder was correctly deleted after testing') pass
def test_model_analysis(self): num_files = 1 data_tups = catalogue._data_tuples_from_fnames(input_path=data_path) data_storage = data_path + 'test_1.pkl' catalogue.rgb_list(data_tups, storage_location=data_storage) plot_tups = handling.pickled_data_loader(data_path, 'test_1') test_set_filenames = preprocessing.hold_out_test_set( data_path, number_of_files_per_class=num_files) test_set_list, learning_set_list = catalogue.data_set_split( plot_tups, test_set_filenames) train, val = catalogue.learning_set(image_list=learning_set_list, split=split, classes=['noise', 'one'], iterator_mode='arrays') testing_set = catalogue.test_set(image_list=test_set_list, classes=['noise', 'one'], iterator_mode='arrays') model, history = cnn.build_model(train, val, config_path='./hardy/test/') result = reporting.model_analysis(model, testing_set, test_set_list) assert isinstance(result, pd.DataFrame)
def test_hold_out_test_set(self): num_files = 2 # Frm .csv files test_set_filenames = preprocessing.hold_out_test_set( data_path, number_of_files_per_class=num_files) assert isinstance(test_set_filenames, list), 'format should be a list' assert len(test_set_filenames) == 2*num_files, \ 'the test set is not the correct length' assert len(np.unique(test_set_filenames)) == 2*num_files,\ 'the files selected to compose the test set should be unique' pass
def test_set_folder(self): num_files = 5 # Frm .csv files test_set_filenames = preprocessing.hold_out_test_set( data_path, number_of_files_per_class=num_files) test_folder = preprocessing.test_set_folder( data_path, test_set_filenames) assert isinstance(test_folder, str), \ 'the return should be the path to the test folder' assert os.path.exists(test_folder), \ 'the test folder was not correctly created' pass
def test_data_split(self): num_files = 3 data_tups = catalogue._data_tuples_from_fnames(input_path=data_path) data_storage = data_path + 'test_1' + '.pkl' catalogue.rgb_list(data_tups, storage_location=data_storage) plot_tups = handling.pickled_data_loader(data_path, 'test_1') test_set_filenames = preprocessing.hold_out_test_set( data_path, number_of_files_per_class=num_files) test_set_list, learning_set_list = catalogue.data_set_split( plot_tups, test_set_filenames) assert isinstance(test_set_filenames, list), 'format should be a list' assert len(test_set_filenames) == 2*num_files, \ 'the test set is not the correct length' assert isinstance(test_set_list, list), 'format should be a list' assert isinstance(learning_set_list, list), 'format should be a list' pass
def checkpoint_datacreation( # Data and Config Paths raw_datapath, tform_config_path, classifier_config_path, # Optional for Data iterator_mode='arrays', plot_format="RGBrgb", print_out=True, skiprows=0, # Optional for Classifier num_test_files_class=300, scale=1.0, classifier='tuner', split=0.1, target_size=(80, 80), batch_size=32, classes=['class_1', 'class_2'], project_name='tuner_run', k_fold=False, k=None, color_mode='rgb', seed=None): ''' Function that is part of run_hardy main module, it is run by checkrun function to evaluate the time and memory required by the preprocessing step. Parameters ---------- raw_datapath : str path to the raw .csv files containing the data to classify tform_config_path : str string containing the path to the yaml file containing the transformations to use and which data in the .csv file to perform it on classifier_config_path : str string containing the path to the yaml file representing the classifier hyperparameters iterator_mode : str option to use images from arrays directly or save the .png and use a directory iterator mode plot_format : str option for standard or RGB color gradient print_out : bool option for printing out feedback on conputational time taken to initialize the data and generate the images num_test_files_class : int or float numebr of files per class to select for the test set classifier : str option cnn or tuner scale : float percentage fo the image to reduce its size to. split : float the percentage of the learning set to use for the validation step target_size : tuple image target size. Presented as a tuble indicating number of pixels composing the two dimensions of the image (w x h) batch_size : int The number of files to group up into a batch classes : list A list containing strings of the classes the data is divided in. The class name represent the folder name the files are contained in. project_name : str name of the folder to be created for storing the results of the tuning k_fold: bool bool value indicating if the k_fold is to be performed. Not valid for tuner k: int integer value indicating how many k folds needs to be performed. seed: int used in hold_out_test_set to isolate the testing data randomly for use in training of neural network. Can be assigned value to repeat the selection. Returns ------- test_set_filenames: list list indicating the test set data to be used for neural network image_data: list list of images comprising of file name, image_data, and label image_path: list indicating the images path for image data when plot data is used """ ''' if tform_config_path is None: # ALLOWED so we can test functions without Transfoms # If so, create a list of one Tform_config, which will be "None" tform_command_list = ["no_transform"] tform_command_dict = {"no_transform": None} else: # Import the Tform Config List (and the dictionary for it) tform_command_list, tform_command_dict = \ arbitrage.import_tform_config(tform_config_path) pass # =========================== # 1b) ANY OTHER SETUP? # =========================== test_set_filenames = preprocessing.hold_out_test_set( raw_datapath, number_of_files_per_class=num_test_files_class, classes=classes, seed=seed) # ============================================ # Section 2: Data Wrapper (Setup + Run) # ============================================ # tform_commands = tform_command_dict[tform_name] data_dict = {} partial_data_wrapper = partial(data_wrapper, raw_datapath=raw_datapath, plot_format=plot_format, iterator_mode=iterator_mode, print_out=print_out, scale=scale, project_name=project_name, classes=classes, skiprows=skiprows, tform_command_dict=tform_command_dict) # need to pass tform_command_dict to data_wrapper instead of # tform_commands, # determine tform_commands in the # data_wrapper # for tform_name in tform_command_list: # ============================================ # Section 2: Data Wrapper (Setup + Run) # ============================================ # tform_commands = tform_command_dict[tform_name] # print(tform_commands) pool = mp.Pool(processes=mp.cpu_count()) if iterator_mode == 'arrays': image_data = pool.map(partial_data_wrapper, tform_command_list) image_path = None for i in range(len(tform_command_list)): data_dict[tform_command_list[i]] = image_data[i] # data_dict[tform_name] = image_data else: image_data = None image_path = pool.map(partial_data_wrapper, tform_command_list) pool.close() return test_set_filenames, image_data, image_path
def checkrun( raw_datapath, tform_config_path, classifier_config_path, # Optional for Data iterator_mode='arrays', plot_format="RGBrgb", print_out=True, skiprows=0, # Optional for Classifier scale=1.0, classifier='tuner', split=0.1, target_size=(80, 80), batch_size=32, classes=['class_1', 'class_2'], project_name='tuner_run', k_fold=False, k=None, color_mode='rgb', seed=None): ''' Check run function that when executed runs all the transformations over 1% of the data and returns the time and memory required for the complete preprocessing of data step. Parameters ---------- raw_datapath : str path to the raw .csv files containing the data to classify tform_config_path : str string containing the path to the yaml file containing the transformations to use and which data in the .csv file to perform it on classifier_config_path : str string containing the path to the yaml file representing the classifier hyperparameters iterator_mode : str option to use images from arrays directly or save the .png and use a directory iterator mode plot_format : str option for standard or RGB color gradient print_out : bool option for printing out feedback on conputational time taken to initialize the data and generate the images num_test_files_class : int or float numebr of files per class to select for the test set classifier : str option cnn or tuner scale : float percentage fo the image to reduce its size to. split : float the percentage of the learning set to use for the validation step target_size : tuple image target size. Presented as a tuble indicating number of pixels composing the two dimensions of the image (w x h) batch_size : int The number of files to group up into a batch classes : list A list containing strings of the classes the data is divided in. The class name represent the folder name the files are contained in. project_name : str name of the folder to be created for storing the results of the tuning k_fold: bool bool value indicating if the k_fold is to be performed. Not valid for tuner k: int integer value indicating how many k folds needs to be performed. seed: int used in hold_out_test_set to isolate the testing data randomly for use in training of neural network. Can be assigned value to repeat the selection. ''' # listing down the filenames file_names = [ item for item in os.listdir(raw_datapath) if item.endswith('.csv') ] # calculations for extracting 1% of data range_filename = int(len(file_names) * 0.01) # check to pass test or when the dataset is very small if range_filename == 0: range_filename += 1 range_classes = len(classes) # calculating files per class to pass to hold_out_test_set # for getting filesnames to be transferring to temporary # folder file_per_class = round(range_filename / range_classes) # check to pass test or when the dataset is very small if file_per_class == 0: file_per_class += 1 # calculating the test_set_filenames to be used for # the hold_out_test_set in the checkpoint_datacreation # function num_test_files_class = round(0.25 * file_per_class) # start measuring time here time_1 = perf_counter() # starting memory tracer tracemalloc.start() # getting filenames to collect files for checkrun file_names_for_test = preprocessing.hold_out_test_set( raw_datapath, number_of_files_per_class=file_per_class, classes=classes) # addding .csv extension to the filenames and storing it in # seperate list file_names_csv = [] for item in file_names_for_test: file_names_csv.append(item + '.csv') # making directory to store the temporary data os.mkdir(os.path.join(raw_datapath, 'temp/')) # copying the 1% data to seperate folder for item in file_names_csv: shutil.copy(os.path.join(raw_datapath, item), os.path.join(raw_datapath + 'temp/')) # creating new data path to be passed to checkpoint_ # datacreation function new_data_path = os.path.join(raw_datapath, 'temp/') checkpoint_datacreation( new_data_path, tform_config_path, classifier_config_path, # Optional for Data iterator_mode=iterator_mode, plot_format=plot_format, print_out=print_out, skiprows=skiprows, # Optional for Classifier num_test_files_class=num_test_files_class, scale=scale, classifier=classifier, split=split, target_size=target_size, batch_size=batch_size, classes=classes, project_name=project_name, k_fold=k_fold, k=k, color_mode=color_mode, seed=seed) # getting feedback from memory tracer current, peak = tracemalloc.get_traced_memory() # stopping memory tracer tracemalloc.stop() time_2 = perf_counter() time_elapsed = time_2 - time_1 print("The total time required for data creation will be approx.\ {} hours".format(round((time_elapsed * 100) / 3600, 3))) print("The total memory required for the process will be approx.\ {} Gigabytes".format(round(peak * 100 / (10**9), 3))) # removing the temporary data folder shutil.rmtree(new_data_path) print("Temporary files created by process are successfully deleted")
def hardy_main( # Data and Config Paths raw_datapath, tform_config_path, classifier_config_path, # Optional for Data iterator_mode='arrays', plot_format="RGBrgb", print_out=True, skiprows=0, # Optional for Classifier num_test_files_class=300, scale=1.0, classifier='tuner', split=0.1, target_size=(80, 80), batch_size=32, classes=['class_1', 'class_2'], project_name='tuner_run', k_fold=False, k=None, color_mode='rgb', seed=None, n_threads=1): """ OVERALL wrapper function, to pass initial configurations and allow all other internal functions to understand and call upon each other. Parameters ---------- raw_datapath : str path to the raw .csv files containing the data to classify tform_config_path : str string containing the path to the yaml file containing the transformations to use and which data in the .csv file to perform it on classifier_config_path : str string containing the path to the yaml file representing the classifier hyperparameters iterator_mode : str option to use images from arrays directly or save the .png and use a directory iterator mode plot_format : str option for standard or RGB color gradient print_out : bool option for printing out feedback on conputational time taken to initialize the data and generate the images num_test_files_class : int or float numebr of files per class to select for the test set classifier : str option cnn or tuner scale : float percentage fo the image to reduce its size to. split : float the percentage of the learning set to use for the validation step target_size : tuple image target size. Presented as a tuble indicating number of pixels composing the two dimensions of the image (w x h) batch_size : int The number of files to group up into a batch classes : list A list containing strings of the classes the data is divided in. The class name represent the folder name the files are contained in. project_name : str name of the folder to be created for storing the results of the tuning k_fold: bool bool value indicating if the k_fold is to be performed. Not valid for tuner k: int integer value indicating how many k folds needs to be performed. seed: int used in hold_out_test_set to isolate the testing data randomly for use in training of neural network. Can be assigned value to repeat the selection. n_threads: int Number of cores used for parallel processing during the data transformation stage. Function Calls (see their related documentation) -------------- import_tform_config : f(n) of ARBITRAGE.py Import the list and dictionary of transforms to be looped through. (full model and report for each) data_wrapper : Local Wrapping f(n) Takes file path and the transformation command for the current loop, and creates the list-of-tuple images (OR Saves Image Files to be used later) classifier_wrapper : Local Wrapping f(n) Takes many inputs including configuration loading directions. Loads images, and makes them Keras-Readable. Then sets up the model and the tuner, and runs the model test/train/tune loops as commanded. Returns ------- * returns a folder containing subfolders, one for each trial run in the hardy module. THese folders will containg a report on the run, as well as the best classifier model """ # ================================================ # Section 1: Setup and Import Transforms # ================================================ if tform_config_path is None: # ALLOWED so we can test functions without Transfoms # If so, create a list of one Tform_config, which will be "None" tform_command_list = ["no_transform"] tform_command_dict = {"no_transform": None} else: # Import the Tform Config List (and the dictionary for it) tform_command_list, tform_command_dict = \ arbitrage.import_tform_config(tform_config_path) pass # =========================== # 1b) ANY OTHER SETUP? # =========================== test_set_filenames = preprocessing.hold_out_test_set( raw_datapath, number_of_files_per_class=num_test_files_class, classes=classes, seed=seed) # # Make the raw Dataframe Tuples List # raw_tuples_list = to_catalogue._data_tuples_from_fnames( # raw_datapath, classes=classes, skiprows=skiprows) # data_dict = {} partial_data_wrapper = partial(data_wrapper, raw_datapath=raw_datapath, plot_format=plot_format, iterator_mode=iterator_mode, print_out=print_out, scale=scale, project_name=project_name, classes=classes, skiprows=skiprows, tform_command_dict=tform_command_dict) # need to pass tform_command_dict to data_wrapper instead of # tform_commands, # determine tform_commands in the # data_wrapper # for tform_name in tform_command_list: # ============================================ # Section 2: Data Wrapper (Setup + Run) # ============================================ # tform_commands = tform_command_dict[tform_name] # print(tform_commands) pool = mp.Pool(processes=n_threads) if iterator_mode == 'arrays': pool.map(partial_data_wrapper, tform_command_list) image_path = None # for i in range(len(tform_command_list)): # data_dict[tform_command_list[i]] = image_data[i] # data_dict[tform_name] = image_data else: # image_data = None image_path = pool.map(partial_data_wrapper, tform_command_list) pool.close() # ============================================ # Section 3: Classifier Wrapper (Setup + Run) # ============================================ for tform_name in tform_command_list: # Image PATH is none, but we can pass DATA classifier_wrapper(raw_datapath, test_set_filenames, tform_name, classifier_config_path, classifier=classifier, iterator_mode=iterator_mode, split=split, color_mode=color_mode, target_size=target_size, batch_size=batch_size, image_path=image_path, classes=classes, project_name=project_name, k_fold=k_fold, k=k) # NO OUTPUT? - it outputs the report file return None