def select_train_val_instances(nth_fold, method, flags): """ select_train_val_instances is used to balance the class instances found in the training and validation sets param: nth_fold param: method return: void """ # check if log files exist list_dir = os.listdir(os.path.join(flags['experiment_folder'])) if ('cv' + str(nth_fold) in list_dir) and ('perm' + str(nth_fold) in list_dir): raise ValueError('Dangerous! You have both cv and perm on the path.') elif 'cv' + str(nth_fold) in list_dir: object_folder = os.path.join(flags['experiment_folder'], 'cv' + str(nth_fold)) elif 'perm' + str(nth_fold) in list_dir: object_folder = os.path.join(flags['experiment_folder'], 'perm' + str(nth_fold)) else: raise ValueError('No cv or perm folder!') train_log_file_path = os.path.join(object_folder, 'train', 'train_log.csv') val_log_file_path = os.path.join(object_folder, 'val', 'val_log.csv') if not os.path.isfile(train_log_file_path): raise ValueError('no ' + train_log_file_path) if not os.path.isfile(val_log_file_path): raise ValueError('no ' + val_log_file_path) # read csv train_log = KScsv.read_csv(train_log_file_path) val_log = KScsv.read_csv(val_log_file_path) # count the number if method == 'by_numbers': train_log = select_instances.by_numbers(train_log) val_log = select_instances.by_numbers(val_log) KScsv.write_csv(train_log, train_log_file_path) KScsv.write_csv(val_log, val_log_file_path) else: raise ValueError('no method ' + method + ' exists!')
def split_perm(obj_list, flags): """ split_perm splits data using permutation with stratification based on group label param: images_list param: labels_list param: groups_list param: num param: test_percentage param: val_percentage return: void """ num = flags['num_split'] test_percentage = flags['test_percentage'] val_percentage = flags['val_percentage'] groups_label = list() for file in obj_list['group']: row = KScsv.read_csv(file) groups_label.append(row[0][0]) groups_label = np.array(groups_label) for key in obj_list.keys(): obj_list[key] = np.array(obj_list[key]) if test_percentage != 0: skf = StratifiedShuffleSplit(n_splits=num, test_size=test_percentage / 100.0) for i_num, (train_idx, test_idx) in enumerate( skf.split(obj_list['image'], groups_label)): cv_folder = os.path.join(flags['experiment_folder'], 'perm' + str(i_num + 1)) create_dir(cv_folder) test_obj_list_dict = dict() train_obj_list_dict = dict() for key in obj_list.keys(): test_obj_list_dict[key] = obj_list[key][test_idx] train_obj_list_dict[key] = obj_list[key][train_idx] train_groups_label = groups_label[train_idx] sss = StratifiedShuffleSplit(n_splits=1, test_size=val_percentage / 100.0) for train_train_index, train_val_index in sss.split( train_obj_list_dict['image'], train_groups_label): train_train_obj_list_dict = dict() train_val_obj_list_dict = dict() for key in train_obj_list_dict.keys(): train_train_obj_list_dict[key] = train_obj_list_dict[key][ train_train_index] train_val_obj_list_dict[key] = train_obj_list_dict[key][ train_val_index] ################################################################# # test for key in test_obj_list_dict.keys(): filename = os.path.join(cv_folder, 'test_' + key + '_list.csv') if not os.path.isfile(filename): row_list = [[item] for item in test_obj_list_dict[key]] KScsv.write_csv(row_list, filename) ################################################################# # train dict_path = flags['dict_path'] dict_ext = flags['dict_ext'] obj_list_dict = dict() for key in dict_path.keys(): obj_list_dict[key] = glob.glob( os.path.join(dict_path[key], '*' + dict_ext[key])) temp_train_train_obj_list_dict = collections.defaultdict(list) for name in train_train_obj_list_dict['image']: basename = os.path.basename(name) basename = os.path.splitext(basename)[0] matching = sorted( [s for s in obj_list_dict['image'] if basename in s]) for m in matching: basename = os.path.basename(m) basename = os.path.splitext(basename)[0] basename_dict = dict() for key in train_train_obj_list_dict.keys(): basename_dict[key] = os.path.join( dict_path[key], basename + dict_ext[key]) if all(basename_dict[k] in obj_list_dict[k] for k in basename_dict.keys()): for key in train_train_obj_list_dict.keys(): temp_train_train_obj_list_dict[key].append( basename_dict[key]) for key in train_train_obj_list_dict.keys(): train_train_obj_list_dict[key] = np.array( temp_train_train_obj_list_dict[key]) filename = os.path.join(cv_folder, 'train_' + key + '_list.csv') if not os.path.isfile(filename): row_list = [[item] for item in train_train_obj_list_dict[key]] KScsv.write_csv(row_list, filename) ################################################################# # validation dict_path = flags['dict_path'] dict_ext = flags['dict_ext'] obj_list_dict = dict() for key in dict_path.keys(): obj_list_dict[key] = glob.glob( os.path.join(dict_path[key], '*' + dict_ext[key])) temp_train_val_obj_list_dict = collections.defaultdict(list) for name in train_val_obj_list_dict['image']: basename = os.path.basename(name) basename = os.path.splitext(basename)[0] matching = sorted( [s for s in obj_list_dict['image'] if basename in s]) for m in matching: basename = os.path.basename(m) basename = os.path.splitext(basename)[0] basename_dict = dict() for key in train_val_obj_list_dict.keys(): basename_dict[key] = os.path.join( dict_path[key], basename + dict_ext[key]) if all(basename_dict[k] in obj_list_dict[k] for k in basename_dict.keys()): for key in train_val_obj_list_dict.keys(): temp_train_val_obj_list_dict[key].append( basename_dict[key]) for key in train_val_obj_list_dict.keys(): train_val_obj_list_dict[key] = np.array( temp_train_val_obj_list_dict[key]) filename = os.path.join(cv_folder, 'val_' + key + '_list.csv') if not os.path.isfile(filename): row_list = [[item] for item in train_val_obj_list_dict[key]] KScsv.write_csv(row_list, filename) else: for i_num in range(num): cv_folder = os.path.join(flags['experiment_folder'], 'perm' + str(i_num + 1)) create_dir(cv_folder) train_obj_list_dict = dict() for key in obj_list.keys(): train_obj_list_dict[key] = obj_list[key] train_groups_label = groups_label sss = StratifiedShuffleSplit(n_splits=1, test_size=val_percentage / 100.0) for train_train_index, train_val_index in sss.split( train_obj_list_dict['image'], train_groups_label): train_train_obj_list_dict = dict() train_val_obj_list_dict = dict() for key in train_obj_list_dict.keys(): train_train_obj_list_dict[key] = train_obj_list_dict[key][ train_train_index] train_val_obj_list_dict[key] = train_obj_list_dict[key][ train_val_index] ################################################################# # train dict_path = flags['dict_path'] dict_ext = flags['dict_ext'] obj_list_dict = dict() for key in dict_path.keys(): obj_list_dict[key] = glob.glob( os.path.join(dict_path[key], '*' + dict_ext[key])) temp_train_train_obj_list_dict = collections.defaultdict(list) for name in train_train_obj_list_dict['image']: basename = os.path.basename(name) basename = os.path.splitext(basename)[0] matching = sorted( [s for s in obj_list_dict['image'] if basename in s]) for m in matching: basename = os.path.basename(m) basename = os.path.splitext(basename)[0] basename_dict = dict() for key in train_train_obj_list_dict.keys(): basename_dict[key] = os.path.join( dict_path[key], basename + dict_ext[key]) if all(basename_dict[k] in obj_list_dict[k] for k in basename_dict.keys()): for key in train_train_obj_list_dict.keys(): temp_train_train_obj_list_dict[key].append( basename_dict[key]) for key in train_train_obj_list_dict.keys(): train_train_obj_list_dict[key] = np.array( temp_train_train_obj_list_dict[key]) filename = os.path.join(cv_folder, 'train_' + key + '_list.csv') if not os.path.isfile(filename): row_list = [[item] for item in train_train_obj_list_dict[key]] KScsv.write_csv(row_list, filename) ################################################################# # validation dict_path = flags['dict_path'] dict_ext = flags['dict_ext'] obj_list_dict = dict() for key in dict_path.keys(): obj_list_dict[key] = glob.glob( os.path.join(dict_path[key], '*' + dict_ext[key])) temp_train_val_obj_list_dict = collections.defaultdict(list) for name in train_val_obj_list_dict['image']: basename = os.path.basename(name) basename = os.path.splitext(basename)[0] matching = sorted( [s for s in obj_list_dict['image'] if basename in s]) for m in matching: basename = os.path.basename(m) basename = os.path.splitext(basename)[0] basename_dict = dict() for key in train_val_obj_list_dict.keys(): basename_dict[key] = os.path.join( dict_path[key], basename + dict_ext[key]) if all(basename_dict[k] in obj_list_dict[k] for k in basename_dict.keys()): for key in train_val_obj_list_dict.keys(): temp_train_val_obj_list_dict[key].append( basename_dict[key]) for key in train_val_obj_list_dict.keys(): train_val_obj_list_dict[key] = np.array( temp_train_val_obj_list_dict[key]) filename = os.path.join(cv_folder, 'val_' + key + '_list.csv') if not os.path.isfile(filename): row_list = [[item] for item in train_val_obj_list_dict[key]] KScsv.write_csv(row_list, filename)
def gen_train_val_data(nth_fold, flags): """ gen_train_val_data generates training and validation data for training the network. It builds directories for train and test and extract patches according to the provided 'method', and it maintains a log file containing the contents of all the data splits param: nth_fold param method: sliding_window return: void """ ########## check whether 'cv' or 'perm' exists and which one to use ########## list_dir = os.listdir(os.path.join(flags['experiment_folder'])) if ('cv' + str(nth_fold) in list_dir) and ('perm' + str(nth_fold) in list_dir): raise ValueError('Dangerous! You have both cv and perm on the path.') elif 'cv' + str(nth_fold) in list_dir: object_folder = os.path.join(flags['experiment_folder'], 'cv' + str(nth_fold)) elif 'perm' + str(nth_fold) in list_dir: object_folder = os.path.join(flags['experiment_folder'], 'perm' + str(nth_fold)) else: raise ValueError('No cv or perm folder!') ########## create train and val paths ########## path_dict = dict() path_dict['train_folder'] = os.path.join(object_folder, 'train') path_dict['val_folder'] = os.path.join(object_folder, 'val') create_dir(path_dict['train_folder']) create_dir(path_dict['val_folder']) print("Gets to the beginning of an if statement") ########## extract patches and put in a designated directory ########## if flags['gen_train_val_method'] == 'sliding_window': key_list = ['image', 'groundtruth', 'weight'] for key in key_list: path_dict['train_' + key + '_folder'] = os.path.join( path_dict['train_folder'], key) create_dir(path_dict['train_' + key + '_folder']) path_dict['val_' + key + '_folder'] = os.path.join( path_dict['val_folder'], key) create_dir(path_dict['val_' + key + '_folder']) list_dict = dict() for key in key_list: list_dict['train_' + key + '_list'] = KScsv.read_csv( os.path.join(object_folder, 'train_' + key + '_list.csv')) list_dict['val_' + key + '_list'] = KScsv.read_csv( os.path.join(object_folder, 'val_' + key + '_list.csv')) ########## train ########## for key in ['train', 'val']: if not os.path.isfile( os.path.join(path_dict[key + '_folder'], key + '_log.csv')): log_data = list() for i_image in range(len(list_dict[key + '_image_list'])): tic = time.time() path_image = list_dict[key + '_image_list'][i_image][0] path_groundtruth = list_dict[ key + '_groundtruth_list'][i_image][0] path_weight = list_dict[key + '_weight_list'][i_image][0] #Resize image, groundtruth, and weight from 10x input size to 2.5x (level at which network operates) image = KSimage.imread(path_image) image = KSimage.imresize(image, 0.25) groundtruth = KSimage.imread(path_groundtruth) groundtruth = KSimage.imresize(groundtruth, 0.25) weight = KSimage.imread(path_weight) weight = KSimage.imresize(weight, 0.25) #make sure that groundtruth images have depth = 1 if (len(groundtruth.shape) > 2 and groundtruth.shape[2] > 1): groundtruth = groundtruth[:, :, 1] groundtruth[ groundtruth == 3] = 2 #remove all intra-stromal epithelium labels and set them simply to stroma groundtruth[ groundtruth == 4] = 3 #fat label was originally 4 but is now changed to 3 dict_obj = { 'image': image, 'groundtruth': groundtruth, 'weight': weight } extractor = extract_patches.sliding_window( dict_obj, flags['size_input_patch'], flags['size_output_patch'], flags['stride']) for j, (out_obj_dict, coord_dict) in enumerate(extractor): images = out_obj_dict['image'] groundtruths = out_obj_dict['groundtruth'] weights = out_obj_dict['weight'] coord_images = coord_dict['image'] ############################################################# basename = os.path.basename(path_image) basename = os.path.splitext(basename)[0] image_name = os.path.join( path_dict[key + '_image_folder'], basename + '_idx' + str(j) + '_row' + str(coord_images[0]) + '_col' + str(coord_images[1]) + flags['image_ext']) label_name = os.path.join( path_dict[key + '_groundtruth_folder'], basename + '_idx' + str(j) + '_row' + str(coord_images[0]) + '_col' + str(coord_images[1]) + flags['groundtruth_ext']) weight_name = os.path.join( path_dict[key + '_weight_folder'], basename + '_idx' + str(j) + '_row' + str(coord_images[0]) + '_col' + str(coord_images[1]) + flags['weight_ext']) if not os.path.isfile(image_name): KSimage.imwrite(images, image_name) if not os.path.isfile(label_name): KSimage.imwrite(groundtruths, label_name) if not os.path.isfile(weight_name): KSimage.imwrite(weights, weight_name) log_data.append((image_name, label_name, weight_name)) print('finish processing %d image from %d images : %.2f' % (i_image + 1, len(list_dict[key + '_image_list']), time.time() - tic)) KScsv.write_csv( log_data, os.path.join(path_dict[key + '_folder'], key + '_log.csv')) #################################################################################################################### else: print( "ONLY SLIDING WINDOW TRAINING IS SUPPORTED!!!! Training terminated." ) return
def split_cv(obj_list, flags): """ split_cv splits data into train, validation, and test stratified by the group label param: images_list param: labels_list param: groups_list param: num param: val_percentage return: void """ num = flags['num_split'] val_percentage = flags['val_percentage'] groups_label = list() for file in obj_list['group']: row = KScsv.read_csv(file) groups_label.append(row[0][0]) groups_label = np.array(groups_label) for key in obj_list.keys(): obj_list[key] = np.array(obj_list[key]) skf = StratifiedKFold(n_splits=num) for i_num, (train_idx, test_idx) in enumerate( skf.split(obj_list['image'], groups_label)): cv_folder = os.path.join(flags['experiment_folder'], 'cv' + str(i_num + 1)) create_dir(cv_folder) test_obj_list_dict = dict() train_obj_list_dict = dict() for key in obj_list.keys(): test_obj_list_dict[key] = obj_list[key][test_idx] train_obj_list_dict[key] = obj_list[key][train_idx] train_groups_label = groups_label[train_idx] sss = StratifiedShuffleSplit(n_splits=1, test_size=val_percentage / 100.0) for train_train_index, train_val_index in sss.split( train_obj_list_dict['image'], train_groups_label): train_train_obj_list_dict = dict() train_val_obj_list_dict = dict() for key in train_obj_list_dict.keys(): train_train_obj_list_dict[key] = train_obj_list_dict[key][ train_train_index] train_val_obj_list_dict[key] = train_obj_list_dict[key][ train_val_index] ################################################################# # test for key in test_obj_list_dict.keys(): filename = os.path.join(cv_folder, 'test_' + key + '_list.csv') if not os.path.isfile(filename): row_list = [[item] for item in test_obj_list_dict[key]] KScsv.write_csv(row_list, filename) ################################################################# # train for key in train_train_obj_list_dict.keys(): filename = os.path.join(cv_folder, 'train_' + key + '_list.csv') if not os.path.isfile(filename): row_list = [[item] for item in train_train_obj_list_dict[key]] KScsv.write_csv(row_list, filename) ################################################################# # validation for key in train_val_obj_list_dict.keys(): filename = os.path.join(cv_folder, 'val_' + key + '_list.csv') if not os.path.isfile(filename): row_list = [[item] for item in train_val_obj_list_dict[key]] KScsv.write_csv(row_list, filename)