def get_data_paths(self, path, folds=5): ''' Get splitted data paths. ''' total_folds_itens = np.zeros(folds) # Get data files. brains_data = Files(path) brains_data = brains_data.get_file_names() total_brains = len(brains_data) fold_size = np.int(total_brains * 1.0 / folds + 0.5) # Get total of brains by category. brains_by_patient = group_brains_by_patient_id(brains_data) patient_ids = brains_by_patient.keys() np.random.shuffle(patient_ids) fold_itens = [] for _ in range(folds): fold_itens.append([]) # Split data, avoiding brains of the same patient being moved to # different sets. for patient in patient_ids: delta_folds_itens = fold_size - total_folds_itens selected_fold = np.argmax(delta_folds_itens) for brain in brains_by_patient[patient]: fold_itens[selected_fold].append(brain) total_folds_itens[selected_fold] += 1 self.fold_itens = np.array(fold_itens)
def split_train_data(self, paths, ratio=None): ''' Get splitted data paths. ''' if ratio == None: ratio = 0.05 / (1 - 1.0 / len(self.fold_itens)) # Get data files. brains_data = Files("") brains_data.paths = paths brains_data = brains_data.get_file_names() total_brains = len(brains_data) validation_size = np.int(total_brains * ratio) # Get total of brains by category. brains_by_patient = group_brains_by_patient_id(brains_data) validation_paths = [] train_paths = [] brains_by_category = group_brains_by_category(brains_data) statistic = {} # Initialize statistic data. for label in brains_by_category: statistic[label] = np.round( len(brains_by_category[label]) * validation_size * 1.0 / total_brains) # Create train and validation set. for label in brains_by_category: index_brains = 0 while statistic[label] > 0: patient_id = get_patient_id( brains_by_category[label][index_brains]) brains_patient = brains_by_patient[patient_id] for brain_patient in brains_patient: validation_paths.append(brain_patient) statistic[get_category(brain_patient)] -= 1 index_brains += 1 for brain in brains_data: if brain not in validation_paths: train_paths.append(brain) # Shuffle data. np.random.shuffle(np.array(validation_paths)) np.random.shuffle(np.array(train_paths)) return train_paths, validation_paths