def save_train_validation_ids(filename, data_path): patient_dirs = sorted(glob.glob(data_path + "/*/study/"), key=lambda folder: int(re.search(r'/(\d+)/', folder).group(1))) dirs_indices = range(0, len(patient_dirs)) valid_dirs_indices = get_cross_validation_indices(indices=dirs_indices, validation_index=0) train_patient_indices = list(set(dirs_indices) - set(valid_dirs_indices)) train_patient_dirs = [utils.get_patient_id(patient_dirs[idx]) for idx in train_patient_indices] validation_patient_dirs = [utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices] d = {'train': train_patient_dirs, 'valid': validation_patient_dirs} utils.save_pkl(d, filename) print 'train-valid patients split saved to', filename return d
def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=5, **kwargs): if patient_ids: patient_paths = [] for pid in patient_ids: patient_paths.append(data_path + '/%s/study/' % pid) else: patient_paths = glob.glob(data_path + '/*/study/') self.pid2sax_slice_paths = defaultdict(list) self.pid2ch2_path, self.pid2ch4_path = {}, {} for p in patient_paths: pid = int(utils.get_patient_id(p)) spaths = sorted(glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1))) if len(spaths) > min_slices: self.pid2sax_slice_paths[pid] = spaths ch2_path = glob.glob(p + '/2ch_*.pkl') self.pid2ch2_path[pid] = ch2_path[0] if ch2_path else None ch4_path = glob.glob(p + '/4ch_*.pkl') self.pid2ch4_path[pid] = ch4_path[0] if ch4_path else None self.patient_ids = self.pid2sax_slice_paths.keys() self.nsamples = len(self.patient_ids) self.id2labels = data.read_labels(labels_path) if labels_path else None self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.batch_size = batch_size self.infinite = infinite self.transformation_params = transform_params self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=False, view='sax', data_prep_fun=data.transform_norm_rescale, **kwargs): if patient_ids: self.patient_paths = [] for pid in patient_ids: self.patient_paths.append(data_path + '/%s/study/' % pid) else: self.patient_paths = glob.glob(data_path + '/*/study/') self.slice_paths = [sorted(glob.glob(p + '/%s_*.pkl' % view)) for p in self.patient_paths] self.slice_paths = list(itertools.chain(*self.slice_paths)) self.slicepath2pid = {} for s in self.slice_paths: self.slicepath2pid[s] = int(utils.get_patient_id(s)) self.nsamples = len(self.slice_paths) self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.infinite = infinite self.id2labels = data.read_labels(labels_path) if labels_path else None self.transformation_params = transform_params self.data_prep_fun = data_prep_fun self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
def get_patient_data(patient_data_path): patient_data = [] spaths = sorted(glob.glob(patient_data_path + '/*.pkl'), key=lambda x: int(re.search(r'/\w*_(\d+)*\.pkl$', x).group(1))) pid = utils.get_patient_id(patient_data_path) for s in spaths: slice_id = utils.get_slice_id(s) metadata = data.read_metadata(s) d = data.read_slice(s) patient_data.append({'data': d, 'metadata': metadata, 'slice_id': slice_id, 'patient_id': pid}) return patient_data
def split_train_data(self, paths, ratio=None): ''' Get splitted data paths. ''' if ratio == None: ratio = 0.05 / (1 - 1.0 / len(self.fold_itens)) # Get data files. brains_data = Files("") brains_data.paths = paths brains_data = brains_data.get_file_names() total_brains = len(brains_data) validation_size = np.int(total_brains * ratio) # Get total of brains by category. brains_by_patient = group_brains_by_patient_id(brains_data) validation_paths = [] train_paths = [] brains_by_category = group_brains_by_category(brains_data) statistic = {} # Initialize statistic data. for label in brains_by_category: statistic[label] = np.round( len(brains_by_category[label]) * validation_size * 1.0 / total_brains) # Create train and validation set. for label in brains_by_category: index_brains = 0 while statistic[label] > 0: patient_id = get_patient_id( brains_by_category[label][index_brains]) brains_patient = brains_by_patient[patient_id] for brain_patient in brains_patient: validation_paths.append(brain_patient) statistic[get_category(brain_patient)] -= 1 index_brains += 1 for brain in brains_data: if brain not in validation_paths: train_paths.append(brain) # Shuffle data. np.random.shuffle(np.array(validation_paths)) np.random.shuffle(np.array(train_paths)) return train_paths, validation_paths
def save_train_validation_ids(filename, data_path): patient_dirs = sorted( glob.glob(data_path + "/*/study/"), key=lambda folder: int(re.search(r'/(\d+)/', folder).group(1))) dirs_indices = list(range(0, len(patient_dirs))) valid_dirs_indices = get_cross_validation_indices(indices=dirs_indices, validation_index=0) train_patient_indices = list(set(dirs_indices) - set(valid_dirs_indices)) train_patient_dirs = [ utils.get_patient_id(patient_dirs[idx]) for idx in train_patient_indices ] validation_patient_dirs = [ utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices ] d = {'train': train_patient_dirs, 'valid': validation_patient_dirs} utils.save_pkl(d, filename) print('train-valid patients split saved to', filename) return d
def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=0, data_prep_fun=data.transform_norm_rescale, **kwargs): if patient_ids: patient_paths = [] for pid in patient_ids: patient_paths.append(data_path + '/%s/study/' % pid) else: patient_paths = glob.glob(data_path + '/*/study/') self.pid2slice_paths = defaultdict(list) nslices = [] for p in patient_paths: pid = int(utils.get_patient_id(p)) spaths = sorted( glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1))) # consider patients only with min_slices if len(spaths) > min_slices: self.pid2slice_paths[pid] = spaths nslices.append(len(spaths)) # take max number of slices self.nslices = int(np.max(nslices)) self.patient_ids = self.pid2slice_paths.keys() self.nsamples = len(self.patient_ids) self.data_path = data_path self.id2labels = data.read_labels(labels_path) if labels_path else None self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.batch_size = batch_size self.infinite = infinite self.transformation_params = transform_params self.data_prep_fun = data_prep_fun self.slice2roi = utils.load_pkl( slice2roi_path) if slice2roi_path else None
def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=5, **kwargs): if patient_ids: patient_paths = [] for pid in patient_ids: patient_paths.append(data_path + '/%s/study/' % pid) else: patient_paths = glob.glob(data_path + '/*/study/') self.pid2sax_slice_paths = defaultdict(list) self.pid2ch2_path, self.pid2ch4_path = {}, {} for p in patient_paths: pid = int(utils.get_patient_id(p)) spaths = sorted( glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1))) if len(spaths) > min_slices: self.pid2sax_slice_paths[pid] = spaths ch2_path = glob.glob(p + '/2ch_*.pkl') self.pid2ch2_path[pid] = ch2_path[0] if ch2_path else None ch4_path = glob.glob(p + '/4ch_*.pkl') self.pid2ch4_path[pid] = ch4_path[0] if ch4_path else None self.patient_ids = self.pid2sax_slice_paths.keys() self.nsamples = len(self.patient_ids) self.id2labels = data.read_labels(labels_path) if labels_path else None self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.batch_size = batch_size self.infinite = infinite self.transformation_params = transform_params self.slice2roi = utils.load_pkl( slice2roi_path) if slice2roi_path else None
def get_patient_data(patient_data_path): patient_data = [] spaths = sorted( glob.glob(patient_data_path + '/*.pkl'), key=lambda x: int(re.search(r'/\w*_(\d+)*\.pkl$', x).group(1))) pid = utils.get_patient_id(patient_data_path) for s in spaths: slice_id = utils.get_slice_id(s) metadata = data.read_metadata(s) d = data.read_slice(s) patient_data.append({ 'data': d, 'metadata': metadata, 'slice_id': slice_id, 'patient_id': pid }) return patient_data
def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=False, view='sax', data_prep_fun=data.transform_norm_rescale, **kwargs): if patient_ids: self.patient_paths = [] for pid in patient_ids: self.patient_paths.append(data_path + '/%s/study/' % pid) else: self.patient_paths = glob.glob(data_path + '/*/study/') self.slice_paths = [ sorted(glob.glob(p + '/%s_*.pkl' % view)) for p in self.patient_paths ] self.slice_paths = list(itertools.chain(*self.slice_paths)) self.slicepath2pid = {} for s in self.slice_paths: self.slicepath2pid[s] = int(utils.get_patient_id(s)) self.nsamples = len(self.slice_paths) self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.infinite = infinite self.id2labels = data.read_labels(labels_path) if labels_path else None self.transformation_params = transform_params self.data_prep_fun = data_prep_fun self.slice2roi = utils.load_pkl( slice2roi_path) if slice2roi_path else None
def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=0, data_prep_fun=data.transform_norm_rescale, **kwargs): if patient_ids: patient_paths = [] for pid in patient_ids: patient_paths.append(data_path + '/%s/study/' % pid) else: patient_paths = glob.glob(data_path + '/*/study/') self.pid2slice_paths = defaultdict(list) nslices = [] for p in patient_paths: pid = int(utils.get_patient_id(p)) spaths = sorted(glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1))) # consider patients only with min_slices if len(spaths) > min_slices: self.pid2slice_paths[pid] = spaths nslices.append(len(spaths)) # take max number of slices self.nslices = int(np.max(nslices)) self.patient_ids = self.pid2slice_paths.keys() self.nsamples = len(self.patient_ids) self.data_path = data_path self.id2labels = data.read_labels(labels_path) if labels_path else None self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.batch_size = batch_size self.infinite = infinite self.transformation_params = transform_params self.data_prep_fun = data_prep_fun self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
def fit(self, X, y, paths, ratio=0.9, iterations=0): ''' Compute ANOVA scores. ''' if iterations != 0: self.iterations = iterations self.labels_ = np.unique(y) brains_by_patient = group_brains_by_patient_id(paths) paths = np.array([get_patient_id(paths[i]) for i in range(len(paths))]) self.scores_ = X[0] * 0.0 for i in range(self.iterations): selection = SelectKBest(f_classif, k=X.shape[1]) randomize = range(len(X)) np.random.shuffle(randomize) X = X[randomize] y = y[randomize] paths = paths[randomize] X_temp, y_temp = self.get_subsample_by_patient( X=X, y=y, patients=list(brains_by_patient.keys()), paths=paths, ratio=ratio) selection.fit(X_temp, y_temp) scores = selection.scores_ scores[np.where(np.logical_or(np.isnan(scores), np.isinf(scores)))] = 0.0 self.scores_ = np.max(np.vstack((scores, self.scores_)), axis=0) del scores del X_temp del y_temp print i, np.mean(self.scores_)
train_patient_indices = list(set(dirs_indices) - set(valid_dirs_indices)) train_patient_dirs = [utils.get_patient_id(patient_dirs[idx]) for idx in train_patient_indices] validation_patient_dirs = [utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices] d = {'train': train_patient_dirs, 'valid': validation_patient_dirs} utils.save_pkl(d, filename) print 'train-valid patients split saved to', filename return d if __name__ == '__main__': global_data_path = '/data/dsb15_pkl/pkl_train' p = save_train_validation_ids(global_data_path) print 'TRAIN' for path in p['train']: print utils.get_patient_id(path), print '\nVALID' valid_ids = [] for path in p['valid']: valid_ids.append(utils.get_patient_id(path)) print utils.get_patient_id(path), valid_ids1 = [] g = glob.glob('/data/dsb15_pkl/pkl_splitted/valid/*/study/') for path in g: valid_ids1.append(utils.get_patient_id(path)) print set(valid_ids) == set(valid_ids1)
] validation_patient_dirs = [ utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices ] d = {'train': train_patient_dirs, 'valid': validation_patient_dirs} utils.save_pkl(d, filename) print('train-valid patients split saved to', filename) return d if __name__ == '__main__': global_data_path = '/data/dsb15_pkl/pkl_train' p = save_train_validation_ids(global_data_path) print('TRAIN') for path in p['train']: print(utils.get_patient_id(path), end=' ') print('\nVALID') valid_ids = [] for path in p['valid']: valid_ids.append(utils.get_patient_id(path)) print(utils.get_patient_id(path), end=' ') valid_ids1 = [] g = glob.glob('/data/dsb15_pkl/pkl_splitted/valid/*/study/') for path in g: valid_ids1.append(utils.get_patient_id(path)) print(set(valid_ids) == set(valid_ids1))
] validation_patient_dirs = [ utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices ] d = {'train': train_patient_dirs, 'valid': validation_patient_dirs} utils.save_pkl(d, filename) print 'train-valid patients split saved to', filename return d if __name__ == '__main__': global_data_path = '/data/dsb15_pkl/pkl_train' p = save_train_validation_ids(global_data_path) print 'TRAIN' for path in p['train']: print utils.get_patient_id(path), print '\nVALID' valid_ids = [] for path in p['valid']: valid_ids.append(utils.get_patient_id(path)) print utils.get_patient_id(path), valid_ids1 = [] g = glob.glob('/data/dsb15_pkl/pkl_splitted/valid/*/study/') for path in g: valid_ids1.append(utils.get_patient_id(path)) print set(valid_ids) == set(valid_ids1)
def __call__(self, tup): return True if self.patient_id == None else self.patient_id == utils.get_patient_id(tup)
def __call__(self, tup): return True if self.patient_id == None else self.patient_id == utils.get_patient_id( tup)