def compress_cube(job_info): """TODO """ ref_time = time.time() cube_edge_len = job_info.cube_edge_len open_jpeg_bin_path = job_info.open_jpeg_bin_path if job_info.compressor == 'jpeg': if job_info.quality_or_ratio < 40: raise Exception("Improbable quality value set for jpeg as " "compressor: Use values between 50 and 90 for " "reasonable results. " "Higher value -> better quality.") elif job_info.compressor == 'j2k': if job_info.quality_or_ratio > 20: raise Exception("Improbable quality value set for j2k as " "compressor: Use values between 2 and 10 for " "reasonable results. " "Lower value -> better quality.") cube_path_without_ending = os.path.splitext(job_info.src_cube_path)[0] if FADVISE_AVAILABLE: fadvise.willneed(job_info.src_cube_path) if job_info.compressor == 'jpeg': cube_raw = np.fromfile(job_info.src_cube_path, dtype=np.uint8) cube_raw = cube_raw.reshape(cube_edge_len * cube_edge_len, cube_edge_len) if job_info.pre_gauss > 0.0: # blur only in 2d if CV2_AVAILABLE: cv2.GaussianBlur(cube_raw, (5, 5), job_info.pre_gauss, cube_raw) else: cube_raw = scipy.ndimage.filters.gaussian_filter( cube_raw, job_info.pre_gauss) # the exact shape of the 2d representation for compression is # actually important! # PIL performs reasonably fast; one could try libjpeg-turbo to make # it even faster, but IO is the bottleneck anyway cube_img = Image.fromarray(cube_raw) cube_img.save(cube_path_without_ending + '.jpg', quality=job_info.quality_or_ratio) elif job_info.compressor == 'j2k': cmd_string = open_jpeg_bin_path + \ ' -i ' + job_info.src_cube_path +\ ' -o ' + cube_path_without_ending + '.jp2' +\ ' -r ' + str(job_info.quality_or_ratio) +\ ' -b 64,64 -s 1,1 -n 3 ' +\ ' -F ' + str(cube_edge_len) + ',' +\ str(cube_edge_len*cube_edge_len) + ',1,8,u' os.system(cmd_string) # print here, not log_fn, because log_fn may not be able to write to some # data structure from multiple processes. compress_cube.log_queue.put("Compress, writing of {0} took: {1} s".format( job_info.src_cube_path, time.time() - ref_time)) return
def addDataFromFile(self, d_path, l_path, d_files, l_files, cube_prios=None, valid_cubes=[], downsample_xy=False): """ Parameters ---------- d_path/l_path: string Directories to load data from d_files/l_files: list List of data/label files in <path> directory (must be in the same order!). Each list element is a tuple in the form **(<Name of h5-file>, <Key of h5-dataset>)** cube_prios: list (not normalised) list of sampling weights to draw examples from the respective cubes. If None the cube sizes are taken as priorities. valid_cubes: list List of indices for cubes (from the file-lists) to use as validation data and exclude from training, may be empty list to skip performance estimation on validation data. """ self.names += d_files if fadvise_avail: names = reduce(lambda x, y: x + [d_path + y[0][0], l_path + y[1][0]], zip(d_files, l_files), []) fadvise.willneed(names) # returns lists of cubes, info is a tuple per cube data, label, info = self._read_images(d_path, l_path, d_files, l_files, downsample_xy) if self.mode == 'img-scalar': data = _borderTreatment(data, self.patch_size, self.border_mode, self.n_dim) if self.pre_process: if self.pre_process == 'standardise': M = np.mean(map(np.mean, data)) S = np.mean(map(np.std, data)) data = map(lambda x: (x - M) / S, data) print "Data is standardised. Original mean: %.g, original std %.g" % (M, S) self.data_mean = M self.data_std = S else: raise NotImplementedError("Pre-processing %s is not implemented" % self.pre_process) if self.n_lab is None: unique = [np.unique(l) for l in label] self.n_lab = np.unique(np.hstack(unique)).size default_info = (np.ones(self.n_lab), np.zeros(self.n_lab)) info = map(lambda x: default_info if x is None else x, info) self.info += info prios = [] # Distribute Cubes into training and valid list for k, (d, l, i) in enumerate(zip(data, label, info)): if k in valid_cubes: self.valid_d.append(d) self.valid_l.append(l) self.valid_i.append(i) else: self.train_d.append(d) self.train_l.append(l) self.train_i.append(i) # If no priorities are given: sample proportional to cube size prios.append(l.size) if cube_prios is None or cube_prios == []: prios = np.array(prios, dtype=np.float) else: # If priorities are given: sample irrespective of cube size prios = np.array(cube_prios, dtype=np.float) # sample example i if: batch_prob[i] < p self._sampling_weight = np.hstack((0, np.cumsum(prios / prios.sum()))) self._training_count = len(self.train_d) self._valid_count = len(self.valid_d) print self.__repr__() print '\n'
def downsample_cube(job_info): """TODO Args: job_info (downsample_job_info): An object that holds data required for downsampling. """ ref_time = time.time() # the first cube in the list contains the new coordinate of the created # downsampled out-cube cube_edge_len = job_info.cube_edge_len skip_already_cubed_layers = job_info.skip_already_cubed_layers down_block = np.zeros( [cube_edge_len * 2, cube_edge_len * 2, cube_edge_len * 2], dtype=np.uint8, order='F') if skip_already_cubed_layers: if os.path.exists(job_info.trg_cube_path): if os.path.getsize(job_info.trg_cube_path) == cube_edge_len**3: return 'skipped' if FADVISE_AVAILABLE: for src_path in job_info.src_cube_paths: fadvise.willneed(src_path) #time.sleep(0.2) for path_to_src_cube, src_coord in zip(job_info.src_cube_paths, job_info.src_cube_local_coords): if path_to_src_cube == 'bogus': continue # Yes, I know the numpy fromfile function - this is significantly # faster on our cluster content = '' #buffersize=131072*2 fd = io.open(path_to_src_cube, 'rb') # buffering = buffersize) #for i in range(0, (cube_edge_len**3 / buffersize) + 1): # content += fd.read(buffersize) content = fd.read(-1) fd.close() this_cube = np.fromstring(content, dtype=np.uint8).reshape( [cube_edge_len, cube_edge_len, cube_edge_len], order='F') #this_cube = np.fromfile(path_to_src_cube, dtype=np.uint8).reshape([ # cube_edge_len,cube_edge_len,cube_edge_len], order='F') this_cube = np.swapaxes(this_cube, 0, 2) #this_cube = np.swapaxes(this_cube, 1, 2) down_block[src_coord[2]*cube_edge_len:\ src_coord[2]*cube_edge_len+cube_edge_len, src_coord[1]*cube_edge_len:\ src_coord[1]*cube_edge_len+cube_edge_len, src_coord[0]*cube_edge_len:\ src_coord[0]*cube_edge_len+cube_edge_len] = this_cube #down_block = np.swapaxes(down_block, 0, 1) #raise() # It is not clear to me whether this zooming function does actually the # right thing. One should # first filter the data and then # re-sample to avoid aliasing. The prefilter setting is possibly not # working correctly, as the scipy documentation appears to be not in # agreement with the actual source code, so that pre-filtering is only # triggered for orders > 1, even if set to True. I assume that bilinear # or higher order re-sampling itself is "filtering" and is "good # enough" for our case. # This website by Stephan Saalfeld has some interesting details, # but there is a ton of material coming from the photography community. # http://fiji.sc/Downsample # My personal experience: Avoid nearest neighbor (ie naive # re-sampling without any filtering), especially # for noisy images. On top of that, the gains of more sophisticated # filters become less clear, and data and scaling factor dependent. down_block = scipy.ndimage.interpolation.zoom( down_block, 0.5, output=np.uint8, # 1: bilinear # 2: bicubic order=1, # this does not mean nearest interpolation, it corresponds to how the # borders are treated. mode='nearest', # treat the borders. prefilter=True) # extract directory of out_path #if not os.path.exists(os.path.dirname(job_info.trg_cube_path)): # os.makedirs(os.path.dirname(job_info.trg_cube_path)) downsample_cube.log_queue.put("Downsampling took: {0} s for {1}".format( time.time() - ref_time, job_info.trg_cube_path)) #down_block.tofile(job_info.trg_cube_path) return down_block
def addDataFromFile(self, d_path, l_path, d_files, l_files, cube_prios=None, valid_cubes=[], downsample_xy=False): """ Parameters ---------- d_path/l_path: string Directories to load data from d_files/l_files: list List of data/label files in <path> directory (must be in the same order!). Each list element is a tuple in the form **(<Name of h5-file>, <Key of h5-dataset>)** cube_prios: list (not normalised) list of sampling weights to draw examples from the respective cubes. If None the cube sizes are taken as priorities. valid_cubes: list List of indices for cubes (from the file-lists) to use as validation data and exclude from training, may be empty list to skip performance estimation on validation data. """ self.names += d_files if fadvise_avail: names = reduce( lambda x, y: x + [d_path + y[0][0], l_path + y[1][0]], zip(d_files, l_files), []) fadvise.willneed(names) # returns lists of cubes, info is a tuple per cube data, label, info = self._read_images(d_path, l_path, d_files, l_files, downsample_xy) if self.mode == 'img-scalar': data = _borderTreatment(data, self.patch_size, self.border_mode, self.n_dim) if self.pre_process: if self.pre_process == 'standardise': M = np.mean(map(np.mean, data)) S = np.mean(map(np.std, data)) data = map(lambda x: (x - M) / S, data) print "Data is standardised. Original mean: %.g, original std %.g" % ( M, S) self.data_mean = M self.data_std = S else: raise NotImplementedError( "Pre-processing %s is not implemented" % self.pre_process) if self.n_lab is None: unique = [np.unique(l) for l in label] self.n_lab = np.unique(np.hstack(unique)).size default_info = (np.ones(self.n_lab), np.zeros(self.n_lab)) info = map(lambda x: default_info if x is None else x, info) self.info += info prios = [] # Distribute Cubes into training and valid list for k, (d, l, i) in enumerate(zip(data, label, info)): if k in valid_cubes: self.valid_d.append(d) self.valid_l.append(l) self.valid_i.append(i) else: self.train_d.append(d) self.train_l.append(l) self.train_i.append(i) # If no priorities are given: sample proportional to cube size prios.append(l.size) if cube_prios is None or cube_prios == []: prios = np.array(prios, dtype=np.float) else: # If priorities are given: sample irrespective of cube size prios = np.array(cube_prios, dtype=np.float) # sample example i if: batch_prob[i] < p self._sampling_weight = np.hstack((0, np.cumsum(prios / prios.sum()))) self._training_count = len(self.train_d) self._valid_count = len(self.valid_d) print self.__repr__() print '\n'