Esempio n. 1
0
def compress_cube(job_info):
    """TODO
    """

    ref_time = time.time()
    cube_edge_len = job_info.cube_edge_len
    open_jpeg_bin_path = job_info.open_jpeg_bin_path

    if job_info.compressor == 'jpeg':
        if job_info.quality_or_ratio < 40:
            raise Exception("Improbable quality value set for jpeg as "
                            "compressor: Use values between 50 and 90 for "
                            "reasonable results. "
                            "Higher value -> better quality.")
    elif job_info.compressor == 'j2k':
        if job_info.quality_or_ratio > 20:
            raise Exception("Improbable quality value set for j2k as "
                            "compressor: Use values between 2 and 10 for "
                            "reasonable results. "
                            "Lower value -> better quality.")

    cube_path_without_ending = os.path.splitext(job_info.src_cube_path)[0]

    if FADVISE_AVAILABLE:
        fadvise.willneed(job_info.src_cube_path)

    if job_info.compressor == 'jpeg':
        cube_raw = np.fromfile(job_info.src_cube_path, dtype=np.uint8)

        cube_raw = cube_raw.reshape(cube_edge_len * cube_edge_len,
                                    cube_edge_len)

        if job_info.pre_gauss > 0.0:
            # blur only in 2d
            if CV2_AVAILABLE:
                cv2.GaussianBlur(cube_raw, (5, 5), job_info.pre_gauss,
                                 cube_raw)
            else:
                cube_raw = scipy.ndimage.filters.gaussian_filter(
                    cube_raw, job_info.pre_gauss)

        # the exact shape of the 2d representation for compression is
        # actually important!
        # PIL performs reasonably fast; one could try libjpeg-turbo to make
        # it even faster, but IO is the bottleneck anyway
        cube_img = Image.fromarray(cube_raw)

        cube_img.save(cube_path_without_ending + '.jpg',
                      quality=job_info.quality_or_ratio)

    elif job_info.compressor == 'j2k':
        cmd_string = open_jpeg_bin_path + \
                     ' -i ' + job_info.src_cube_path +\
                     ' -o ' + cube_path_without_ending + '.jp2' +\
                     ' -r ' + str(job_info.quality_or_ratio) +\
                     ' -b 64,64 -s 1,1 -n 3 ' +\
                     ' -F ' + str(cube_edge_len) + ',' +\
                     str(cube_edge_len*cube_edge_len) + ',1,8,u'
        os.system(cmd_string)

    # print here, not log_fn, because log_fn may not be able to write to some
    # data structure from multiple processes.
    compress_cube.log_queue.put("Compress, writing of {0} took: {1} s".format(
        job_info.src_cube_path,
        time.time() - ref_time))

    return
Esempio n. 2
0
    def addDataFromFile(self,
                        d_path,
                        l_path,
                        d_files,
                        l_files,
                        cube_prios=None,
                        valid_cubes=[],
                        downsample_xy=False):
        """
        Parameters
        ----------

        d_path/l_path: string
          Directories to load data from
        d_files/l_files: list
          List of data/label files in <path> directory (must be in the same order!). Each list
        element is a tuple in the form **(<Name of h5-file>, <Key of h5-dataset>)**
        cube_prios: list
          (not normalised) list of sampling weights to draw examples from the respective cubes.
          If None the cube sizes are taken as priorities.
        valid_cubes: list
          List of indices for cubes (from the file-lists) to use as validation data and exclude from training,
          may be empty list to skip performance estimation on validation data.
        """
        self.names += d_files
        if fadvise_avail:
            names = reduce(lambda x, y: x + [d_path + y[0][0], l_path + y[1][0]], zip(d_files, l_files), [])
            fadvise.willneed(names)
        # returns lists of cubes, info is a tuple per cube
        data, label, info = self._read_images(d_path, l_path, d_files, l_files, downsample_xy)

        if self.mode == 'img-scalar':
            data = _borderTreatment(data, self.patch_size, self.border_mode, self.n_dim)

        if self.pre_process:
            if self.pre_process == 'standardise':
                M = np.mean(map(np.mean, data))
                S = np.mean(map(np.std, data))
                data = map(lambda x: (x - M) / S, data)
                print "Data is standardised. Original mean: %.g, original std %.g" % (M, S)
                self.data_mean = M
                self.data_std = S

            else:
                raise NotImplementedError("Pre-processing %s is not implemented" % self.pre_process)

        if self.n_lab is None:
            unique = [np.unique(l) for l in label]
            self.n_lab = np.unique(np.hstack(unique)).size

        default_info = (np.ones(self.n_lab), np.zeros(self.n_lab))
        info = map(lambda x: default_info if x is None else x, info)
        self.info += info

        prios = []
        # Distribute Cubes into training and valid list
        for k, (d, l, i) in enumerate(zip(data, label, info)):
            if k in valid_cubes:
                self.valid_d.append(d)
                self.valid_l.append(l)
                self.valid_i.append(i)
            else:
                self.train_d.append(d)
                self.train_l.append(l)
                self.train_i.append(i)
                # If no priorities are given: sample proportional to cube size
                prios.append(l.size)

        if cube_prios is None or cube_prios == []:
            prios = np.array(prios, dtype=np.float)
        else:  # If priorities are given: sample irrespective of cube size
            prios = np.array(cube_prios, dtype=np.float)

            # sample example i if: batch_prob[i] < p
        self._sampling_weight = np.hstack((0, np.cumsum(prios / prios.sum())))
        self._training_count = len(self.train_d)
        self._valid_count = len(self.valid_d)

        print self.__repr__()
        print '\n'
Esempio n. 3
0
def downsample_cube(job_info):
    """TODO

    Args:
        job_info (downsample_job_info):
            An object that holds data required for downsampling.
    """

    ref_time = time.time()
    # the first cube in the list contains the new coordinate of the created
    # downsampled out-cube

    cube_edge_len = job_info.cube_edge_len
    skip_already_cubed_layers = job_info.skip_already_cubed_layers

    down_block = np.zeros(
        [cube_edge_len * 2, cube_edge_len * 2, cube_edge_len * 2],
        dtype=np.uint8,
        order='F')

    if skip_already_cubed_layers:
        if os.path.exists(job_info.trg_cube_path):
            if os.path.getsize(job_info.trg_cube_path) == cube_edge_len**3:
                return 'skipped'

    if FADVISE_AVAILABLE:
        for src_path in job_info.src_cube_paths:
            fadvise.willneed(src_path)

        #time.sleep(0.2)

    for path_to_src_cube, src_coord in zip(job_info.src_cube_paths,
                                           job_info.src_cube_local_coords):
        if path_to_src_cube == 'bogus':
            continue

        # Yes, I know the numpy fromfile function - this is significantly
        # faster on our cluster
        content = ''
        #buffersize=131072*2
        fd = io.open(path_to_src_cube, 'rb')
        # buffering = buffersize)
        #for i in range(0, (cube_edge_len**3 / buffersize) + 1):
        #    content += fd.read(buffersize)
        content = fd.read(-1)
        fd.close()

        this_cube = np.fromstring(content, dtype=np.uint8).reshape(
            [cube_edge_len, cube_edge_len, cube_edge_len], order='F')

        #this_cube = np.fromfile(path_to_src_cube, dtype=np.uint8).reshape([
        #    cube_edge_len,cube_edge_len,cube_edge_len], order='F')

        this_cube = np.swapaxes(this_cube, 0, 2)
        #this_cube = np.swapaxes(this_cube, 1, 2)

        down_block[src_coord[2]*cube_edge_len:\
                   src_coord[2]*cube_edge_len+cube_edge_len,
                   src_coord[1]*cube_edge_len:\
                   src_coord[1]*cube_edge_len+cube_edge_len,
                   src_coord[0]*cube_edge_len:\
                   src_coord[0]*cube_edge_len+cube_edge_len] = this_cube

        #down_block = np.swapaxes(down_block, 0, 1)
    #raise()

    # It is not clear to me whether this zooming function does actually the
    # right thing. One should
    # first filter the data and then
    # re-sample to avoid aliasing. The prefilter setting is possibly not
    # working correctly, as the scipy documentation appears to be not in
    # agreement with the actual source code, so that pre-filtering is only
    # triggered for orders > 1, even if set to True. I assume that bilinear
    # or higher order re-sampling itself is "filtering" and is "good
    # enough" for our case.
    # This website by Stephan Saalfeld has some interesting details,
    # but there is a ton of material coming from the photography community.
    # http://fiji.sc/Downsample
    # My personal experience: Avoid nearest neighbor (ie naive
    # re-sampling without any filtering), especially
    # for noisy images. On top of that, the gains of more sophisticated
    # filters become less clear, and data and scaling factor dependent.
    down_block = scipy.ndimage.interpolation.zoom(
        down_block,
        0.5,
        output=np.uint8,
        # 1: bilinear
        # 2: bicubic
        order=1,
        # this does not mean nearest interpolation, it corresponds to how the
        # borders are treated.
        mode='nearest',
        # treat the borders.
        prefilter=True)

    # extract directory of out_path
    #if not os.path.exists(os.path.dirname(job_info.trg_cube_path)):
    #    os.makedirs(os.path.dirname(job_info.trg_cube_path))

    downsample_cube.log_queue.put("Downsampling took: {0} s for {1}".format(
        time.time() - ref_time, job_info.trg_cube_path))

    #down_block.tofile(job_info.trg_cube_path)

    return down_block
Esempio n. 4
0
    def addDataFromFile(self,
                        d_path,
                        l_path,
                        d_files,
                        l_files,
                        cube_prios=None,
                        valid_cubes=[],
                        downsample_xy=False):
        """
        Parameters
        ----------

        d_path/l_path: string
          Directories to load data from
        d_files/l_files: list
          List of data/label files in <path> directory (must be in the same order!). Each list
        element is a tuple in the form **(<Name of h5-file>, <Key of h5-dataset>)**
        cube_prios: list
          (not normalised) list of sampling weights to draw examples from the respective cubes.
          If None the cube sizes are taken as priorities.
        valid_cubes: list
          List of indices for cubes (from the file-lists) to use as validation data and exclude from training,
          may be empty list to skip performance estimation on validation data.
        """
        self.names += d_files
        if fadvise_avail:
            names = reduce(
                lambda x, y: x + [d_path + y[0][0], l_path + y[1][0]],
                zip(d_files, l_files), [])
            fadvise.willneed(names)
        # returns lists of cubes, info is a tuple per cube
        data, label, info = self._read_images(d_path, l_path, d_files, l_files,
                                              downsample_xy)

        if self.mode == 'img-scalar':
            data = _borderTreatment(data, self.patch_size, self.border_mode,
                                    self.n_dim)

        if self.pre_process:
            if self.pre_process == 'standardise':
                M = np.mean(map(np.mean, data))
                S = np.mean(map(np.std, data))
                data = map(lambda x: (x - M) / S, data)
                print "Data is standardised. Original mean: %.g, original std %.g" % (
                    M, S)
                self.data_mean = M
                self.data_std = S

            else:
                raise NotImplementedError(
                    "Pre-processing %s is not implemented" % self.pre_process)

        if self.n_lab is None:
            unique = [np.unique(l) for l in label]
            self.n_lab = np.unique(np.hstack(unique)).size

        default_info = (np.ones(self.n_lab), np.zeros(self.n_lab))
        info = map(lambda x: default_info if x is None else x, info)
        self.info += info

        prios = []
        # Distribute Cubes into training and valid list
        for k, (d, l, i) in enumerate(zip(data, label, info)):
            if k in valid_cubes:
                self.valid_d.append(d)
                self.valid_l.append(l)
                self.valid_i.append(i)
            else:
                self.train_d.append(d)
                self.train_l.append(l)
                self.train_i.append(i)
                # If no priorities are given: sample proportional to cube size
                prios.append(l.size)

        if cube_prios is None or cube_prios == []:
            prios = np.array(prios, dtype=np.float)
        else:  # If priorities are given: sample irrespective of cube size
            prios = np.array(cube_prios, dtype=np.float)

            # sample example i if: batch_prob[i] < p
        self._sampling_weight = np.hstack((0, np.cumsum(prios / prios.sum())))
        self._training_count = len(self.train_d)
        self._valid_count = len(self.valid_d)

        print self.__repr__()
        print '\n'