Ejemplo n.º 1
0
def convert_to_np(dataset_dir, be_nice, create_new_dataset_dict,
                  create_metadata, split_npz_to_slices, create_dataset_splits,
                  debug):
    # PATHs
    output_dir = os.path.join(dataset_dir, 'preprocessed')
    output_dir_pred = os.path.join(dataset_dir, 'predictionset_preprocessed')

    # Validate inputs and ensure output dir is empty (prevent mix with old files)
    rmtree(output_dir, ignore_errors=True)
    rmtree(output_dir_pred, ignore_errors=True)
    os.makedirs(output_dir)
    os.makedirs(output_dir_pred)

    # Only use free resources on the machine (don't block the machine)
    if be_nice:
        os.nice(18)

    # Load dataset information (location of training data, labels, test data, ...)
    if create_new_dataset_dict:
        dataset_dict = create_dataset_dict(dataset_dir)
        print("Dataset dict created, continue with np-conversion")
    else:
        with open(os.path.join(dataset_dir, "dataset.pkl"), 'rb') as f:
            dataset_dict = pickle.load(f)

    # Prepare data for parallel processing
    params = []
    filename_pattern = "^(.+\/)*(.+)\.(.+)\.(.+)$"
    filename_prog = re.compile(filename_pattern)
    for i in range(len(dataset_dict['training'])):
        # Select image and corresponding label
        img = dataset_dict['training'][i]['image'].replace('./', '')
        lab = dataset_dict['training'][i]['label'].replace('./', '')
        _, image_path, image_filename, _, _, _ = filename_prog.split(img)
        output_file_path = os.path.join(output_dir, image_filename)

        params += [(img, lab, dataset_dir, output_file_path, debug)]

    for i in range(len(dataset_dict.get('test', ""))):
        pred = dataset_dict['test'][i].replace('./', '')
        _, image_path, image_filename, _, _, _ = filename_prog.split(pred)
        output_file_path = os.path.join(output_dir_pred, image_filename)

        params += [(pred, None, dataset_dir, output_file_path, debug)]

    # Convert files in parallel
    # convert_to_np_worker(params[0][0], params[0][1], params[0][2], params[0][3], params[0][4])
    parallel_progbar(convert_to_np_worker, params, starmap=True)

    # Execute next steps
    if create_metadata:
        create_meta(dataset_dir, debug=debug)

    if split_npz_to_slices:
        split_npz_into_slices(dataset_dir, debug=debug)

    if create_dataset_splits:
        create_splits(dataset_dir)
Ejemplo n.º 2
0
def create_meta_dirs(image_dir, img_file_suffix, be_nice, debug):
    # Only use free resources on the machine (don't block the machine)
    if be_nice:
        os.nice(18)

    print("Extracting meta-data from images in {}".format(image_dir))

    # Create a list of images to process
    image_files = dict()
    files = []
    for f in os.listdir(image_dir):
        if f.endswith(img_file_suffix):
            files += [(os.path.join(image_dir, f), debug)]

    # Extract the image shapes in parallel
    image_shapes = parallel_progbar(extract_shape_from_image,
                                    files,
                                    starmap=True)

    # Store the results in format:
    # image_files[file] = shape
    for img in image_shapes:
        npz_file, npz_file_shape = img
        image_files[npz_file] = npz_file_shape

    dataset_size = len(image_files.keys())

    print("Found {} images for meta collection".format(dataset_size))

    with open(os.path.join(image_dir, 'image_meta.pkl'), 'wb') as f:
        pickle.dump(image_files, f)
Ejemplo n.º 3
0
def split_npz_into_slices_dirs(be_nice, image_dir, output_dir, debug):
    # Validate inputs and ensure output dir is empty (prevent mix with old files)
    rmtree(output_dir, ignore_errors=True)
    os.makedirs(output_dir)

    print("Splitting npz files in {}".format(image_dir))

    # Copy the meta-data file from the dataset to the new output dir
    copyfile(os.path.join(image_dir, "image_meta.pkl"), os.path.join(output_dir, "image_meta.pkl"))

    # Only use free resources on the machine (don't block the machine)
    if be_nice:
        os.nice(18)

    # Create a list of images to process
    images = []
    for image in os.listdir(image_dir):
        if image.endswith(".npz"):
            images += [(image_dir, image, output_dir, debug)]

    # Split the image slices in parallel
    empty_slices = parallel_progbar(split_npz_worker, images, starmap=True)

    empty_slices_dict = {}
    for volume_name, slice_ids in empty_slices:
        empty_slices_dict[volume_name] = slice_ids

    with open("{}".format(os.path.join(image_dir, "empty_labels.pkl")), 'wb') as empty_slices_file:
        pickle.dump(empty_slices_dict, empty_slices_file)
Ejemplo n.º 4
0
def combine_splits(dataset_dir,
                   be_nice=True,
                   debug=False):
    # Paths
    source_dir = os.path.join(dataset_dir, 'predicted_segmentations_slices')
    output_dir = os.path.join(dataset_dir, 'predicted_segmentations')

    # Only use free resources on the machine (don't block the machine)
    if be_nice:
        os.nice(18)

    # Validate inputs and ensure output dir is empty (prevent mix with old files)
    rmtree(output_dir, ignore_errors=True)
    os.makedirs(output_dir)

    # Get all 3D volumes
    volumes = {}
    volume_metadata = {}
    filename_pattern = "^(.*)(_slice_)([0-9]*)"
    filename_prog = re.compile(filename_pattern)
    for v in os.listdir(source_dir):
        if v.endswith("nii.gz"):
            _, volume_name, _, slice_id, _ = filename_prog.split(v)

            if volume_name in volumes:
                volumes[volume_name] += [(v, slice_id)]
            else:
                # Copy the meta-data file for each volume to the new output dir
                meta_file = os.path.join(source_dir, "{}.pkl".format(volume_name))
                copy(meta_file, output_dir)

                with open(meta_file, 'rb') as f:
                    meta_data = pickle.load(f)

                # Create new volume entry
                volumes[volume_name] = [(v, slice_id)]
                volume_metadata[volume_name] = meta_data

    # Convert files in parallel
    params = []
    for volume_name in volumes.keys():
        params += [(volume_name, volumes[volume_name], volume_metadata[volume_name], source_dir, output_dir, debug)]

    parallel_progbar(combine_splits_worker, params, starmap=True)
Ejemplo n.º 5
0
    def test_parallel_progbar_overhead(self):
        def mapper(i):
            return i**2

        n = list(range(100000))
        toc = tic()
        list_comp = [i**2 for i in n]
        time_lc = toc('List comprehension')
        par = parallel_progbar(mapper, n)
        time_par = toc('Parallel progbar')
        print("{}s / {}s = {}x slowdown".format(time_par, time_lc,
                                                time_par / time_lc))
        self.assertSequenceEqual(par, list_comp)
Ejemplo n.º 6
0
def create_splits(dataset_dir,
                  prediction_volume_numbers=None,
                  image_dir=None,
                  img_file_suffix=".npz",
                  num_splitsets=5):
    print(
        "Welcome to the dataset split tool. It will split your data into training, validation, testing sets."
    )

    # Dataset with images to create the splits for
    if not image_dir:
        image_dir = os.path.join(dataset_dir, 'preprocessed')

    # Keep following volume numbers for prediction
    # only used if no separate, unlabeled image dir is available
    if prediction_volume_numbers is None:
        prediction_volume_numbers = [1]

    # Extract all available image files
    image_files = []
    for file in os.listdir(image_dir):
        if file.endswith(img_file_suffix):
            image_files += [file]

    dataset_size = len(image_files)
    dataset_size_remaining = dataset_size

    # Get prediction slices
    # check if separate testset is available
    unlabeled_testset_dir = os.path.join(dataset_dir,
                                         "predictionset_preprocessed")
    prediction_set = []
    if os.path.exists(unlabeled_testset_dir):
        print("Separate unlabeled image_dir found.")
        for file in os.listdir(unlabeled_testset_dir):
            if file.endswith(img_file_suffix):
                prediction_set += [file]

        prediction_set = np.asarray(prediction_set)
    else:
        prediction_set = get_volume_slices(
            image_files, volume_numbers_list=prediction_volume_numbers)

        # Remove prediction volume from dataset
        for image in prediction_set:
            image_files.remove(image)

        dataset_size_remaining -= len(prediction_set)

    trainset_size = int(dataset_size_remaining * 0.7)
    valset_size = int((dataset_size_remaining - trainset_size) / 2)
    testset_size = int((dataset_size_remaining - trainset_size - valset_size))

    # Manual dataset size definition:
    # trainset_size = 50
    # valset_size = 10
    # testset_size = 10

    print("Found {} labeled slices for learning".format(dataset_size))
    print("Added {} image-volumes to training set".format(trainset_size))
    print("Added {} image-volumes to validation set".format(valset_size))
    print("Added {} image-volumes to test set".format(testset_size))
    print("Added {} image-volumes to prediction set".format(
        len(prediction_set)))
    print()
    print("Creating dataset splits ...")
    splits = [(trainset_size, valset_size, testset_size, image_files.copy())
              ] * num_splitsets

    # Extract the image shapes in parallel)
    splits_dict = parallel_progbar(create_splits_worker, splits, starmap=True)
    for i in range(len(splits_dict)):
        splits_dict[i]['prediction'] = prediction_set

    with open(os.path.join(dataset_dir, 'splits.pkl'), 'wb') as f:
        pickle.dump(splits_dict, f)

    print("Splitfile created")
Ejemplo n.º 7
0
    def to_eep(self,
               eep_params=None,
               eep_functions=None,
               metric_function=None,
               progress=True,
               nprocs=None,
               **kwargs):
        '''
        Converts the grid of evolution tracks to EEP basis. For details on EEP
        functions, see the documentation for kiauhoku.eep.

        Parameters
        ----------
        eep_params (dict, None): contains a mapping from your grid's specific
            column names to the names used by kiauhoku's default EEP functions.
            It also contains 'eep_intervals', the number of secondary EEPs
            between each consecutive pair of primary EEPs. If none are supplied,
            kiauhoku will attempt to read them from a cache directory.

        eep_functions (dict, None): if the default EEP functions won't do the
            job, you can specify your own and supply them in a dictionary.
            EEP functions must have the call signature
            function(track, eep_params), where `track` is a single track.
            If none are supplied, the default functions will be used.

        metric_function (callable, None): the metric function is how the EEP
            interpolator spaces the secondary EEPs. By default, the path
            length along the evolution track on the H-R diagram (luminosity vs.
            Teff) is used, but you can specify your own if desired.
            metric_function must have the call signature
            function(track, eep_params), where `track` is a single track.
            If no function is supplied, defaults to kiauhoku.eep._HRD_distance.

        progress (bool, True): whether or not to display a progress bar.

        nprocs (int, None): how many parallel processes to use for MultiIndex
            DataFrames. If none is specified, defaults to the number of CPUs.

        **kwargs: extra keyword arguments to pass to parallel_progbar.

        Returns
        -------
        eep_frame (StarGrid): grid of EEP-based evolution tracks.
        '''

        # User can specify eep_params, but if none are specified,
        # searched for cached params.
        if not eep_params:
            eep_params = load_eep_params(self.name)

        # If self is a MultiIndexed DataFrame, split it into individual
        # tracks, convert to EEP basis, and recombine.
        if self.is_MultiIndex():

            def eep_pool_helper(i):
                # Not strictly necessary, but makes for cleaner mapping.
                track = self.loc[i, :]
                return _eep_interpolate(track, eep_params, eep_functions,
                                        metric_function)

            # create index iterator and pass to the mapping/progress function
            idx = self.index.droplevel(-1).drop_duplicates()
            eep_tracks = parallel_progbar(eep_pool_helper,
                                          idx,
                                          verbose=progress,
                                          nprocs=nprocs,
                                          **kwargs)

            # Setup MultiIndex and remove Nones
            idx_list = [(*i, j) for i, tr in zip(idx, eep_tracks)
                        if tr is not None for j in tr.index]
            eep_tracks = [tr for tr in eep_tracks if tr is not None]

            # Create MultiIndex for EEP frame
            multiindex = pd.MultiIndex.from_tuples(idx_list,
                                                   names=[*idx.names, 'eep'])

            # Put it all together
            eep_frame = pd.concat(eep_tracks, ignore_index=True)
            eep_frame.index = multiindex

        # Other case is if a single track is passed
        else:
            eep_frame = _eep_interpolate(self, eep_params, eep_functions,
                                         metric_function)

        # Cast DataFrame to StarGrid
        eep_frame = from_pandas(eep_frame,
                                name=self.name,
                                eep_params=eep_params)

        return eep_frame