Example #1
0
def savePickle(path, vol, mod, label, args, roi=None, nindent=l2_indent):
    warnings.warn(
        'savePickle DEPRECATED IN FAVOR OF: calculate_features.pickleFeature()',
        DeprecationWarning)
    args_string = getArgsString(args, ignore_list=['glcm_stat_function'])
    if (roi is not None):
        roi_string = 'roi={!s}_'.format(roi.roiname)
    else:
        roi_string = ''

    # append ROIName to pickle path
    pickle_dump_path = os.path.join(
        path,
        'feature={featname!s}_{mod:s}_{roistring:s}args({args!s}).pickle'.
        format(featname=label, mod=mod, roistring=roi_string,
               args=args_string))
    try:
        vol.toPickle(pickle_dump_path)
    except:
        logger.info(
            indent('error pickling: {:s}'.format(pickle_dump_path), nindent))
    else:
        logger.info(
            indent(
                'feature pickled successfully to: {:s}'.format(
                    pickle_dump_path), nindent))
Example #2
0
def wavelet_energy(image_volume,
                   radius=2,
                   roi=None,
                   wavelet_str='db1',
                   mode_str='smooth'):
    # compute wavelet coefficients
    logger.info(
        indent(
            'performing 3d wavelet decomp using wavelet: {!s}'.format(
                wavelet_str), g_indents[3]))
    roi_volume = image_volume.conformTo(roi.frameofreference)
    wavelet_coeffs = wavelet_decomp_3d(roi_volume, wavelet_str, mode_str)
    nlevels = len(wavelet_coeffs) - 1
    # level_results = []
    accumulator = np.zeros(roi_volume.frameofreference.size[::-1])
    # sum voxel-wise energy across all levels
    for level in range(nlevels - 1, 0, -1):
        wavelet_coeffs_diag = wavelet_coeffs[level + 1]['ddd']
        logger.info(
            indent(
                'computing energy for level {:d} of shape:{!s}'.format(
                    level, wavelet_coeffs_diag.shape), g_indents[3]))
        result = image_iterator(energy_plugin, wavelet_coeffs_diag, radius)

        zoomfactors = tuple(
            np.true_divide(roi_volume.frameofreference.size[::-1],
                           result.shape))
        # scale low-res coefficients to image res
        result = scipy.ndimage.interpolation.zoom(result, zoomfactors, order=3)
        result = MaskableVolume().fromArray(result,
                                            roi_volume.frameofreference)
        # level_results.append(result)
        accumulator = np.add(accumulator, result.array)
    return MaskableVolume().fromArray(accumulator, roi_volume.frameofreference)
Example #3
0
def loadPickle(path, mod=None, feature_label=None, nindent=l2_indent):
    warnings.warn(
        'DEPRECATED IN FAVOR OF: calculate_features.loadPrecalculated()',
        DeprecationWarning)
    logger.info(
        indent('Pickled feature vector found ({!s}). Loading.'.format(mod),
               nindent))
    try:
        vol = MaskableVolume().fromPickle(path)
    except rttypes.PickleOutdatedError:
        # old pickle definition doesnt contain mod and feature_label add and repickle
        if mod:
            vol.mod = mod
        if feature_label:
            vol.feature_label = feature_label
        logger.info(
            indent('outdated pickle found, updating in filesystem', nindent))
        vol.toPickle(path)
    except:
        vol = None
    finally:
        if (vol):
            logger.info(
                indent('Pickled feature vector loaded successfully.', nindent))
            return vol
        else:
            logger.info(
                indent(
                    'there was a problem loading the file: {!s}'.format(path),
                    nindent))
            return None
Example #4
0
def loadImages(images_path, modalities):
    """takes a list of modality strings and loads dicoms as a MaskableVolume instance from images_path

    Args:
        images_path --  Full path to patient specific directory containing various modality dicom images
            each modality imageset is contained in a directory within images_path where the modality string
            in modalities must match the directory name. This subdir is recursively searched for all dicoms
        modalities  --  list of modality strings that are used to identify subdirectories from which dicoms
            are loaded
    Returns:
        dictionary of {modality: imvolume} that contains loaded image data for each modality supported
    """
    # check if path specified exists
    if (not os.path.exists(images_path)):
        logger.info('Couldn\'t find specified path, nothing was loaded.')
        return None
    else:
        # load imvector and store to dictionary for each modality
        # if modality is missing, dont add to dictionary
        if (modalities is None or len(modalities) == 0):
            logger.info('No modalities supplied. skipping')
            return None
        else:
            volumes = OrderedDict()
            for mod in modalities:
                logger.info(
                    indent('Importing {mod:s} images'.format(mod=mod.upper()),
                           l1_indent))
                dicom_path = os.path.join(images_path,
                                          '{mod:s}'.format(mod=mod))

                if (os.path.exists(dicom_path)):
                    # recursively walk modality path for dicom images, and build a dataset from it
                    try:
                        volumes[mod] = MaskableVolume().fromDir(dicom_path,
                                                                recursive=True)
                    except:
                        logger.info(
                            'failed to create Volume for modality: {:s}'.
                            format(mod))
                    else:
                        size = volumes[mod].frameofreference.size
                        logger.info(
                            indent(
                                'stacked {len:d} datasets of shape: ({d:d}, {r:d}, {c:d})'
                                .format(len=size[2], d=1, r=size[1],
                                        c=size[0]), l2_indent))
                else:
                    logger.info(
                        indent(
                            'path to {mod:s} dicoms doesn\'t exist. skipping\n'
                            '(path: {path:s}'.format(mod=mod, path=dicom_path),
                            l2_indent))
                logger.info('')
            return volumes
Example #5
0
def cluster_kmeans(feature_matrix, nclusters=10, eps=1e-4, njobs=-2):
    """take input feature array of N rows and D columns and perform standard kmeans clustering using \
            sklearn kmeans library

    Args:
        feature_matrix -- numpy array of N rows and D columns where N is the number of voxels in the
                            volume and D is the number of features.

    Optional Args:
        nclusters      -- number of clusters
        eps            -- epsilon convergence criteria
    Returns:
        imvector of cluster assignments from 0 to k-1 aligned to the BaseVolumes of feature_matrix
    """
    # check inputs
    if not isinstance(feature_matrix, np.ndarray):
        logger.warning(
            indent('a proper numpy ndarray was not provided. skipping.',
                   g_indents[1]))
        logger.warning(
            indent(
                str(type(feature_matrix)) + str(type(np.ndarray)),
                g_indents[1]))
        return None
    if (nclusters <= 1):
        logger.exception(indent('k must be >1', g_indents[1]))
        raise ValueError

    # Preprocessing - normalization
    normalizer = StandardScaler()
    normalized_feature_matrix = np.nan_to_num(
        normalizer.fit_transform(feature_matrix))

    # create estimator obj
    km = KMeans(n_clusters=nclusters,
                max_iter=300,
                n_init=10,
                init='k-means++',
                precompute_distances=True,
                tol=eps,
                n_jobs=njobs)
    km.fit(normalized_feature_matrix)
    logger.debug(indent('#iters: {:d}'.format(km.n_iter_), g_indents[1]))
    logger.debug(
        indent(
            'score: {score:0.4f}'.format(
                score=km.score(normalized_feature_matrix)), g_indents[1]))
    return km.predict(normalized_feature_matrix)
Example #6
0
def cluster_hierarchical_scipy(feature_matrix,
                               nclusters=3,
                               metric='euclidean',
                               method='ward'):
    """take input feature array of N rows and D columns and perform agglomerative hierarchical clustering \
            using the standard sklearn agglomerative clustring library

    Args:
        feature_matrix -- numpy array of N rows and D columns where N is the number of voxels in the
                            volume and D is the number of features.

    Optional Args:
        nclusters     -- number of clusters to find
        metric        -- metric used to compute linkage ['euclidean', 'l1', 'l2', 'manhattan']
        method        -- criterion to use for cluster merging ['ward', 'complete', 'average']
    """
    # check inputs
    if not isinstance(feature_matrix, np.ndarray):
        logger.exception(
            indent(
                'a proper numpy ndarray was not provided. {!s} != {!s}'.format(
                    type(feature_matrix), type(np.ndarray)), g_indents[1]))
        raise TypeError

    # sanitize string inputs
    method = method.lower()
    metric = metric.lower()

    # Preprocessing - normalization
    normalizer = StandardScaler()
    normalized_feature_matrix = normalizer.fit_transform(feature_matrix)

    # determine valid parameters
    valid_method = GLOBAL['scipy_hcluster_valid_methods']
    if (method not in valid_method):
        logger.exception('method must be one of {!s}'.format(valid_method))
        raise ValueError(str)
    if (method is 'maximum'):
        method = 'complete'

    valid_metric = GLOBAL['scipy_hcluster_valid_metrics']
    if (metric not in valid_metric):
        logger.exception('metric must be one of {!s}'.format(valid_metric))
        raise ValueError(str)

    if (method is 'ward'):
        # must use euclidean distance
        metric = 'euclidean'

    # perform fit and estimation
    linkage_matrix = sch.linkage(normalized_feature_matrix, method, metric)
    prediction = sch.fcluster(linkage_matrix, nclusters, criterion='maxclust')
    return (prediction, linkage_matrix)
Example #7
0
def wavelet_raw(image_volume,
                radius=2,
                roi=None,
                wavelet_str='db1',
                mode_str='smooth',
                level=0):
    # compute wavelet coefficients
    logger.info(
        indent(
            'performing 3d wavelet decomp using wavelet: {!s}'.format(
                wavelet_str), g_indents[3]))
    roi_volume = image_volume.conformTo(roi.frameofreference)
    wavelet_coeffs = wavelet_decomp_3d(roi_volume, wavelet_str, mode_str)
    nlevels = len(wavelet_coeffs) - 1
    wavelet_coeffs_diag = wavelet_coeffs[nlevels - level]['ddd']
    zoomfactors = tuple(
        np.true_divide(roi_volume.frameofreference.size[::-1],
                       wavelet_coeffs_diag.shape))
    # scale low-res coefficients at highest level to image res
    result = scipy.ndimage.interpolation.zoom(wavelet_coeffs_diag,
                                              zoomfactors,
                                              order=3)
    result = MaskableVolume().fromArray(result, roi_volume.frameofreference)
    return result
Example #8
0
def image_iterator(processing_function, image_volume, radius=2, roi=None):
    """compute the pixel-wise feature of an image over a region defined by neighborhood

    Args:
        processing_function -- function that should be applied to at each voxel location with neighborhood
                                context. Function signature should match:
                                    fxn()
                            -- list<callables> that will all be evaluated at each patch location, results will
                                be stored to separate result MaskableVolume objects
        image -- a flattened array of pixel intensities of type imslice or a matrix shaped numpy ndarray
        radius -- describes neighborood size in each dimension. radius of 4 would be a 9x9x9
    Returns:
        feature_volume as MaskableVolume with shape=image.shape
    """
    # This is an ugly way of type-checking but cant get isinstance to see both as the same
    if (MaskableVolume.__name__ in str(type(image_volume))):
        (c, r, d) = image_volume.frameofreference.size

        def get_val(image_volume, z, y, x):
            # image boundary handling is built into BaseVolume.get_val
            return image_volume.get_val(z, y, x)

        def set_val(feature_volume, z, y, x, val):
            feature_volume.set_val(z, y, x, val)

        #instantiate a blank BaseVolume of the proper size
        feature_volume = MaskableVolume().fromArray(
            np.zeros((d, r, c)), image_volume.frameofreference)
        # feature_volume.modality = image_volume.modality
        # feature_volume.feature_label = 'feature'
    elif isinstance(image_volume, np.ndarray):
        if image_volume.ndim == 3:
            d, r, c = image_volume.shape
        elif image_volume.ndim == 2:
            d, r, c = (1, *image_volume.shape)
            image_volume = image_volume.reshape((d, r, c))

        # instantiate a blank np.ndarray of the proper size
        feature_volume = np.zeros((d, r, c))

        def get_val(image, z, y, x):
            if (z < 0 or y < 0 or x < 0) or (z >= d or y >= r or x >= c):
                return 0
            else:
                return image[z, y, x]

        def set_val(image, z, y, x, val):
            image[z, y, x] = val
    else:
        logger.info(
            'invalid image type supplied ({:s}). Please specify an image of type BaseVolume \
            or type np.ndarray'.format(str(type(image_volume))))
        return None

    # z_radius_range controls 2d neighborhood vs 3d neighborhood for 2d vs 3d images
    if d == 1:  # 2D image
        logger.debug(
            indent('Computing 2D feature with radius: {:d}'.format(radius),
                   l3))
        z_radius_range = [0]
    elif d > 1:  # 3D image
        logger.debug(
            indent('Computing 3D feature with radius: {:d}'.format(radius),
                   l3))
        z_radius_range = range(-radius, radius + 1)

    # in plane patch range
    radius_range = range(-radius, radius + 1)

    # timing
    start_feature_calc = time.time()

    # absolute max indices for imagevolume - for handling request of voxel out of bounds
    cbound = c
    rbound = r
    dbound = d

    # set calculation index bounds -- will be redefined if roi is specified
    cstart, cstop = 0, cbound
    rstart, rstop = 0, rbound
    dstart, dstop = 0, dbound

    # defines dimensionality
    d_subset = dstop - dstart
    r_subset = rstop - rstart
    c_subset = cstop - cstart

    # restrict calculation bounds to roi
    if (roi is not None):
        # get max extents of the mask/ROI to speed up calculation only within ROI cubic volume
        extents = roi.getROIExtents()
        cstart, rstart, dstart = image_volume.frameofreference.getIndices(
            extents.start)
        cstop, rstop, dstop = np.subtract(
            image_volume.frameofreference.getIndices(extents.end()), 1)
        logger.info(
            indent(
                'calculation subset volume x=({xstart:d}->{xstop:d}), '
                'y=({ystart:d}->{ystop:d}), '
                'z=({zstart:d}->{zstop:d})'.format(zstart=dstart,
                                                   zstop=dstop,
                                                   ystart=rstart,
                                                   ystop=rstop,
                                                   xstart=cstart,
                                                   xstop=cstop), l4))
        # redefine feature_volume
        d_subset = dstop - dstart
        r_subset = rstop - rstart
        c_subset = cstop - cstart
        feature_frameofreference = FrameOfReference(
            (extents.start), (image_volume.frameofreference.spacing),
            (c_subset, r_subset, d_subset))
        feature_volume = feature_volume.fromArray(
            np.zeros((d_subset, r_subset, c_subset)), feature_frameofreference)

    # # setup an output volume for each feature in processing_function list
    # if (not isinstance(processing_function, list)):
    #     tmp = []
    #     tmp.append(processing_function)
    #     processing_function = tmp
    # feature_volumes = [feature_volume]
    # for funct in processing_function[1:]:
    #     feature_volumes.append(np.zeros_like(feature_volume))

    # nested loop approach -> slowest, try GPU next
    total_voxels = d * r * c
    subset_total_voxels = d_subset * r_subset * c_subset
    #onepercent = int(subset_total_voxels / 100)
    fivepercent = int(subset_total_voxels / 100 * 5)

    idx = -1
    subset_idx = -1
    z_idx = -1
    for z in range(dstart, dstop):
        z_idx += 1
        y_idx = -1
        x_idx = -1
        for y in range(rstart, rstop):
            y_idx += 1
            x_idx = -1
            for x in range(cstart, cstop):
                x_idx += 1
                idx += 1
                if (z < dstart or z > dstop or y < rstart or y > rstop
                        or x < cstart or x > cstop):
                    # we shouldnt ever be here
                    logger.info('why are we here?!')
                    #fill 0 instead
                    set_val(feature_volume, z_idx, y_idx, x_idx, 0)
                else:
                    subset_idx += 1
                    patch_vals = np.zeros(
                        (len(z_radius_range), len(radius_range),
                         len(radius_range)))
                    for p_z, k_z in enumerate(z_radius_range):
                        for p_x, k_x in enumerate(radius_range):
                            for p_y, k_y in enumerate(radius_range):
                                #logger.info('k_z:{z:d}, k_y:{y:d}, k_x:{x:d}'.format(z=k_z,y=k_y,x=k_x))
                                # handle out of bounds requests - replace with 0
                                request_z = z + k_z
                                request_y = y + k_y
                                request_x = x + k_x
                                if (request_z < 0 or request_z >= dbound
                                        or request_y < 0 or request_y >= rbound
                                        or request_x < 0
                                        or request_x >= cbound):
                                    val = 0
                                else:
                                    val = get_val(image_volume, request_z,
                                                  request_y, request_x)
                                # store to local image patch
                                patch_vals[p_z, p_y, p_x] = val

                    # for i, funct in enumerate(processing_function):
                    proc_value = processing_function(patch_vals)
                    set_val(feature_volume, z_idx, y_idx, x_idx, proc_value)

                    if (False and (subset_idx % fivepercent == 0
                                   or subset_idx == subset_total_voxels - 1)):
                        logger.debug(
                            'feature value at ({x:d}, {y:d}, {z:d})= {e:f}'.
                            format(x=z * y * x + y * x + x,
                                   y=z * y * x + y,
                                   z=z * y * x,
                                   e=proc_value))

                    if ((subset_idx % fivepercent == 0
                         or subset_idx == subset_total_voxels - 1)):
                        logger.debug(
                            indent(
                                '{p:0.2%} - voxel: {i:d} of {tot:d} (of total: {abstot:d})'
                                .format(p=subset_idx / subset_total_voxels,
                                        i=subset_idx,
                                        tot=subset_total_voxels,
                                        abstot=total_voxels), l4))

    if isinstance(image_volume, np.ndarray) and d == 1:
        # need to reshape ndarray if input was 2d
        feature_volume = feature_volume.reshape((r_subset, c_subset))
        # for i, feature_volume in enumerate(feature_volumes):
        #     feature_volumes[i] = feature_volume.reshape((r_subset, c_subset))

    end_feature_calc = time.time()
    logger.debug(
        timer('feature calculation time:',
              end_feature_calc - start_feature_calc, l3))
    # if len(features_volumes > 1):
    #     return feature_volumes
    # else:
    return feature_volume
Example #9
0
def create_feature_matrix(feature_volumes,
                          roi=None,
                          PCA=False,
                          PCA_components=0.95):
    """takes a list of feature BaseVolumes and combines them into a numpy ndarray of N rows and D features

    where N is the number of samples in each feature vector (voxels in the image) and D is the number of
    feature vectors stored in the "feature_volumes" list.

    Args:
        feature_volumes    --  python list of BaseVolumes that are aligned
    Returns:
        NDArray     --  numpy ndarray with N rows and D columns where N is the number of voxels in the
                        aligned images (in depth-row major order) and D is the number of feature vectors
                        in the list (len(feature_volumes))
    """
    if len(feature_volumes) <= 0:
        logger.warning(indent('no features supplied. skipping', g_indents[1]))
        return None
    else:
        if (roi):
            # use roi.frameofreference as common shape
            frameofreference = roi.frameofreference
        else:
            # find highest resolution volume and use as FrameOfReference
            highest_res_volume = feature_volumes[0]
            highest_res = np.product(
                highest_res_volume.frameofreference.spacing)
            for volume in feature_volumes[1:]:
                res = np.product(volume.frameofreference.spacing)
                if (res < highest_res):
                    highest_res_volume = volume
                    highest_res = res
            # assign highest res FOR as common shape
            frameofreference = highest_res_volume.frameofreference

        # take the selected FORs shape to be the reference
        ref_shape = frameofreference.size[::
                                          -1]  # reverses tuple from (x,y,z) to (z,y,x)
        logger.debug('Common Spacing (z,y,x): ({:f}, {:f}, {:f})'.format(
            *frameofreference.spacing))
        logger.debug(
            'Common Shape (z,y,x): ({:d}, {:d}, {:d})'.format(*ref_shape))

        # create list of commonly shaped feature vectors
        conformed_feature_list = []
        dense_feature_list = []
        feature_column_labels = []
        for i, vol in enumerate(feature_volumes):
            # check for invalid feature
            if (vol is None):
                logger.debug(
                    'empty (None) feature provided at index {:d}, removing and continuing'
                    .format(i))
                continue

            # conform feature volumes and add to list
            conformed_volume = vol.conformTo(frameofreference)

            if (conformed_volume.array.shape != ref_shape):
                logger.warning(
                    indent(
                        'shape mismatch. ref={ref:s} != feature[{num:d}]={shape:s}.'
                        ' removing and continuing'.format(
                            ref=str(ref_shape),
                            num=i,
                            shape=str(conformed_volume.array.shape)),
                        g_indents[1]))
                continue
            else:
                # concatenate, need to make feat.array a 2d vector
                pruned_feature_vector = create_pruned_vector(
                    conformed_volume, roi)
                conformed_feature_list.append(pruned_feature_vector)
                dense_feature_list.append(
                    conformed_volume.vectorize(roi).reshape((-1, 1)))
                # label generator
                label = generate_heatmap_label(conformed_volume)
                feature_column_labels.append(label)

        # combine accepted features into array of shape (nSamples, nFeatures)
        pruned_feature_array = np.nan_to_num(
            np.concatenate(conformed_feature_list, axis=1))
        # create expanded/dense version for pickling and using in hierarchical clustering
        dense_feature_array = np.nan_to_num(
            np.concatenate(dense_feature_list, axis=1))

        # dimensionality reduction
        if PCA:
            pca = sklearnPCA(whiten=False,
                             n_components=PCA_components)  # standard PCA
            # nfeats = pruned_feature_array.shape[1]
            pruned_feature_array = pca.fit_transform(pruned_feature_array)
            # logger.debug('pca: keeping {:d} of {:d} components'.format(pca.n_components, nfeats))

        logger.debug(
            indent(
                'combined {n:d} features into pruned array of shape: {shape:s}'
                .format(n=pruned_feature_array.shape[1],
                        shape=str(pruned_feature_array.shape)), g_indents[1]))
        return (pruned_feature_array, frameofreference, dense_feature_array,
                feature_column_labels)
Example #10
0
def loadROIs(rtstruct_path):
    """loads an rtstruct specified by path and returns a dict of ROI objects

    DEPRECATED IN FAVOR OF rttypes.ROI classmethod collectionFromFile(rtstruct_path)
    Args:
        rtstruct_path    -- path to rtstruct.dcm file

    Returns:
        dict<key='contour name', val=ROI>
    """
    warnings.warn(
        'scripting.loadROIs() DEPRECATED IN FAVOR OF rttypes.ROI classmethod collectionFromFile(rtstruct_path)',
        DeprecationWarning)
    if (not os.path.exists(rtstruct_path)):
        logger.info(
            indent('invalid path provided: "{:s}"'.format(rtstruct_path),
                   l2_indent))
        raise ValueError

    logger.info(indent('Importing ROIs', l1_indent))

    # check if path is file or dir
    if (os.path.isdir(rtstruct_path)):
        # search recursively for a valid rtstruct file
        ds_list = dcmio.read_dicom_dir(rtstruct_path, recursive=True)
        if (ds_list is None or len(ds_list) == 0):
            logger.info(
                'no rtstruct datasets found at "{:s}"'.format(rtstruct_path))
            raise Exception
        ds = ds_list[0]
    elif (os.path.isfile(rtstruct_path)):
        ds = dcmio.read_dicom(rtstruct_path)

    # parse rtstruct file and instantiate maskvolume for each contour located
    # add each maskvolume to dict with key set to contour name and number?
    if (ds is not None):
        # get structuresetROI sequence
        StructureSetROI_list = ds.StructureSetROISequence
        nContours = len(StructureSetROI_list)
        if (nContours <= 0):
            logger.debug(indent('no contours were found', l2_indent))
            return None

        # Add structuresetROI to dict
        StructureSetROI_dict = {
            StructureSetROI.ROINumber: StructureSetROI
            for StructureSetROI in StructureSetROI_list
        }

        # get dict containing a contour dataset for each StructureSetROI with a paired key=ROINumber
        ROIContour_dict = {
            ROIContour.ReferencedROINumber: ROIContour
            for ROIContour in ds.ROIContourSequence
        }

        # construct a dict of ROI objects where contour name is key
        roi_dict = {}
        for ROINumber, structuresetroi in StructureSetROI_dict.items():
            roi_dict[structuresetroi.ROIName] = (ROI(
                frameofreference=None,
                roicontour=ROIContour_dict[ROINumber],
                structuresetroi=structuresetroi))
        # prune empty ROIs from dict
        for roiname, roi in dict(roi_dict).items():
            if (roi.coordslices is None or len(roi.coordslices) <= 0):
                logger.debug(
                    indent(
                        'pruning empty ROI: {:s} from loaded ROIs'.format(
                            roiname), l2_indent))
                del roi_dict[roiname]

        logger.info(
            indent('loaded {:d} ROIs succesfully'.format(len(roi_dict)),
                   l2_indent))
        return roi_dict
    else:
        logger.info(indent('no dataset was found', l2_indent))
        return None
Example #11
0
def loadFeatures(pickle_path,
                 image_volumes,
                 feature_defs,
                 roi=None,
                 savepickle=True):
    """Checks if feature vector has already been pickled at path specified and
    loads the files if so, or computes feature for each modality and pickles for later access.

    Args:
        pickle_path   -- should be the full path to the patient specific "precomputed" dir.
                         pickle file names are searched for occurence of pet, ct, and feature and will be loaded if a
                         modality string and "feature" are both present.
        image_volumes -- dictionary of {modality, BaseVolume} that contains loaded image data for
                         each modality supported
                         feature_defs  -- dict<key='feature_name', value=dict<k=argname, v-argval>>
    Returns:
        dict<key='feature_name', value=dict<key=mod, value=MaskableVolume>>
    """
    # check if path specified exists
    if (not os.path.exists(pickle_path)):
        logger.info('Couldn\'t find specified path, nothing was loaded.')
        return None
    else:
        # extract modalities from image_volumes
        if (image_volumes is None or len(image_volumes) == 0):
            logger.info('No image data was provided. Skipping')
            return None
        modalities = list(image_volumes.keys())

        # load first file that matches the search and move to next modality
        feature_volumes = OrderedDict()
        for feature_def in feature_defs:
            feature_label = feature_def.label
            feature_args = feature_def.args
            calculate_feature = feature_def.calculation_function
            recalculate = feature_def.recalculate

            these_feature_volumes = OrderedDict()  # k: mod, v: BaseVolume
            logger.info('Loading Feature ({!s}):'.format(feature_label))
            for mod in modalities:
                logger.info(
                    indent('Loading {!s} feature:'.format(mod.upper()),
                           l1_indent))
                # initialize to None
                these_feature_volumes[mod] = None

                # get files that match settings
                match = checkPickle(pickle_path, feature_label, feature_args,
                                    mod, roi)

                if (not recalculate and match is not None):
                    # found pickled feature vector, load it and add to dict - no need to calculate feature
                    these_feature_volumes[mod] = loadPickle(
                        os.path.join(pickle_path, match), mod, feature_label)
                else:  # Calculate feature this time
                    if (match):
                        logger.info(
                            indent('Recalculating feature as requested',
                                   l2_indent))
                    else:
                        logger.info(
                            indent(
                                'No pickled feature vector found ({!s})'.
                                format(mod), l2_indent))

                    logger.info(
                        indent('Computing feature now...'.format(mod=mod),
                               l2_indent))
                    vol = calculate_feature(image_volumes[mod],
                                            roi=roi,
                                            **feature_args)
                    # inject metadata
                    vol.modality = mod
                    vol.feature_label = feature_label

                    these_feature_volumes[mod] = vol

                    # Check status of calculation
                    if these_feature_volumes[mod] is None:
                        logger.info(
                            indent(
                                'Failed to compute feature for {!s} images.'.
                                format(mod.upper()), l2_indent))
                    else:
                        logger.info(
                            indent('feature computed successfully', l2_indent))
                        # pickle for later recall
                        savePickle(pickle_path, these_feature_volumes[mod],
                                   mod, feature_label, feature_args, roi)

                logger.info('')
                # END mod
            feature_volumes[feature_label] = these_feature_volumes
            logger.info('')
            # END feature_def

        # return dict of modality specific feature imvectors with keys defined by keys for image_volumes arg.
        return feature_volumes
Example #12
0
def cluster_hierarchical_sklearn(feature_matrix,
                                 nclusters=3,
                                 affinity='euclidean',
                                 linkage='ward'):
    """take input feature array of N rows and D columns and perform agglomerative hierarchical clustering \
            using the standard sklearn agglomerative clustring library

    Args:
        feature_matrix -- numpy array of N rows and D columns where N is the number of voxels in the
                            volume and D is the number of features.

    Optional Args:
        nclusters      -- number of clusters to find
        affinity       -- metric used to compute linkage ['euclidean', 'l1', 'l2', 'manhattan']
        linkage        -- criterion to use for cluster merging ['ward', 'complete', 'average']
    """
    # check inputs
    if not isinstance(feature_matrix, np.ndarray):
        logger.exception(
            indent(
                'a proper numpy ndarray was not provided. {:s} != {:s}'.format(
                    str(type(feature_matrix)), str(type(np.ndarray))),
                g_indents[1]))
        raise TypeError

    # sanitize string inputs
    linkage = linkage.lower()
    affinity = affinity.lower()

    # Preprocessing - normalization
    normalizer = StandardScaler()
    normalized_feature_matrix = normalizer.fit_transform(feature_matrix)

    # determine valid parameters
    valid_linkage = ['ward', 'complete', 'maximum', 'average']
    if (linkage not in valid_linkage):
        logger.exception('linkage must be one of {:s}'.format(
            str(valid_linkage)))
        raise ValueError(str)
    if (linkage is 'maximum'):
        linkage = 'complete'

    valid_affinity = ['l1', 'l2', 'manhattan', 'cosine', 'euclidean']
    if (affinity not in valid_affinity):
        logger.exception('affinity must be one of {:s}'.format(
            str(valid_affinity)))
        raise ValueError(str)

    if (linkage is 'ward'):
        # must use euclidean distance
        affinity = 'euclidean'

    conn_matrix = None

    # create estimator obj
    agg = AgglomerativeClustering(n_clusters=nclusters,
                                  connectivity=conn_matrix,
                                  affinity=affinity,
                                  compute_full_tree=True,
                                  linkage=linkage,
                                  pooling_func=np.mean)

    # perform fit and estimation
    prediction = agg.fit_predict(normalized_feature_matrix)
    logger.debug(indent('#leaves: {:d}'.format(agg.n_leaves_), g_indents[1]))
    logger.debug(
        indent('#components: {:d}'.format(agg.n_components_), g_indents[1]))
    return (prediction, agg)