def savePickle(path, vol, mod, label, args, roi=None, nindent=l2_indent): warnings.warn( 'savePickle DEPRECATED IN FAVOR OF: calculate_features.pickleFeature()', DeprecationWarning) args_string = getArgsString(args, ignore_list=['glcm_stat_function']) if (roi is not None): roi_string = 'roi={!s}_'.format(roi.roiname) else: roi_string = '' # append ROIName to pickle path pickle_dump_path = os.path.join( path, 'feature={featname!s}_{mod:s}_{roistring:s}args({args!s}).pickle'. format(featname=label, mod=mod, roistring=roi_string, args=args_string)) try: vol.toPickle(pickle_dump_path) except: logger.info( indent('error pickling: {:s}'.format(pickle_dump_path), nindent)) else: logger.info( indent( 'feature pickled successfully to: {:s}'.format( pickle_dump_path), nindent))
def wavelet_energy(image_volume, radius=2, roi=None, wavelet_str='db1', mode_str='smooth'): # compute wavelet coefficients logger.info( indent( 'performing 3d wavelet decomp using wavelet: {!s}'.format( wavelet_str), g_indents[3])) roi_volume = image_volume.conformTo(roi.frameofreference) wavelet_coeffs = wavelet_decomp_3d(roi_volume, wavelet_str, mode_str) nlevels = len(wavelet_coeffs) - 1 # level_results = [] accumulator = np.zeros(roi_volume.frameofreference.size[::-1]) # sum voxel-wise energy across all levels for level in range(nlevels - 1, 0, -1): wavelet_coeffs_diag = wavelet_coeffs[level + 1]['ddd'] logger.info( indent( 'computing energy for level {:d} of shape:{!s}'.format( level, wavelet_coeffs_diag.shape), g_indents[3])) result = image_iterator(energy_plugin, wavelet_coeffs_diag, radius) zoomfactors = tuple( np.true_divide(roi_volume.frameofreference.size[::-1], result.shape)) # scale low-res coefficients to image res result = scipy.ndimage.interpolation.zoom(result, zoomfactors, order=3) result = MaskableVolume().fromArray(result, roi_volume.frameofreference) # level_results.append(result) accumulator = np.add(accumulator, result.array) return MaskableVolume().fromArray(accumulator, roi_volume.frameofreference)
def loadPickle(path, mod=None, feature_label=None, nindent=l2_indent): warnings.warn( 'DEPRECATED IN FAVOR OF: calculate_features.loadPrecalculated()', DeprecationWarning) logger.info( indent('Pickled feature vector found ({!s}). Loading.'.format(mod), nindent)) try: vol = MaskableVolume().fromPickle(path) except rttypes.PickleOutdatedError: # old pickle definition doesnt contain mod and feature_label add and repickle if mod: vol.mod = mod if feature_label: vol.feature_label = feature_label logger.info( indent('outdated pickle found, updating in filesystem', nindent)) vol.toPickle(path) except: vol = None finally: if (vol): logger.info( indent('Pickled feature vector loaded successfully.', nindent)) return vol else: logger.info( indent( 'there was a problem loading the file: {!s}'.format(path), nindent)) return None
def loadImages(images_path, modalities): """takes a list of modality strings and loads dicoms as a MaskableVolume instance from images_path Args: images_path -- Full path to patient specific directory containing various modality dicom images each modality imageset is contained in a directory within images_path where the modality string in modalities must match the directory name. This subdir is recursively searched for all dicoms modalities -- list of modality strings that are used to identify subdirectories from which dicoms are loaded Returns: dictionary of {modality: imvolume} that contains loaded image data for each modality supported """ # check if path specified exists if (not os.path.exists(images_path)): logger.info('Couldn\'t find specified path, nothing was loaded.') return None else: # load imvector and store to dictionary for each modality # if modality is missing, dont add to dictionary if (modalities is None or len(modalities) == 0): logger.info('No modalities supplied. skipping') return None else: volumes = OrderedDict() for mod in modalities: logger.info( indent('Importing {mod:s} images'.format(mod=mod.upper()), l1_indent)) dicom_path = os.path.join(images_path, '{mod:s}'.format(mod=mod)) if (os.path.exists(dicom_path)): # recursively walk modality path for dicom images, and build a dataset from it try: volumes[mod] = MaskableVolume().fromDir(dicom_path, recursive=True) except: logger.info( 'failed to create Volume for modality: {:s}'. format(mod)) else: size = volumes[mod].frameofreference.size logger.info( indent( 'stacked {len:d} datasets of shape: ({d:d}, {r:d}, {c:d})' .format(len=size[2], d=1, r=size[1], c=size[0]), l2_indent)) else: logger.info( indent( 'path to {mod:s} dicoms doesn\'t exist. skipping\n' '(path: {path:s}'.format(mod=mod, path=dicom_path), l2_indent)) logger.info('') return volumes
def cluster_kmeans(feature_matrix, nclusters=10, eps=1e-4, njobs=-2): """take input feature array of N rows and D columns and perform standard kmeans clustering using \ sklearn kmeans library Args: feature_matrix -- numpy array of N rows and D columns where N is the number of voxels in the volume and D is the number of features. Optional Args: nclusters -- number of clusters eps -- epsilon convergence criteria Returns: imvector of cluster assignments from 0 to k-1 aligned to the BaseVolumes of feature_matrix """ # check inputs if not isinstance(feature_matrix, np.ndarray): logger.warning( indent('a proper numpy ndarray was not provided. skipping.', g_indents[1])) logger.warning( indent( str(type(feature_matrix)) + str(type(np.ndarray)), g_indents[1])) return None if (nclusters <= 1): logger.exception(indent('k must be >1', g_indents[1])) raise ValueError # Preprocessing - normalization normalizer = StandardScaler() normalized_feature_matrix = np.nan_to_num( normalizer.fit_transform(feature_matrix)) # create estimator obj km = KMeans(n_clusters=nclusters, max_iter=300, n_init=10, init='k-means++', precompute_distances=True, tol=eps, n_jobs=njobs) km.fit(normalized_feature_matrix) logger.debug(indent('#iters: {:d}'.format(km.n_iter_), g_indents[1])) logger.debug( indent( 'score: {score:0.4f}'.format( score=km.score(normalized_feature_matrix)), g_indents[1])) return km.predict(normalized_feature_matrix)
def cluster_hierarchical_scipy(feature_matrix, nclusters=3, metric='euclidean', method='ward'): """take input feature array of N rows and D columns and perform agglomerative hierarchical clustering \ using the standard sklearn agglomerative clustring library Args: feature_matrix -- numpy array of N rows and D columns where N is the number of voxels in the volume and D is the number of features. Optional Args: nclusters -- number of clusters to find metric -- metric used to compute linkage ['euclidean', 'l1', 'l2', 'manhattan'] method -- criterion to use for cluster merging ['ward', 'complete', 'average'] """ # check inputs if not isinstance(feature_matrix, np.ndarray): logger.exception( indent( 'a proper numpy ndarray was not provided. {!s} != {!s}'.format( type(feature_matrix), type(np.ndarray)), g_indents[1])) raise TypeError # sanitize string inputs method = method.lower() metric = metric.lower() # Preprocessing - normalization normalizer = StandardScaler() normalized_feature_matrix = normalizer.fit_transform(feature_matrix) # determine valid parameters valid_method = GLOBAL['scipy_hcluster_valid_methods'] if (method not in valid_method): logger.exception('method must be one of {!s}'.format(valid_method)) raise ValueError(str) if (method is 'maximum'): method = 'complete' valid_metric = GLOBAL['scipy_hcluster_valid_metrics'] if (metric not in valid_metric): logger.exception('metric must be one of {!s}'.format(valid_metric)) raise ValueError(str) if (method is 'ward'): # must use euclidean distance metric = 'euclidean' # perform fit and estimation linkage_matrix = sch.linkage(normalized_feature_matrix, method, metric) prediction = sch.fcluster(linkage_matrix, nclusters, criterion='maxclust') return (prediction, linkage_matrix)
def wavelet_raw(image_volume, radius=2, roi=None, wavelet_str='db1', mode_str='smooth', level=0): # compute wavelet coefficients logger.info( indent( 'performing 3d wavelet decomp using wavelet: {!s}'.format( wavelet_str), g_indents[3])) roi_volume = image_volume.conformTo(roi.frameofreference) wavelet_coeffs = wavelet_decomp_3d(roi_volume, wavelet_str, mode_str) nlevels = len(wavelet_coeffs) - 1 wavelet_coeffs_diag = wavelet_coeffs[nlevels - level]['ddd'] zoomfactors = tuple( np.true_divide(roi_volume.frameofreference.size[::-1], wavelet_coeffs_diag.shape)) # scale low-res coefficients at highest level to image res result = scipy.ndimage.interpolation.zoom(wavelet_coeffs_diag, zoomfactors, order=3) result = MaskableVolume().fromArray(result, roi_volume.frameofreference) return result
def image_iterator(processing_function, image_volume, radius=2, roi=None): """compute the pixel-wise feature of an image over a region defined by neighborhood Args: processing_function -- function that should be applied to at each voxel location with neighborhood context. Function signature should match: fxn() -- list<callables> that will all be evaluated at each patch location, results will be stored to separate result MaskableVolume objects image -- a flattened array of pixel intensities of type imslice or a matrix shaped numpy ndarray radius -- describes neighborood size in each dimension. radius of 4 would be a 9x9x9 Returns: feature_volume as MaskableVolume with shape=image.shape """ # This is an ugly way of type-checking but cant get isinstance to see both as the same if (MaskableVolume.__name__ in str(type(image_volume))): (c, r, d) = image_volume.frameofreference.size def get_val(image_volume, z, y, x): # image boundary handling is built into BaseVolume.get_val return image_volume.get_val(z, y, x) def set_val(feature_volume, z, y, x, val): feature_volume.set_val(z, y, x, val) #instantiate a blank BaseVolume of the proper size feature_volume = MaskableVolume().fromArray( np.zeros((d, r, c)), image_volume.frameofreference) # feature_volume.modality = image_volume.modality # feature_volume.feature_label = 'feature' elif isinstance(image_volume, np.ndarray): if image_volume.ndim == 3: d, r, c = image_volume.shape elif image_volume.ndim == 2: d, r, c = (1, *image_volume.shape) image_volume = image_volume.reshape((d, r, c)) # instantiate a blank np.ndarray of the proper size feature_volume = np.zeros((d, r, c)) def get_val(image, z, y, x): if (z < 0 or y < 0 or x < 0) or (z >= d or y >= r or x >= c): return 0 else: return image[z, y, x] def set_val(image, z, y, x, val): image[z, y, x] = val else: logger.info( 'invalid image type supplied ({:s}). Please specify an image of type BaseVolume \ or type np.ndarray'.format(str(type(image_volume)))) return None # z_radius_range controls 2d neighborhood vs 3d neighborhood for 2d vs 3d images if d == 1: # 2D image logger.debug( indent('Computing 2D feature with radius: {:d}'.format(radius), l3)) z_radius_range = [0] elif d > 1: # 3D image logger.debug( indent('Computing 3D feature with radius: {:d}'.format(radius), l3)) z_radius_range = range(-radius, radius + 1) # in plane patch range radius_range = range(-radius, radius + 1) # timing start_feature_calc = time.time() # absolute max indices for imagevolume - for handling request of voxel out of bounds cbound = c rbound = r dbound = d # set calculation index bounds -- will be redefined if roi is specified cstart, cstop = 0, cbound rstart, rstop = 0, rbound dstart, dstop = 0, dbound # defines dimensionality d_subset = dstop - dstart r_subset = rstop - rstart c_subset = cstop - cstart # restrict calculation bounds to roi if (roi is not None): # get max extents of the mask/ROI to speed up calculation only within ROI cubic volume extents = roi.getROIExtents() cstart, rstart, dstart = image_volume.frameofreference.getIndices( extents.start) cstop, rstop, dstop = np.subtract( image_volume.frameofreference.getIndices(extents.end()), 1) logger.info( indent( 'calculation subset volume x=({xstart:d}->{xstop:d}), ' 'y=({ystart:d}->{ystop:d}), ' 'z=({zstart:d}->{zstop:d})'.format(zstart=dstart, zstop=dstop, ystart=rstart, ystop=rstop, xstart=cstart, xstop=cstop), l4)) # redefine feature_volume d_subset = dstop - dstart r_subset = rstop - rstart c_subset = cstop - cstart feature_frameofreference = FrameOfReference( (extents.start), (image_volume.frameofreference.spacing), (c_subset, r_subset, d_subset)) feature_volume = feature_volume.fromArray( np.zeros((d_subset, r_subset, c_subset)), feature_frameofreference) # # setup an output volume for each feature in processing_function list # if (not isinstance(processing_function, list)): # tmp = [] # tmp.append(processing_function) # processing_function = tmp # feature_volumes = [feature_volume] # for funct in processing_function[1:]: # feature_volumes.append(np.zeros_like(feature_volume)) # nested loop approach -> slowest, try GPU next total_voxels = d * r * c subset_total_voxels = d_subset * r_subset * c_subset #onepercent = int(subset_total_voxels / 100) fivepercent = int(subset_total_voxels / 100 * 5) idx = -1 subset_idx = -1 z_idx = -1 for z in range(dstart, dstop): z_idx += 1 y_idx = -1 x_idx = -1 for y in range(rstart, rstop): y_idx += 1 x_idx = -1 for x in range(cstart, cstop): x_idx += 1 idx += 1 if (z < dstart or z > dstop or y < rstart or y > rstop or x < cstart or x > cstop): # we shouldnt ever be here logger.info('why are we here?!') #fill 0 instead set_val(feature_volume, z_idx, y_idx, x_idx, 0) else: subset_idx += 1 patch_vals = np.zeros( (len(z_radius_range), len(radius_range), len(radius_range))) for p_z, k_z in enumerate(z_radius_range): for p_x, k_x in enumerate(radius_range): for p_y, k_y in enumerate(radius_range): #logger.info('k_z:{z:d}, k_y:{y:d}, k_x:{x:d}'.format(z=k_z,y=k_y,x=k_x)) # handle out of bounds requests - replace with 0 request_z = z + k_z request_y = y + k_y request_x = x + k_x if (request_z < 0 or request_z >= dbound or request_y < 0 or request_y >= rbound or request_x < 0 or request_x >= cbound): val = 0 else: val = get_val(image_volume, request_z, request_y, request_x) # store to local image patch patch_vals[p_z, p_y, p_x] = val # for i, funct in enumerate(processing_function): proc_value = processing_function(patch_vals) set_val(feature_volume, z_idx, y_idx, x_idx, proc_value) if (False and (subset_idx % fivepercent == 0 or subset_idx == subset_total_voxels - 1)): logger.debug( 'feature value at ({x:d}, {y:d}, {z:d})= {e:f}'. format(x=z * y * x + y * x + x, y=z * y * x + y, z=z * y * x, e=proc_value)) if ((subset_idx % fivepercent == 0 or subset_idx == subset_total_voxels - 1)): logger.debug( indent( '{p:0.2%} - voxel: {i:d} of {tot:d} (of total: {abstot:d})' .format(p=subset_idx / subset_total_voxels, i=subset_idx, tot=subset_total_voxels, abstot=total_voxels), l4)) if isinstance(image_volume, np.ndarray) and d == 1: # need to reshape ndarray if input was 2d feature_volume = feature_volume.reshape((r_subset, c_subset)) # for i, feature_volume in enumerate(feature_volumes): # feature_volumes[i] = feature_volume.reshape((r_subset, c_subset)) end_feature_calc = time.time() logger.debug( timer('feature calculation time:', end_feature_calc - start_feature_calc, l3)) # if len(features_volumes > 1): # return feature_volumes # else: return feature_volume
def create_feature_matrix(feature_volumes, roi=None, PCA=False, PCA_components=0.95): """takes a list of feature BaseVolumes and combines them into a numpy ndarray of N rows and D features where N is the number of samples in each feature vector (voxels in the image) and D is the number of feature vectors stored in the "feature_volumes" list. Args: feature_volumes -- python list of BaseVolumes that are aligned Returns: NDArray -- numpy ndarray with N rows and D columns where N is the number of voxels in the aligned images (in depth-row major order) and D is the number of feature vectors in the list (len(feature_volumes)) """ if len(feature_volumes) <= 0: logger.warning(indent('no features supplied. skipping', g_indents[1])) return None else: if (roi): # use roi.frameofreference as common shape frameofreference = roi.frameofreference else: # find highest resolution volume and use as FrameOfReference highest_res_volume = feature_volumes[0] highest_res = np.product( highest_res_volume.frameofreference.spacing) for volume in feature_volumes[1:]: res = np.product(volume.frameofreference.spacing) if (res < highest_res): highest_res_volume = volume highest_res = res # assign highest res FOR as common shape frameofreference = highest_res_volume.frameofreference # take the selected FORs shape to be the reference ref_shape = frameofreference.size[:: -1] # reverses tuple from (x,y,z) to (z,y,x) logger.debug('Common Spacing (z,y,x): ({:f}, {:f}, {:f})'.format( *frameofreference.spacing)) logger.debug( 'Common Shape (z,y,x): ({:d}, {:d}, {:d})'.format(*ref_shape)) # create list of commonly shaped feature vectors conformed_feature_list = [] dense_feature_list = [] feature_column_labels = [] for i, vol in enumerate(feature_volumes): # check for invalid feature if (vol is None): logger.debug( 'empty (None) feature provided at index {:d}, removing and continuing' .format(i)) continue # conform feature volumes and add to list conformed_volume = vol.conformTo(frameofreference) if (conformed_volume.array.shape != ref_shape): logger.warning( indent( 'shape mismatch. ref={ref:s} != feature[{num:d}]={shape:s}.' ' removing and continuing'.format( ref=str(ref_shape), num=i, shape=str(conformed_volume.array.shape)), g_indents[1])) continue else: # concatenate, need to make feat.array a 2d vector pruned_feature_vector = create_pruned_vector( conformed_volume, roi) conformed_feature_list.append(pruned_feature_vector) dense_feature_list.append( conformed_volume.vectorize(roi).reshape((-1, 1))) # label generator label = generate_heatmap_label(conformed_volume) feature_column_labels.append(label) # combine accepted features into array of shape (nSamples, nFeatures) pruned_feature_array = np.nan_to_num( np.concatenate(conformed_feature_list, axis=1)) # create expanded/dense version for pickling and using in hierarchical clustering dense_feature_array = np.nan_to_num( np.concatenate(dense_feature_list, axis=1)) # dimensionality reduction if PCA: pca = sklearnPCA(whiten=False, n_components=PCA_components) # standard PCA # nfeats = pruned_feature_array.shape[1] pruned_feature_array = pca.fit_transform(pruned_feature_array) # logger.debug('pca: keeping {:d} of {:d} components'.format(pca.n_components, nfeats)) logger.debug( indent( 'combined {n:d} features into pruned array of shape: {shape:s}' .format(n=pruned_feature_array.shape[1], shape=str(pruned_feature_array.shape)), g_indents[1])) return (pruned_feature_array, frameofreference, dense_feature_array, feature_column_labels)
def loadROIs(rtstruct_path): """loads an rtstruct specified by path and returns a dict of ROI objects DEPRECATED IN FAVOR OF rttypes.ROI classmethod collectionFromFile(rtstruct_path) Args: rtstruct_path -- path to rtstruct.dcm file Returns: dict<key='contour name', val=ROI> """ warnings.warn( 'scripting.loadROIs() DEPRECATED IN FAVOR OF rttypes.ROI classmethod collectionFromFile(rtstruct_path)', DeprecationWarning) if (not os.path.exists(rtstruct_path)): logger.info( indent('invalid path provided: "{:s}"'.format(rtstruct_path), l2_indent)) raise ValueError logger.info(indent('Importing ROIs', l1_indent)) # check if path is file or dir if (os.path.isdir(rtstruct_path)): # search recursively for a valid rtstruct file ds_list = dcmio.read_dicom_dir(rtstruct_path, recursive=True) if (ds_list is None or len(ds_list) == 0): logger.info( 'no rtstruct datasets found at "{:s}"'.format(rtstruct_path)) raise Exception ds = ds_list[0] elif (os.path.isfile(rtstruct_path)): ds = dcmio.read_dicom(rtstruct_path) # parse rtstruct file and instantiate maskvolume for each contour located # add each maskvolume to dict with key set to contour name and number? if (ds is not None): # get structuresetROI sequence StructureSetROI_list = ds.StructureSetROISequence nContours = len(StructureSetROI_list) if (nContours <= 0): logger.debug(indent('no contours were found', l2_indent)) return None # Add structuresetROI to dict StructureSetROI_dict = { StructureSetROI.ROINumber: StructureSetROI for StructureSetROI in StructureSetROI_list } # get dict containing a contour dataset for each StructureSetROI with a paired key=ROINumber ROIContour_dict = { ROIContour.ReferencedROINumber: ROIContour for ROIContour in ds.ROIContourSequence } # construct a dict of ROI objects where contour name is key roi_dict = {} for ROINumber, structuresetroi in StructureSetROI_dict.items(): roi_dict[structuresetroi.ROIName] = (ROI( frameofreference=None, roicontour=ROIContour_dict[ROINumber], structuresetroi=structuresetroi)) # prune empty ROIs from dict for roiname, roi in dict(roi_dict).items(): if (roi.coordslices is None or len(roi.coordslices) <= 0): logger.debug( indent( 'pruning empty ROI: {:s} from loaded ROIs'.format( roiname), l2_indent)) del roi_dict[roiname] logger.info( indent('loaded {:d} ROIs succesfully'.format(len(roi_dict)), l2_indent)) return roi_dict else: logger.info(indent('no dataset was found', l2_indent)) return None
def loadFeatures(pickle_path, image_volumes, feature_defs, roi=None, savepickle=True): """Checks if feature vector has already been pickled at path specified and loads the files if so, or computes feature for each modality and pickles for later access. Args: pickle_path -- should be the full path to the patient specific "precomputed" dir. pickle file names are searched for occurence of pet, ct, and feature and will be loaded if a modality string and "feature" are both present. image_volumes -- dictionary of {modality, BaseVolume} that contains loaded image data for each modality supported feature_defs -- dict<key='feature_name', value=dict<k=argname, v-argval>> Returns: dict<key='feature_name', value=dict<key=mod, value=MaskableVolume>> """ # check if path specified exists if (not os.path.exists(pickle_path)): logger.info('Couldn\'t find specified path, nothing was loaded.') return None else: # extract modalities from image_volumes if (image_volumes is None or len(image_volumes) == 0): logger.info('No image data was provided. Skipping') return None modalities = list(image_volumes.keys()) # load first file that matches the search and move to next modality feature_volumes = OrderedDict() for feature_def in feature_defs: feature_label = feature_def.label feature_args = feature_def.args calculate_feature = feature_def.calculation_function recalculate = feature_def.recalculate these_feature_volumes = OrderedDict() # k: mod, v: BaseVolume logger.info('Loading Feature ({!s}):'.format(feature_label)) for mod in modalities: logger.info( indent('Loading {!s} feature:'.format(mod.upper()), l1_indent)) # initialize to None these_feature_volumes[mod] = None # get files that match settings match = checkPickle(pickle_path, feature_label, feature_args, mod, roi) if (not recalculate and match is not None): # found pickled feature vector, load it and add to dict - no need to calculate feature these_feature_volumes[mod] = loadPickle( os.path.join(pickle_path, match), mod, feature_label) else: # Calculate feature this time if (match): logger.info( indent('Recalculating feature as requested', l2_indent)) else: logger.info( indent( 'No pickled feature vector found ({!s})'. format(mod), l2_indent)) logger.info( indent('Computing feature now...'.format(mod=mod), l2_indent)) vol = calculate_feature(image_volumes[mod], roi=roi, **feature_args) # inject metadata vol.modality = mod vol.feature_label = feature_label these_feature_volumes[mod] = vol # Check status of calculation if these_feature_volumes[mod] is None: logger.info( indent( 'Failed to compute feature for {!s} images.'. format(mod.upper()), l2_indent)) else: logger.info( indent('feature computed successfully', l2_indent)) # pickle for later recall savePickle(pickle_path, these_feature_volumes[mod], mod, feature_label, feature_args, roi) logger.info('') # END mod feature_volumes[feature_label] = these_feature_volumes logger.info('') # END feature_def # return dict of modality specific feature imvectors with keys defined by keys for image_volumes arg. return feature_volumes
def cluster_hierarchical_sklearn(feature_matrix, nclusters=3, affinity='euclidean', linkage='ward'): """take input feature array of N rows and D columns and perform agglomerative hierarchical clustering \ using the standard sklearn agglomerative clustring library Args: feature_matrix -- numpy array of N rows and D columns where N is the number of voxels in the volume and D is the number of features. Optional Args: nclusters -- number of clusters to find affinity -- metric used to compute linkage ['euclidean', 'l1', 'l2', 'manhattan'] linkage -- criterion to use for cluster merging ['ward', 'complete', 'average'] """ # check inputs if not isinstance(feature_matrix, np.ndarray): logger.exception( indent( 'a proper numpy ndarray was not provided. {:s} != {:s}'.format( str(type(feature_matrix)), str(type(np.ndarray))), g_indents[1])) raise TypeError # sanitize string inputs linkage = linkage.lower() affinity = affinity.lower() # Preprocessing - normalization normalizer = StandardScaler() normalized_feature_matrix = normalizer.fit_transform(feature_matrix) # determine valid parameters valid_linkage = ['ward', 'complete', 'maximum', 'average'] if (linkage not in valid_linkage): logger.exception('linkage must be one of {:s}'.format( str(valid_linkage))) raise ValueError(str) if (linkage is 'maximum'): linkage = 'complete' valid_affinity = ['l1', 'l2', 'manhattan', 'cosine', 'euclidean'] if (affinity not in valid_affinity): logger.exception('affinity must be one of {:s}'.format( str(valid_affinity))) raise ValueError(str) if (linkage is 'ward'): # must use euclidean distance affinity = 'euclidean' conn_matrix = None # create estimator obj agg = AgglomerativeClustering(n_clusters=nclusters, connectivity=conn_matrix, affinity=affinity, compute_full_tree=True, linkage=linkage, pooling_func=np.mean) # perform fit and estimation prediction = agg.fit_predict(normalized_feature_matrix) logger.debug(indent('#leaves: {:d}'.format(agg.n_leaves_), g_indents[1])) logger.debug( indent('#components: {:d}'.format(agg.n_components_), g_indents[1])) return (prediction, agg)