def extractImgPxlSample(inputImg, pxlNSample, noData=None): # Import the RIOS image reader from rios.imagereader import ImageReader first = True reader = ImageReader(inputImg, windowxsize=200, windowysize=200) print('Started .0.', end='', flush=True) outCount = 10 for (info, block) in reader: if info.getPercent() > outCount: print('.'+str(int(outCount))+'.', end='', flush=True) outCount = outCount + 10 blkShape = block.shape blkBands = block.reshape((blkShape[0], (blkShape[1]*blkShape[2]))) blkBandsTrans = numpy.transpose(blkBands) if noData is not None: blkBandsTrans = blkBandsTrans[(blkBandsTrans!=noData).all(axis=1)] if blkBandsTrans.shape[0] > 0: nSamp = int((blkBandsTrans.shape[0])/pxlNSample) nSampRange = numpy.arange(0, nSamp, 1)*pxlNSample blkBandsTransSamp = blkBandsTrans[nSampRange] if first: outArr = blkBandsTransSamp first = False else: outArr = numpy.concatenate((outArr, blkBandsTransSamp), axis=0) print('. Completed') return outArr
def getData(self): """ Read a 3d numpy array with data for the current extent """ if self.mode == basedriver.CREATE: msg = 'Can only read raster data in READ or UPDATE modes' raise GDALException(msg) numpyType = imageio.GDALTypeToNumpyType(self.gdalType) # use RIOS to do the hard work data = ImageReader.readBlockWithMargin(self.ds, self.blockxcoord, self.blockycoord, self.blockxsize, self.blockysize, numpyType, self.controls.overlap, self.nullValList) return data
def countPxlsOfVal(inputImg, vals=[0]): """ Function which counts the number of pixels of a set of values returning a list in the same order as the list of values provided. :param inputImg: the input image :param vals: is a list of pixel values to be counted """ if len(vals) == 0: raise Exception( 'At least 1 value should be provided within the vals input varable.' ) numVals = len(vals) outVals = numpy.zeros(numVals, dtype=numpy.int64) from rios.imagereader import ImageReader reader = ImageReader(inputImg) for (info, block) in reader: for idx in range(numVals): outVals[idx] = outVals[idx] + (block == vals[idx]).sum() return outVals
#!/usr/bin/env python import sys from scipy import ndimage from rios.imagereader import ImageReader from rios.imagewriter import ImageWriter inImage = sys.argv[1] outImage = sys.argv[2] reader = ImageReader(inImage) writer = None for (info, block) in reader: out = block * 2 if writer is None: writer = ImageWriter(outImage, info=info, firstblock=out, drivername='HFA', creationoptions=['COMPRESSED=TRUE']) else: writer.write(out) writer.close(calcStats=True)
import optparse from rios.imagereader import ImageReader from rios.imagewriter import ImageWriter inputs = list() inputs.append( "/Users/pete/Temp/Hyperforest/Kersselaerspleyn_LiDAR_05m_pmfgrd_chmNN_median5_morphgrad_minima.env" ) inputs.append( "/Users/pete/Temp/Hyperforest/Kersselaerspleyn_LiDAR_05m_pmfgrd_chmNN_median5_morphgrad.env" ) outfile = "/Users/pete/Temp/Hyperforest/Kersselaerspleyn_LiDAR_05m_pmfgrd_chmNN_median5_watershed.img" reader = ImageReader(inputs, windowxsize=1000, windowysize=1000, overlap=100) writer = None # read through each block and apply scaling # and write into output file for (info, blocks) in reader: block1, block2 = blocks seeds = np.int32(block1) grad = np.uint16(block2) out = np.expand_dims(ndimage.watershed_ift(grad[0], seeds[0]), 0) if writer is None: writer = ImageWriter(outfile, info=info, firstblock=out) else: writer.write(out) print info.getPercent(), '%\r',
def label_pxl_sample_chips(sample_pxls_img, cls_msk_img, output_image, gdalformat, chip_size, cls_lut, sample_pxl_img_band=1, cls_msk_img_band=1): """ A function which labels image pixels based on the proportions of a class within a chip around the pixel (can be used in combination with rsgislib.imageutils.assign_random_pxls). It is expected that this function will be used when trying to use existing maps to create deep learning chip classification training data. Pixels are labelled if the proportion of pixels is >= the threshold provided in the LUT. If more than one class meets the threshold then the one with the highest proportion is assigned. :param sample_pxls_img: The input binary image with the pixel locations (value == 1) :param cls_msk_img: The classification image used to assign the output pixel values. :param output_image: The output image. Single pixels with the class value will be outputted. :param gdalformat: The output image file format. :param chip_size: The size of the chip used to identify the class - would probably correspond to the chip size being used for the deep learning classification. Areas used is half the chip size around the pixel (i.e., the pixel from the samples image will be at the centre of the chip). :param cls_lut: A dict look up table (LUT) with the thresholds per class for the pixel to be classified as that class. :param sample_pxl_img_band: Default 1. The image band in the sample image. :param cls_msk_img_band: Default 1. The image band in the sample image. Example:: sample_pxls_img = 'LS5TM_20000108_latn531lonw37_r23p204_osgb_samples.kea' cls_msk_img = 'LS5TM_20000108_latn531lonw37_r23p204_osgb_clouds_up.kea' output_image = 'LS5TM_20000108_latn531lonw37_r23p204_osgb_samples_lbld.kea' cls_lut = dict() cls_lut[1] = 0.2 cls_lut[2] = 0.2 cls_lut[3] = 0.99 label_pxl_sample_chips(sample_pxls_img, cls_msk_img, output_image, 'KEA', 21, cls_lut) """ import rsgislib.rastergis from rios.imagereader import ImageReader from rios.imagewriter import ImageWriter import tqdm import numpy import math chip_size_odd = False if (chip_size % 2) != 0: chip_size_odd = True img_win_h_size = math.floor(chip_size / 2) img_win_size = chip_size n_pxls = img_win_size * img_win_size inImgs = list() inImgBands = list() inImgs.append(sample_pxls_img) inImgBands.append([sample_pxl_img_band]) inImgs.append(cls_msk_img) inImgBands.append([cls_msk_img_band]) writer = None reader = ImageReader(inImgs, windowxsize=200, windowysize=200, overlap=img_win_h_size, layerselection=inImgBands) for (info, block) in tqdm.tqdm(reader): samples_msk_arr = block[0] blk_shp = samples_msk_arr.shape xSize = blk_shp[2] - (img_win_h_size * 2) ySize = blk_shp[1] - (img_win_h_size * 2) xRange = numpy.arange(img_win_h_size, img_win_h_size + xSize, 1) yRange = numpy.arange(img_win_h_size, img_win_h_size + ySize, 1) out_samp_arr = numpy.zeros_like(samples_msk_arr, dtype=numpy.uint8) for y in yRange: yMin = y - img_win_h_size yMax = y + img_win_h_size if chip_size_odd: yMax += 1 for x in xRange: xMin = x - img_win_h_size xMax = x + img_win_h_size if chip_size_odd: xMax += 1 if samples_msk_arr[0][y][x] == 1: img_blk = block[1][0, yMin:yMax, xMin:xMax] uniq_vals, uniq_counts = numpy.unique(img_blk, return_counts=True) uniq_dict = dict(zip(uniq_vals, uniq_counts)) first = True for val in uniq_vals: if val in cls_lut: val_prop = uniq_dict[val] / n_pxls if val_prop >= cls_lut[val]: if first: max_val = val max_val_prop = val_prop first = False elif val_prop > max_val_prop: max_val = val max_val_prop = val_prop if not first: out_samp_arr[0][y][x] = max_val if writer is None: writer = ImageWriter(output_image, info=info, firstblock=out_samp_arr, drivername=gdalformat) else: writer.write(out_samp_arr) writer.close(calcStats=False) rsgislib.rastergis.populateStats(output_image, True, True, True)
def apply_keras_chips_pixel_classifier(classTrainInfo, keras_cls_mdl, imgMask, imgMaskVal, imgFileInfo, chip_h_size, outClassImg, gdalformat, pred_batch_size=128, pred_max_queue_size=10, pred_workers=1, pred_use_multiprocessing=False, classClrNames=True): """ This function applies a trained single pixel keras model to an image. The function train_keras_pixel_classifer can be used to train such as model. The output image will contain the hard membership of the predicted class. For pred_batch_size, pred_max_queue_size, pred_workers and pred_use_multiprocessing options see the keras documentation https://keras.io/models/model/ :param classTrainInfo: dict (where the key is the class name) of rsgislib.classification.ClassInfoObj objects which will be used to train the classifier (i.e., train_keras_pixel_classifer()), provide pixel value id and RGB class values. :param keras_cls_mdl: a trained keras model object, with a input dimensions equivlent to the number of image bands specified in the imgFileInfo input and output layer which provides an output array of the length of the number of classes. :param imgMask: is an image file providing a mask to specify where should be classified. Simplest mask is all the valid data regions (rsgislib.imageutils.genValidMask) :param imgMaskVal: the pixel value within the imgMask to limit the region to which the classification is applied. Can be used to create a heirachical classification. :param imgFileInfo: a list of rsgislib.imageutils.ImageBandInfo objects (also used within rsgislib.imageutils.extractZoneImageBandValues2HDF) to identify which images and bands are to be used for the classification so it adheres to the training data. :param outClassImg: Output image which will contain the hard classification. :param chip_h_size: is half the chip size to be extracted (i.e., 10 with output image chips 21x21, 10 pixels either size of the one of interest). :param gdalformat: is the output image format - all GDAL supported formats are supported. :param pred_batch_size: the batch size used for the classification prediction. :param pred_max_queue_size: the max queue size used for the classification prediction :param pred_workers: the number of workers used for the classification prediction :param pred_use_multiprocessing: whether to use a multiprocessing option for the classification prediction :param classClrNames: default is True and therefore a colour table will the colours specified in ClassInfoObj and a ClassName (from classTrainInfo) column will be added to the output file. """ n_classes = len(classTrainInfo) cls_id_lut = numpy.zeros(n_classes) for clsname in classTrainInfo: if classTrainInfo[clsname].id >= n_classes: raise ( "ClassInfoObj '{}' id ({}) is not consecutive starting from 0." .format(clsname, classTrainInfo[clsname].id)) cls_id_lut[classTrainInfo[clsname].id] = classTrainInfo[clsname].out_id inImgs = list() inImgBands = list() inImgs.append(imgMask) inImgBands.append([1]) n_img_bands = 0 for inImgInfo in imgFileInfo: inImgs.append(inImgInfo.fileName) inImgBands.append(inImgInfo.bands) n_img_bands = n_img_bands + len(inImgInfo.bands) nImgs = len(imgFileInfo) scn_overlap = chip_h_size chip_size = (chip_h_size * 2) + 1 writer = None reader = ImageReader(inImgs, windowxsize=200, windowysize=200, overlap=scn_overlap, layerselection=inImgBands) for (info, block) in tqdm.tqdm(reader): classMskArr = block[0] blkShape = classMskArr.shape vld_cls_arr = numpy.zeros_like(classMskArr, dtype=int) xSize = blkShape[2] - (scn_overlap * 2) ySize = blkShape[1] - (scn_overlap * 2) xRange = numpy.arange(scn_overlap, scn_overlap + xSize, 1) yRange = numpy.arange(scn_overlap, scn_overlap + ySize, 1) n_vld_pxls = 0 for y in yRange: for x in xRange: if classMskArr[0][y][x] == imgMaskVal: n_vld_pxls = n_vld_pxls + 1 vld_cls_arr[0][y][x] = 1 feat2cls = numpy.zeros([n_vld_pxls, n_img_bands, chip_size, chip_size], dtype=numpy.float32) iFeat = 0 for y in yRange: yMin = y - scn_overlap yMax = y + scn_overlap + 1 for x in xRange: xMin = x - scn_overlap xMax = x + scn_overlap + 1 if classMskArr[0][y][x] == imgMaskVal: for nImg in range(nImgs): imgBlk = block[nImg + 1][..., yMin:yMax, xMin:xMax] for iBand in range(imgBlk.shape[0]): numpy.copyto(feat2cls[iFeat, iBand], imgBlk[iBand], casting='safe') iFeat = iFeat + 1 preds_idxs = numpy.argmax(keras_cls_mdl.predict( feat2cls, batch_size=pred_batch_size, max_queue_size=pred_max_queue_size, workers=pred_workers, use_multiprocessing=pred_use_multiprocessing), axis=1) feat2cls = None out_cls_arr = numpy.zeros_like(classMskArr, dtype=numpy.uint16) out_cls_arr = out_cls_arr.flatten() vld_cls_arr = vld_cls_arr.flatten() ID = numpy.arange(out_cls_arr.shape[0]) ID = ID[vld_cls_arr == 1] preds_cls_ids = numpy.zeros_like(preds_idxs, dtype=numpy.uint16) for cld_id, idx in zip(cls_id_lut, numpy.arange(0, len(cls_id_lut))): preds_cls_ids[preds_idxs == idx] = cld_id out_cls_arr[ID] = preds_cls_ids out_cls_arr = numpy.expand_dims(out_cls_arr.reshape( (classMskArr.shape[1], classMskArr.shape[2])), axis=0) if writer is None: writer = ImageWriter(outClassImg, info=info, firstblock=out_cls_arr, drivername=gdalformat) else: writer.write(out_cls_arr) writer.close(calcStats=False) if classClrNames: rsgislib.rastergis.populateStats(outClassImg, addclrtab=True, calcpyramids=True, ignorezero=True) max_val = rsgislib.imagecalc.getImageBandMinMax( outClassImg, 1, False, 0)[1] ratDataset = gdal.Open(outClassImg, gdal.GA_Update) max_cls_val = 0 for classKey in classTrainInfo: if classTrainInfo[classKey].out_id > max_cls_val: max_cls_val = classTrainInfo[classKey].out_id if max_cls_val > max_val: red = numpy.random.randint(0, 255, max_cls_val + 1) green = numpy.random.randint(0, 255, max_cls_val + 1) blue = numpy.random.randint(0, 255, max_cls_val + 1) else: red = rat.readColumn(ratDataset, 'Red') green = rat.readColumn(ratDataset, 'Green') blue = rat.readColumn(ratDataset, 'Blue') ClassName = numpy.empty_like(red, dtype=numpy.dtype('a255')) ClassName[...] = "" for classKey in classTrainInfo: print("Apply Colour to class \'" + classKey + "\'") red[classTrainInfo[classKey].out_id] = classTrainInfo[classKey].red green[classTrainInfo[classKey]. out_id] = classTrainInfo[classKey].green blue[classTrainInfo[classKey]. out_id] = classTrainInfo[classKey].blue ClassName[classTrainInfo[classKey].out_id] = classKey rat.writeColumn(ratDataset, "Red", red) rat.writeColumn(ratDataset, "Green", green) rat.writeColumn(ratDataset, "Blue", blue) rat.writeColumn(ratDataset, "ClassName", ClassName) ratDataset = None
def img_pixel_sample_cluster(inputImg, outputImg, gdalformat='KEA', noDataVal=0, imgSamp=100, clusterer=MiniBatchKMeans(n_clusters=60, init='k-means++', max_iter=100, batch_size=100), calcStats=True, useMeanShiftEstBandWidth=False): """ A function which allows a clustering to be performed using the algorithms available within the scikit-learn library. The clusterer is trained on a sample of the input image and then applied using the predict function (therefore this function is only compatiable with clusterers which have the predict function implemented) to the whole image. :param inputImg: input image file. :param outputImg: output image file. :param gdalformat: output image file format. :param noDataVal: no data value associated with the input image. :param imgSamp: the input image sampling. (e.g., 100 is every 100th pixel) :param clusterer: clusterer from scikit-learn which must have a predict function. :param calcStats: calculate image pixel statistics, histogram and image pyramids - note if you are not using a KEA file then the format needs to support RATs for this option as histogram and colour table are written to RAT. :param useMeanShiftEstBandWidth: use the mean-shift algorithm as the clusterer (pass None as the clusterer) where the bandwidth is calculated from the data itself. """ print('Sample input image:') dataSamp = rsgislib.imageutils.extractImgPxlSample(inputImg, imgSamp, noDataVal) if useMeanShiftEstBandWidth: print('Using Mean-Shift predict bandwidth') from sklearn.cluster import MeanShift, estimate_bandwidth bandwidth = estimate_bandwidth(dataSamp, quantile=0.2, n_samples=500) clusterer = MeanShift(bandwidth=bandwidth, bin_seeding=True) print('Fit Clusterer') outClust = clusterer.fit(dataSamp) print('Fitted Clusterer') print('Apply to whole image:') reader = ImageReader(inputImg, windowxsize=200, windowysize=200) writer = None for (info, block) in tqdm.tqdm(reader): blkShape = block.shape blkBands = block.reshape((blkShape[0], (blkShape[1] * blkShape[2]))).T ID = numpy.arange(blkBands.shape[0]) outClusterVals = numpy.zeros((blkBands.shape[0])) finiteMskArr = numpy.isfinite(blkBands).all(axis=1) ID = ID[finiteMskArr] blkBands = blkBands[finiteMskArr] noDataValArr = numpy.logical_not( numpy.where(blkBands == noDataVal, True, False).all(axis=1)) blkBandsNoData = blkBands[noDataValArr] ID = ID[noDataValArr] if ID.shape[0] > 0: outPred = clusterer.predict(blkBandsNoData) + 1 outClusterVals[ID] = outPred outClusterValsOutArr = outClusterVals.reshape( [1, blkShape[1], blkShape[2]]) if writer is None: writer = ImageWriter(outputImg, info=info, firstblock=outClusterValsOutArr, drivername=gdalformat, creationoptions=[]) else: writer.write(outClusterValsOutArr) writer.close(calcStats=False) if calcStats: rsgislib.rastergis.populateStats(clumps=outputImg, addclrtab=True, calcpyramids=True, ignorezero=True)
def img_pixel_tiled_cluster(inputImg, outputImg, gdalformat='KEA', noDataVal=0, clusterer=MiniBatchKMeans(n_clusters=60, init='k-means++', max_iter=100, batch_size=100), calcStats=True, useMeanShiftEstBandWidth=False, tileXSize=200, tileYSize=200): """ A function which allows a clustering to be performed using the algorithms available within the scikit-learn library. The clusterer is applied to a single tile at a time and therefore produces tile boundaries in the result. However, memory is controlled such that usage isn't excessive which it could be when processing a whole image. :param inputImg: input image file. :param outputImg: output image file. :param gdalformat: output image file format. :param noDataVal: no data value associated with the input image. :param clusterer: clusterer from scikit-learn which must have a predict function. :param calcStats: calculate image pixel statistics, histogram and image pyramids - note if you are not using a KEA file then the format needs to support RATs for this option as histogram and colour table are written to RAT. :param useMeanShiftEstBandWidth: use the mean-shift algorithm as the clusterer (pass None as the clusterer) where the bandwidth is calculated from the data itself. :param tileXSize: tile size in the x-axis in pixels. :param tileYSize: tile size in the y-axis in pixels. """ if useMeanShiftEstBandWidth: from sklearn.cluster import MeanShift, estimate_bandwidth reader = ImageReader(inputImg, windowxsize=tileXSize, windowysize=tileYSize) writer = None for (info, block) in tqdm.tqdm(reader): blkShape = block.shape blkBands = block.reshape((blkShape[0], (blkShape[1] * blkShape[2]))).T ID = numpy.arange(blkBands.shape[0]) outClusterVals = numpy.zeros((blkBands.shape[0])) finiteMskArr = numpy.isfinite(blkBands).all(axis=1) ID = ID[finiteMskArr] blkBands = blkBands[finiteMskArr] noDataValArr = numpy.logical_not( numpy.where(blkBands == noDataVal, True, False).all(axis=1)) blkBandsNoData = blkBands[noDataValArr] ID = ID[noDataValArr] if ID.shape[0] > 0: if useMeanShiftEstBandWidth: bandwidth = estimate_bandwidth(blkBandsNoData, quantile=0.2, n_samples=1000) clusterer = MeanShift(bandwidth=bandwidth, bin_seeding=True) clusterer.fit(blkBandsNoData) outPred = clusterer.labels_ + 1 outClusterVals[ID] = outPred outClusterValsOutArr = outClusterVals.reshape( [1, blkShape[1], blkShape[2]]) if writer is None: writer = ImageWriter(outputImg, info=info, firstblock=outClusterValsOutArr, drivername=gdalformat, creationoptions=[]) else: writer.write(outClusterValsOutArr) writer.close(calcStats=False) if calcStats: rsgislib.rastergis.populateStats(clumps=outputImg, addclrtab=True, calcpyramids=True, ignorezero=True)
def zoneMeans(clumpFile, dataFile, clumpBand=1, dataBands=None, ignoreDataVals=None): """ Given a file of clumps and a file of data, calculates the mean and standard deviation for the area of each clump value in the data. If dataBands is None does all bands in the dataFile, otherwise pass list of 1-based band indices or a single integer If dataBands is None or a list, returns list of tuples. Each tuple contains two arrays, one with the mean values, one with the standard deviation values. The indices of these arrays go from zero to the maximum clump value and have values for each clump id, zero for other indices. If dataBands is a single integer, returns a tuple with mean and standard deviation arrays as above. Ignore values(s) may be passed in with the ignoreDataVals parameter. This may be a single value in which case the same is used for all dataBands, or a sequence the same length as dataValues. """ fileDict = {'clumps':clumpFile, 'data':dataFile} origdataBands = dataBands # so we know whether to return list or tuple if isinstance(dataBands, int): dataBands = [dataBands] # treat as list for now if isinstance(ignoreDataVals, int): # make list same size as dataBands ignoreDataVals = [ignoreDataVals] * len(dataBands) # use dictionaries for accumulated values # index is the clump id # we have a list of these dictionaries one per dataBand sumDictList = [] sumsqDictList = [] countDictList = [] if dataBands is not None: # if None, sorted below when we know how many bands # create the dictionaries for each band for dataBand in dataBands: sumDictList.append({}) sumsqDictList.append({}) countDictList.append({}) # red thru the images reader = ImageReader(fileDict) for (info, blocks) in reader: # get the data for the specified bands and flatten it clumps = blocks['clumps'][clumpBand-1].flatten() if dataBands is None: # now we know how many bands there are for the default list dataBands = range(1, blocks['data'].shape[0]+1) # create the dictionaries for each band for dataBand in dataBands: sumDictList.append({}) sumsqDictList.append({}) countDictList.append({}) for idx, dataBand in enumerate(dataBands): data = blocks['data'][dataBand-1].flatten() sumDict = sumDictList[dataBand-1] sumsqDict = sumsqDictList[dataBand-1] countDict = countDictList[dataBand-1] # for each clump id for value in numpy.unique(clumps): # get the data for that clump mask = (clumps == value) # if we are ignoring values then extend mask if ignoreDataVals is not None: mask = mask & (data != ignoreDataVals[idx]) dataSubset = data.compress(mask) # check we have data if dataSubset.size != 0: # calculate the values sum = dataSubset.sum() sq = dataSubset * dataSubset sumsq = sq.sum() # check if we encountered this value or not # and load into our dictioanaries if value in sumDict: sumDict[value] += sum sumsqDict[value] += sumsq countDict[value] += dataSubset.size else: sumDict[value] = sum sumsqDict[value] = sumsq countDict[value] = dataSubset.size # work out the length of the arrays and # create some blank arrays maxidx = max(sumDict.keys()) + 1 meanArray = numpy.zeros((maxidx,), numpy.float) stdArray = numpy.zeros((maxidx,), numpy.float) resultList = [] # go through each band for dataBand in dataBands: sumDict = sumDictList[dataBand-1] sumsqDict = sumsqDictList[dataBand-1] countDict = countDictList[dataBand-1] # turn into arrays so we don't have to iterate idxs = numpy.fromiter(sumDict.keys(), numpy.integer) sums = numpy.zeros((maxidx,), numpy.float) sums[idxs] = numpy.fromiter(sumDict.values(), numpy.float) sumsqs = numpy.zeros((maxidx,), numpy.float) sumsqs[idxs] = numpy.fromiter(sumsqDict.values(), numpy.float) counts = numpy.zeros((maxidx,), numpy.integer) counts[idxs] = numpy.fromiter(countDict.values(), numpy.integer) # mask out invalid divides outInvalid = counts == 0 counts[outInvalid] = 1 means = sums / counts stds = numpy.sqrt((sumsqs / counts) - (means * means)) means[outInvalid] = 0 stds[outInvalid] = 0 resultList.append((means, stds)) if isinstance(origdataBands, int): return resultList[0] # only one item else: return resultList
def zoneMajority(clumpFile, dataFile, clumpBand=1, dataBands=None): """ Given a file of clumps and a file of data, calculates the most common data values for each clump and the histogram If dataBands is None does all bands in the dataFile, otherwise pass list of 1-based band indices or a single integer If dataBands is None or a list, returns list of tuples. Each tuple contains as array of the most common values and a histogram. The indices of this array go from zero to the maximum clump value and have values for each clump id, zero for other indices. The histogram is a dictionary, keyed on the clump id. Each value in the dictionary is itself a dictionary keyed on the data value, with the count of that value. If dataBands is a single integer, returns a tuple with the mode array and histogram dictionary as above. """ origdataBands = dataBands # so we know whether to return list or tuple if isinstance(dataBands, int): dataBands = [dataBands] # treat as list for now fileDict = {'clumps':clumpFile, 'data':dataFile} # index is the clump id # list of dictionaries clumpDictList = [] if dataBands is not None: # if None, sorted below when we know how many bands # create the dictionaries for each band for dataBand in dataBands: clumpDictList.append({}) # red thru the images reader = ImageReader(fileDict) for (info, blocks) in reader: # get the data for the specified bands and flatten it clumps = blocks['clumps'][clumpBand-1].flatten() if dataBands is None: # now we know how many bands there are for the default list dataBands = range(1, blocks['data'].shape[0]+1) # create the dictionaries for each band for dataBand in dataBands: clumpDictList.append({}) for dataBand in dataBands: data = blocks['data'][dataBand-1].flatten() clumpDict = clumpDictList[dataBand-1] # for each clump id for value in numpy.unique(clumps): # get the data for that clump dataSubset = data.compress(clumps == value) # check we have data if dataSubset.size != 0: # do we have this value in histDict? if value in clumpDict: # yes, retrieve dict histDict = clumpDict[value] else: # no, create it and set it histDict = {} clumpDict[value] = histDict # do the bincount bincount = numpy.bincount(dataSubset) # turn this into a dictionary bins = numpy.arange(bincount.size) # only interested in values where count != 0 bins = numpy.compress(bincount != 0, bins) bincount = numpy.compress(bincount != 0, bincount) for count in range(bins.size): binvalue = bins[count] if binvalue in histDict: histDict[binvalue] += bincount[count] else: histDict[binvalue] = bincount[count] resultList = [] for dataBand in dataBands: # work out the length of the arrays and # create a blank arrays maxidx = max(clumpDict.keys()) + 1 modeArray = numpy.zeros((maxidx,), numpy.uint32) # go thru each value for value in clumpDict.keys(): # find the mode histDict = clumpDict[value] maxValue, maxCount = max(histDict.items(), key=lambda x:x[1]) modeArray[value] = maxValue resultList.append((modeArray, clumpDict)) if isinstance(origdataBands, int): return resultList[0] # only one item else: return resultList