def _load_data(xName, yName, tileRadius, onlySlices, omitLabels=None): """Loads data sets and does basic preprocessing. """ X = emlib.load_cube(xName, np.float32) # usually we expect fewer slices in Z than pixels in X or Y. # Make sure the dimensions look ok before proceeding. assert(X.shape[0] < X.shape[1]) assert(X.shape[0] < X.shape[2]) if onlySlices: X = X[onlySlices,:,:] print('[emCNN]: data shape: %s' % str(X.shape)) X = emlib.mirror_edges(X, tileRadius) # Scale data to live in [0 1]. # *** ASSUMPTION *** original data is in [0 255] if np.max(X) > 1: X = X / 255. print('[emCNN]: data min/max: %0.2f / %0.2f' % (np.min(X), np.max(X))) # Also obtain labels file (if provided - e.g. in deploy mode # we may not have labels...) if yName: Y = emlib.load_cube(yName, np.float32) if onlySlices: Y = Y[onlySlices,:,:] print('[emCNN]: labels shape: %s' % str(Y.shape)) # ** ASSUMPTION **: Special case code for membrane detection / ISBI volume yAll = np.unique(Y) yAll.sort() if (len(yAll) == 2) and (yAll[0] == 0) and (yAll[1] == 255): print('[emCNN]: ISBI-style labels detected. converting 0->1, 255->0') Y[Y==0] = 1; # membrane Y[Y==255] = 0; # non-membrane # Labels must be natural numbers (contiguous integers starting at 0) # because they are mapped to indices at the output of the network. # This next bit of code remaps the native y values to these indices. omitLabels, pctOmitted = _omit_labels(Y, omitLabels) Y = emlib.fix_class_labels(Y, omitLabels).astype(np.int32) print('[emCNN]: yAll is %s' % str(np.unique(Y))) print('[emCNN]: will use %0.2f%% of volume' % (100.0 - pctOmitted)) Y = emlib.mirror_edges(Y, tileRadius) return X, Y else: return X
def _load_data(xName, yName, args, tileSize): """Loads data sets and does basic preprocessing. """ X = emlib.load_cube(xName, np.float32) # usually we expect fewer slices in Z than pixels in X or Y. # Make sure the dimensions look ok before proceeding. assert (X.shape[0] < X.shape[1]) assert (X.shape[0] < X.shape[2]) print('[emCNN]: data shape: %s' % str(X.shape)) if args.onlySlices: X = X[args.onlySlices, :, :] X = emlib.mirror_edges(X, tileSize) # Scale data to live in [0 1]. # I'm assuming original data is in [0 255] if np.max(X) > 1: X = X / 255. # Also obtain labels file (if provided - e.g. in deploy mode # we may not have labels...) if yName: Y = emlib.load_cube(yName, np.float32) if args.onlySlices: Y = Y[args.onlySlices, :, :] # Labels must be natural numbers (contiguous integers starting at 0) # because they are mapped to indices at the output of the network. # This next bit of code remaps the native y values to these indices. Y = emlib.fix_class_labels(Y, args.omitLabels) print('[emCNN]: yAll is %s' % str(np.unique(Y))) print('[emCNN]: will use %0.2f%% of volume' % (100. * np.sum(Y >= 0) / numel(Y))) Y = emlib.mirror_edges(Y, tileSize) else: Y = None return X, Y
def _load_data(xName, yName, args, tileSize): """Loads data sets and does basic preprocessing. """ X = emlib.load_cube(xName, np.float32) # usually we expect fewer slices in Z than pixels in X or Y. # Make sure the dimensions look ok before proceeding. assert(X.shape[0] < X.shape[1]) assert(X.shape[0] < X.shape[2]) print('[emCNN]: data shape: %s' % str(X.shape)) if args.onlySlices: X = X[args.onlySlices,:,:] X = emlib.mirror_edges(X, tileSize) # Scale data to live in [0 1]. # I'm assuming original data is in [0 255] if np.max(X) > 1: X = X / 255. # Also obtain labels file (if provided - e.g. in deploy mode # we may not have labels...) if yName: Y = emlib.load_cube(yName, np.float32) if args.onlySlices: Y = Y[args.onlySlices,:,:] # Labels must be natural numbers (contiguous integers starting at 0) # because they are mapped to indices at the output of the network. # This next bit of code remaps the native y values to these indices. Y = emlib.fix_class_labels(Y, args.omitLabels) print('[emCNN]: yAll is %s' % str(np.unique(Y))) print('[emCNN]: will use %0.2f%% of volume' % (100.*np.sum(Y>=0)/numel(Y))) Y = emlib.mirror_edges(Y, tileSize) else: Y = None return X, Y
def main(args): tileRadius = np.floor(args.tileSize/2) nMiniBatch = 1000 # here, a "mini-batch" specifies LMDB transaction size # make sure we don't clobber an existing output if os.path.exists(args.outDir): raise RuntimeError('Output path "%s" already exists; please move out of the way and try again' % args.outDir) # load the data volumes (EM image and labels, if any) print('[make_lmdb]: loading EM data file: %s' % args.emFileName) X = emlib.load_cube(args.emFileName, np.float32) if args.labelsFileName: print('[make_lmdb]: loading labels file: %s' % args.labelsFileName) Y = emlib.load_cube(args.labelsFileName, np.float32) Y = emlib.fix_class_labels(Y, eval(args.omitLabels)) assert(Y.shape == X.shape) else: print('[make_lmdb]: no labels file; assuming this is a test volume') Y = np.zeros(X.shape) # usually we expect fewer slices in Z than pixels in X or Y. # Make sure the dimensions look ok before proceeding. assert(X.shape[0] < X.shape[1]) assert(X.shape[0] < X.shape[2]) # Identify the subset of the data to use for training. # (default is to use it all) if len(args.slicesExpr): sliceIdx = eval(args.slicesExpr) X = X[sliceIdx, :, :] # python puts the z dimension first... Y = Y[sliceIdx, :, :] X = X.astype(np.uint8) # critical!! otherwise, Caffe just flails... print('[make_lmdb]: EM volume shape: %s' % str(X.shape)) print('[make_lmdb]: yAll is %s' % np.unique(Y)) print('[make_lmdb]: %0.2f%% pixels will be omitted' % (100.0*np.sum(Y==-1)/numel(Y))) print('[make_lmdb]: writing results to: %s' % args.outDir) print('') sys.stdout.flush() # Create the output database. # Multiply the actual size by a fudge factor to get a safe upper bound dbSize = (X.nbytes * args.tileSize * args.tileSize + Y.nbytes) * 10 env = lmdb.open(args.outDir, map_size=dbSize) # Extract all possible tiles. # This corresponds to extracting one "epoch" worth of tiles. tileId = 0 lastChatter = -1 tic = time.time() yCnt = np.zeros(sum(np.unique(Y) >= 0)) if np.any(Y > 0): # generates a balanced training data set (subsamples and shuffles) it = emlib.stratified_interior_pixel_generator(Y, tileRadius, nMiniBatch, omitLabels=[-1]) else: # enumerates all possible tiles in order (no shuffling) it = emlib.interior_pixel_generator(X, tileRadius, nMiniBatch) for Idx, epochPct in it: # respect upper bound on number of examples if tileId > args.maxNumExamples: print('[make_lmdb]: stopping at %d (max number of examples reached\n)' % (tileId-1)) break # Each mini-batch will be added to the database as a single transaction. with env.begin(write=True) as txn: # Translate indices Idx -> tiles Xi and labels yi. for jj in range(Idx.shape[0]): yi = Y[ Idx[jj,0], Idx[jj,1], Idx[jj,2] ] yi = int(yi) a = Idx[jj,1] - tileRadius b = Idx[jj,1] + tileRadius + 1 c = Idx[jj,2] - tileRadius d = Idx[jj,2] + tileRadius + 1 Xi = X[ Idx[jj,0], a:b, c:d ] assert(Xi.shape == (args.tileSize, args.tileSize)) datum = caffe.proto.caffe_pb2.Datum() datum.channels = 1 datum.height = Xi.shape[0] datum.width = Xi.shape[1] datum.data = Xi.tostring() # use tobytes() for newer numpy datum.label = yi strId = '{:08}'.format(tileId) txn.put(strId.encode('ascii'), datum.SerializeToString()) tileId += 1 yCnt[yi] += 1 # check early termination conditions if tileId > args.maxNumExamples: break #if np.floor(epochPct) > lastChatter: print('[make_lmdb] %% %0.2f done (%0.2f min; yCnt=%s)' % ((100*epochPct), (time.time() - tic)/60, str(yCnt))) lastChatter = epochPct