Beispiel #1
0
def _load_data(xName, yName, tileRadius, onlySlices, omitLabels=None):
    """Loads data sets and does basic preprocessing.
    """
    X = emlib.load_cube(xName, np.float32)

    # usually we expect fewer slices in Z than pixels in X or Y.
    # Make sure the dimensions look ok before proceeding.
    assert(X.shape[0] < X.shape[1])
    assert(X.shape[0] < X.shape[2])

    if onlySlices: 
        X = X[onlySlices,:,:] 
    print('[emCNN]:    data shape: %s' % str(X.shape))

    X = emlib.mirror_edges(X, tileRadius)

    # Scale data to live in [0 1].
    # *** ASSUMPTION *** original data is in [0 255]
    if np.max(X) > 1:
        X = X / 255.
    print('[emCNN]:    data min/max: %0.2f / %0.2f' % (np.min(X), np.max(X)))

    # Also obtain labels file (if provided - e.g. in deploy mode
    # we may not have labels...)
    if yName: 
        Y = emlib.load_cube(yName, np.float32)

        if onlySlices: 
            Y = Y[onlySlices,:,:] 
        print('[emCNN]:    labels shape: %s' % str(Y.shape))

        # ** ASSUMPTION **: Special case code for membrane detection / ISBI volume
        yAll = np.unique(Y)
        yAll.sort()
        if (len(yAll) == 2) and (yAll[0] == 0) and (yAll[1] == 255):
            print('[emCNN]:    ISBI-style labels detected.  converting 0->1, 255->0')
            Y[Y==0] = 1;      #  membrane
            Y[Y==255] = 0;    #  non-membrane

        # Labels must be natural numbers (contiguous integers starting at 0)
        # because they are mapped to indices at the output of the network.
        # This next bit of code remaps the native y values to these indices.
        omitLabels, pctOmitted = _omit_labels(Y, omitLabels)
        Y = emlib.fix_class_labels(Y, omitLabels).astype(np.int32)

        print('[emCNN]:    yAll is %s' % str(np.unique(Y)))
        print('[emCNN]:    will use %0.2f%% of volume' % (100.0 - pctOmitted))

        Y = emlib.mirror_edges(Y, tileRadius)

        return X, Y
    else:
        return X
Beispiel #2
0
def _load_data(xName, yName, args, tileSize):
    """Loads data sets and does basic preprocessing.
    """
    X = emlib.load_cube(xName, np.float32)

    # usually we expect fewer slices in Z than pixels in X or Y.
    # Make sure the dimensions look ok before proceeding.
    assert (X.shape[0] < X.shape[1])
    assert (X.shape[0] < X.shape[2])
    print('[emCNN]:    data shape: %s' % str(X.shape))

    if args.onlySlices:
        X = X[args.onlySlices, :, :]

    X = emlib.mirror_edges(X, tileSize)

    # Scale data to live in [0 1].
    # I'm assuming original data is in [0 255]
    if np.max(X) > 1:
        X = X / 255.

    # Also obtain labels file (if provided - e.g. in deploy mode
    # we may not have labels...)
    if yName:
        Y = emlib.load_cube(yName, np.float32)

        if args.onlySlices:
            Y = Y[args.onlySlices, :, :]

        # Labels must be natural numbers (contiguous integers starting at 0)
        # because they are mapped to indices at the output of the network.
        # This next bit of code remaps the native y values to these indices.
        Y = emlib.fix_class_labels(Y, args.omitLabels)

        print('[emCNN]:    yAll is %s' % str(np.unique(Y)))
        print('[emCNN]:    will use %0.2f%% of volume' %
              (100. * np.sum(Y >= 0) / numel(Y)))

        Y = emlib.mirror_edges(Y, tileSize)
    else:
        Y = None

    return X, Y
Beispiel #3
0
def _load_data(xName, yName, args, tileSize):
    """Loads data sets and does basic preprocessing.
    """
    X = emlib.load_cube(xName, np.float32)

    # usually we expect fewer slices in Z than pixels in X or Y.
    # Make sure the dimensions look ok before proceeding.
    assert(X.shape[0] < X.shape[1])
    assert(X.shape[0] < X.shape[2])
    print('[emCNN]:    data shape: %s' % str(X.shape))

    if args.onlySlices: 
        X = X[args.onlySlices,:,:] 

    X = emlib.mirror_edges(X, tileSize)

    # Scale data to live in [0 1].
    # I'm assuming original data is in [0 255]
    if np.max(X) > 1:
        X = X / 255.

    # Also obtain labels file (if provided - e.g. in deploy mode
    # we may not have labels...)
    if yName: 
        Y = emlib.load_cube(yName, np.float32)

        if args.onlySlices: 
            Y = Y[args.onlySlices,:,:] 

        # Labels must be natural numbers (contiguous integers starting at 0)
        # because they are mapped to indices at the output of the network.
        # This next bit of code remaps the native y values to these indices.
        Y = emlib.fix_class_labels(Y, args.omitLabels)

        print('[emCNN]:    yAll is %s' % str(np.unique(Y)))
        print('[emCNN]:    will use %0.2f%% of volume' % (100.*np.sum(Y>=0)/numel(Y)))

        Y = emlib.mirror_edges(Y, tileSize)
    else:
        Y = None

    return X, Y
Beispiel #4
0
def main(args):
    tileRadius = np.floor(args.tileSize/2)
    nMiniBatch = 1000 # here, a "mini-batch" specifies LMDB transaction size

    # make sure we don't clobber an existing output
    if os.path.exists(args.outDir):
        raise RuntimeError('Output path "%s" already exists; please move out of the way and try again' % args.outDir)


    # load the data volumes (EM image and labels, if any)
    print('[make_lmdb]: loading EM data file: %s' % args.emFileName)
    X = emlib.load_cube(args.emFileName, np.float32)

    if args.labelsFileName: 
        print('[make_lmdb]: loading labels file: %s' % args.labelsFileName) 
        Y = emlib.load_cube(args.labelsFileName, np.float32)
        Y = emlib.fix_class_labels(Y, eval(args.omitLabels))
        assert(Y.shape == X.shape)
    else:
        print('[make_lmdb]: no labels file; assuming this is a test volume')
        Y = np.zeros(X.shape)


    # usually we expect fewer slices in Z than pixels in X or Y.
    # Make sure the dimensions look ok before proceeding.
    assert(X.shape[0] < X.shape[1])
    assert(X.shape[0] < X.shape[2])

    # Identify the subset of the data to use for training.
    # (default is to use it all)
    if len(args.slicesExpr): 
        sliceIdx = eval(args.slicesExpr) 
        X = X[sliceIdx, :, :]  # python puts the z dimension first... 
        Y = Y[sliceIdx, :, :]
    X = X.astype(np.uint8)  # critical!! otherwise, Caffe just flails...

    print('[make_lmdb]: EM volume shape: %s' % str(X.shape))
    print('[make_lmdb]: yAll is %s' % np.unique(Y))
    print('[make_lmdb]: %0.2f%% pixels will be omitted' % (100.0*np.sum(Y==-1)/numel(Y)))
    print('[make_lmdb]: writing results to: %s' % args.outDir)
    print('')
    sys.stdout.flush()

    # Create the output database.
    # Multiply the actual size by a fudge factor to get a safe upper bound
    dbSize = (X.nbytes * args.tileSize * args.tileSize + Y.nbytes) * 10
    env = lmdb.open(args.outDir, map_size=dbSize)

    # Extract all possible tiles.
    # This corresponds to extracting one "epoch" worth of tiles.
    tileId = 0
    lastChatter = -1
    tic = time.time()
    yCnt = np.zeros(sum(np.unique(Y) >= 0))

    if np.any(Y > 0): 
        # generates a balanced training data set (subsamples and shuffles)
        it = emlib.stratified_interior_pixel_generator(Y, tileRadius, nMiniBatch, omitLabels=[-1])
    else:
        # enumerates all possible tiles in order (no shuffling)
        it = emlib.interior_pixel_generator(X, tileRadius, nMiniBatch)


    for Idx, epochPct in it: 
        # respect upper bound on number of examples
        if tileId > args.maxNumExamples: 
            print('[make_lmdb]: stopping at %d (max number of examples reached\n)' % (tileId-1))
            break

        # Each mini-batch will be added to the database as a single transaction.
        with env.begin(write=True) as txn:
            # Translate indices Idx -> tiles Xi and labels yi.
            for jj in range(Idx.shape[0]):
                yi = Y[ Idx[jj,0], Idx[jj,1], Idx[jj,2] ]
                yi = int(yi)
                a = Idx[jj,1] - tileRadius
                b = Idx[jj,1] + tileRadius + 1
                c = Idx[jj,2] - tileRadius
                d = Idx[jj,2] + tileRadius + 1
                Xi = X[ Idx[jj,0], a:b, c:d ]
                assert(Xi.shape == (args.tileSize, args.tileSize))

                datum = caffe.proto.caffe_pb2.Datum()
                datum.channels = 1
                datum.height = Xi.shape[0]
                datum.width = Xi.shape[1]
                datum.data = Xi.tostring() # use tobytes() for newer numpy
                datum.label = yi
                strId = '{:08}'.format(tileId)

                txn.put(strId.encode('ascii'), datum.SerializeToString())
                tileId += 1
                yCnt[yi] += 1

                # check early termination conditions
                if tileId > args.maxNumExamples:
                    break

        #if np.floor(epochPct) > lastChatter: 
        print('[make_lmdb] %% %0.2f done (%0.2f min;   yCnt=%s)' % ((100*epochPct), (time.time() - tic)/60, str(yCnt)))
        lastChatter = epochPct