def write2dvid(vdata):
                    from libdvid import ConnectionMethod
                    import numpy
                    node_service = retrieve_node_service(server, uuid, resource_server, resource_port, appname) 
                    
                    coords, data = vdata 
                    xiter, yiter, ziter = coords

                    # set block indices
                    zbindex = ziter
                    ybindex = yiter

                    zsize,ysize,xsize = data.shape
                    #xrun = xsize/BLKSIZE
                    xbindex = xiter*maxxrun // 2

                    # retrieve blocks
                    blockbuffer = ""

                    # skip blank blocks
                    startblock = False
                    xrun = 0

                    if islabelblk: 
                        vals = numpy.unique(data)
                        # TODO: ignore blank blocks within an x line 
                        if not (len(vals) == 1 and vals[0] == 0):
                            if resource_server != "":
                                node_service.put_labels3D(currsource, data, (zbindex*BLKSIZE, ybindex*BLKSIZE, xbindex*BLKSIZE), compress=True, throttle=False)
                            else:
                                node_service.put_labels3D(currsource, data, (zbindex*BLKSIZE, ybindex*BLKSIZE, xbindex*BLKSIZE), compress=True)
                    else:
                        for iterx in range(0, xsize, BLKSIZE):
                            block = data[:,:,iterx:iterx+BLKSIZE]
                            vals = numpy.unique(block)
                            if len(vals) == 1 and vals[0] == delimiter:
                                # check if the block is blank
                                if startblock:
                                    # if the previous block has data, push blocks in current queue
                                    node_service.custom_request(str((currsource + "/blocks/%d_%d_%d/%d") % (xbindex, ybindex, zbindex, xrun)), blockbuffer, ConnectionMethod.POST) 
                                    startblock = False
                                    xrun = 0
                                    blockbuffer = ""

                            else:
                                if startblock == False:
                                    xbindex = xiter*maxxrun // 2 + iterx // BLKSIZE
                               
                                startblock = True
                                blockbuffer += block.tobytes()
                                xrun += 1


                        # write-out leftover blocks
                        if xrun > 0:
                            node_service.custom_request(str((currsource + "/blocks/%d_%d_%d/%d") % (xbindex, ybindex, zbindex, xrun)), blockbuffer, ConnectionMethod.POST) 
        def writeimagepyramid(image):
            slicenum, imnpy = image 
            
            from PIL import Image
            from scipy import ndimage
            import io
            
            from libdvid import ConnectionMethod
            node_service = retrieve_node_service(server, uuid, resource_server, resource_port, appname) 

            # actually perform tile load
            def loadTile(reqpair):
                urlreq, reqbuff = reqpair 
                node_service.custom_request(urlreq, reqbuff, ConnectionMethod.POST) 
                #session.post(urlreq , data=reqbuff)
                

            work_queue = []
            # iterate slice by slice
                
            imlevels = []
            imlevels.append(imnpy)
            # use generic downsample algorithm
            for level in range(1, maxlevel+1):
                dim1, dim2 = imlevels[level-1].shape
                imlevels.append(ndimage.interpolation.zoom(imlevels[level-1], 0.5)) 

            # write pyramid for each slice using custom request
            for levelnum in range(0, len(imlevels)):
                levelslice = imlevels[levelnum]
                dim1, dim2 = levelslice.shape

                num1tiles = (dim1-1) // TILESIZE + 1
                num2tiles = (dim2-1) // TILESIZE + 1

                for iter1 in range(0, num1tiles):
                    for iter2 in range(0, num2tiles):
                        # extract tile
                        tileholder = numpy.zeros((TILESIZE, TILESIZE), numpy.uint8)
                        min1 = iter1*TILESIZE
                        min2 = iter2*TILESIZE
                        tileslice = levelslice[min1:min1+TILESIZE, min2:min2+TILESIZE]
                        t1, t2 = tileslice.shape
                        tileholder[0:t1, 0:t2] = tileslice

                        # write tileholder to dvid
                        buf = BytesIO() 
                        img = Image.frombuffer('L', (TILESIZE, TILESIZE), tileholder.tostring(), 'raw', 'L', 0, 1)
                        imformatpil = imformat
                        if imformat == "jpg":
                            imformatpil = "jpeg"
                        img.save(buf, format=imformatpil)

                        loadTile((tilename + "/tile/xy/" + str(levelnum) + "/" + str(iter2) + "_" + str(iter1) + "_" + str(slicenum), buf.getvalue()))
                        buf.close()
 def node_service(self):
     if self._node_service is None:
         try:
             # We don't pass the resource manager details here
             # because we use the resource manager from python.
             self._node_service = retrieve_node_service(self._server, self._uuid, "", "")
         except Exception as ex:
             host = socket.gethostname()
             msg = f"Host {host}: Failed to connect to {self._server} / {self._uuid}"
             raise RuntimeError(msg) from ex
     return self._node_service
                def retrievedata(coord):
                    xiter, yiter, ziter = coord
                    node_service = retrieve_node_service(server, uuid, resource_server, resource_port)

                    shape_zyx = ( BLKSIZE*2, BLKSIZE*2, maxxrun*BLKSIZE )
                    offset_zyx = (ziter*BLKSIZE*2, yiter*BLKSIZE*2, xiter*BLKSIZE*maxxrun)
                    vol_zyx = None
                    if islabelblk:
                        vol_zyx = node_service.get_labels3D( str(prevsource), shape_zyx, offset_zyx, throttle=False)
                    else:
                        vol_zyx = node_service.get_gray3D( str(prevsource), shape_zyx, offset_zyx, throttle=False)

                    return (coord, vol_zyx)
                def write2dvid(yblocks):
                    from libdvid import ConnectionMethod
                    import numpy
                    node_service = retrieve_node_service(server, uuid, resource_server, resource_port, appname) 
                    
                    # get block coordinates
                    zbindex = slice // blocksize 
                    (ybindex, layer), blocks = yblocks
                    zbindex += layer
                    zsize,ysize,xsize = blocks.shape
                    xrun = xsize // blocksize
                    xbindex = 0 # assume x starts at 0!!

                    # retrieve blocks
                    blockbuffer = ""

                    # skip blank blocks
                    startblock = False
                    xrun = 0
                    xbindex = 0

                    for iterx in range(0, xsize, blocksize):
                        block = blocks[:,:,iterx:iterx+blocksize].copy()
                        vals = numpy.unique(block)
                        if len(vals) == 1 and vals[0] == delimiter:
                            # check if the block is blank
                            if startblock:
                                # if the previous block has data, push blocks in current queue
                                node_service.custom_request(str((grayname + "/blocks/%d_%d_%d/%d") % (xbindex+xoffset, ybindex+yoffset, zbindex+zoffset, xrun)), blockbuffer, ConnectionMethod.POST) 
                                startblock = False
                                xrun = 0
                                blockbuffer = ""

                        else:
                            if startblock == False:
                                xbindex = iterx // blocksize
                            
                            startblock = True
                            blockbuffer += block.tobytes()
                            xrun += 1

                            if blocklimit > 0 and xrun >= blocklimit:
                                # if the previous block has data, push blocks in current queue
                                node_service.custom_request(str((grayname + "/blocks/%d_%d_%d/%d") % (xbindex+xoffset, ybindex+yoffset, zbindex+zoffset, xrun)), blockbuffer, ConnectionMethod.POST) 
                                startblock = False
                                xrun = 0
                                blockbuffer = ""

                    # write-out leftover blocks
                    if xrun > 0:
                        node_service.custom_request(str((grayname + "/blocks/%d_%d_%d/%d") % (xbindex+xoffset, ybindex+yoffset, zbindex+zoffset, xrun)), blockbuffer, ConnectionMethod.POST) 
 def get_seg():
     node_service = retrieve_node_service(pdconf["dvid-server"], 
             pdconf["uuid"], resource_server, resource_port)
     # retrieve data from box start position
     # Note: libdvid uses zyx order for python functions
     
     if resource_server != "": 
         return node_service.get_labels3D(str(pdconf["segmentation-name"]),
             (size_z, size_y, size_x),
             (subvolume.box.z2-border, subvolume.box.y1-border, subvolume.box.x1-border))
     else:
         return node_service.get_labels3D(str(pdconf["segmentation-name"]),
              (size_z, size_y, size_x),
              (subvolume.box.z2-border, subvolume.box.y1-border, subvolume.box.x1-border))
    def _init_skeletons_instance(self):
        dvid_info = self.config_data["dvid-info"]
        options = self.config_data["options"]
        if is_node_locked(dvid_info["dvid"]["server"], dvid_info["dvid"]["uuid"]):
            raise RuntimeError(f"Can't write skeletons/meshes: The node you specified ({dvid_info['dvid']['server']} / {dvid_info['dvid']['uuid']}) is locked.")

        node_service = retrieve_node_service( dvid_info["dvid"]["server"],
                                              dvid_info["dvid"]["uuid"],
                                              options["resource-server"],
                                              options["resource-port"] )

        if "neutube-skeleton" in options["output-types"]:
            node_service.create_keyvalue(dvid_info["dvid"]["skeletons-destination"])

        if "mesh" in options["output-types"]:
            node_service.create_keyvalue(dvid_info["dvid"]["meshes-destination"])
    def _init_meshes_instances(self):
        dvid_info = self.config_data["dvid-info"]
        options = self.config_data["options"]
        if is_node_locked(dvid_info["dvid"]["server"], dvid_info["dvid"]["uuid"]):
            raise RuntimeError(f"Can't write meshes: The node you specified ({dvid_info['dvid']['server']} / {dvid_info['dvid']['uuid']}) is locked.")

        node_service = retrieve_node_service( dvid_info["dvid"]["server"],
                                              dvid_info["dvid"]["uuid"],
                                              options["resource-server"],
                                              options["resource-port"] )

        self.mesh_instances = []
        for simplification_ratio in self.config_data["mesh-config"]["simplify-ratios"]:
            instance_name = dvid_info["dvid"]["meshes-destination"]
            if len(self.config_data["mesh-config"]["simplify-ratios"]) > 1:
                instance_name += f"_dec{simplification_ratio:.2f}"

            node_service.create_keyvalue( instance_name )
            self.mesh_instances.append( instance_name )
                def get_seg():
                    node_service = retrieve_node_service(
                        pdconf["dvid-server"], pdconf["uuid"], resource_server,
                        resource_port)
                    # retrieve data from box start position
                    # Note: libdvid uses zyx order for python functions

                    if resource_server != "":
                        return node_service.get_labels3D(
                            str(pdconf["segmentation-name"]),
                            (size_z, size_y, size_x),
                            (subvolume.box.z2 - border, subvolume.box.y1 -
                             border, subvolume.box.x1 - border))
                    else:
                        return node_service.get_labels3D(
                            str(pdconf["segmentation-name"]),
                            (size_z, size_y, size_x),
                            (subvolume.box.z2 - border, subvolume.box.y1 -
                             border, subvolume.box.x1 - border))
        def retrieveslices(blknum):
            # grab slice with 3d volume call
            node_service = retrieve_node_service(server, uuid, resource_server, resource_port)
            vol = None
          
            if resource_server != "": 
                # Note: libdvid uses zyx order for python functions
                if axis == "xy":
                    shape_zyx = ( BLKSIZE, (ymax+1)*BLKSIZE-ymin*BLKSIZE, (xmax+1)*BLKSIZE-xmin*BLKSIZE )
                    offset_zyx = (blknum*BLKSIZE, ymin*BLKSIZE, xmin*BLKSIZE)
                    vol_zyx = node_service.get_gray3D( str(grayname), shape_zyx, offset_zyx, throttle=False)
                    vol = vol_zyx
                elif axis == "xz":
                    shape_zyx = ( (zmax+1)*BLKSIZE-zmin*BLKSIZE, BLKSIZE, (xmax+1)*BLKSIZE-xmin*BLKSIZE )
                    offset_zyx = (zmin*BLKSIZE, blknum*BLKSIZE, xmin*BLKSIZE)
                    vol_zyx = node_service.get_gray3D( str(grayname), shape_zyx, offset_zyx, throttle=False )
                    vol_yzx = vol_zyx.transpose((1,0,2))
                    vol = vol_yzx
                else:
                    shape_zyx = ( (zmax+1)*BLKSIZE-zmin*BLKSIZE, (ymax+1)*BLKSIZE-ymin*BLKSIZE, BLKSIZE )
                    offset_zyx = ( zmin*BLKSIZE, ymin*BLKSIZE, blknum*BLKSIZE )
                    vol_zyx = node_service.get_gray3D( str(grayname), shape_zyx, offset_zyx, throttle=False )
                    vol = vol_zyx.transpose((2,0,1))
            else:
                if axis == "xy":
                    shape_zyx = ( BLKSIZE, (ymax+1)*BLKSIZE-ymin*BLKSIZE, (xmax+1)*BLKSIZE-xmin*BLKSIZE )
                    offset_zyx = (blknum*BLKSIZE, ymin*BLKSIZE, xmin*BLKSIZE)
                    vol_zyx = node_service.get_gray3D( str(grayname), shape_zyx, offset_zyx)
                    vol = vol_zyx
                elif axis == "xz":
                    shape_zyx = ( (zmax+1)*BLKSIZE-zmin*BLKSIZE, BLKSIZE, (xmax+1)*BLKSIZE-xmin*BLKSIZE )
                    offset_zyx = (zmin*BLKSIZE, blknum*BLKSIZE, xmin*BLKSIZE)
                    vol_zyx = node_service.get_gray3D( str(grayname), shape_zyx, offset_zyx )
                    vol_yzx = vol_zyx.transpose((1,0,2))
                    vol = vol_yzx
                else:
                    shape_zyx = ( (zmax+1)*BLKSIZE-zmin*BLKSIZE, (ymax+1)*BLKSIZE-ymin*BLKSIZE, BLKSIZE )
                    offset_zyx = ( zmin*BLKSIZE, ymin*BLKSIZE, blknum*BLKSIZE )
                    vol_zyx = node_service.get_gray3D( str(grayname), shape_zyx, offset_zyx )
                    vol = vol_zyx.transpose((2,0,1))

            return (blknum, vol)
    def _init_skeletons_instance(self):
        dvid_info = self.config_data["dvid-info"]
        options = self.config_data["options"]
        if is_node_locked(dvid_info["dvid"]["server"],
                          dvid_info["dvid"]["uuid"]):
            raise RuntimeError(
                f"Can't write skeletons/meshes: The node you specified ({dvid_info['dvid']['server']} / {dvid_info['dvid']['uuid']}) is locked."
            )

        node_service = retrieve_node_service(dvid_info["dvid"]["server"],
                                             dvid_info["dvid"]["uuid"],
                                             options["resource-server"],
                                             options["resource-port"])

        if "neutube-skeleton" in options["output-types"]:
            node_service.create_keyvalue(
                dvid_info["dvid"]["skeletons-destination"])

        if "mesh" in options["output-types"]:
            node_service.create_keyvalue(
                dvid_info["dvid"]["meshes-destination"])
                def retrievedata(coord):
                    xiter, yiter, ziter = coord
                    node_service = retrieve_node_service(
                        server, uuid, resource_server, resource_port)

                    shape_zyx = (BLKSIZE * 2, BLKSIZE * 2, maxxrun * BLKSIZE)
                    offset_zyx = (ziter * BLKSIZE * 2, yiter * BLKSIZE * 2,
                                  xiter * BLKSIZE * maxxrun)
                    vol_zyx = None
                    if islabelblk:
                        vol_zyx = node_service.get_labels3D(str(prevsource),
                                                            shape_zyx,
                                                            offset_zyx,
                                                            throttle=False)
                    else:
                        vol_zyx = node_service.get_gray3D(str(prevsource),
                                                          shape_zyx,
                                                          offset_zyx,
                                                          throttle=False)

                    return (coord, vol_zyx)
    def _init_meshes_instances(self):
        dvid_info = self.config_data["dvid-info"]
        options = self.config_data["options"]
        if is_node_locked(dvid_info["dvid"]["server"],
                          dvid_info["dvid"]["uuid"]):
            raise RuntimeError(
                f"Can't write meshes: The node you specified ({dvid_info['dvid']['server']} / {dvid_info['dvid']['uuid']}) is locked."
            )

        node_service = retrieve_node_service(dvid_info["dvid"]["server"],
                                             dvid_info["dvid"]["uuid"],
                                             options["resource-server"],
                                             options["resource-port"])

        self.mesh_instances = []
        for simplification_ratio in self.config_data["mesh-config"][
                "simplify-ratios"]:
            instance_name = dvid_info["dvid"]["meshes-destination"]
            if len(self.config_data["mesh-config"]["simplify-ratios"]) > 1:
                instance_name += f"_dec{simplification_ratio:.2f}"

            node_service.create_keyvalue(instance_name)
            self.mesh_instances.append(instance_name)
    def execute(self):
        server = str(self.config_data["dvid-info"]["dvid-server"])
        uuid = str(self.config_data["dvid-info"]["uuid"])
        source = str(self.config_data["dvid-info"]["source"])

        session = default_dvid_session()
        # determine grayscale blk extants
        if not server.startswith("http://"):
            server = "http://" + server

        req = session.get(server + "/api/node/" + uuid + "/" + source +
                          "/info")
        sourcemeta = req.json()

        # xmin, ymin, zmin not being used explicitly yet
        #xmin, ymin, zmin = sourcemeta["Extended"]["MinIndex"]
        xmin, ymin, zmin = 0, 0, 0
        xmax, ymax, zmax = sourcemeta["Extended"]["MaxIndex"]

        islabelblk = False
        datatype = sourcemeta["Extended"]["Values"][0]["Label"]
        if str(datatype) == "labelblk":
            islabelblk = True

        # !! always assume isotropic block
        BLKSIZE = int(sourcemeta["Extended"]["BlockSize"][0])

        maxdim = max(xmax, ymax, zmax)
        # build pyramid until BLKSIZE * 4
        import math
        maxlevel = int(math.log(maxdim + 1) / math.log(2)) - 2

        # assume 0,0,0 start for now
        xspan, yspan, zspan = xmax + 1, ymax + 1, zmax + 1

        xrunlimit = self.config_data["options"]["xrunlimit"]
        xrunlimit = xrunlimit + (xrunlimit % 2)  # should be even

        currsource = source

        # create source pyramid and append _level to name
        for level in range(1, maxlevel + 1):
            node_service = retrieve_node_service(server, uuid,
                                                 self.resource_server,
                                                 self.resource_port,
                                                 self.APPNAME)
            # !! limit to grayscale now
            prevsource = currsource
            currsource = source + ("_%d" % level)

            # TODO: set voxel resolution to base dataset (not too important in current workflows)
            if islabelblk:
                node_service.create_labelblk(currsource, None, BLKSIZE)
            else:
                node_service.create_grayscale8(currsource, BLKSIZE)
                # set extents for new volume (only need to do for grayscale)
                newsourceext = {}
                newsourceext["MinPoint"] = [0, 0, 0]  # for now no offset
                newsourceext["MaxPoint"] = [
                    ((xspan - 1) // 2 + 1) * BLKSIZE - 1,
                    ((yspan - 1) // 2 + 1) * BLKSIZE - 1,
                    ((zspan - 1) // 2 + 1) * BLKSIZE - 1
                ]
                session.post(server + "/api/node/" + uuid + "/" + currsource +
                             "/extents",
                             json=newsourceext)

            # determine number of requests
            maxxrun = xspan
            if xrunlimit > 0 and xrunlimit < xspan:
                maxxrun = xrunlimit
            if maxxrun % 2:
                maxxrun += 1

            xsize = xspan // maxxrun
            if xspan % maxxrun:
                xsize += 1
            ysize = (yspan + 1) // 2
            zsize = (zspan + 1) // 2
            resource_server = self.resource_server
            resource_port = self.resource_port

            for ziter2 in range(0, zsize, 2):
                workqueue = []
                for yiter in range(0, ysize):
                    for xiter in range(0, xsize):
                        for miniz in range(ziter2, ziter2 + 2):
                            workqueue.append((xiter, yiter, miniz))

                # parallelize jobs
                pieces = self.sc.parallelize(workqueue, len(workqueue))

                # grab data corresponding to xrun
                def retrievedata(coord):
                    xiter, yiter, ziter = coord
                    node_service = retrieve_node_service(
                        server, uuid, resource_server, resource_port)

                    shape_zyx = (BLKSIZE * 2, BLKSIZE * 2, maxxrun * BLKSIZE)
                    offset_zyx = (ziter * BLKSIZE * 2, yiter * BLKSIZE * 2,
                                  xiter * BLKSIZE * maxxrun)
                    vol_zyx = None
                    if islabelblk:
                        vol_zyx = node_service.get_labels3D(str(prevsource),
                                                            shape_zyx,
                                                            offset_zyx,
                                                            throttle=False)
                    else:
                        vol_zyx = node_service.get_gray3D(str(prevsource),
                                                          shape_zyx,
                                                          offset_zyx,
                                                          throttle=False)

                    return (coord, vol_zyx)

                volumedata = pieces.map(retrievedata)

                # downsample gray data
                def downsamplegray(vdata):
                    coords, data = vdata
                    from scipy import ndimage
                    data = ndimage.interpolation.zoom(data, 0.5)
                    return (coords, data)

                # downsample label data (TODO: make faster)
                def downsamplelabels(vdata):
                    coords, data = vdata
                    import numpy
                    zmax, ymax, xmax = data.shape
                    data2 = numpy.zeros(
                        (zmax // 2, ymax // 2, xmax // 2)).astype(numpy.uint64)

                    for ziter in range(0, zmax, 2):
                        for yiter in range(0, ymax, 2):
                            for xiter in range(0, xmax, 2):
                                v1 = data[ziter, yiter, xiter]
                                v2 = data[ziter, yiter, xiter + 1]
                                v3 = data[ziter, yiter + 1, xiter]
                                v4 = data[ziter, yiter + 1, xiter + 1]
                                v5 = data[ziter + 1, yiter, xiter]
                                v6 = data[ziter + 1, yiter, xiter + 1]
                                v7 = data[ziter + 1, yiter + 1, xiter]
                                v8 = data[ziter + 1, yiter + 1, xiter + 1]

                                freqs = {}
                                freqs[v2] = 0
                                freqs[v3] = 0
                                freqs[v4] = 0
                                freqs[v5] = 0
                                freqs[v6] = 0
                                freqs[v7] = 0
                                freqs[v8] = 0

                                freqs[v1] = 1
                                freqs[v2] += 1
                                freqs[v3] += 1
                                freqs[v4] += 1
                                freqs[v5] += 1
                                freqs[v6] += 1
                                freqs[v7] += 1
                                freqs[v8] += 1

                                maxval = 0
                                freqkey = 0
                                for key, val in freqs.items():
                                    if val > maxval:
                                        maxval = val
                                        freqkey = key

                                data2[ziter // 2, yiter // 2,
                                      xiter // 2] = freqkey

                    return (coords, data2)

                downsampleddata = None
                if islabelblk:
                    downsampleddata = volumedata.map(downsamplelabels)
                else:
                    downsampleddata = volumedata.map(downsamplegray)

                appname = self.APPNAME
                delimiter = self.config_data["options"]["blankdelimiter"]

                # write results ?!
                def write2dvid(vdata):
                    from libdvid import ConnectionMethod
                    import numpy
                    node_service = retrieve_node_service(
                        server, uuid, resource_server, resource_port, appname)

                    coords, data = vdata
                    xiter, yiter, ziter = coords

                    # set block indices
                    zbindex = ziter
                    ybindex = yiter

                    zsize, ysize, xsize = data.shape
                    #xrun = xsize/BLKSIZE
                    xbindex = xiter * maxxrun // 2

                    # retrieve blocks
                    blockbuffer = ""

                    # skip blank blocks
                    startblock = False
                    xrun = 0

                    if islabelblk:
                        vals = numpy.unique(data)
                        # TODO: ignore blank blocks within an x line
                        if not (len(vals) == 1 and vals[0] == 0):
                            if resource_server != "":
                                node_service.put_labels3D(
                                    currsource,
                                    data,
                                    (zbindex * BLKSIZE, ybindex * BLKSIZE,
                                     xbindex * BLKSIZE),
                                    compress=True,
                                    throttle=False)
                            else:
                                node_service.put_labels3D(
                                    currsource,
                                    data,
                                    (zbindex * BLKSIZE, ybindex * BLKSIZE,
                                     xbindex * BLKSIZE),
                                    compress=True)
                    else:
                        for iterx in range(0, xsize, BLKSIZE):
                            block = data[:, :, iterx:iterx + BLKSIZE]
                            vals = numpy.unique(block)
                            if len(vals) == 1 and vals[0] == delimiter:
                                # check if the block is blank
                                if startblock:
                                    # if the previous block has data, push blocks in current queue
                                    node_service.custom_request(
                                        str((currsource +
                                             "/blocks/%d_%d_%d/%d") %
                                            (xbindex, ybindex, zbindex, xrun)),
                                        blockbuffer, ConnectionMethod.POST)
                                    startblock = False
                                    xrun = 0
                                    blockbuffer = ""

                            else:
                                if startblock == False:
                                    xbindex = xiter * maxxrun // 2 + iterx // BLKSIZE

                                startblock = True
                                blockbuffer += block.tobytes()
                                xrun += 1

                        # write-out leftover blocks
                        if xrun > 0:
                            node_service.custom_request(
                                str((currsource + "/blocks/%d_%d_%d/%d") %
                                    (xbindex, ybindex, zbindex, xrun)),
                                blockbuffer, ConnectionMethod.POST)

                downsampleddata.foreach(write2dvid)

            # adjust max coordinate for new level
            xspan = (xspan - 1) // 2
            yspan = (yspan - 1) // 2
            zspan = (zspan - 1) // 2
Beispiel #15
0
        def retrieveslices(blknum):
            # grab slice with 3d volume call
            node_service = retrieve_node_service(server, uuid, resource_server,
                                                 resource_port)
            vol = None

            if resource_server != "":
                # Note: libdvid uses zyx order for python functions
                if axis == "xy":
                    shape_zyx = (BLKSIZE,
                                 (ymax + 1) * BLKSIZE - ymin * BLKSIZE,
                                 (xmax + 1) * BLKSIZE - xmin * BLKSIZE)
                    offset_zyx = (blknum * BLKSIZE, ymin * BLKSIZE,
                                  xmin * BLKSIZE)
                    vol_zyx = node_service.get_gray3D(str(grayname),
                                                      shape_zyx,
                                                      offset_zyx,
                                                      throttle=False)
                    vol = vol_zyx
                elif axis == "xz":
                    shape_zyx = ((zmax + 1) * BLKSIZE - zmin * BLKSIZE,
                                 BLKSIZE,
                                 (xmax + 1) * BLKSIZE - xmin * BLKSIZE)
                    offset_zyx = (zmin * BLKSIZE, blknum * BLKSIZE,
                                  xmin * BLKSIZE)
                    vol_zyx = node_service.get_gray3D(str(grayname),
                                                      shape_zyx,
                                                      offset_zyx,
                                                      throttle=False)
                    vol_yzx = vol_zyx.transpose((1, 0, 2))
                    vol = vol_yzx
                else:
                    shape_zyx = ((zmax + 1) * BLKSIZE - zmin * BLKSIZE,
                                 (ymax + 1) * BLKSIZE - ymin * BLKSIZE,
                                 BLKSIZE)
                    offset_zyx = (zmin * BLKSIZE, ymin * BLKSIZE,
                                  blknum * BLKSIZE)
                    vol_zyx = node_service.get_gray3D(str(grayname),
                                                      shape_zyx,
                                                      offset_zyx,
                                                      throttle=False)
                    vol = vol_zyx.transpose((2, 0, 1))
            else:
                if axis == "xy":
                    shape_zyx = (BLKSIZE,
                                 (ymax + 1) * BLKSIZE - ymin * BLKSIZE,
                                 (xmax + 1) * BLKSIZE - xmin * BLKSIZE)
                    offset_zyx = (blknum * BLKSIZE, ymin * BLKSIZE,
                                  xmin * BLKSIZE)
                    vol_zyx = node_service.get_gray3D(str(grayname), shape_zyx,
                                                      offset_zyx)
                    vol = vol_zyx
                elif axis == "xz":
                    shape_zyx = ((zmax + 1) * BLKSIZE - zmin * BLKSIZE,
                                 BLKSIZE,
                                 (xmax + 1) * BLKSIZE - xmin * BLKSIZE)
                    offset_zyx = (zmin * BLKSIZE, blknum * BLKSIZE,
                                  xmin * BLKSIZE)
                    vol_zyx = node_service.get_gray3D(str(grayname), shape_zyx,
                                                      offset_zyx)
                    vol_yzx = vol_zyx.transpose((1, 0, 2))
                    vol = vol_yzx
                else:
                    shape_zyx = ((zmax + 1) * BLKSIZE - zmin * BLKSIZE,
                                 (ymax + 1) * BLKSIZE - ymin * BLKSIZE,
                                 BLKSIZE)
                    offset_zyx = (zmin * BLKSIZE, ymin * BLKSIZE,
                                  blknum * BLKSIZE)
                    vol_zyx = node_service.get_gray3D(str(grayname), shape_zyx,
                                                      offset_zyx)
                    vol = vol_zyx.transpose((2, 0, 1))

            return (blknum, vol)
    def execute(self):
        # TODO: handle 64 bit segmentation

        from pyspark import SparkContext
        from pyspark import StorageLevel
        from DVIDSparkServices.reconutils.Segmentor import Segmentor

        self.chunksize = self.config_data["options"]["chunk-size"]

        # create datatype in the beginning
        node_service = retrieve_node_service(self.config_data["dvid-info"]["dvid-server"], 
                self.config_data["dvid-info"]["uuid"], self.resource_server, self.resource_port)
        
        # grab ROI subvolumes and find neighbors
        distsubvolumes = self.sparkdvid_context.parallelize_roi(
                self.config_data["dvid-info"]["roi"],
                self.chunksize, self.contextbuffer, True)

        contextbuffer = self.contextbuffer
        # do not recompute ROI for each iteration
        distsubvolumes.persist()

        # instantiate the voxel prediction plugin
        import importlib
        full_function_name = self.config_data["options"]["predict-voxels"]["function"]
        module_name = '.'.join(full_function_name.split('.')[:-1])
        function_name = full_function_name.split('.')[-1]
        module = importlib.import_module(module_name)
        
        parameters = self.config_data["options"]["predict-voxels"]["parameters"]
        vprediction_function = partial( getattr(module, function_name), **parameters )

        # determine number of iterations
        num_parts = len(distsubvolumes.collect())
        iteration_size = self.config_data["options"]["iteration-size"]
        if iteration_size == 0:
            iteration_size = num_parts

        num_iters = num_parts // iteration_size
        if num_parts % iteration_size > 0:
            num_iters += 1

        feature_chunk_list = []

        # enable checkpointing if not empty
        checkpoint_dir = self.config_data["options"]["checkpoint-dir"]

        # enable rollback of iterations if necessary
        rollback = False
        if self.config_data["options"]["checkpoint"]:
            rollback = True
       
        for iternum in range(0, num_iters):
            # it might make sense to randomly map partitions for selection
            # in case something pathological is happening -- if original partitioner
            # is randomish than this should be fine
            def subset_part(sid_data):
                (s_id, _data) = sid_data
                if (s_id % num_iters) == iternum:
                    return True
                return False
            
            # should preserve partitioner
            distsubvolumes_part = distsubvolumes.filter(subset_part)

            # get grayscale chunks with specified overlap
            gray_chunks = self.sparkdvid_context.map_grayscale8(distsubvolumes_part,
                    self.config_data["dvid-info"]["grayscale"])

            pred_checkpoint_dir = ""
            if checkpoint_dir:
                pred_checkpoint_dir = checkpoint_dir + "/prediter-" + str(iternum)

            # For now, we always read predictions if available, and always write them if not.
            # TODO: Add config settings to control read/write behavior.
            @Segmentor.use_block_cache(pred_checkpoint_dir, allow_read=True, allow_write=True)
            def predict_voxels( sv_gray ):
                (_subvolume, gray) = sv_gray
                return vprediction_function(gray, None)

            vox_preds = gray_chunks.values().map( predict_voxels ) # predictions only
            vox_preds = distsubvolumes_part.values().zip( vox_preds ) # (subvolume, predictions)

            pdconf = self.config_data["dvid-info"]
            resource_server = self.resource_server
            resource_port = self.resource_port

            # retrieve segmentation and generate features
            def generate_features(vox_pred):
                import numpy
                (subvolume, pred) = vox_pred
                pred = numpy.ascontiguousarray(pred)


                # extract labelblks
                border = 1 # only one pixel needed to find edges
                
                # get sizes of box
                size_z = subvolume.box.z2 + 2*border - subvolume.box.z1
                size_y = subvolume.box.y2 + 2*border - subvolume.box.y1
                size_x = subvolume.box.x2 + 2*border - subvolume.box.x1

                # retrieve data from box start position considering border
                # !! technically ROI is not respected but unwritten segmentation will be ignored since it will have 0-valued pixels.
                @auto_retry(3, pause_between_tries=60.0, logging_name=__name__)
                def get_seg():
                    node_service = retrieve_node_service(pdconf["dvid-server"], 
                            pdconf["uuid"], resource_server, resource_port)
                    # retrieve data from box start position
                    # Note: libdvid uses zyx order for python functions
                    
                    if resource_server != "": 
                        return node_service.get_labels3D(str(pdconf["segmentation-name"]),
                            (size_z, size_y, size_x),
                            (subvolume.box.z2-border, subvolume.box.y1-border, subvolume.box.x1-border))
                    else:
                        return node_service.get_labels3D(str(pdconf["segmentation-name"]),
                             (size_z, size_y, size_x),
                             (subvolume.box.z2-border, subvolume.box.y1-border, subvolume.box.x1-border))

                initial_seg = get_seg()

                # !!! potentially dangerous but needed for now
                initial_seg = initial_seg.astype(numpy.uint32)

                pred2 = pred[(contextbuffer-border):-(contextbuffer-border), (contextbuffer-border):-(contextbuffer-border), (contextbuffer-border):-(contextbuffer-border), :].copy()
                z,y,x,num_chans = pred2.shape

                # call neuroproof and generate features
                from neuroproof import FocusedProofreading 
                # "edges": [ edge ] where edge = [node1, node2, edgesize, all features...]
                # "vertices": [vertex ] where vertex = [id, size, all features...]
                features = FocusedProofreading.extract_features(initial_seg, pred2) 
                
                element_list = []
                # iterate edges and create ((node1, node2), features)
                if "Edges" in features:
                    # could have only one vertex in a partition and no edges
                    for edge in features["Edges"]:
                        n1 = edge["Id1"]
                        n2 = edge["Id2"]
                        edge["Loc1"][0] += subvolume.box.x1
                        edge["Loc1"][1] += subvolume.box.y1
                        edge["Loc1"][2] += subvolume.box.z1
                        
                        edge["Loc2"][0] += subvolume.box.x1
                        edge["Loc2"][1] += subvolume.box.y1
                        edge["Loc2"][2] += subvolume.box.z1
                        
                        if n1 > n2:
                            n1, n2 = n2, n1
                        element_list.append(((n1,n2), (num_chans, edge)))

                for node in features["Vertices"]:
                    n1 = node["Id"]
                    element_list.append(((n1,-1), (num_chans, node)))

                return element_list 

            features = vox_preds.flatMap(generate_features)

            # retrieve previously computed RDD or save current RDD
            if checkpoint_dir != "":
                features = self.sparkdvid_context.checkpointRDD(features, 
                        checkpoint_dir + "/featureiter-" + str(iternum), rollback)  

            # any forced persistence will result in costly
            # pickling, lz4 compressed numpy array should help
            features.persist(StorageLevel.MEMORY_AND_DISK_SER)

            feature_chunk_list.append(features)

        features = feature_chunk_list[0]

        for iter1 in range(1, len(feature_chunk_list)):
            # this could cause a serialization problems if there are a large number of iterations (>100)
            features = feature.union(feature_chunk_list[iter1])
    
        # grab num channels from boundary prediction
        features.persist(StorageLevel.MEMORY_AND_DISK_SER)
        first_feature = features.first()
        (key1, key2), (num_channels, foo) = first_feature

        # remove num channels from features
        def remove_num_channels(featurepair):
            foo, feature = featurepair
            return feature
        features = features.mapValues(remove_num_channels)
       
        import json

        # merge edge and node features -- does not require reading classifier
        # node features are encoded as (vertex id, -1)
        def combine_edge_features(element1, element2):
            from neuroproof import FocusedProofreading
            
            if "Id2" in element1:
                # are edges
                return FocusedProofreading.combine_edge_features( json.dumps(element1, cls=NumpyConvertingEncoder),
                                                                  json.dumps(element2, cls=NumpyConvertingEncoder),
                                                                  num_channels )
            else:
                # are vertices
                return FocusedProofreading.combine_vertex_features( json.dumps(element1, cls=NumpyConvertingEncoder),
                                                                    json.dumps(element2, cls=NumpyConvertingEncoder),
                                                                    num_channels )

        features_combined = features.reduceByKey(combine_edge_features)
     
        #features_combined.persist()
        # TODO: option to serialize features to enable other analyses
       
        # join node and edge probs
        def retrieve_nodes(val):
            (n1,n2),features = val
            if n2 == -1:
                return True
            return False

        def retrieve_edges(val):
            (n1,n2),features = val
            if n2 == -1:
                return False
            return True

        node_features = features_combined.filter(retrieve_nodes)
        edge_features = features_combined.filter(retrieve_edges)

       
        node_features = node_features.map(lambda x: (x[0][0], x[1]))
        edge1_features = edge_features.map(lambda x: (x[0][0], x[1]))
        edge2_features = edge_features.map(lambda x: (x[0][1], x[1]))

        # multiple edges with the same key
        edge1_node_features = edge1_features.leftOuterJoin(node_features)
        edge2_node_features = edge2_features.leftOuterJoin(node_features)

        def reset_edgekey(val):
            key, (edge, node) = val
            n1 = edge["Id1"]
            n2 = edge["Id2"]
            if n1 > n2:
                n1, n2 = n2, n1
            return ((n1,n2), (edge, node))

        edge1_node_features = edge1_node_features.map(reset_edgekey)
        edge2_node_features = edge2_node_features.map(reset_edgekey)

        edge_node_features = edge1_node_features.join(edge2_node_features)

        # generate prob for each edge (JSON: body sizes, edge list with prob)
        classifierlocation = self.config_data["options"]["segment-classifier"]
        def compute_prob(edge_node_features):
            from neuroproof import FocusedProofreading 
            classifier = FocusedProofreading.ComputeProb(str(classifierlocation), num_channels) 
            
            res_list = []
            for edge_node_edge_node in edge_node_features:
                edge_key, ((edge, node1), (edge_dummy, node2)) = edge_node_edge_node
                weight = classifier.compute_prob( json.dumps(edge, cls=NumpyConvertingEncoder),
                                                  json.dumps(node1, cls=NumpyConvertingEncoder),
                                                  json.dumps(node2, cls=NumpyConvertingEncoder) )
                # node1, node2
                res_list.append((int(node1["Id"]),int(node2["Id"]),int(node1["Weight"]),int(node2["Weight"]),int(edge["Weight"]),weight,edge["Loc1"], edge["Loc2"]))

            return res_list

        # avoid loading large classifier for each small edge
        allprobs = edge_node_features.mapPartitions(compute_prob)
    
        # collect all edges and send to DVID (TODO: add option to dump to disk) 
        allprobs_combined = allprobs.collect()

        bodyinfo = {}
        edges = []

        for edge_info in allprobs_combined:
            node1, node2, node1_size, node2_size, edge_size, weight, loc1, loc2 = edge_info
            bodyinfo[node1] = node1_size            
            bodyinfo[node2] = node2_size            
            edges.append({"Id1": node1, "Id2": node2, "Weight": weight, "Loc1": loc1, "Loc2": loc2})
        
        bodies = []
        for (key, val) in bodyinfo.items():
            bodies.append({"Id": key, "Weight": val})

        graph = {}
        graph["Vertices"] = bodies
        graph["Edges"] = edges

        SAVE_TO_FILE = False
        if SAVE_TO_FILE:
            graph_filepath = '/tmp/graph-output.json'
            with open(graph_filepath, 'w') as f:
                self.workflow_entry_exit_printer.warn("Writing graph json to file:\n{}".format(graph_filepath))
                import json
                json.dump(graph, f, indent=4, separators=(',', ': '), cls=NumpyConvertingEncoder)
            self.workflow_entry_exit_printer.write_data("Wrote graph to disk") # write to logger after spark job

        UPLOAD_TO_DVID = True
        if UPLOAD_TO_DVID:
            # load entire graph into DVID
            node_service.create_graph(str(self.config_data["dvid-info"]["graph-name"]))
            server = str(self.config_data["dvid-info"]["dvid-server"])
            #if not server.startswith("http://"):
            #    server = "http://" + server
            #session = default_dvid_session()
            #session.post(server + "/api/node/" + str(self.config_data["dvid-info"]["uuid"]) + "/" + str(self.config_data["dvid-info"]["graph-name"]) + "/subgraph", json=graph)
            #self.workflow_entry_exit_printer.write_data("Wrote DVID graph") # write to logger after spark job


        if self.config_data["options"]["debug"]:
            import json
            print("DEBUG:", json.dumps(graph, cls=NumpyConvertingEncoder))
     
        # write dvid to specified file (if provided)
        if "output-file" in self.config_data["options"] and self.config_data["options"]["output-file"] != "":
            filename = self.config_data["options"]["output-file"] 

            edgelist = []
            for edge in graph["Edges"]:
                edgelist.append({"node1": edge["Id1"], "node2": edge["Id2"], "weight": edge["Weight"], "loc1": edge["Loc1"], "loc2": edge["Loc2"]})

            npgraph = {}
            npgraph["edge_list"] = edgelist
            fout = open(filename, 'w')
            fout.write(json.dumps(npgraph, cls=NumpyConvertingEncoder))
                def write2dvid(vdata):
                    from libdvid import ConnectionMethod
                    import numpy
                    node_service = retrieve_node_service(
                        server, uuid, resource_server, resource_port, appname)

                    coords, data = vdata
                    xiter, yiter, ziter = coords

                    # set block indices
                    zbindex = ziter
                    ybindex = yiter

                    zsize, ysize, xsize = data.shape
                    #xrun = xsize/BLKSIZE
                    xbindex = xiter * maxxrun // 2

                    # retrieve blocks
                    blockbuffer = ""

                    # skip blank blocks
                    startblock = False
                    xrun = 0

                    if islabelblk:
                        vals = numpy.unique(data)
                        # TODO: ignore blank blocks within an x line
                        if not (len(vals) == 1 and vals[0] == 0):
                            if resource_server != "":
                                node_service.put_labels3D(
                                    currsource,
                                    data,
                                    (zbindex * BLKSIZE, ybindex * BLKSIZE,
                                     xbindex * BLKSIZE),
                                    compress=True,
                                    throttle=False)
                            else:
                                node_service.put_labels3D(
                                    currsource,
                                    data,
                                    (zbindex * BLKSIZE, ybindex * BLKSIZE,
                                     xbindex * BLKSIZE),
                                    compress=True)
                    else:
                        for iterx in range(0, xsize, BLKSIZE):
                            block = data[:, :, iterx:iterx + BLKSIZE]
                            vals = numpy.unique(block)
                            if len(vals) == 1 and vals[0] == delimiter:
                                # check if the block is blank
                                if startblock:
                                    # if the previous block has data, push blocks in current queue
                                    node_service.custom_request(
                                        str((currsource +
                                             "/blocks/%d_%d_%d/%d") %
                                            (xbindex, ybindex, zbindex, xrun)),
                                        blockbuffer, ConnectionMethod.POST)
                                    startblock = False
                                    xrun = 0
                                    blockbuffer = ""

                            else:
                                if startblock == False:
                                    xbindex = xiter * maxxrun // 2 + iterx // BLKSIZE

                                startblock = True
                                blockbuffer += block.tobytes()
                                xrun += 1

                        # write-out leftover blocks
                        if xrun > 0:
                            node_service.custom_request(
                                str((currsource + "/blocks/%d_%d_%d/%d") %
                                    (xbindex, ybindex, zbindex, xrun)),
                                blockbuffer, ConnectionMethod.POST)
Beispiel #18
0
        def write_blocks(part_vol):
            logger = logging.getLogger(__name__)
            part, data = part_vol
            offset = part.get_offset()
            reloffset = part.get_reloffset()
            _, _, x_size = data.shape
            if x_size % blksize != 0:
                # check if padded
                raise ValueError("Data is not block aligned")

            shiftedoffset = (offset.z + reloffset.z, offset.y + reloffset.y,
                             offset.x + reloffset.x)
            logger.info("Starting WRITE of partition at: {} size: {}".format(
                shiftedoffset, data.shape))
            node_service = retrieve_node_service(server, uuid, resource_server,
                                                 resource_port, appname)

            # Find all non-zero blocks (and record by block index)
            block_coords = []
            for block_index, block_x in enumerate(range(0, x_size, blksize)):
                if not (data[:, :, block_x:block_x + blksize]
                        == delimiter).all():
                    block_coords.append(
                        (0, 0, block_index
                         ))  # (Don't care about Z,Y indexes, just X-index)

            # Find *runs* of non-zero blocks
            block_runs = runlength_encode(
                block_coords, True)  # returns [[Z,Y,X1,X2], [Z,Y,X1,X2], ...]

            # Convert stop indexes from inclusive to exclusive
            block_runs[:, -1] += 1

            # Discard Z,Y indexes and convert from indexes to pixels
            ranges = blksize * block_runs[:, 2:4]

            # iterate through contiguous blocks and write to DVID
            # TODO: write compressed data directly into DVID
            for (data_x_start, data_x_end) in ranges:
                with Timer() as copy_timer:
                    datacrop = data[:, :, data_x_start:data_x_end].copy()
                logger.info("Copied {}:{} in {:.3f} seconds".format(
                    data_x_start, data_x_end, copy_timer.seconds))

                data_offset_zyx = (shiftedoffset[0], shiftedoffset[1],
                                   shiftedoffset[2] + data_x_start)

                if dataname is not None:
                    with Timer() as put_timer:
                        if not israw:
                            logger.info("STARTING Put: labels block {}".format(
                                data_offset_zyx))
                            if resource_server != "" or dvid_info[
                                    "dvid-server"].startswith(
                                        "http://127.0.0.1"):
                                node_service.put_labels3D(dataname,
                                                          datacrop,
                                                          data_offset_zyx,
                                                          compress=True,
                                                          throttle=False)
                            else:
                                node_service.put_labels3D(dataname,
                                                          datacrop,
                                                          data_offset_zyx,
                                                          compress=True)
                        else:
                            logger.info("STARTING Put: raw block {}".format(
                                data_offset_zyx))
                            if resource_server != "" or dvid_info[
                                    "dvid-server"].startswith(
                                        "http://127.0.0.1"):
                                node_service.put_gray3D(dataname,
                                                        datacrop,
                                                        data_offset_zyx,
                                                        compress=False,
                                                        throttle=False)
                            else:
                                node_service.put_gray3D(dataname,
                                                        datacrop,
                                                        data_offset_zyx,
                                                        compress=False)
                    logger.info("Put block {} in {:.3f} seconds".format(
                        data_offset_zyx, put_timer.seconds))

                if dataname_lossy is not None:
                    logger.info(
                        "STARTING Put: lossy block {}".format(data_offset_zyx))
                    with Timer() as put_lossy_timer:
                        if resource_server != "" or dvid_info[
                                "dvid-server"].startswith("http://127.0.0.1"):
                            node_service.put_gray3D(dataname_lossy,
                                                    datacrop,
                                                    data_offset_zyx,
                                                    compress=False,
                                                    throttle=False)
                        else:
                            node_service.put_gray3D(dataname_lossy,
                                                    datacrop,
                                                    data_offset_zyx,
                                                    compress=False)
                    logger.info("Put lossy block {} in {:.3f} seconds".format(
                        data_offset_zyx, put_lossy_timer.seconds))
Beispiel #19
0
def neuroproof_agglomerate(grayscale, predictions, supervoxels, classifier, threshold = 0.20, mitochannel = 2):
    """Main agglomeration function

   Args:
        grayscale = 3D uing8 (z,y,x) -- Not used.
        predictions = 4D float32 numpy label array (z, y, x, ch) 
        supervoxels = 3D uint32 numpy label array (z,y,x) 
        classifier = file location or DVID (assume to be xml unless .h5 is explict in name)
        threshold = threshold (default = 0.20)
        mitochannel = prediction channel for mito (default 2) (empty means no mito mode)
    
    Returns:
        segmentation = 3D numpy label array (z,y,x)
    """

    print("neuroproof_agglomerate(): Starting with label data: dtype={}, shape={}".format(str(supervoxels.dtype), supervoxels.shape))


    import numpy
    # return immediately if no segmentation
    if len(numpy.unique(supervoxels)) <= 1:
        return supervoxels


    #from neuroproof import Classifier, Agglomeration
    from neuroproof import Agglomeration
    import os

    # verify channels
    assert predictions.ndim == 4
    z,y,x,nch = predictions.shape

    if nch > 2:
        # make sure mito is in the second channel
        predictions[[[[2, mitochannel]]]] = predictions[[[[mitochannel, mitochannel]]]] 

    pathname = str(classifier["path"])
    tempfilehold = None
    tclassfile = ""

    # write classifier to temporary file if stored on DVID
    if "dvid-server" in classifier:
        # allow user to specify any server and version for the data
        dvidserver = classifier["dvid-server"]
        uuid = classifier["uuid"]

        # extract file and store into temporary location
        node_service = retrieve_node_service(str(dvidserver), str(uuid))

        name_key = pathname.split('/')
        classfile = node_service.get(name_key[0], name_key[1])

        # create temp file
        import tempfile
        tempfilehold = tempfile.NamedTemporaryFile(delete=False)
       
        # open file and write data
        with open(tempfilehold.name, 'w') as fout:
            fout.write(classfile)

        # move temporary file to have the same extension as provided file
        if pathname.endswith('.h5'):
            tclassfile = tempfilehold.name + ".h5"
        else:
            tclassfile = tempfilehold.name + ".xml"
        os.rename(tempfilehold.name, tclassfile)

    else:
        # just read from directory
        tclassfile = pathname
        

    # load classifier from file
    #classifier = loadClassifier(tclassfile)

    # run agglomeration (supervoxels must be 32 uint and predicitons must be float32)
    segmentation = Agglomeration.agglomerate(supervoxels.astype(numpy.uint32), predictions.astype(numpy.float32), tclassfile, threshold)

    if tempfilehold is not None:
        os.remove(tclassfile)

    return segmentation
    def execute(self):
        server = str(self.config_data["dvid-info"]["dvid-server"])
        uuid = str(self.config_data["dvid-info"]["uuid"])
        source = str(self.config_data["dvid-info"]["source"])

        session = default_dvid_session()        
        # determine grayscale blk extants
        if not server.startswith("http://"):
            server = "http://" + server

        req = session.get(server + "/api/node/" + uuid + "/" + source + "/info")
        sourcemeta = req.json()
       
        # xmin, ymin, zmin not being used explicitly yet
        #xmin, ymin, zmin = sourcemeta["Extended"]["MinIndex"] 
        xmin, ymin, zmin = 0, 0, 0 
        xmax, ymax, zmax = sourcemeta["Extended"]["MaxIndex"] 
       
        islabelblk = False
        datatype = sourcemeta["Extended"]["Values"][0]["Label"]
        if str(datatype) == "labelblk":
            islabelblk = True

        # !! always assume isotropic block
        BLKSIZE = int(sourcemeta["Extended"]["BlockSize"][0])

        maxdim = max(xmax,ymax,zmax)
        # build pyramid until BLKSIZE * 4
        import math
        maxlevel = int(math.log(maxdim+1) / math.log(2)) - 2

        # assume 0,0,0 start for now
        xspan, yspan, zspan = xmax+1, ymax+1, zmax+1
        
        xrunlimit = self.config_data["options"]["xrunlimit"]
        xrunlimit = xrunlimit + (xrunlimit % 2) # should be even

        currsource = source

        # create source pyramid and append _level to name
        for level in range(1, maxlevel+1):
            node_service = retrieve_node_service(server, uuid, self.resource_server, self.resource_port, self.APPNAME)
            # !! limit to grayscale now
            prevsource = currsource
            currsource = source + ("_%d" % level)
            
            # TODO: set voxel resolution to base dataset (not too important in current workflows)
            if islabelblk:
                node_service.create_labelblk(currsource, None, BLKSIZE)
            else:
                node_service.create_grayscale8(currsource, BLKSIZE)
                # set extents for new volume (only need to do for grayscale)
                newsourceext = {}
                newsourceext["MinPoint"] = [0,0,0] # for now no offset
                newsourceext["MaxPoint"] = [((xspan-1) // 2+1)*BLKSIZE-1,((yspan-1) // 2+1)*BLKSIZE-1,((zspan-1) // 2+1)*BLKSIZE-1]
                session.post(server + "/api/node/" + uuid + "/" + currsource + "/extents", json=newsourceext)

            # determine number of requests
            maxxrun = xspan
            if xrunlimit > 0 and xrunlimit < xspan:
                maxxrun = xrunlimit
            if maxxrun % 2:
                maxxrun += 1

            xsize = xspan // maxxrun
            if xspan % maxxrun:
                xsize += 1
            ysize = (yspan+1) // 2
            zsize = (zspan+1) // 2
            resource_server = self.resource_server
            resource_port = self.resource_port

            for ziter2 in range(0, zsize, 2):
                workqueue = []
                for yiter in range(0, ysize):
                    for xiter in range(0, xsize):
                        for miniz in range(ziter2, ziter2+2):
                            workqueue.append((xiter,yiter,miniz))

                # parallelize jobs
                pieces = self.sc.parallelize(workqueue, len(workqueue))

                # grab data corresponding to xrun
                def retrievedata(coord):
                    xiter, yiter, ziter = coord
                    node_service = retrieve_node_service(server, uuid, resource_server, resource_port)

                    shape_zyx = ( BLKSIZE*2, BLKSIZE*2, maxxrun*BLKSIZE )
                    offset_zyx = (ziter*BLKSIZE*2, yiter*BLKSIZE*2, xiter*BLKSIZE*maxxrun)
                    vol_zyx = None
                    if islabelblk:
                        vol_zyx = node_service.get_labels3D( str(prevsource), shape_zyx, offset_zyx, throttle=False)
                    else:
                        vol_zyx = node_service.get_gray3D( str(prevsource), shape_zyx, offset_zyx, throttle=False)

                    return (coord, vol_zyx)

                volumedata = pieces.map(retrievedata)

                # downsample gray data
                def downsamplegray(vdata):
                    coords, data = vdata
                    from scipy import ndimage
                    data = ndimage.interpolation.zoom(data, 0.5)
                    return (coords, data)

                # downsample label data (TODO: make faster)
                def downsamplelabels(vdata):
                    coords, data = vdata
                    import numpy 
                    zmax, ymax, xmax = data.shape
                    data2 = numpy.zeros((zmax // 2, ymax // 2, xmax // 2)).astype(numpy.uint64)

                    for ziter in range(0,zmax,2):
                        for yiter in range(0, ymax,2):
                            for xiter in range(0,xmax,2):
                                v1 = data[ziter, yiter, xiter] 
                                v2 = data[ziter, yiter, xiter+1] 
                                v3 = data[ziter, yiter+1, xiter] 
                                v4 = data[ziter, yiter+1, xiter+1] 
                                v5 = data[ziter+1, yiter, xiter] 
                                v6 = data[ziter+1, yiter, xiter+1] 
                                v7 = data[ziter+1, yiter+1, xiter] 
                                v8 = data[ziter+1, yiter+1, xiter+1]

                                freqs = {}
                                freqs[v2] = 0
                                freqs[v3] = 0
                                freqs[v4] = 0
                                freqs[v5] = 0
                                freqs[v6] = 0
                                freqs[v7] = 0
                                freqs[v8] = 0
                                
                                freqs[v1] = 1
                                freqs[v2] += 1
                                freqs[v3] += 1
                                freqs[v4] += 1
                                freqs[v5] += 1
                                freqs[v6] += 1
                                freqs[v7] += 1
                                freqs[v8] += 1

                                maxval = 0
                                freqkey = 0
                                for key, val in freqs.items():
                                        if val > maxval:
                                                maxval = val
                                                freqkey = key
        
                                data2[ziter // 2, yiter // 2, xiter // 2] = freqkey
            
                    return (coords, data2)

                downsampleddata = None
                if islabelblk:
                    downsampleddata = volumedata.map(downsamplelabels)
                else:
                    downsampleddata = volumedata.map(downsamplegray)

                appname = self.APPNAME
                delimiter = self.config_data["options"]["blankdelimiter"]
                
                # write results ?!
                def write2dvid(vdata):
                    from libdvid import ConnectionMethod
                    import numpy
                    node_service = retrieve_node_service(server, uuid, resource_server, resource_port, appname) 
                    
                    coords, data = vdata 
                    xiter, yiter, ziter = coords

                    # set block indices
                    zbindex = ziter
                    ybindex = yiter

                    zsize,ysize,xsize = data.shape
                    #xrun = xsize/BLKSIZE
                    xbindex = xiter*maxxrun // 2

                    # retrieve blocks
                    blockbuffer = ""

                    # skip blank blocks
                    startblock = False
                    xrun = 0

                    if islabelblk: 
                        vals = numpy.unique(data)
                        # TODO: ignore blank blocks within an x line 
                        if not (len(vals) == 1 and vals[0] == 0):
                            if resource_server != "":
                                node_service.put_labels3D(currsource, data, (zbindex*BLKSIZE, ybindex*BLKSIZE, xbindex*BLKSIZE), compress=True, throttle=False)
                            else:
                                node_service.put_labels3D(currsource, data, (zbindex*BLKSIZE, ybindex*BLKSIZE, xbindex*BLKSIZE), compress=True)
                    else:
                        for iterx in range(0, xsize, BLKSIZE):
                            block = data[:,:,iterx:iterx+BLKSIZE]
                            vals = numpy.unique(block)
                            if len(vals) == 1 and vals[0] == delimiter:
                                # check if the block is blank
                                if startblock:
                                    # if the previous block has data, push blocks in current queue
                                    node_service.custom_request(str((currsource + "/blocks/%d_%d_%d/%d") % (xbindex, ybindex, zbindex, xrun)), blockbuffer, ConnectionMethod.POST) 
                                    startblock = False
                                    xrun = 0
                                    blockbuffer = ""

                            else:
                                if startblock == False:
                                    xbindex = xiter*maxxrun // 2 + iterx // BLKSIZE
                               
                                startblock = True
                                blockbuffer += block.tobytes()
                                xrun += 1


                        # write-out leftover blocks
                        if xrun > 0:
                            node_service.custom_request(str((currsource + "/blocks/%d_%d_%d/%d") % (xbindex, ybindex, zbindex, xrun)), blockbuffer, ConnectionMethod.POST) 


                downsampleddata.foreach(write2dvid)

            # adjust max coordinate for new level
            xspan = (xspan-1) // 2
            yspan = (yspan-1) // 2
            zspan = (zspan-1) // 2
Beispiel #21
0
    def execute(self):
        from DVIDSparkServices.reconutils import SimpleGraph
        from pyspark import SparkContext
        from pyspark import StorageLevel

        if "chunk-size" in self.config_data["options"]:
            self.chunksize = self.config_data["options"]["chunk-size"]

        #  grab ROI
        distrois = self.sparkdvid_context.parallelize_roi(
            self.config_data["dvid-info"]["roi"],
            chunk_size=self.chunksize,
            border=1)

        num_partitions = distrois.getNumPartitions()

        # map ROI to label volume (1 pixel overlap)
        label_chunks = self.sparkdvid_context.map_labels64(
            distrois,
            self.config_data["dvid-info"]["label-name"],
            border=1,
            roiname=self.config_data["dvid-info"]["roi"])

        # map labels to graph data -- external program (eventually convert neuroproof metrics and graph to a python library) ?!
        sg = SimpleGraph.SimpleGraph(self.config_data["options"])

        # extract graph
        graph_elements = label_chunks.flatMap(sg.build_graph)

        # group data for vertices and edges
        graph_elements_red = graph_elements.reduceByKey(lambda a, b: a + b)

        # repartition by first vertex to better group edges together
        graph_elements_red = graph_elements_red.partitionBy(
            num_partitions, lambda a: hash(a[0]))

        graph_elements_red.persist(StorageLevel.MEMORY_ONLY)  # ??
        graph_vertices = graph_elements_red.filter(sg.is_vertex)
        graph_edges = graph_elements_red.filter(sg.is_edge)

        # create graph
        node_service = retrieve_node_service(
            self.config_data["dvid-info"]["dvid-server"],
            self.config_data["dvid-info"]["uuid"], self.resource_server,
            self.resource_port)

        node_service.create_graph(
            str(self.config_data["dvid-info"]["graph-name"]))

        # dump graph -- should this be wrapped through utils or through sparkdvid ??
        # will this result in too many request (should they be accumulated) ??
        # currently looking at one partitioning at a time to try to group requests
        self.sparkdvid_context.foreachPartition_graph_elements(
            graph_vertices, self.config_data["dvid-info"]["graph-name"])
        self.sparkdvid_context.foreachPartition_graph_elements(
            graph_edges, self.config_data["dvid-info"]["graph-name"])

        if "debug" in self.config_data["options"] and self.config_data[
                "options"]["debug"]:
            num_elements = graph_elements.count()
            print("DEBUG: ", num_elements)

        graph_elements_red.unpersist()
Beispiel #22
0
    def execute(self):
        from PIL import Image
        import numpy
        import os

        iterslices = self.BLKSIZE * self.config_data["options"][
            "numblocklayers"]

        minslice = self.config_data["minslice"]
        # map file to numpy array
        basename = self.config_data["basename"]

        # format should be gs://<bucket>/path
        gbucketname = ""
        gpath = ""
        if basename.startswith('gs://'):
            # parse google bucket names
            tempgs = basename.split('//')
            bucketpath = tempgs[1].split('/')
            gbucketname = bucketpath[0]
            gpath = '/'.join(bucketpath[1:])

        server = None

        xoffset = yoffset = zoffset = 0

        if "offset" in self.config_data["options"]:
            xoffset = self.config_data["options"]["offset"][0]
            yoffset = self.config_data["options"]["offset"][1]
            zoffset = self.config_data["options"]["offset"][2]

            if xoffset % self.BLKSIZE != 0:
                raise Exception("offset not block aligned")
            if yoffset % self.BLKSIZE != 0:
                raise Exception("offset not block aligned")
            if zoffset % self.BLKSIZE != 0:
                raise Exception("offset not block aligned")

            xoffset /= self.BLKSIZE
            yoffset /= self.BLKSIZE
            zoffset /= self.BLKSIZE

        # this will start the Z block writing at the specified offse
        # (changes default behavior when loading nonzero starting image slice)
        zoffset -= (minslice // self.BLKSIZE)

        # create metadata before workers start if using DVID
        if "output-dir" not in self.config_data or self.config_data[
                "output-dir"] == "":
            # write to dvid
            server = self.config_data["dvid-info"]["dvid-server"]
            uuid = self.config_data["dvid-info"]["uuid"]
            grayname = self.config_data["dvid-info"]["grayname"]
            resource_server = str(self.resource_server)
            resource_port = self.resource_port

            # create grayscale type
            node_service = retrieve_node_service(server, uuid, resource_server,
                                                 resource_port, self.APPNAME)
            node_service.create_grayscale8(str(grayname), self.BLKSIZE)

        for slice in range(self.config_data["minslice"],
                           self.config_data["maxslice"] + 1, iterslices):
            # parallelize images across many machines
            imgs = self.sc.parallelize(list(range(slice, slice + iterslices)),
                                       iterslices)

            def img2npy(slicenum):
                try:
                    img = None
                    if gbucketname == "":
                        img = Image.open(basename % slicenum)
                    else:
                        from gcloud import storage
                        from io import BytesIO
                        client = storage.Client()
                        gbucket = client.get_bucket(gbucketname)
                        gblob = gbucket.get_blob(gpath % slicenum)

                        # write to bytes which implements file interface
                        gblobfile = BytesIO()
                        gblob.download_to_file(gblobfile)
                        gblobfile.seek(0)
                        img = Image.open(gblobfile)
                    return slicenum, numpy.array(img)
                except Exception as e:
                    # just return a blank slice -- will be handled downstream
                    return slicenum, numpy.zeros((0, 0), numpy.uint8)

            npy_images = imgs.map(img2npy)

            # map numpy array into y lines of block height
            blocksize = self.BLKSIZE
            blocklimit = self.BLOCKLIMIT

            def npy2lines(arrpair):
                z, arr = arrpair
                ysize, xsize = arr.shape
                npylines = []

                for itery in range(0, ysize, blocksize):
                    line = numpy.zeros(
                        (blocksize,
                         ((xsize - 1) // blocksize + 1) * blocksize),
                        numpy.uint8)
                    uppery = blocksize
                    if (itery + blocksize) > ysize:
                        uppery = ysize - itery

                    line[0:uppery, 0:xsize] = arr[itery:itery + blocksize,
                                                  0:xsize]

                    npylines.append((itery // blocksize, (z, line)))

                return npylines

            npy_lines = npy_images.flatMap(npy2lines)

            # reduce y lines into DVID blocks
            groupedlines = npy_lines.groupByKey()

            # map y lines => (y, blocks)
            def lines2blocks(linespair):
                y, linesp = linespair

                xsize = None
                blockdata = None
                for z, line in linesp:
                    if xsize is None:
                        _, xsize = line.shape
                        blockdata = numpy.zeros((iterslices, blocksize, xsize),
                                                numpy.uint8)

                    blockdata[(z - minslice) % iterslices, :, :] = line
                return y, blockdata

            yblocks = groupedlines.map(lines2blocks)

            # map multilayer of blocks to an array of single layer blocks
            def multi2single(yblocks):
                ybindex, blocks = yblocks
                blockarr = []
                num_layers = iterslices // blocksize
                for layer in range(0, num_layers):
                    blockarr.append(
                        ((ybindex, layer),
                         blocks[layer * blocksize:(layer * blocksize +
                                                   blocksize), :, :]))

                return blockarr

            yblockssplit = yblocks.flatMap(multi2single)

            if "output-dir" in self.config_data and self.config_data[
                    "output-dir"] != "":
                # write blocks to disk for separte post-process -- write directly to DVID eventually?
                output_dir = self.config_data["output-dir"]

                def write2disk(yblocks):
                    zbindex = slice // blocksize
                    (ybindex, layer), blocks = yblocks
                    zbindex += layer

                    zsize, ysize, xsize = blocks.shape

                    outdir = output_dir
                    outdir += "/" + ("%05d" % zbindex) + ".z/"
                    filename = outdir + ("%05d" % ybindex) + "-" + str(
                        xsize // blocksize) + ".blocks"

                    try:
                        os.makedirs(outdir)
                    except Exception as e:
                        pass

                    # extract blocks from buffer and write to disk
                    fout = open(filename, 'wb')
                    for iterx in range(0, xsize, blocksize):
                        block = blocks[:, :, iterx:iterx + blocksize].copy()
                        fout.write(block)
                    fout.close()

                yblockssplit.foreach(write2disk)
            else:
                # write to dvid
                server = self.config_data["dvid-info"]["dvid-server"]
                uuid = self.config_data["dvid-info"]["uuid"]
                grayname = self.config_data["dvid-info"]["grayname"]
                appname = self.APPNAME
                delimiter = self.config_data["options"]["blankdelimiter"]

                def write2dvid(yblocks):
                    from libdvid import ConnectionMethod
                    import numpy
                    node_service = retrieve_node_service(
                        server, uuid, resource_server, resource_port, appname)

                    # get block coordinates
                    zbindex = slice // blocksize
                    (ybindex, layer), blocks = yblocks
                    zbindex += layer
                    zsize, ysize, xsize = blocks.shape
                    xrun = xsize // blocksize
                    xbindex = 0  # assume x starts at 0!!

                    # retrieve blocks
                    blockbuffer = ""

                    # skip blank blocks
                    startblock = False
                    xrun = 0
                    xbindex = 0

                    for iterx in range(0, xsize, blocksize):
                        block = blocks[:, :, iterx:iterx + blocksize].copy()
                        vals = numpy.unique(block)
                        if len(vals) == 1 and vals[0] == delimiter:
                            # check if the block is blank
                            if startblock:
                                # if the previous block has data, push blocks in current queue
                                node_service.custom_request(
                                    str((grayname + "/blocks/%d_%d_%d/%d") %
                                        (xbindex + xoffset, ybindex + yoffset,
                                         zbindex + zoffset, xrun)),
                                    blockbuffer, ConnectionMethod.POST)
                                startblock = False
                                xrun = 0
                                blockbuffer = ""

                        else:
                            if startblock == False:
                                xbindex = iterx // blocksize

                            startblock = True
                            blockbuffer += block.tobytes()
                            xrun += 1

                            if blocklimit > 0 and xrun >= blocklimit:
                                # if the previous block has data, push blocks in current queue
                                node_service.custom_request(
                                    str((grayname + "/blocks/%d_%d_%d/%d") %
                                        (xbindex + xoffset, ybindex + yoffset,
                                         zbindex + zoffset, xrun)),
                                    blockbuffer, ConnectionMethod.POST)
                                startblock = False
                                xrun = 0
                                blockbuffer = ""

                    # write-out leftover blocks
                    if xrun > 0:
                        node_service.custom_request(
                            str((grayname + "/blocks/%d_%d_%d/%d") %
                                (xbindex + xoffset, ybindex + yoffset,
                                 zbindex + zoffset, xrun)), blockbuffer,
                            ConnectionMethod.POST)

                yblockssplit.foreach(write2dvid)

            self.workflow_entry_exit_printer.write_data("Ingested %d slices" %
                                                        iterslices)

        # just fetch one image at driver to get dims
        width = height = 1
        try:
            img = None
            if gbucketname == "":
                img = Image.open(basename % minslice)
                width, height = img.width, img.height
            else:
                from gcloud import storage
                from io import BytesIO
                client = storage.Client()
                gbucket = client.get_bucket(gbucketname)
                gblob = gbucket.get_blob(gpath % minslice)

                # write to bytes which implements file interface
                gblobfile = BytesIO()
                gblob.download_to_file(gblobfile)
                gblobfile.seek(0)
                img = Image.open(gblobfile)
                width, height = img.width, img.height
        except Exception as e:
            # just set size to 1
            pass

        if "output-dir" not in self.config_data or self.config_data[
                "output-dir"] == "":
            # update metadata
            grayext = {}
            grayext["MinPoint"] = [
                xoffset * self.BLKSIZE, yoffset * self.BLKSIZE,
                zoffset * self.BLKSIZE + minslice
            ]
            grayext["MaxPoint"] = [
                xoffset * self.BLKSIZE + width - 1,
                yoffset * self.BLKSIZE + height - 1, zoffset * self.BLKSIZE +
                minslice + self.config_data["maxslice"]
            ]
            if not server.startswith("http://"):
                server = "http://" + server
            session = default_dvid_session()
            session.post(server + "/api/node/" + uuid + "/" + grayname +
                         "/extents",
                         json=grayext)
Beispiel #23
0
        def findindexerrors(bodies):
            index, bodylist = bodies
            bodymappings = {}
            rangequery = []
            for (body, bids) in bodylist:
                bodymappings[body] = bids
                rangequery.append(body)
            
            # call block index DVID API
            from libdvid import ConnectionMethod
            rangequery.sort()
            b1 = rangequery[0]
            b2 = rangequery[-1]
    
            ns = retrieve_node_service(server, uuid, resource_server, resource_port, appname)

            addr = str(labelname + "/sparsevols-coarse/" + str(b1) + "/" + str(b2))
            res = ns.custom_request(addr, None, ConnectionMethod.GET)
        
            bodyblockrle = np.fromstring(res, dtype=np.int32)
            currindex = 0
            
            bodymappingsdvid = {}
            while currindex < len(bodyblockrle):
                #  retrieve bodies
                hb = bodyblockrle[currindex]
                lb = bodyblockrle[currindex+1]
                currbody = hb | lb << 32 
                currindex += 2
                
                # retrieve runlengths
                numspans = bodyblockrle[currindex] 
                currindex += 1
                blockarray = []
                for index in range(numspans):
                    dimx = bodyblockrle[currindex] 
                    currindex += 1
                    dimy = bodyblockrle[currindex] 
                    currindex += 1
                    dimz = bodyblockrle[currindex] 
                    currindex += 1
                    runx = bodyblockrle[currindex] 
                    currindex += 1

                    # create body mappings
                    for xblock in range(dimx, dimx+runx):
                        blockarray.append((dimz, dimy, xblock))
                bodymappingsdvid[currbody] = blockarray

            allerrors = []
            # find differences
            for body, blocklist in bodymappings.items():
                if body not in bodymappingsdvid:
                    allerrors.append([True, body, blocklist])
                    continue

                # false negatives
                bset = set(blocklist)
                bsetdvid = set(bodymappingsdvid[body])
                errors = list(bset - bsetdvid)
                if len(errors) > 0:
                    allerrors.append([True, body, errors])
                
                # false positives
                errors2 = list(bsetdvid - bset)
                if len(errors2) > 0:
                    allerrors.append([False, body, errors2])
            return allerrors
Beispiel #24
0
    def execute(self):
        # imports here so that schema can be retrieved without installation
        from DVIDSparkServices.reconutils.metrics import Evaluate
        from pyspark import SparkContext
        from pyspark import StorageLevel
        import time
        import datetime
        import json
        
        starttime = time.time()

        node_service = retrieve_node_service(self.config_data["dvid-info"]["dvid-server"],
                self.config_data["dvid-info"]["uuid"], self.resource_server, self.resource_port)

        if "chunk-size" in self.config_data["options"]:
            self.chunksize = self.config_data["options"]["chunk-size"]

        # check if downsampling possible
        downsample_level = self.config_data["options"]["downsample-level"]
        
        # do not allow dowsampling by more that 32x in each dim
        assert (downsample_level <= 5 or downsample_level >= 0)

        if downsample_level > 0:
            # check if labelmap or labelarray and max  and levellevel
            datameta = node_service.get_typeinfo(str(self.config_data["dvid-info"]["label-name"]))
            labeltype = datameta["Base"]["TypeName"]
            assert labeltype in ("labelarray", "labelmap")
            maxlevel = datameta["Extended"]["MaxDownresLevel"]
            assert maxlevel >= downsample_level

            if "dvid-info-comp" in self.config_data:
                node_service2 = retrieve_node_service(self.config_data["dvid-info-comp"]["dvid-server"],
                        self.config_data["dvid-info-comp"]["uuid"], self.resource_server, self.resource_port)
                datameta = node_service2.get_typeinfo(str(self.config_data["dvid-info-comp"]["label-name"]))
                labeltype = datameta["Base"]["TypeName"]
                assert labeltype in ("labelarray", "labelmap")
                maxlevel = datameta["Extended"]["MaxDownresLevel"]
                assert maxlevel >= downsample_level

        #  grab ROI (no overlap and no neighbor checking)
        distrois = self.sparkdvid_context.parallelize_roi(self.config_data["dvid-info"]["roi"],
                self.chunksize, border=1, partition_method="grid-aligned-" + str(self.config_data["options"]["chunk-size"]))
        def setBorderHack(subvolume):
            subvolume.border = 0
            return subvolume
        distrois = distrois.mapValues(setBorderHack)
      
        # modify substack extents and roi
        if downsample_level > 0:
            def downsampleROIs(subvolume):
                z1 = subvolume.box.z1
                y1 = subvolume.box.y1
                x1 = subvolume.box.x1
                z2 = subvolume.box.z2
                y2 = subvolume.box.y2
                x2 = subvolume.box.x2
                for level in range(0, downsample_level):
                    subvolume.roi_blocksize = subvolume.roi_blocksize // 2
                    z1 = z1 // 2 
                    y1 = y1 // 2 
                    x1 = x1 // 2 
                    z2 = z2 // 2 
                    y2 = y2 // 2 
                    x2 = x2 // 2 
                subvolume.box = SubvolumeNamedTuple(z1,y1,x1,z2,y2,x2)
                return subvolume

            distrois = distrois.mapValues(downsampleROIs)

        # check for self mode
        selfcompare = False
        dvidserver2 = ""
        dviduuid2 = ""
        dvidlname2 = ""
        if "dvid-info-comp" in self.config_data:
            dvidserver2 = self.config_data["dvid-info-comp"]["dvid-server"]
            dviduuid2 = self.config_data["dvid-info-comp"]["uuid"]
            dvidlname2 = self.config_data["dvid-info-comp"]["label-name"]

        # map ROI to two label volumes (0 overlap)
        # this will be used for all volume and point overlaps
        # (preserves partitioner)
        # (key, (subvolume, seggt, seg2)
        
        # creates a dummy volume if no second server is available
        lpairs = self.sparkdvid_context.map_labels64_pair(
                distrois, self.config_data["dvid-info"]["label-name"],
                dvidserver2, dviduuid2, dvidlname2,
                self.config_data["dvid-info"]["roi"], downsample_level)

        # TODO ?? how to handle debug coords
        
        # filter bodies if there is a body list from GT
        important_bodies = self.config_data["options"]["important-bodies"]

        if self.config_data["options"]["enable-sparse"]:
            # if sparse mode is enable there should be a body list
            assert (len(important_bodies) > 0)
        else:
            # should only filter bodies for non-sparse mode
            # if the bodies densely cover the volume
            def filter_bodies(label_pairs):
                from DVIDSparkServices.sparkdvid.CompressedNumpyArray import CompressedNumpyArray
                import numpy

                subvolume, labelgtc, label2c = label_pairs

                # extract numpy arrays
                labelgt = labelgtc.deserialize()
                
                # filter bodies from gt
                bodylist = numpy.unique(labelgt)
                intersecting_bodies = set(bodylist).intersection(set(important_bodies))
                mask = numpy.zeros(labelgt.shape)
                for body in intersecting_bodies:
                    mask[labelgt==body] = 1
                labelgt[mask==0] = 0

                # compress results
                return (subvolume, CompressedNumpyArray(labelgt), label2c)
           
            if len(important_bodies) > 0:
                lpairs = lpairs.mapValues(filter_bodies)

        def _split_disjoint_labels(label_pairs):
            """Helper function: map subvolumes so disconnected bodies are different labels.

            Function preserves partitioner.

            Args:
                label_pairs (rdd): RDD is of (subvolume id, data)
       
            Returns:
                Original RDD including mappings for gt and the test seg.
        
            """
            from DVIDSparkServices.reconutils.morpho import split_disconnected_bodies
            
            subvolume, labelgt, label2 = label_pairs

            # split bodies up
            labelgt_split, labelgt_map = split_disconnected_bodies(labelgt)
            label2_split, label2_map = split_disconnected_bodies(label2)
            
            # compress results
            return (subvolume, labelgt_map, label2_map, labelgt_split, label2_split)

        
        # split bodies that are merged outside of the subvolume
        # (preserves partitioner)
        # => (key, (subvolume, seggt-split, seg2-split, seggt-map, seg2-map))
        lpairs_split = lpairs.mapValues(_split_disjoint_labels)

        if self.config_data["options"]["run-cc"]: 
            # save current segmentation state
            lpairs_split.persist()

            # apply connected components
            def _extractfaces(label_pairs):
                """Extracts 6 sides from each cube.
                """
                
                key, (subvolume, gtmap, segmap, gtvol, segvol) = label_pairs

                # extract unique bodies not remapped
                allgt = set(numpy.unique(gtvol))
                allseg = set(numpy.unique(segvol))

                gtmapbodies = set()
                for key2, body in gtmap.items():
                    gtmapbodies.add(key2)
                segmapbodies = set()
                for key2, body in segmap.items():
                    segmapbodies.add(key2)
                allgt = allgt.difference(gtmapbodies)
                if 0 in allgt:
                    allgt.remove(0)
                allseg = allseg.difference(segmapbodies)
                if 0 in allseg:
                    allseg.remove(0)
                """
                if 0 in allseg:
                    allseg.remove(0)
                if 0 in allgt:
                    allgt.remove(0)
                """

                zmax,ymax,xmax = gtvol.shape
                start = (subvolume.box.z1, subvolume.box.y1, subvolume.box.x1)

                mappedfaces = []

                # grab 6 faces for gt
                slicex0 = gtvol[:,:,0]
                slicexmax = gtvol[:,:,xmax-1]

                slicey0 = gtvol[:,0,:]
                sliceymax = gtvol[:,ymax-1,:]
                
                slicez0 = gtvol[0,:,:]
                slicezmax = gtvol[zmax-1,:,:]

                mappedfaces.append(( (start, (start[0]+zmax, start[1]+ymax, start[2]+1), True), 
                                     [(slicex0, gtmap, key, True, allgt)] ))
                mappedfaces.append(( ((start[0], start[1], start[2]+xmax),
                                      (start[0]+zmax, start[1]+ymax, start[2]+xmax+1), True), 
                                     [(slicexmax, gtmap, key, False, set())] ))
                
                mappedfaces.append(( (start, (start[0]+zmax, start[1]+1, start[2]+xmax), True), 
                                     [(slicey0, gtmap, key, False, set())] ))
                mappedfaces.append(( ((start[0], start[1]+ymax, start[2]),
                                      (start[0]+zmax, start[1]+ymax+1, start[2]+xmax), True), 
                                     [(sliceymax, gtmap, key, False, set())] ))
                
                mappedfaces.append(( (start, (start[0]+1, start[1]+ymax, start[2]+xmax), True), 
                                     [(slicez0, gtmap, key, False, set())] ))
                mappedfaces.append(( ((start[0]+zmax, start[1], start[2]),
                                      (start[0]+zmax+1, start[1]+ymax, start[2]+xmax), True), 
                                     [(slicezmax, gtmap, key, False, set())] ))

                # grab 6 faces for seg
                segslicex0 = segvol[:,:,0]
                segslicexmax = segvol[:,:,xmax-1]

                segslicey0 = segvol[:,0,:]
                segsliceymax = segvol[:,ymax-1,:]
                
                segslicez0 = segvol[0,:,:]
                segslicezmax = segvol[zmax-1,:,:]

                mappedfaces.append(( (start, (start[0]+zmax, start[1]+ymax, start[2]+1), False), 
                                     [(segslicex0, segmap, key, True, allseg)] ))
                mappedfaces.append(( ((start[0], start[1], start[2]+xmax),
                                      (start[0]+zmax, start[1]+ymax, start[2]+xmax+1), False), 
                                     [(segslicexmax, segmap, key, False, set())] ))
                
                mappedfaces.append(( (start, (start[0]+zmax, start[1]+1, start[2]+xmax), False), 
                                     [(segslicey0, segmap, key, False, set())] ))
                mappedfaces.append(( ((start[0], start[1]+ymax, start[2]),
                                      (start[0]+zmax, start[1]+ymax+1, start[2]+xmax), False), 
                                     [(segsliceymax, segmap, key, False, set())] ))
                
                mappedfaces.append(( (start, (start[0]+1, start[1]+ymax, start[2]+xmax), False), 
                                     [(segslicez0, segmap, key, False, set())] ))
                mappedfaces.append(( ((start[0]+zmax, start[1], start[2]),
                                      (start[0]+zmax+1, start[1]+ymax, start[2]+xmax), False), 
                                     [(segslicezmax, segmap, key, False, set())] ))
        
                return mappedfaces

            # assume there could be only one possible match
            def _reducematches(faces1, faces2):
                faces1.extend(faces2)
                return faces1

            def _extractmatches(keyfaces):
                """Finds matching segments that have the same body id.
                """
                key, faces = keyfaces
            
                # no match found
                if len(faces) == 1:
                    start, end, isgt = key
                    seg1, segmap, sid, hack1, segbodies = faces[0]
                    bodymatches = []
                    if hack1:
                        for label, body in segmap.items():
                            bodymatches.append(((body, isgt), [(label, sid, True)]))
                        for body in segbodies:
                            bodymatches.append(((body, isgt), [(body, sid, True)]))

                    return bodymatches
                assert(len(faces) == 2)

                start, end, isgt = key
                seg1, segmap, sid, hack1, segbodies = faces[0]
                seg2, segmap2, sid2, hack2, segbodies2 = faces[1]

                seg1 = seg1.flatten()
                seg2 = seg2.flatten()
               
                seg1seg2 = numpy.column_stack((seg1, seg2))
                unique_pairs = numpy.unique(seg1seg2, axis=0)

                bodymatches = []

                for val in unique_pairs:
                    if val[0] == 0 or val[1] == 0:
                        continue
                    
                    mapped1 = val[0]
                    if mapped1 in segmap:
                        mapped1 = segmap[mapped1]
                    mapped2 = val[1]
                    if mapped2 in segmap2:
                        mapped2 = segmap2[mapped2]

                    if mapped1 == mapped2:
                        bodymatches.append(((mapped1, isgt), [((val[0], sid), (val[1], sid2))]))


                # hack: send all bodies that have new labels
                # assume 1) disjoint bodies will always include implicit identity mapping
                # and 2) each subvolume will be represented at least 6 times
                if hack1:
                    for label, body in segmap.items():
                        bodymatches.append(((body, isgt), [(label, sid, True)]))
                    for body in segbodies:
                        bodymatches.append(((body, isgt), [(body, sid, True)]))
                    
                if hack2:
                    for label, body in segmap2.items():
                        bodymatches.append(((body, isgt), [(label, sid2, True)]))
                    for body in segbodies2:
                        bodymatches.append(((body, isgt), [(body, sid2, True)]))

                return bodymatches

            def _reduce_bodies(bodies1, bodies2):
                """Group all bodies maps together.
                """
                bodies1.extend(bodies2)
                return bodies1

            flatmatches = lpairs_split.flatMap(_extractfaces).reduceByKey(_reducematches).flatMap(_extractmatches)
            matches = flatmatches.reduceByKey(_reduce_bodies)
            
            
            # should be small enough that the list can be global
            def _find_disjoint_bodies(matches):
                """Extract bodies that should be split into more than one piece.
                """
                (bodyid, isgt), matchlist = matches
                
                merges = {} 
                mergeset = {} 

                for match in matchlist:
                    # handle original mapping disjoint ids
                    if len(match) == 3:
                        val = (match[0], match[1])
                        if val not in merges:
                           merges[val] = val
                           mergeset[val] = set([val])
                        continue

                    val, val2 = match
                    if val2 < val:
                        val, val2 = val2, val
                    
                    mappedval = val
                    if mappedval in merges:
                        mappedval = merges[mappedval]
                    else:
                        merges[val] = val 
                    
                    if mappedval not in mergeset:
                        mergeset[mappedval] = set([val])
                    else:
                        mergeset[mappedval].add(val)

                    mappedval2 = val2
                    if mappedval2 in merges:
                        mappedval2 = merges[mappedval2]
                    
                    if mappedval2 not in mergeset:
                        mergeset[mappedval2] = set([val2])
                    else:
                        mergeset[mappedval2].add(val2)

                    # if the mapped value is equal, no need for further processing
                    if mappedval2 == mappedval:
                        continue

                    merges[mappedval2] = mappedval

                    for iterval in mergeset[mappedval2]:
                        merges[iterval] = mappedval

                    mergeset[mappedval] = mergeset[mappedval].union(mergeset[mappedval2])
                    del mergeset[mappedval2]

                if len(mergeset) == 1:
                    return []

                bodygroups = []
                for (dummy, group) in mergeset.items():
                    bodygroups.append(((bodyid, isgt), group))
                return bodygroups
            
            
            # choose very large arbitary index for simplicity (but below js 2^53 limit)
            ccstartbodyindex = 2**51

            # find disjoint mappings            
            disjoint_bodies = matches.flatMap(_find_disjoint_bodies)
            mapped_bodies = disjoint_bodies.zipWithIndex()
            mapped_bodies.persist()


            
            # send changes to substacks
            def cc2sid(mapped_body):
                (((bodyid, isgt), group), rid) = mapped_body
                sidbodies = []
                for (subval, sid) in group:
                    sidbodies.append((sid, [(isgt, subval, rid+ccstartbodyindex)]))
                return sidbodies

            def groupsids(sid1, sid2):
                sid1.extend(sid2)
                return sid1

            sidccbodies = mapped_bodies.flatMap(cc2sid).reduceByKey(groupsids, lpairs_split.getNumPartitions())

            # shuffle mappings to substacks (does this cause a shuffle)
            lpairs_split_j = lpairs_split.leftOuterJoin(sidccbodies, lpairs_split.getNumPartitions())


            # give new ids for subvolumes
            def _insertccmappings(label_pairs):
                ((subvolume, labelgt_map, label2_map, labelgt_split, label2_split), ccbodies) = label_pairs
                if ccbodies is not None:
                    for (isgt, subval, bodyid) in ccbodies:
                        if isgt:
                            labelgt_map[subval] = bodyid
                        else:
                            label2_map[subval] = bodyid

                return (subvolume, labelgt_map, label2_map, labelgt_split, label2_split)
            lpairs_split = lpairs_split_j.mapValues(_insertccmappings)
            
        # evaluation tool (support RAND, VI, per body, graph, and
        # histogram stats over different sets of points)
        evaluator = Evaluate.Evaluate(self.config_data)

        ### VOLUMETRIC ANALYSIS ###

        # TODO: !! Grab number of intersecting disjoint faces
        # (might need +1 border) for split edit distance
        
        # grab volumetric body overlap ignoring boundaries as specified
        # and generate overlap stats for substack (compute local)
        # => (key, (subvolume, stats, seggt-split, seg2-split, seggt-map, seg2-map))
        # (preserve partitioner)
        lpairs_proc = evaluator.calcoverlap(lpairs_split, self.config_data["options"]["boundary-size"])
       
        point_data = {}
        ### POINT ANALYSIS ###
        for point_list_name in self.config_data["dvid-info"]["point-lists"]:
            # grab point list from DVID
            keyvalue = point_list_name.split('/')
            pointname = ""

            if len(keyvalue) == 2:
                # is this too large to broadcast?? -- default lz4 should help quite a bit
                # TODO: send only necessary data to each job through join might help
                point_data[keyvalue[1]] = node_service.get_json(str(keyvalue[0]),
                        str(keyvalue[1]))
                pointname = keyvalue[1]
            elif len(keyvalue) == 1:    
                # assume dvid annotation datatype and always treat as a synapse type
                # TODO: split this up into many small calls so that it scales
                syndata = node_service.custom_request(str(keyvalue[0]) + "/roi/" + str(self.config_data["dvid-info"]["roi"]), "".encode(), ConnectionMethod.GET) 
                synjson = json.loads(syndata)
                synindex = {}
                synspot = 0
                # grab index positions
                for synapse in synjson:
                    synindex[tuple(synapse["Pos"])] = synspot
                    synspot += 1
               
                # load point data
                pointlist = [] 
                for synapse in synjson:
                    pointrel = synapse["Pos"]
                    if synapse["Rels"] is not None:
                        for rel in synapse["Rels"]:
                            if rel["Rel"] == "PreSynTo":
                                # only add relations within ROI
                                if tuple(rel["To"]) in synindex:
                                    index = synindex[tuple(rel["To"])]
                                    pointrel.append(index)
                    pointlist.append(pointrel)
                pointinfo = {"type": "synapse", "sparse": False, "point-list": pointlist}
                point_data[keyvalue[0]] = pointinfo
                pointname = keyvalue[0]
            else:
               raise Exception(str(point_list_name) + "point list key value not properly specified")

            # Generate per substack and global stats for given points.
            # Querying will just be done on the local labels stored.
            # (preserve partitioner)
            lpairs_proc = evaluator.calcoverlap_pts(lpairs_proc, pointname, point_data[pointname])

        # Extract stats by retrieving substacks and stats info and
        # loading into data structures on the driver.
        stats = evaluator.calculate_stats(lpairs_proc)


        if self.config_data["options"]["run-cc"]: 
            # make a global remap function
            def extract_disjoint_bodies(mapped_body):
                (((bodyid, isgt), group), rid) = mapped_body
                return (bodyid, rid+ccstartbodyindex)
            bodies_remap = mapped_bodies.map(extract_disjoint_bodies).collect()
            
            # global map of cc bodies to original body (unique across GT and seg)
            cc2body = {}
            for (bodyid, rid) in bodies_remap:
                cc2body[rid] = bodyid

        """
        # map temporary CC body index to original body index for body stats
        # for convenience (not very necessary since
        # CC mappings are also provided)
        for bodystat in stats["bodystats"]:
            delkeys = []
            newbodies = {}

            # rename bodyid -> bodyid-<num> for CC bodies
            for (tbody, val) in bodystat["bodies"].items():
                if tbody in cc2body:
                    delkeys.append(tbody)
                    iter1 = 0
                    while (str(tbody) + "-" + str(iter1)) in newbodies:
                        iter1 += 1
                    newbodies[str(tbody) + "-" + str(iter1)] = val
            for key in delkeys:
                del bodystat["bodies"][key]
            for (body, val) in newbodies.items():
                bodystat["bodies"][body] = val
        """

        # expand subvolume to original size if downsampled
        if downsample_level > 0:
            for sid, subvolumestats in stats["subvolumes"].items():
                for stat in subvolumestats:
                    if stat["name"] == "bbox":
                        stat["val"] = list(stat["val"])
                        for pos in range(6):
                            for level in range(downsample_level):
                                stat["val"][pos] = stat["val"][pos]*2

        # dump CC mappings for use in debugging
        if self.config_data["options"]["run-cc"]: 
            stats["connected-components"] = cc2body

        # none or false
        debug = False
        if "debug" in self.config_data:
            debug = self.config_data["debug"]

        if debug:
            print("DEBUG:", json.dumps(stats, cls=NumpyConvertingEncoder))

        # TODO: !! maybe generate a summary view from stats, write that back
        # with simplify output, dump the more complicated file to keyvalue as well

        # write stats and config back to DVID with time stamp
        # (@ name + user name + time stamp)
        # client should use '--' delimeter to parse name
        stats["time-analyzed"] = \
            datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
        stats["runtime"] = time.time() - starttime
        stats["config-file"] = self.config_data
        current_time = int(time.time())

        username = str(self.config_data["options"]["user-name"])
        username = "******".join(username.split('.'))
        
        location = str(self.config_data["dvid-info"]["stats-location"])
        location = "__".join(location.split('.'))
    
        fileloc = str(location + "--" + username + "--" + str(current_time))

        node_service.create_keyvalue(self.writelocation)
        node_service.put(self.writelocation, fileloc, json.dumps(stats, cls=NumpyConvertingEncoder).encode('utf-8'))
    def execute(self):
        from pyspark import SparkContext
        from pyspark import StorageLevel
        from DVIDSparkServices.reconutils.Segmentor import Segmentor
        resource_server = self.resource_server
        resource_port = self.resource_port
        self.chunksize = self.config_data["options"]["chunk-size"]

        # create datatype in the beginning
        mutateseg = self.config_data["options"]["mutateseg"]
        node_service = retrieve_node_service(
            self.config_data["dvid-info"]["dvid-server"],
            self.config_data["dvid-info"]["uuid"], resource_server,
            resource_port)
        success = node_service.create_labelblk(
            str(self.config_data["dvid-info"]["segmentation-name"]))
        # check whether seg should be mutated
        if (not success and mutateseg == "auto") or mutateseg == "yes":
            mutateseg = "yes"
        else:
            mutateseg = "no"

        # grab ROI subvolumes and find neighbors
        distsubvolumes = self.sparkdvid_context.parallelize_roi(
            self.config_data["dvid-info"]["roi"], self.chunksize,
            self.overlap // 2, True,
            self.config_data["dvid-info"]["partition-method"],
            self.config_data["dvid-info"]["partition-filter"])

        # do not recompute ROI for each iteration
        distsubvolumes.persist()

        num_parts = len(distsubvolumes.collect())

        # Instantiate the correct Segmentor subclass (must be installed)
        import importlib
        full_segmentor_classname = self.config_data["options"]["segmentor"][
            "class"]
        segmentor_classname = full_segmentor_classname.split('.')[-1]
        module_name = '.'.join(full_segmentor_classname.split('.')[:-1])
        segmentor_mod = importlib.import_module(module_name)
        segmentor_class = getattr(segmentor_mod, segmentor_classname)
        segmentor = segmentor_class(self.sparkdvid_context, self)

        # determine number of iterations
        iteration_size = self.config_data["options"]["iteration-size"]
        if iteration_size == 0:
            iteration_size = num_parts

        num_iters = num_parts // iteration_size
        if num_parts % iteration_size > 0:
            num_iters += 1

        seg_chunks_list = []

        # enable checkpointing if not empty
        checkpoint_dir = self.config_data["options"]["checkpoint-dir"]

        # enable rollback of iterations if necessary
        rollback_seg = (
            self.config_data["options"]["checkpoint"] == "segmentation")

        # enable rollback of boundary prediction if necessary
        rollback_pred = (rollback_seg or
                         self.config_data["options"]["checkpoint"] == "voxel")

        for iternum in range(0, num_iters):
            # Disable rollback by setting checkpoint dirs to empty
            gray_checkpoint_dir = mask_checkpoint_dir = pred_checkpoint_dir = sp_checkpoint_dir = seg_checkpoint_dir = ""
            if checkpoint_dir != "":
                pred_checkpoint_dir = checkpoint_dir + "/prediter-" + str(
                    iternum)
                seg_checkpoint_dir = checkpoint_dir + "/segiter-" + str(
                    iternum)

                # Grayscale and SP caches are only written to as a "debug" feature
                if self.config_data["options"]["debug"]:
                    gray_checkpoint_dir = checkpoint_dir + "/grayiter-" + str(
                        iternum)
                    mask_checkpoint_dir = checkpoint_dir + "/maskiter-" + str(
                        iternum)
                    sp_checkpoint_dir = checkpoint_dir + "/spiter-" + str(
                        iternum)

                    roi = self.config_data["dvid-info"]["roi"]
                    method = self.config_data["dvid-info"]["partition-method"]
                    roi_description = roi
                    if method != "ask-dvid":
                        roi_description += "-" + method
                    roi_filter = self.config_data["dvid-info"][
                        "partition-filter"]
                    if roi_filter != "all":
                        roi_description += "-" + roi_filter

                    # Spit out a JSON of the Subvolume list boxes
                    ids_and_subvols = distsubvolumes.collect()
                    subvols = [v for (_k, v) in ids_and_subvols]
                    subvol_bounds_json = Subvolume.subvol_list_to_json(subvols)
                    mkdir_p(checkpoint_dir)
                    with open(
                            checkpoint_dir +
                            "/{}-subvol-bounds.json".format(roi_description),
                            'w') as f:
                        f.write(subvol_bounds_json)

                    # Also spit out JSON RLE for writing the modified ROI directly to DVID, in case that's useful
                    all_blocks = Subvolume.subvol_list_all_blocks(subvols)
                    rle = runlength_encode(all_blocks, assume_sorted=False)
                    with open(
                            checkpoint_dir +
                            "/{}-dvid-blocks.json".format(roi_description),
                            'w') as f:
                        json.dump(rle.tolist(), f)

            # it might make sense to randomly map partitions for selection
            # in case something pathological is happening -- if original partitioner
            # is randomish than this should be fine
            def subset_part(sid_data):
                (sid, _data) = sid_data
                if (sid % num_iters) == iternum:
                    return True
                return False

            # should preserve partitioner
            distsubvolumes_part = distsubvolumes.filter(subset_part)

            if rollback_seg:
                readable_seg_checkpoint_dir = seg_checkpoint_dir
            else:
                readable_seg_checkpoint_dir = ""

            subvols_with_seg_cache, subvols_without_seg_cache = \
                CreateSegmentation._split_subvols_by_cache_status( readable_seg_checkpoint_dir,
                                                                   distsubvolumes_part.values().collect() )

            ##
            ## CACHED SUBVOLS
            ##
            cached_subvols_rdd = self.sparkdvid_context.sc.parallelize(
                subvols_with_seg_cache,
                len(subvols_with_seg_cache) or None)

            # Load as many seg blocks from cache as possible
            if subvols_with_seg_cache:

                def retrieve_seg_from_cache(subvol):
                    z1, y1, x1, z2, y2, x2 = subvol.box_with_border
                    block_bounds = ((z1, y1, x1), (z2, y2, x2))
                    block_store = H5BlockStore(seg_checkpoint_dir, mode='r')
                    h5_block = block_store.get_block(block_bounds)
                    return h5_block[:]

                cached_seg_chunks = cached_subvols_rdd.map(
                    retrieve_seg_from_cache)
            else:
                cached_seg_chunks = self.sparkdvid_context.sc.parallelize(
                    [])  # empty rdd

            cached_seg_chunks.persist()
            cached_seg_max_ids = cached_seg_chunks.map(np.max)

            # (subvol, (seg, max_id))
            cached_seg_chunks_kv = cached_subvols_rdd.zip(
                cached_seg_chunks.zip(cached_seg_max_ids))

            ##
            ## UNCACHED SUBVOLS
            ##
            uncached_subvols = self.sparkdvid_context.sc.parallelize(
                subvols_without_seg_cache,
                len(subvols_without_seg_cache) or None)
            uncached_subvols.persist()

            def prepend_sv_index(subvol):
                return (subvol.sv_index, subvol)

            uncached_subvols_kv_rdd = uncached_subvols.map(prepend_sv_index)

            # get grayscale chunks with specified overlap
            uncached_sv_and_gray = self.sparkdvid_context.map_grayscale8(
                uncached_subvols_kv_rdd,
                self.config_data["dvid-info"]["grayscale"])

            uncached_gray_vols = select_item(uncached_sv_and_gray, 1, 1)

            # small hack since segmentor is unaware for current iteration
            # perhaps just declare the segment function to have an arbitrary number of parameters
            if type(segmentor) == Segmentor:
                computed_seg_chunks = segmentor.segment(
                    uncached_subvols, uncached_gray_vols, gray_checkpoint_dir,
                    mask_checkpoint_dir, pred_checkpoint_dir,
                    sp_checkpoint_dir, seg_checkpoint_dir, rollback_pred,
                    False, rollback_seg)
            else:
                computed_seg_chunks = segmentor.segment(
                    uncached_subvols, uncached_gray_vols)

            computed_seg_chunks.persist()
            computed_seg_max_ids = computed_seg_chunks.map(np.max)

            # (subvol, (seg, max_id))
            computed_seg_chunks_kv = uncached_subvols.zip(
                computed_seg_chunks.zip(computed_seg_max_ids))

            ##
            ## FINAL LIST: COMBINED CACHED+UNCACHED
            ##

            # (subvol, (seg, max_id))
            seg_chunks = cached_seg_chunks_kv.union(computed_seg_chunks_kv)
            seg_chunks.persist(StorageLevel.MEMORY_AND_DISK)

            seg_chunks_list.append(seg_chunks)

        seg_chunks = seg_chunks_list[0]

        for iter1 in range(1, len(seg_chunks_list)):
            # ?? does this preserve the partitioner (yes, if num partitions is the same)
            # this could cause a serialization problems if there are a large number of iterations (>100)
            seg_chunks = seg_chunks.union(seg_chunks_list[iter1])
        del seg_chunks_list

        # persist through stitch
        # any forced persistence will result in costly
        # pickling, lz4 compressed numpy array should help
        seg_chunks.persist(StorageLevel.MEMORY_AND_DISK)

        # stitch the segmentation chunks
        # (preserves initial partitioning)
        mapped_seg_chunks = segmentor.stitch(seg_chunks)

        def prepend_key(item):
            subvol, _ = item
            return (subvol.sv_index, item)

        mapped_seg_chunks = mapped_seg_chunks.map(prepend_key)

        if self.config_data["options"]["parallelwrites"] > 0:
            # repartition to fewer partition if there is write bandwidth limits to DVID
            # (coalesce() doesn't balance the partitions, so we opt for a full shuffle.)
            mapped_seg_chunks = mapped_seg_chunks.repartition(
                self.config_data["options"]["parallelwrites"])

        # write data to DVID
        self.sparkdvid_context.foreach_write_labels3d(
            self.config_data["dvid-info"]["segmentation-name"],
            mapped_seg_chunks, self.config_data["dvid-info"]["roi"], mutateseg)
        self.workflow_entry_exit_printer.write_data(
            "Wrote DVID labels")  # write to logger after spark job

        if self.config_data["options"]["debug"]:
            # grab 256 cube from ROI
            node_service = retrieve_node_service(
                self.config_data["dvid-info"]["dvid-server"],
                self.config_data["dvid-info"]["uuid"], resource_server,
                resource_port)

            substacks, packing_factor = node_service.get_roi_partition(
                str(self.config_data["dvid-info"]["roi"]),
                256 // self.blocksize)

            if self.resource_server != "":
                label_volume = node_service.get_labels3D(
                    str(self.config_data["dvid-info"]["segmentation-name"]),
                    (256, 256, 256),
                    (substacks[0].z, substacks[0].y, substacks[0].x),
                    compress=True,
                    throttle=False)
            else:
                label_volume = node_service.get_labels3D(
                    str(self.config_data["dvid-info"]["segmentation-name"]),
                    (256, 256, 256),
                    (substacks[0].z, substacks[0].y, substacks[0].x),
                    compress=True)

            # dump checksum
            import hashlib
            md5 = hashlib.md5()
            md5.update(label_volume)
            print("DEBUG: ", md5.hexdigest())
    def execute(self):
        from PIL import Image
        import numpy
        import os
       
        iterslices = self.BLKSIZE * self.config_data["options"]["numblocklayers"]

        minslice = self.config_data["minslice"]
        # map file to numpy array
        basename = self.config_data["basename"]
        
        # format should be gs://<bucket>/path
        gbucketname = ""
        gpath = ""
        if basename.startswith('gs://'):
            # parse google bucket names
            tempgs = basename.split('//')
            bucketpath = tempgs[1].split('/')
            gbucketname = bucketpath[0]
            gpath = '/'.join(bucketpath[1:])


        server = None
     
        xoffset = yoffset = zoffset = 0

        if "offset" in self.config_data["options"]:
            xoffset = self.config_data["options"]["offset"][0] 
            yoffset = self.config_data["options"]["offset"][1] 
            zoffset = self.config_data["options"]["offset"][2] 

            if xoffset % self.BLKSIZE != 0:
                raise Exception("offset not block aligned")        
            if yoffset % self.BLKSIZE != 0:
                raise Exception("offset not block aligned")        
            if zoffset % self.BLKSIZE != 0:
                raise Exception("offset not block aligned")        

            xoffset /= self.BLKSIZE
            yoffset /= self.BLKSIZE
            zoffset /= self.BLKSIZE

        # this will start the Z block writing at the specified offse
        # (changes default behavior when loading nonzero starting image slice)
        zoffset -= (minslice // self.BLKSIZE)


        # create metadata before workers start if using DVID
        if "output-dir" not in self.config_data or self.config_data["output-dir"] == "":
            # write to dvid
            server = self.config_data["dvid-info"]["dvid-server"]
            uuid = self.config_data["dvid-info"]["uuid"]
            grayname = self.config_data["dvid-info"]["grayname"]
            resource_server = str(self.resource_server)
            resource_port = self.resource_port

            # create grayscale type
            node_service = retrieve_node_service(server, uuid, resource_server, resource_port, self.APPNAME)
            node_service.create_grayscale8(str(grayname), self.BLKSIZE)

        for slice in range(self.config_data["minslice"], self.config_data["maxslice"]+1, iterslices):
            # parallelize images across many machines
            imgs = self.sc.parallelize(list(range(slice, slice+iterslices)), iterslices)

            def img2npy(slicenum):
                try:
                    img = None
                    if gbucketname == "":
                        img = Image.open(basename % slicenum)
                    else:
                        from gcloud import storage
                        from io import BytesIO
                        client = storage.Client()
                        gbucket = client.get_bucket(gbucketname)
                        gblob = gbucket.get_blob(gpath % slicenum)
                        
                        # write to bytes which implements file interface
                        gblobfile = BytesIO()
                        gblob.download_to_file(gblobfile)
                        gblobfile.seek(0)
                        img = Image.open(gblobfile)
                    return slicenum, numpy.array(img)
                except Exception as e:
                    # just return a blank slice -- will be handled downstream
                    return slicenum, numpy.zeros((0,0), numpy.uint8)

            npy_images = imgs.map(img2npy) 
          
            # map numpy array into y lines of block height
            blocksize = self.BLKSIZE
            blocklimit = self.BLOCKLIMIT 
            def npy2lines(arrpair):
                z, arr = arrpair
                ysize, xsize = arr.shape
                npylines = []
               
                for itery in range(0, ysize, blocksize):
                    line = numpy.zeros((blocksize, ((xsize-1) // blocksize + 1)*blocksize), numpy.uint8)
                    uppery = blocksize
                    if (itery + blocksize) > ysize:
                        uppery = ysize - itery

                    line[0:uppery, 0:xsize] = arr[itery:itery+blocksize, 0:xsize]

                    npylines.append((itery // blocksize, (z, line)))

                return npylines

            npy_lines = npy_images.flatMap(npy2lines)

            # reduce y lines into DVID blocks
            groupedlines = npy_lines.groupByKey()         

            # map y lines => (y, blocks)
            def lines2blocks(linespair):
                y, linesp = linespair

                xsize = None
                blockdata = None
                for z, line in linesp:
                    if xsize is None:
                        _, xsize = line.shape
                        blockdata = numpy.zeros((iterslices, blocksize, xsize), numpy.uint8)

                    blockdata[(z - minslice)%iterslices, :, :] = line
                return y, blockdata
            
            yblocks = groupedlines.map(lines2blocks)
       
            # map multilayer of blocks to an array of single layer blocks
            def multi2single(yblocks):
                ybindex, blocks = yblocks
                blockarr = []
                num_layers = iterslices // blocksize
                for layer in range(0,num_layers):
                    blockarr.append(((ybindex, layer), blocks[layer*blocksize:(layer*blocksize+blocksize),:,:]))

                return blockarr

            yblockssplit = yblocks.flatMap(multi2single)


            if "output-dir" in self.config_data and self.config_data["output-dir"] != "":
                # write blocks to disk for separte post-process -- write directly to DVID eventually?
                output_dir = self.config_data["output-dir"]
                def write2disk(yblocks):
                    zbindex = slice // blocksize 
                    (ybindex, layer), blocks = yblocks
                    zbindex += layer

                    zsize,ysize,xsize = blocks.shape
                    
                    outdir = output_dir 
                    outdir += "/" + ("%05d" % zbindex) + ".z/"
                    filename = outdir + ("%05d" % ybindex) + "-" + str(xsize // blocksize) + ".blocks"

                    try: 
                        os.makedirs(outdir)
                    except Exception as e:
                        pass

                    # extract blocks from buffer and write to disk
                    fout = open(filename, 'wb')
                    for iterx in range(0, xsize, blocksize):
                        block = blocks[:,:,iterx:iterx+blocksize].copy()
                        fout.write(block)
                    fout.close()

                yblockssplit.foreach(write2disk) 
            else:
                # write to dvid
                server = self.config_data["dvid-info"]["dvid-server"]
                uuid = self.config_data["dvid-info"]["uuid"]
                grayname = self.config_data["dvid-info"]["grayname"]
                appname = self.APPNAME
                delimiter = self.config_data["options"]["blankdelimiter"]
                
                def write2dvid(yblocks):
                    from libdvid import ConnectionMethod
                    import numpy
                    node_service = retrieve_node_service(server, uuid, resource_server, resource_port, appname) 
                    
                    # get block coordinates
                    zbindex = slice // blocksize 
                    (ybindex, layer), blocks = yblocks
                    zbindex += layer
                    zsize,ysize,xsize = blocks.shape
                    xrun = xsize // blocksize
                    xbindex = 0 # assume x starts at 0!!

                    # retrieve blocks
                    blockbuffer = ""

                    # skip blank blocks
                    startblock = False
                    xrun = 0
                    xbindex = 0

                    for iterx in range(0, xsize, blocksize):
                        block = blocks[:,:,iterx:iterx+blocksize].copy()
                        vals = numpy.unique(block)
                        if len(vals) == 1 and vals[0] == delimiter:
                            # check if the block is blank
                            if startblock:
                                # if the previous block has data, push blocks in current queue
                                node_service.custom_request(str((grayname + "/blocks/%d_%d_%d/%d") % (xbindex+xoffset, ybindex+yoffset, zbindex+zoffset, xrun)), blockbuffer, ConnectionMethod.POST) 
                                startblock = False
                                xrun = 0
                                blockbuffer = ""

                        else:
                            if startblock == False:
                                xbindex = iterx // blocksize
                            
                            startblock = True
                            blockbuffer += block.tobytes()
                            xrun += 1

                            if blocklimit > 0 and xrun >= blocklimit:
                                # if the previous block has data, push blocks in current queue
                                node_service.custom_request(str((grayname + "/blocks/%d_%d_%d/%d") % (xbindex+xoffset, ybindex+yoffset, zbindex+zoffset, xrun)), blockbuffer, ConnectionMethod.POST) 
                                startblock = False
                                xrun = 0
                                blockbuffer = ""

                    # write-out leftover blocks
                    if xrun > 0:
                        node_service.custom_request(str((grayname + "/blocks/%d_%d_%d/%d") % (xbindex+xoffset, ybindex+yoffset, zbindex+zoffset, xrun)), blockbuffer, ConnectionMethod.POST) 


                yblockssplit.foreach(write2dvid)
        
            self.workflow_entry_exit_printer.write_data("Ingested %d slices" % iterslices)
        
        # just fetch one image at driver to get dims
        width = height = 1
        try:
            img = None
            if gbucketname == "":
                img = Image.open(basename % minslice) 
                width, height = img.width, img.height
            else:
                from gcloud import storage
                from io import BytesIO
                client = storage.Client()
                gbucket = client.get_bucket(gbucketname)
                gblob = gbucket.get_blob(gpath % minslice)
                
                # write to bytes which implements file interface
                gblobfile = BytesIO()
                gblob.download_to_file(gblobfile)
                gblobfile.seek(0)
                img = Image.open(gblobfile)
                width, height = img.width, img.height
        except Exception as e:
            # just set size to 1 
            pass

        if "output-dir" not in self.config_data or self.config_data["output-dir"] == "":
            # update metadata
            grayext = {}
            grayext["MinPoint"] = [xoffset*self.BLKSIZE,yoffset*self.BLKSIZE,zoffset*self.BLKSIZE+minslice]
            grayext["MaxPoint"] = [xoffset*self.BLKSIZE + width-1, yoffset*self.BLKSIZE + height-1, zoffset*self.BLKSIZE+minslice + self.config_data["maxslice"]]
            if not server.startswith("http://"):
                server = "http://" + server
            session = default_dvid_session()
            session.post(server + "/api/node/" + uuid + "/" + grayname + "/extents", json=grayext)
    def execute(self):
        # TODO: handle 64 bit segmentation

        from pyspark import SparkContext
        from pyspark import StorageLevel
        from DVIDSparkServices.reconutils.Segmentor import Segmentor

        self.chunksize = self.config_data["options"]["chunk-size"]

        # create datatype in the beginning
        node_service = retrieve_node_service(
            self.config_data["dvid-info"]["dvid-server"],
            self.config_data["dvid-info"]["uuid"], self.resource_server,
            self.resource_port)

        # grab ROI subvolumes and find neighbors
        distsubvolumes = self.sparkdvid_context.parallelize_roi(
            self.config_data["dvid-info"]["roi"], self.chunksize,
            self.contextbuffer, True)

        contextbuffer = self.contextbuffer
        # do not recompute ROI for each iteration
        distsubvolumes.persist()

        # instantiate the voxel prediction plugin
        import importlib
        full_function_name = self.config_data["options"]["predict-voxels"][
            "function"]
        module_name = '.'.join(full_function_name.split('.')[:-1])
        function_name = full_function_name.split('.')[-1]
        module = importlib.import_module(module_name)

        parameters = self.config_data["options"]["predict-voxels"][
            "parameters"]
        vprediction_function = partial(getattr(module, function_name),
                                       **parameters)

        # determine number of iterations
        num_parts = len(distsubvolumes.collect())
        iteration_size = self.config_data["options"]["iteration-size"]
        if iteration_size == 0:
            iteration_size = num_parts

        num_iters = num_parts // iteration_size
        if num_parts % iteration_size > 0:
            num_iters += 1

        feature_chunk_list = []

        # enable checkpointing if not empty
        checkpoint_dir = self.config_data["options"]["checkpoint-dir"]

        # enable rollback of iterations if necessary
        rollback = False
        if self.config_data["options"]["checkpoint"]:
            rollback = True

        for iternum in range(0, num_iters):
            # it might make sense to randomly map partitions for selection
            # in case something pathological is happening -- if original partitioner
            # is randomish than this should be fine
            def subset_part(sid_data):
                (s_id, _data) = sid_data
                if (s_id % num_iters) == iternum:
                    return True
                return False

            # should preserve partitioner
            distsubvolumes_part = distsubvolumes.filter(subset_part)

            # get grayscale chunks with specified overlap
            gray_chunks = self.sparkdvid_context.map_grayscale8(
                distsubvolumes_part,
                self.config_data["dvid-info"]["grayscale"])

            pred_checkpoint_dir = ""
            if checkpoint_dir:
                pred_checkpoint_dir = checkpoint_dir + "/prediter-" + str(
                    iternum)

            # For now, we always read predictions if available, and always write them if not.
            # TODO: Add config settings to control read/write behavior.
            @Segmentor.use_block_cache(pred_checkpoint_dir,
                                       allow_read=True,
                                       allow_write=True)
            def predict_voxels(sv_gray):
                (_subvolume, gray) = sv_gray
                return vprediction_function(gray, None)

            vox_preds = gray_chunks.values().map(
                predict_voxels)  # predictions only
            vox_preds = distsubvolumes_part.values().zip(
                vox_preds)  # (subvolume, predictions)

            pdconf = self.config_data["dvid-info"]
            resource_server = self.resource_server
            resource_port = self.resource_port

            # retrieve segmentation and generate features
            def generate_features(vox_pred):
                import numpy
                (subvolume, pred) = vox_pred
                pred = numpy.ascontiguousarray(pred)

                # extract labelblks
                border = 1  # only one pixel needed to find edges

                # get sizes of box
                size_z = subvolume.box.z2 + 2 * border - subvolume.box.z1
                size_y = subvolume.box.y2 + 2 * border - subvolume.box.y1
                size_x = subvolume.box.x2 + 2 * border - subvolume.box.x1

                # retrieve data from box start position considering border
                # !! technically ROI is not respected but unwritten segmentation will be ignored since it will have 0-valued pixels.
                @auto_retry(3, pause_between_tries=60.0, logging_name=__name__)
                def get_seg():
                    node_service = retrieve_node_service(
                        pdconf["dvid-server"], pdconf["uuid"], resource_server,
                        resource_port)
                    # retrieve data from box start position
                    # Note: libdvid uses zyx order for python functions

                    if resource_server != "":
                        return node_service.get_labels3D(
                            str(pdconf["segmentation-name"]),
                            (size_z, size_y, size_x),
                            (subvolume.box.z2 - border, subvolume.box.y1 -
                             border, subvolume.box.x1 - border))
                    else:
                        return node_service.get_labels3D(
                            str(pdconf["segmentation-name"]),
                            (size_z, size_y, size_x),
                            (subvolume.box.z2 - border, subvolume.box.y1 -
                             border, subvolume.box.x1 - border))

                initial_seg = get_seg()

                # !!! potentially dangerous but needed for now
                initial_seg = initial_seg.astype(numpy.uint32)

                pred2 = pred[(contextbuffer -
                              border):-(contextbuffer - border),
                             (contextbuffer -
                              border):-(contextbuffer - border),
                             (contextbuffer -
                              border):-(contextbuffer - border), :].copy()
                z, y, x, num_chans = pred2.shape

                # call neuroproof and generate features
                from neuroproof import FocusedProofreading
                # "edges": [ edge ] where edge = [node1, node2, edgesize, all features...]
                # "vertices": [vertex ] where vertex = [id, size, all features...]
                features = FocusedProofreading.extract_features(
                    initial_seg, pred2)

                element_list = []
                # iterate edges and create ((node1, node2), features)
                if "Edges" in features:
                    # could have only one vertex in a partition and no edges
                    for edge in features["Edges"]:
                        n1 = edge["Id1"]
                        n2 = edge["Id2"]
                        edge["Loc1"][0] += subvolume.box.x1
                        edge["Loc1"][1] += subvolume.box.y1
                        edge["Loc1"][2] += subvolume.box.z1

                        edge["Loc2"][0] += subvolume.box.x1
                        edge["Loc2"][1] += subvolume.box.y1
                        edge["Loc2"][2] += subvolume.box.z1

                        if n1 > n2:
                            n1, n2 = n2, n1
                        element_list.append(((n1, n2), (num_chans, edge)))

                for node in features["Vertices"]:
                    n1 = node["Id"]
                    element_list.append(((n1, -1), (num_chans, node)))

                return element_list

            features = vox_preds.flatMap(generate_features)

            # retrieve previously computed RDD or save current RDD
            if checkpoint_dir != "":
                features = self.sparkdvid_context.checkpointRDD(
                    features, checkpoint_dir + "/featureiter-" + str(iternum),
                    rollback)

            # any forced persistence will result in costly
            # pickling, lz4 compressed numpy array should help
            features.persist(StorageLevel.MEMORY_AND_DISK_SER)

            feature_chunk_list.append(features)

        features = feature_chunk_list[0]

        for iter1 in range(1, len(feature_chunk_list)):
            # this could cause a serialization problems if there are a large number of iterations (>100)
            features = feature.union(feature_chunk_list[iter1])

        # grab num channels from boundary prediction
        features.persist(StorageLevel.MEMORY_AND_DISK_SER)
        first_feature = features.first()
        (key1, key2), (num_channels, foo) = first_feature

        # remove num channels from features
        def remove_num_channels(featurepair):
            foo, feature = featurepair
            return feature

        features = features.mapValues(remove_num_channels)

        import json

        # merge edge and node features -- does not require reading classifier
        # node features are encoded as (vertex id, -1)
        def combine_edge_features(element1, element2):
            from neuroproof import FocusedProofreading

            if "Id2" in element1:
                # are edges
                return FocusedProofreading.combine_edge_features(
                    json.dumps(element1, cls=NumpyConvertingEncoder),
                    json.dumps(element2, cls=NumpyConvertingEncoder),
                    num_channels)
            else:
                # are vertices
                return FocusedProofreading.combine_vertex_features(
                    json.dumps(element1, cls=NumpyConvertingEncoder),
                    json.dumps(element2, cls=NumpyConvertingEncoder),
                    num_channels)

        features_combined = features.reduceByKey(combine_edge_features)

        #features_combined.persist()
        # TODO: option to serialize features to enable other analyses

        # join node and edge probs
        def retrieve_nodes(val):
            (n1, n2), features = val
            if n2 == -1:
                return True
            return False

        def retrieve_edges(val):
            (n1, n2), features = val
            if n2 == -1:
                return False
            return True

        node_features = features_combined.filter(retrieve_nodes)
        edge_features = features_combined.filter(retrieve_edges)

        node_features = node_features.map(lambda x: (x[0][0], x[1]))
        edge1_features = edge_features.map(lambda x: (x[0][0], x[1]))
        edge2_features = edge_features.map(lambda x: (x[0][1], x[1]))

        # multiple edges with the same key
        edge1_node_features = edge1_features.leftOuterJoin(node_features)
        edge2_node_features = edge2_features.leftOuterJoin(node_features)

        def reset_edgekey(val):
            key, (edge, node) = val
            n1 = edge["Id1"]
            n2 = edge["Id2"]
            if n1 > n2:
                n1, n2 = n2, n1
            return ((n1, n2), (edge, node))

        edge1_node_features = edge1_node_features.map(reset_edgekey)
        edge2_node_features = edge2_node_features.map(reset_edgekey)

        edge_node_features = edge1_node_features.join(edge2_node_features)

        # generate prob for each edge (JSON: body sizes, edge list with prob)
        classifierlocation = self.config_data["options"]["segment-classifier"]

        def compute_prob(edge_node_features):
            from neuroproof import FocusedProofreading
            classifier = FocusedProofreading.ComputeProb(
                str(classifierlocation), num_channels)

            res_list = []
            for edge_node_edge_node in edge_node_features:
                edge_key, ((edge, node1), (edge_dummy,
                                           node2)) = edge_node_edge_node
                weight = classifier.compute_prob(
                    json.dumps(edge, cls=NumpyConvertingEncoder),
                    json.dumps(node1, cls=NumpyConvertingEncoder),
                    json.dumps(node2, cls=NumpyConvertingEncoder))
                # node1, node2
                res_list.append(
                    (int(node1["Id"]), int(node2["Id"]), int(node1["Weight"]),
                     int(node2["Weight"]), int(edge["Weight"]), weight,
                     edge["Loc1"], edge["Loc2"]))

            return res_list

        # avoid loading large classifier for each small edge
        allprobs = edge_node_features.mapPartitions(compute_prob)

        # collect all edges and send to DVID (TODO: add option to dump to disk)
        allprobs_combined = allprobs.collect()

        bodyinfo = {}
        edges = []

        for edge_info in allprobs_combined:
            node1, node2, node1_size, node2_size, edge_size, weight, loc1, loc2 = edge_info
            bodyinfo[node1] = node1_size
            bodyinfo[node2] = node2_size
            edges.append({
                "Id1": node1,
                "Id2": node2,
                "Weight": weight,
                "Loc1": loc1,
                "Loc2": loc2
            })

        bodies = []
        for (key, val) in bodyinfo.items():
            bodies.append({"Id": key, "Weight": val})

        graph = {}
        graph["Vertices"] = bodies
        graph["Edges"] = edges

        SAVE_TO_FILE = False
        if SAVE_TO_FILE:
            graph_filepath = '/tmp/graph-output.json'
            with open(graph_filepath, 'w') as f:
                self.workflow_entry_exit_printer.warn(
                    "Writing graph json to file:\n{}".format(graph_filepath))
                import json
                json.dump(graph,
                          f,
                          indent=4,
                          separators=(',', ': '),
                          cls=NumpyConvertingEncoder)
            self.workflow_entry_exit_printer.write_data(
                "Wrote graph to disk")  # write to logger after spark job

        UPLOAD_TO_DVID = True
        if UPLOAD_TO_DVID:
            # load entire graph into DVID
            node_service.create_graph(
                str(self.config_data["dvid-info"]["graph-name"]))
            server = str(self.config_data["dvid-info"]["dvid-server"])
            #if not server.startswith("http://"):
            #    server = "http://" + server
            #session = default_dvid_session()
            #session.post(server + "/api/node/" + str(self.config_data["dvid-info"]["uuid"]) + "/" + str(self.config_data["dvid-info"]["graph-name"]) + "/subgraph", json=graph)
            #self.workflow_entry_exit_printer.write_data("Wrote DVID graph") # write to logger after spark job

        if self.config_data["options"]["debug"]:
            import json
            print("DEBUG:", json.dumps(graph, cls=NumpyConvertingEncoder))

        # write dvid to specified file (if provided)
        if "output-file" in self.config_data["options"] and self.config_data[
                "options"]["output-file"] != "":
            filename = self.config_data["options"]["output-file"]

            edgelist = []
            for edge in graph["Edges"]:
                edgelist.append({
                    "node1": edge["Id1"],
                    "node2": edge["Id2"],
                    "weight": edge["Weight"],
                    "loc1": edge["Loc1"],
                    "loc2": edge["Loc2"]
                })

            npgraph = {}
            npgraph["edge_list"] = edgelist
            fout = open(filename, 'w')
            fout.write(json.dumps(npgraph, cls=NumpyConvertingEncoder))
Beispiel #28
0
                def write2dvid(yblocks):
                    from libdvid import ConnectionMethod
                    import numpy
                    node_service = retrieve_node_service(
                        server, uuid, resource_server, resource_port, appname)

                    # get block coordinates
                    zbindex = slice // blocksize
                    (ybindex, layer), blocks = yblocks
                    zbindex += layer
                    zsize, ysize, xsize = blocks.shape
                    xrun = xsize // blocksize
                    xbindex = 0  # assume x starts at 0!!

                    # retrieve blocks
                    blockbuffer = ""

                    # skip blank blocks
                    startblock = False
                    xrun = 0
                    xbindex = 0

                    for iterx in range(0, xsize, blocksize):
                        block = blocks[:, :, iterx:iterx + blocksize].copy()
                        vals = numpy.unique(block)
                        if len(vals) == 1 and vals[0] == delimiter:
                            # check if the block is blank
                            if startblock:
                                # if the previous block has data, push blocks in current queue
                                node_service.custom_request(
                                    str((grayname + "/blocks/%d_%d_%d/%d") %
                                        (xbindex + xoffset, ybindex + yoffset,
                                         zbindex + zoffset, xrun)),
                                    blockbuffer, ConnectionMethod.POST)
                                startblock = False
                                xrun = 0
                                blockbuffer = ""

                        else:
                            if startblock == False:
                                xbindex = iterx // blocksize

                            startblock = True
                            blockbuffer += block.tobytes()
                            xrun += 1

                            if blocklimit > 0 and xrun >= blocklimit:
                                # if the previous block has data, push blocks in current queue
                                node_service.custom_request(
                                    str((grayname + "/blocks/%d_%d_%d/%d") %
                                        (xbindex + xoffset, ybindex + yoffset,
                                         zbindex + zoffset, xrun)),
                                    blockbuffer, ConnectionMethod.POST)
                                startblock = False
                                xrun = 0
                                blockbuffer = ""

                    # write-out leftover blocks
                    if xrun > 0:
                        node_service.custom_request(
                            str((grayname + "/blocks/%d_%d_%d/%d") %
                                (xbindex + xoffset, ybindex + yoffset,
                                 zbindex + zoffset, xrun)), blockbuffer,
                            ConnectionMethod.POST)
    def execute(self):
        from pyspark import SparkContext
        from pyspark import StorageLevel
        from DVIDSparkServices.reconutils.Segmentor import Segmentor
        resource_server = self.resource_server
        resource_port = self.resource_port
        self.chunksize = self.config_data["options"]["chunk-size"]

        # create datatype in the beginning
        mutateseg = self.config_data["options"]["mutateseg"]
        node_service = retrieve_node_service(self.config_data["dvid-info"]["dvid-server"], 
                self.config_data["dvid-info"]["uuid"], resource_server, resource_port)
        success = node_service.create_labelblk(str(self.config_data["dvid-info"]["segmentation-name"]))
        # check whether seg should be mutated
        if (not success and mutateseg == "auto") or mutateseg == "yes":
            mutateseg = "yes"
        else:
            mutateseg = "no"

        # grab ROI subvolumes and find neighbors
        distsubvolumes = self.sparkdvid_context.parallelize_roi(
                self.config_data["dvid-info"]["roi"],
                self.chunksize, self.overlap // 2,
                True,
                self.config_data["dvid-info"]["partition-method"],
                self.config_data["dvid-info"]["partition-filter"] )

        # do not recompute ROI for each iteration
        distsubvolumes.persist()

        num_parts = len(distsubvolumes.collect())

        # Instantiate the correct Segmentor subclass (must be installed)
        import importlib
        full_segmentor_classname = self.config_data["options"]["segmentor"]["class"]
        segmentor_classname = full_segmentor_classname.split('.')[-1]
        module_name = '.'.join(full_segmentor_classname.split('.')[:-1])
        segmentor_mod = importlib.import_module(module_name)
        segmentor_class = getattr(segmentor_mod, segmentor_classname)
        segmentor = segmentor_class(self.sparkdvid_context, self)

        # determine number of iterations
        iteration_size = self.config_data["options"]["iteration-size"]
        if iteration_size == 0:
            iteration_size = num_parts

        num_iters = num_parts // iteration_size
        if num_parts % iteration_size > 0:
            num_iters += 1

        seg_chunks_list = []

        # enable checkpointing if not empty
        checkpoint_dir = self.config_data["options"]["checkpoint-dir"]

        # enable rollback of iterations if necessary
        rollback_seg = (self.config_data["options"]["checkpoint"] == "segmentation")
       
        # enable rollback of boundary prediction if necessary
        rollback_pred = (rollback_seg or self.config_data["options"]["checkpoint"] == "voxel")

        for iternum in range(0, num_iters):
            # Disable rollback by setting checkpoint dirs to empty
            gray_checkpoint_dir = mask_checkpoint_dir = pred_checkpoint_dir = sp_checkpoint_dir = seg_checkpoint_dir = ""
            if checkpoint_dir != "":
                pred_checkpoint_dir = checkpoint_dir + "/prediter-" + str(iternum)
                seg_checkpoint_dir = checkpoint_dir + "/segiter-" + str(iternum)

                # Grayscale and SP caches are only written to as a "debug" feature
                if self.config_data["options"]["debug"]:
                    gray_checkpoint_dir = checkpoint_dir + "/grayiter-" + str(iternum)
                    mask_checkpoint_dir = checkpoint_dir + "/maskiter-" + str(iternum)
                    sp_checkpoint_dir = checkpoint_dir + "/spiter-" + str(iternum)

                    roi = self.config_data["dvid-info"]["roi"]
                    method = self.config_data["dvid-info"]["partition-method"]
                    roi_description = roi
                    if method != "ask-dvid":
                        roi_description += "-" + method
                    roi_filter = self.config_data["dvid-info"]["partition-filter"]
                    if roi_filter != "all":
                        roi_description += "-" + roi_filter
                        
                                        
                    # Spit out a JSON of the Subvolume list boxes
                    ids_and_subvols = distsubvolumes.collect()
                    subvols = [v for (_k,v) in ids_and_subvols]
                    subvol_bounds_json = Subvolume.subvol_list_to_json( subvols )
                    mkdir_p(checkpoint_dir)
                    with open(checkpoint_dir + "/{}-subvol-bounds.json".format(roi_description), 'w') as f:
                        f.write( subvol_bounds_json )

                    # Also spit out JSON RLE for writing the modified ROI directly to DVID, in case that's useful
                    all_blocks = Subvolume.subvol_list_all_blocks(subvols)
                    rle = runlength_encode(all_blocks, assume_sorted=False)
                    with open(checkpoint_dir + "/{}-dvid-blocks.json".format(roi_description), 'w') as f:
                        json.dump(rle.tolist(), f)

            # it might make sense to randomly map partitions for selection
            # in case something pathological is happening -- if original partitioner
            # is randomish than this should be fine
            def subset_part(sid_data):
                (sid, _data) = sid_data
                if (sid % num_iters) == iternum:
                    return True
                return False

            # should preserve partitioner
            distsubvolumes_part = distsubvolumes.filter(subset_part)

            if rollback_seg:
                readable_seg_checkpoint_dir = seg_checkpoint_dir
            else:
                readable_seg_checkpoint_dir = ""

            subvols_with_seg_cache, subvols_without_seg_cache = \
                CreateSegmentation._split_subvols_by_cache_status( readable_seg_checkpoint_dir,
                                                                   distsubvolumes_part.values().collect() )

            ##
            ## CACHED SUBVOLS
            ##    
            cached_subvols_rdd = self.sparkdvid_context.sc.parallelize(subvols_with_seg_cache, len(subvols_with_seg_cache) or None)
    
            # Load as many seg blocks from cache as possible
            if subvols_with_seg_cache:
                def retrieve_seg_from_cache(subvol):
                    z1, y1, x1, z2, y2, x2 = subvol.box_with_border
                    block_bounds = ((z1, y1, x1), (z2, y2, x2))
                    block_store = H5BlockStore(seg_checkpoint_dir, mode='r')
                    h5_block = block_store.get_block( block_bounds )
                    return h5_block[:]
                cached_seg_chunks = cached_subvols_rdd.map(retrieve_seg_from_cache)
            else:
                cached_seg_chunks = self.sparkdvid_context.sc.parallelize([]) # empty rdd

            cached_seg_chunks.persist()
            cached_seg_max_ids = cached_seg_chunks.map(np.max)
            
            # (subvol, (seg, max_id))
            cached_seg_chunks_kv = cached_subvols_rdd.zip( cached_seg_chunks.zip(cached_seg_max_ids) )

            ##
            ## UNCACHED SUBVOLS
            ##    
            uncached_subvols = self.sparkdvid_context.sc.parallelize(subvols_without_seg_cache, len(subvols_without_seg_cache) or None)
            uncached_subvols.persist()

            def prepend_sv_index(subvol):
                return (subvol.sv_index, subvol)
            uncached_subvols_kv_rdd = uncached_subvols.map(prepend_sv_index)

            # get grayscale chunks with specified overlap
            uncached_sv_and_gray = self.sparkdvid_context.map_grayscale8(uncached_subvols_kv_rdd,
                                                                         self.config_data["dvid-info"]["grayscale"])

            uncached_gray_vols = select_item(uncached_sv_and_gray, 1, 1)

            # small hack since segmentor is unaware for current iteration
            # perhaps just declare the segment function to have an arbitrary number of parameters
            if type(segmentor) == Segmentor:
                computed_seg_chunks = segmentor.segment(uncached_subvols, uncached_gray_vols,
                                                        gray_checkpoint_dir, mask_checkpoint_dir, pred_checkpoint_dir, sp_checkpoint_dir, seg_checkpoint_dir,
                                                        rollback_pred, False, rollback_seg)
            else:
                computed_seg_chunks = segmentor.segment(uncached_subvols, uncached_gray_vols)

            computed_seg_chunks.persist()
            computed_seg_max_ids = computed_seg_chunks.map( np.max )
            
            # (subvol, (seg, max_id))
            computed_seg_chunks_kv = uncached_subvols.zip( computed_seg_chunks.zip(computed_seg_max_ids) )
        
            ##
            ## FINAL LIST: COMBINED CACHED+UNCACHED
            ##
        
            # (subvol, (seg, max_id))
            seg_chunks = cached_seg_chunks_kv.union(computed_seg_chunks_kv)
            seg_chunks.persist(StorageLevel.MEMORY_AND_DISK)

            seg_chunks_list.append(seg_chunks)

        seg_chunks = seg_chunks_list[0]

        for iter1 in range(1, len(seg_chunks_list)):
            # ?? does this preserve the partitioner (yes, if num partitions is the same)
            # this could cause a serialization problems if there are a large number of iterations (>100)
            seg_chunks = seg_chunks.union(seg_chunks_list[iter1])
        del seg_chunks_list

        # persist through stitch
        # any forced persistence will result in costly
        # pickling, lz4 compressed numpy array should help
        seg_chunks.persist(StorageLevel.MEMORY_AND_DISK)

        # stitch the segmentation chunks
        # (preserves initial partitioning)
        mapped_seg_chunks = segmentor.stitch(seg_chunks)
        
        def prepend_key(item):
            subvol, _ = item
            return (subvol.sv_index, item)
        mapped_seg_chunks = mapped_seg_chunks.map(prepend_key)
       
        if self.config_data["options"]["parallelwrites"] > 0:
            # repartition to fewer partition if there is write bandwidth limits to DVID
            # (coalesce() doesn't balance the partitions, so we opt for a full shuffle.)
            mapped_seg_chunks = mapped_seg_chunks.repartition(self.config_data["options"]["parallelwrites"])

        # write data to DVID
        self.sparkdvid_context.foreach_write_labels3d(self.config_data["dvid-info"]["segmentation-name"], mapped_seg_chunks, self.config_data["dvid-info"]["roi"], mutateseg)
        self.workflow_entry_exit_printer.write_data("Wrote DVID labels") # write to logger after spark job

        if self.config_data["options"]["debug"]:
            # grab 256 cube from ROI 
            node_service = retrieve_node_service(self.config_data["dvid-info"]["dvid-server"], 
                    self.config_data["dvid-info"]["uuid"], resource_server, resource_port)
            
            substacks, packing_factor = node_service.get_roi_partition(str(self.config_data["dvid-info"]["roi"]),
                                                                       256 // self.blocksize)

            if self.resource_server != "":
                label_volume = node_service.get_labels3D( str(self.config_data["dvid-info"]["segmentation-name"]), 
                                                          (256,256,256),
                                                          (substacks[0].z, substacks[0].y, substacks[0].x),
                                                          compress=True, throttle=False )
            else:
                label_volume = node_service.get_labels3D( str(self.config_data["dvid-info"]["segmentation-name"]), 
                                                          (256,256,256),
                                                          (substacks[0].z, substacks[0].y, substacks[0].x),
                                                          compress=True )


            # dump checksum
            import hashlib
            md5 = hashlib.md5()
            md5.update( label_volume )
            print("DEBUG: ", md5.hexdigest())
        def write_blocks(part_vol):
            logger = logging.getLogger(__name__)
            part, data = part_vol
            offset = part.get_offset()
            reloffset = part.get_reloffset()
            _, _, x_size = data.shape
            if x_size % blksize != 0:
                # check if padded
                raise ValueError("Data is not block aligned")

            shiftedoffset = (offset.z+reloffset.z, offset.y+reloffset.y, offset.x+reloffset.x)
            logger.info("Starting WRITE of partition at: {} size: {}".format(shiftedoffset, data.shape))
            node_service = retrieve_node_service(server, uuid, resource_server, resource_port, appname)

            # Find all non-zero blocks (and record by block index)
            block_coords = []
            for block_index, block_x in enumerate(range(0, x_size, blksize)):
                if not (data[:, :, block_x:block_x+blksize] == delimiter).all():
                    block_coords.append( (0, 0, block_index) ) # (Don't care about Z,Y indexes, just X-index)

            # Find *runs* of non-zero blocks
            block_runs = runlength_encode(block_coords, True) # returns [[Z,Y,X1,X2], [Z,Y,X1,X2], ...]
            
            # Convert stop indexes from inclusive to exclusive
            block_runs[:,-1] += 1
            
            # Discard Z,Y indexes and convert from indexes to pixels
            ranges = blksize * block_runs[:, 2:4]
            
            # iterate through contiguous blocks and write to DVID
            # TODO: write compressed data directly into DVID
            for (data_x_start, data_x_end) in ranges:
                with Timer() as copy_timer:
                    datacrop = data[:,:,data_x_start:data_x_end].copy()
                logger.info("Copied {}:{} in {:.3f} seconds".format(data_x_start, data_x_end, copy_timer.seconds))

                data_offset_zyx = (shiftedoffset[0], shiftedoffset[1], shiftedoffset[2] + data_x_start)
                
                if dataname is not None:
                    with Timer() as put_timer:
                        if not israw: 
                            logger.info("STARTING Put: labels block {}".format(data_offset_zyx))
                            if resource_server != "" or dvid_info["dvid-server"].startswith("http://127.0.0.1"):
                                node_service.put_labels3D(dataname, datacrop, data_offset_zyx, compress=True, throttle=False)
                            else:
                                node_service.put_labels3D(dataname, datacrop, data_offset_zyx, compress=True)
                        else:
                            logger.info("STARTING Put: raw block {}".format(data_offset_zyx))
                            if resource_server != "" or dvid_info["dvid-server"].startswith("http://127.0.0.1"):
                                node_service.put_gray3D(dataname, datacrop, data_offset_zyx, compress=False, throttle=False)
                            else:
                                node_service.put_gray3D(dataname, datacrop, data_offset_zyx, compress=False)
                    logger.info("Put block {} in {:.3f} seconds".format(data_offset_zyx, put_timer.seconds))

                if dataname_lossy is not None:
                    logger.info("STARTING Put: lossy block {}".format(data_offset_zyx))
                    with Timer() as put_lossy_timer:
                        if resource_server != "" or dvid_info["dvid-server"].startswith("http://127.0.0.1"):
                            node_service.put_gray3D(dataname_lossy, datacrop, data_offset_zyx, compress=False, throttle=False)
                        else:
                            node_service.put_gray3D(dataname_lossy, datacrop, data_offset_zyx, compress=False)
                    logger.info("Put lossy block {} in {:.3f} seconds".format(data_offset_zyx, put_lossy_timer.seconds))