Ejemplo n.º 1
0
    def _stop(self):
        pointScanner.PointScanner._stop(self, send_stop=False)
        self.progress.send(self)
        t_ = time.time()

        logger.info('Finished tile acquisition')
        if self._backend == 'cluster':
            logger.info(
                'Waiting for spoolers to empty and for base levels to be built'
            )
        self.P.finish_base_tiles()

        if self._backend == 'cluster':
            logger.info('Base tiles built')

        logger.info('Completing pyramid (dt = %3.2f)' % (time.time() - t_))
        self.P.update_pyramid()

        if self._backend == 'cluster':
            from PYME.IO import clusterIO
            clusterIO.put_file(self.P.base_dir + '/metadata.json',
                               self.P.mdh.to_JSON().encode())
        else:
            with open(os.path.join(self._tiledir, 'metadata.json'), 'w') as f:
                f.write(self.P.mdh.to_JSON())

        logger.info('Pyramid complete (dt = %3.2f)' % (time.time() - t_))

        self.on_stop.send(self)
        self.progress.send(self)
Ejemplo n.º 2
0
    def prepare(self):
        """
        Do any setup work - e.g. uploading metadata required before the rule is triggered

        Returns
        -------

        post_args : dict
            a dictionary with arguments to pass to RulePusher._post_rule() - specifically timeout, max_tasks, release_start, release_end

        """
        #set up results file:
        logging.debug('resultsURI: ' + self.worker_resultsURI)
        clusterResults.fileResults(self.worker_resultsURI + '/MetaData',
                                   self.mdh)

        # defer copying events to after series completion
        #clusterResults.fileResults(self.worker_resultsURI + '/Events', self.ds.getEvents())

        # set up metadata file which is used for deciding how to launch the analysis
        clusterIO.put_file(self.resultsMDFilename,
                           self.mdh.to_JSON().encode(),
                           serverfilter=self.serverfilter)

        #wait until clusterIO caches clear to avoid replicating the results file.
        #time.sleep(1.5) #moved inside polling thread so launches will run quicker

        self._next_release_start = self.start_at
        self.frames_outstanding = self.total_frames - self._next_release_start
        if self.data_complete:
            return dict(max_tasks=self.total_frames)
        return {}
Ejemplo n.º 3
0
def test_dircache_purge():
    testdata = b'foo bar\n'
    for i in range(1050):
        clusterIO.put_file('_testing/lots_of_folders/test_%d/test.txt' % i,
                           testdata, 'TEST')

        listing = clusterIO.listdir('_testing/lots_of_folders/test_%d/' % i,
                                    'TEST')
Ejemplo n.º 4
0
 def StartSpool(self):
     sp.Spooler.StartSpool(self)
     
     logger.debug('Starting spooling: %s' %self.seriesName)
     
     if self._aggregate_h5:
         #NOTE: allow a longer timeout than normal here as __aggregate with metadata waits for a lock on the server side before
         # actually adding (and is therefore susceptible to longer latencies than most operations). FIXME - remove server side lock.
         clusterIO.put_file('__aggregate_h5/' + self.seriesName + '/metadata.json', self.md.to_JSON().encode(), serverfilter=self.clusterFilter, timeout=3)
     else:
         clusterIO.put_file(self.seriesName + '/metadata.json', self.md.to_JSON().encode(), serverfilter=self.clusterFilter)
Ejemplo n.º 5
0
def test_double_put():
    """Trying to put the same file twice should cause an error"""
    testdata = b'foo bar\n'

    clusterIO.put_file('_testing/test_d.txt', testdata, 'TES1')
    
    try:
        clusterIO.put_file('_testing/test_d.txt', testdata, 'TES1')
        raise AssertionError('Second put attempt did not raise an error')
    except RuntimeError:
        #we want to generate this error
        pass
Ejemplo n.º 6
0
    def StopSpool(self):
        self._dPoll = False
        sp.Spooler.StopSpool(self)

        logger.debug('Stopping spooling %s' % self.seriesName)

        if self._aggregate_h5:
            clusterIO.put_file('__aggregate_h5/' + self.seriesName +
                               '/final_metadata.json',
                               self.md.to_JSON().encode(),
                               serverfilter=self.clusterFilter)

            #save the acquisition events as json - TODO - consider a binary format as the events
            #can be quite numerous
            clusterIO.put_file('__aggregate_h5/' + self.seriesName +
                               '/events.json',
                               self.evtLogger.to_JSON().encode(),
                               serverfilter=self.clusterFilter)

        else:
            clusterIO.put_file(self.seriesName + '/final_metadata.json',
                               self.md.to_JSON().encode(),
                               serverfilter=self.clusterFilter)

            #save the acquisition events as json - TODO - consider a binary format as the events
            #can be quite numerous
            clusterIO.put_file(self.seriesName + '/events.json',
                               self.evtLogger.to_JSON().encode(),
                               serverfilter=self.clusterFilter)
Ejemplo n.º 7
0
    def StartSpool(self):
        sp.Spooler.StartSpool(self)

        logger.debug('Starting spooling: %s' % self.seriesName)

        if self._aggregate_h5:
            clusterIO.put_file('__aggregate_h5/' + self.seriesName +
                               '/metadata.json',
                               self.md.to_JSON().encode(),
                               serverfilter=self.clusterFilter)
        else:
            clusterIO.put_file(self.seriesName + '/metadata.json',
                               self.md.to_JSON().encode(),
                               serverfilter=self.clusterFilter)
Ejemplo n.º 8
0
def mkdir(request, basedir):
    from PYME.IO import clusterIO
    newDirectory = request.POST.get('newDirectory',
                                    request.GET.get('newDirectory', None))

    if newDirectory is None or newDirectory == '':
        return HttpResponseForbidden('No directory name specified')

    newDirectory = (basedir + newDirectory).rstrip('/') + '/'

    if clusterIO.exists(newDirectory) or clusterIO.exists(newDirectory[:-1]):
        return HttpResponseForbidden('Directory already exists')

    clusterIO.put_file(newDirectory, '')

    return HttpResponse(newDirectory)
Ejemplo n.º 9
0
def upload_files(request, directory):
    from PYME.IO import clusterIO

    files = request.FILES.getlist('file')
    for file in files:
        targetFilename = directory + file.name
        if clusterIO.exists(targetFilename):
            return HttpResponseForbidden(
                'Upload failed [no files uploaded]. %s already exists on cluster'
                % targetFilename)

    for file in files:
        targetFilename = directory + file.name
        clusterIO.put_file(targetFilename, file.read())

    return HttpResponseRedirect(request.META['HTTP_REFERER'])
def test_single_put():
    testdata = 'foo bar\n'
    t = time.time()
    clusterIO.put_file('_testing/test.txt', testdata, 'TEST')

    print('putting a small file took %3.5f s' % (time.time() - t))

    t = time.time()
    clusterIO.put_file('_testing/test1.txt', testdata, 'TEST')

    print('putting a second small file took %3.5f s' % (time.time() - t))

    t = time.time()
    retrieved = clusterIO.get_file('_testing/test.txt', 'TEST')

    print('retrieving a small file took %3.5f s' % (time.time() - t))
Ejemplo n.º 11
0
    def recvMember(self, rfile, name, size, req):
        """Receive (save) a member file"""

        fname = os.path.join(self.fsname, urllib.unquote(name))

        if size == 0:
            _dummy_files.append(fname)
            return
        else:
            try:
                _dummy_files.remove(fname)
            except ValueError:
                pass

        #f = file(fname, 'wb')
        f = BytesIO()
        # if size=-1 it's Transfer-Encoding: Chunked mode, like OSX finder using this mode put data
        # so the file size need get here.
        if size == -2:
            l = int(rfile.readline(), 16)
            ltotal = 0
            while l > 0:
                buf = rfile.read(l)
                f.write(buf)  #yield buf
                rfile.readline()
                ltotal += l
                l = int(rfile.readline(), 16)
        elif size > 0:  # if size=0 ,just save a empty file.
            writ = 0
            bs = 65536
            while True:
                if size != -1 and (bs > size - writ):
                    bs = size - writ
                buf = rfile.read(bs)
                if len(buf) == 0:
                    break
                f.write(buf)
                writ += len(buf)
                if size != -1 and writ >= size:
                    break

        logger.debug('ClusterIO put: %s' % fname)
        clusterIO.put_file(fname, f.getvalue())

        f.close()
Ejemplo n.º 12
0
    def finalise(self):
        # wait until our input queue is empty rather than immediately stopping saving.
        self._stopping=True
        logger.debug('Stopping spooling %s' % self.seriesName)
        
        
        #join our polling threads
        if config.get('httpspooler-jointhreads', True):
            # Allow this to be switched off in a config option for maximum performance on High Throughput system.
            # Joining threads is the recommended and safest behaviour, but forces spooling of current series to complete
            # before next series starts, so could have negative performance implications.
            # The alternative - letting spooling continue during the acquisition of the next series - has the potential
            # to result in runaway memory and thread usage when things go pear shaped (i.e. spooling is not fast enough)
            # TODO - is there actually a performance impact that justifies this config option, or is it purely theoretical
            for pt in self._pollThreads:
                pt.join()

        # remove our reference to the threads which hold back-references preventing garbage collection
        del(self._pollThreads)
        
        # save events and final metadata
        # TODO - use a binary format for saving events - they can be quite
        # numerous, and can trip the standard 1 s clusterIO.put_file timeout.
        # Use long timeouts as a temporary hack because failing these can ruin
        # a dataset
        if self._aggregate_h5:
            clusterIO.put_file('__aggregate_h5/' + self.seriesName + '/final_metadata.json', 
                               self.md.to_JSON().encode(), self.clusterFilter)
            clusterIO.put_file('__aggregate_h5/' + self.seriesName + '/events.json', 
                               self.evtLogger.to_JSON().encode(),
                               self.clusterFilter, timeout=10)
        else:
            clusterIO.put_file(self.seriesName + '/final_metadata.json', 
                               self.md.to_JSON().encode(), self.clusterFilter)
            clusterIO.put_file(self.seriesName + '/events.json', 
                               self.evtLogger.to_JSON().encode(), 
                               self.clusterFilter, timeout=10)
Ejemplo n.º 13
0
    def __init__(self, dataSourceID, metadata, resultsFilename, queueName = None, startAt = 10, dataSourceModule=None, serverfilter=''):
        """
        Create a pusher and push tasks for each frame in a series. For use with the new cluster distribution architecture

        Parameters
        ----------
        dataSourceID : str
            The URI of the data source - e.g. PYME-CLUSTER://serverfilter/path/to/data
        metadata : PYME.IO.MetaDataHandler object
            The acquisition and analysis metadata
        resultsFilename : str
            The cluster relative path to the results file. e.g. "<username>/analysis/<date>/seriesname.h5r"
        queueName : str
            a name to give the queue. The results filename is used if no name is given.
        startAt : int
            which frame to start at. TODO - read from metadata instead of taking as a parameter.
        dataSourceModule : str [optional]
            The name of the module to use for reading the raw data. If not given, it will be inferred from the dataSourceID
        serverfilter : str
            A cluster filter, for use when multiple PYME clusters are visible on the same network segment.
        """
        if queueName is None:
            queueName = resultsFilename

        self.queueID = queueName
        self.dataSourceID = dataSourceID
        if '~' in self.dataSourceID or '~' in self.queueID or '~' in resultsFilename:
            raise RuntimeError('File, queue or results name must NOT contain dashes')

        self.resultsURI = 'PYME-CLUSTER://%s/__aggregate_h5r/%s' % (serverfilter, resultsFilename)

        resultsMDFilename = resultsFilename + '.json'
        self.results_md_uri = 'PYME-CLUSTER://%s/%s' % (serverfilter, resultsMDFilename)

        self.taskQueueURI = _getTaskQueueURI()

        self.mdh = metadata

        #load data source
        if dataSourceModule is None:
            DataSource = DataSources.getDataSourceForFilename(dataSourceID)
        else:
            DataSource = __import__('PYME.IO.DataSources.' + dataSourceModule, fromlist=['PYME', 'io', 'DataSources']).DataSource #import our data source
        self.ds = DataSource(self.dataSourceID)
        
        #set up results file:
        logging.debug('resultsURI: ' + self.resultsURI)
        clusterResults.fileResults(self.resultsURI + '/MetaData', metadata)
        clusterResults.fileResults(self.resultsURI + '/Events', self.ds.getEvents())

        # set up metadata file which is used for deciding how to launch the analysis
        clusterIO.put_file(resultsMDFilename, self.mdh.to_JSON(), serverfilter=serverfilter)
        
        #wait until clusterIO caches clear to avoid replicating the results file.
        #time.sleep(1.5) #moved inside polling thread so launches will run quicker

        self.currentFrameNum = startAt

        self._task_template = None
        
        self.doPoll = True
        
        self.pollT = threading.Thread(target=self._updatePoll)
        self.pollT.start()
Ejemplo n.º 14
0
def test_put():
    testdata = b'foo bar\n'
    clusterIO.put_file('_testing/test.txt', testdata, 'TES1')
    retrieved = clusterIO.get_file('_testing/test.txt', 'TES1')
    
    assert testdata == retrieved
Ejemplo n.º 15
0
    def _save(self, filename, data):
        from PYME.IO import clusterIO, PZFFormat

        clusterIO.put_file(filename, PZFFormat.dumps(data.astype('float32')))
Ejemplo n.º 16
0
def distributed_pyramid(out_folder,
                        ds,
                        xm,
                        ym,
                        mdh,
                        split=False,
                        skipMoveFrames=False,
                        shiftfield=None,
                        mixmatrix=[[1., 0.], [0., 1.]],
                        correlate=False,
                        dark=None,
                        flat=None,
                        pyramid_tile_size=256):
    """Create a distributed pyramid through PYMECluster.

    Parameters
    ----------
    out_folder : str
        directory to save pyramid tiles(/directories). The same folder will be
        created on the cluster servers.
    ds : PYME.IO.DataSources.BaseDataSource, np.ndarray
        array-like image
    xm : np.ndarray or PYME.Analysis.piecewiseMapping.piecewiseMap
        x positions of frames in ds. Raw stage positions in [um]. ImagePyramid
        origin will be at at minimum x, and offset to camera chip origin will
        be handled in SupertileDatasource tile_coords_um method.
        to the camera chip origin.
    ym : np.ndarray or PYME.Analysis.piecewiseMapping.piecewiseMap
        y positions of frames in ds. Raw stage positions in [um]. ImagePyramid
        origin will be at at minimum y, and offset to camera chip origin will
        be handled in SupertileDatasource tile_coords_um method.
    mdh : PYME.IO.MetaDataHandler.MDataHandlerBase
        metadata for ds
    split : bool, optional
        whether this is a splitter datasource and should be treated like one,
        by default False
    skipMoveFrames : bool, optional
        flag to drop frames which are the first frame acquired at a given
        position, by default False
    shiftfield : [type], optional
        required for splitter data, see PYME.Acquire.Hardware.splitter, by 
        default None
    mixmatrix : list, optional
        for splitter data, see PYME.Acquire.Hardware.splitter, by 
        default [[1., 0.], [0., 1.]]
    correlate : bool, optional
        whether to add a 300 pixel padding to the edges, by default False
    dark : ndarray, float, optional
        (appropriately-cropped or scalar) dark frame (analog-digital offset)
        calibration to subtract when adding frames to the pyramid, by default
        None, in which case Camera.ADOffset from metadata will be used, if 
        available
    flat : ndarray, optional
        (appropriately-cropped or scalar) flatfield calibration to apply to 
        frames when adding them to the pyramid, by default None
    pyramid_tile_size : int, optional
        base tile size, by default 256 pixels

    Returns
    -------
    DistributedImagePyramid
        coalesced/averaged/etc multilevel DistributedImagePyramid instance
    
    Notes
    -----
    Code is currently somewhat alpha in that the splitter functionality is 
    more or less untested, and we only get tile orientations right for primary
    cameras (i.e. when the stage is registered with multipliers to match the
    camera, rather than camera registered with orientation metadata to match it
    to the stage).
    
    TODO - this largely duplicates the corresponding function in tile_pyramid => refactor

    """
    frameSizeX, frameSizeY, numFrames = ds.shape[:3]

    if split:
        from PYME.Acquire.Hardware import splitter
        frameSizeY /= 2
        nchans = 2
        unmux = splitter.Unmixer(shiftfield, mdh.voxelsize_nm.x)
    else:
        nchans = 1

    #x & y positions of each frame
    xps = xm(np.arange(numFrames)) if not isinstance(xm, np.ndarray) else xm
    yps = ym(np.arange(numFrames)) if not isinstance(ym, np.ndarray) else ym

    #give some room at the edges
    bufSize = 0
    if correlate:
        bufSize = 300

    # to avoid building extra, empty tiles, the pyramid origin is the minimum
    # x and y position present in the tiles
    x0_pyramid, y0_pyramid = xps.min(), yps.min()
    xps -= x0_pyramid
    yps -= y0_pyramid

    # calculate origin independent of the camera ROI setting to store in
    # metadata for use in e.g. SupertileDatasource.DataSource.tile_coords_um
    x0_cam, y0_cam = get_camera_physical_roi_origin(mdh)
    x0 = x0_pyramid + mdh.voxelsize_nm.x / 1e3 * x0_cam
    y0 = y0_pyramid + mdh.voxelsize_nm.y / 1e3 * y0_cam

    #convert to pixels
    xdp = (bufSize + (xps / (mdh.getEntry('voxelsize.x'))).round()).astype('i')
    ydp = (bufSize + (yps / (mdh.getEntry('voxelsize.y'))).round()).astype('i')

    # get splitter ROI coordinates in units of pixels
    ROIX1 = x0_cam + 1  # TODO - is splitter 1-indexed?
    ROIY1 = y0_cam + 1
    ROIX2 = ROIX1 + mdh.getEntry('Camera.ROIWidth')
    ROIY2 = ROIY1 + mdh.getEntry('Camera.ROIHeight')

    if dark is None:
        dark = float(mdh.getOrDefault('Camera.ADOffset', 0))

    P = DistributedImagePyramid(
        out_folder,
        pyramid_tile_size,
        x0=x0,
        y0=y0,
        pixel_size=mdh.getEntry('voxelsize.x'),
    )

    logger.debug('Updating base tiles ...')

    t1 = time.time()
    for i in range(int(mdh.getEntry('Protocol.DataStartsAt')), numFrames):
        if xdp[i - 1] == xdp[i] or not skipMoveFrames:
            x_i = xdp[i]
            y_i = ydp[i]
            d = ds[:, :, i].astype('f') - dark
            if not flat is None:
                d = d * flat

            if split:
                d = np.concatenate(
                    unmux.Unmix(d, mixmatrix, dark,
                                [ROIX1, ROIY1, ROIX2, ROIY2]), 2)

            # TODO - account for orientation so this works for non-primary cams
            P.update_base_tiles_from_frame(x_i, y_i, d)

    P.finish_base_tiles()

    t2 = time.time()
    logger.debug('Updated base tiles in %fs' % (t2 - t1))
    #P._occ.flush()
    logger.debug(time.time() - t2)
    logger.debug('Updating pyramid ...')
    P.update_pyramid()  # TODO: make cluster-aware
    logger.debug(time.time() - t2)
    logger.debug('Done')

    clusterIO.put_file('/'.join([P.base_dir, 'metadata.json']),
                       P.mdh.to_JSON())

    return P