Esempio n. 1
0
    def __init__(self,
                 data_dir,
                 batch_range=None,
                 init_epoch=1,
                 init_batchnum=None,
                 dp_params=None,
                 test=False):
        LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch,
                                     init_batchnum, dp_params, test)

        if EMDataProvider.data_parser == None:
            assert os.path.isfile(
                data_dir
            )  # this needs to be the full path / file name of EMDataParser config file

            # if the convnet is writing features and an em feature path is provided then also have parser write outputs
            EMDataProvider.write_features = dp_params['convnet'].op.get_value(
                'write_features')
            write_outputs = False
            append_features = False  # modes exposed in init by the EMDataParser class
            if EMDataProvider.write_features:
                if dp_params['em_feature_path']:
                    # this command line flag along with write_features enables writing output probabilities
                    EMDataProvider.write_features_type = 'prob'
                    # if the em_feature_path is an hdf file name, then this is a single whole-dataset hdf5 file
                    fn, ext = os.path.splitext(dp_params['em_feature_path'])
                    ext = ext.lower()
                    append_features = (ext == '.h5' or ext == '.hdf5')
                    # if not appending features, then just do normal write outputs
                    write_outputs = not append_features
                else:
                    # if em_feature_path is not specified, then this mode is for initializing data pre-processing
                    EMDataProvider.write_features_type = 'data'
                    assert (dp_params['convnet'].op.get_value('numpy_dump'))

            # instantiate the parser, override some attributes and then initialize
            EMDataProvider.data_parser = EMDataParser(
                data_dir, write_outputs, dp_params['init_load_path'],
                dp_params['save_name'], append_features,
                dp_params['convnet'].op.get_value('chunk_skip_list'),
                dp_params['convnet'].op.get_value('dim_ordering'))
            # if writing any features, override the outpath and force no label lookup
            if EMDataProvider.write_features:
                EMDataProvider.data_parser.outpath = dp_params[
                    'em_feature_path']
                EMDataProvider.data_parser.no_label_lookup = True
            EMDataProvider.data_parser.initBatches()
        self.batch_meta = EMDataProvider.data_parser.batch_meta
        self.batches_generated = 0
Esempio n. 2
0
    def __init__(self, cfg_file, write_output=None, chunk_skip_list=[], dim_ordering='', batch_range=[1,10], 
                 name='emdata', isTest=False, concatenate_batches=False, NBUF=2, image_in_size=None):
        Thread.__init__(self)
        self.name = name

        # mostly intended for double buffering (NBUF==2) so that data can be pushed to card simultaneous with training.
        # single buffer (NUF==1) fetches next EM batch in parallel but waits until __iter__ to push to backend buffer.
        # more buffers should work (NBUF > 2) but takes more gpu memory and likely no speed improvement
        assert( NBUF > 0 )
        self.NBUF = NBUF

        # batches are numbered starting at 1 and inclusive of end of range.
        # this needs to be done first so that nmacrobatches property works.
        self.batch_range = batch_range; self.batchnum = batch_range[0]

        # previously parser was agnostic to test or train, but needed it for allowing single ini in chunk_list_all mode
        self.isTest = isTest

        # if the output an hdf file name, then this is a single whole-dataset hdf5 file.
        # xxx - initializations for writing output features could be cleaned up.
        write_outputs = (write_output is not None); append_features = False
        if write_outputs:
            fn, ext = os.path.splitext(write_output); ext = ext.lower()
            # .conf indicates to write knossos-style outputs
            append_features = (ext == '.h5' or ext == '.hdf5' or ext == '.conf')
            write_outputs = not append_features
        # instantiate the actual em data parser, code shared with cuda-convnets2 em data parser
        self.parser = EMDataParser(cfg_file, write_outputs=write_outputs, append_features=append_features, 
                                    chunk_skip_list=chunk_skip_list, dim_ordering=dim_ordering, isTest=self.isTest,
                                    image_in_size=image_in_size)
        if write_outputs or append_features:
            # force some properties if in mode for writing outputs.
            # xxx - this is not clean, needs some rethinking on how write_outputs modes are initialized
            self.parser.outpath = write_output
            self.parser.no_label_lookup = True
            self.parser.append_features_knossos = append_features and (ext == '.conf')
            if self.parser.append_features_knossos: 
                self.parser.outpath, fn = os.path.split(fn); self.parser.strnetid = re.findall(r'\d+', fn)[0]
        # parser relies on having initBatches called right away, xxx - could revisit this?
        self.parser.initBatches()

        # no need for special code to concatenate if there is only one macrobatch anyways
        self.concatenate_batches = concatenate_batches and (self.nmacrobatches > 1)

        self.nexamples = self.parser.num_cases_per_batch
        if self.concatenate_batches: self.nexamples *= self.nmacrobatches

        # locks and events for synchronizing data loading thread.
        self.init_event = threading.Event()
        if self.NBUF > 1:
            self.lbuf_lock = threading.Lock(); self.cbuf_lock = threading.Lock()
            self.lbuf_event = threading.Event(); self.cbuf_event = threading.Event()
        else:
            self.push_event = threading.Event(); self.push_done_event = threading.Event()

        # set pycuda driver for gpu backend
        # xxx - this is a bit hacky, is there a better way to do this?
        if type(self.be) == NervanaGPU:
            import pycuda.driver as drv
            self.drv = drv
            #self.stream = self.drv.Stream() # xxx - for other synchonize method??? see below
        else:
            self.drv = None
            
        # start the thread and wait for initialization to complete.
        # initialization of backend memory has to occur within the thread.
        self.daemon = True  # so that stop event is not necessary to terminate threads when process completes.
        self.start()
        self.init_event.wait()
Esempio n. 3
0
    def __init__(self,
                 cfg_file,
                 write_output=None,
                 chunk_skip_list=[],
                 dim_ordering='',
                 batch_range=[1, 10],
                 name='emdata',
                 isTest=False,
                 concatenate_batches=False,
                 NBUF=2,
                 image_in_size=None):
        Thread.__init__(self)
        self.name = name

        # mostly intended for double buffering (NBUF==2) so that data can be pushed to card simultaneous with training.
        # single buffer (NUF==1) fetches next EM batch in parallel but waits until __iter__ to push to backend buffer.
        # more buffers should work (NBUF > 2) but takes more gpu memory and likely no speed improvement
        assert (NBUF > 0)
        self.NBUF = NBUF

        # batches are numbered starting at 1 and inclusive of end of range.
        # this needs to be done first so that nmacrobatches property works.
        self.batch_range = batch_range
        self.batchnum = batch_range[0]

        # previously parser was agnostic to test or train, but needed it for allowing single ini in chunk_list_all mode
        self.isTest = isTest

        # if the output an hdf file name, then this is a single whole-dataset hdf5 file.
        # xxx - initializations for writing output features could be cleaned up.
        write_outputs = (write_output is not None)
        append_features = False
        if write_outputs:
            fn, ext = os.path.splitext(write_output)
            ext = ext.lower()
            # .conf indicates to write knossos-style outputs
            append_features = (ext == '.h5' or ext == '.hdf5'
                               or ext == '.conf')
            write_outputs = not append_features
        # instantiate the actual em data parser, code shared with cuda-convnets2 em data parser
        self.parser = EMDataParser(cfg_file,
                                   write_outputs=write_outputs,
                                   append_features=append_features,
                                   chunk_skip_list=chunk_skip_list,
                                   dim_ordering=dim_ordering,
                                   isTest=self.isTest,
                                   image_in_size=image_in_size)
        if write_outputs or append_features:
            # force some properties if in mode for writing outputs.
            # xxx - this is not clean, needs some rethinking on how write_outputs modes are initialized
            self.parser.outpath = write_output
            self.parser.no_label_lookup = True
            self.parser.append_features_knossos = append_features and (
                ext == '.conf')
            if self.parser.append_features_knossos:
                self.parser.outpath, fn = os.path.split(fn)
                self.parser.strnetid = re.findall(r'\d+', fn)[0]
        # parser relies on having initBatches called right away, xxx - could revisit this?
        self.parser.initBatches()

        # no need for special code to concatenate if there is only one macrobatch anyways
        self.concatenate_batches = concatenate_batches and (self.nmacrobatches
                                                            > 1)

        self.nexamples = self.parser.num_cases_per_batch
        if self.concatenate_batches: self.nexamples *= self.nmacrobatches

        # locks and events for synchronizing data loading thread.
        self.init_event = threading.Event()
        if self.NBUF > 1:
            self.lbuf_lock = threading.Lock()
            self.cbuf_lock = threading.Lock()
            self.lbuf_event = threading.Event()
            self.cbuf_event = threading.Event()
        else:
            self.push_event = threading.Event()
            self.push_done_event = threading.Event()

        # set pycuda driver for gpu backend
        # xxx - this is a bit hacky, is there a better way to do this?
        if type(self.be) == NervanaGPU:
            import pycuda.driver as drv
            self.drv = drv
            #self.stream = self.drv.Stream() # xxx - for other synchonize method??? see below
        else:
            self.drv = None

        # start the thread and wait for initialization to complete.
        # initialization of backend memory has to occur within the thread.
        self.daemon = True  # so that stop event is not necessary to terminate threads when process completes.
        self.start()
        self.init_event.wait()
Esempio n. 4
0
class EMDataIterator(NervanaEMDataIterator, Thread):

    def __init__(self, cfg_file, write_output=None, chunk_skip_list=[], dim_ordering='', batch_range=[1,10], 
                 name='emdata', isTest=False, concatenate_batches=False, NBUF=2, image_in_size=None):
        Thread.__init__(self)
        self.name = name

        # mostly intended for double buffering (NBUF==2) so that data can be pushed to card simultaneous with training.
        # single buffer (NUF==1) fetches next EM batch in parallel but waits until __iter__ to push to backend buffer.
        # more buffers should work (NBUF > 2) but takes more gpu memory and likely no speed improvement
        assert( NBUF > 0 )
        self.NBUF = NBUF

        # batches are numbered starting at 1 and inclusive of end of range.
        # this needs to be done first so that nmacrobatches property works.
        self.batch_range = batch_range; self.batchnum = batch_range[0]

        # previously parser was agnostic to test or train, but needed it for allowing single ini in chunk_list_all mode
        self.isTest = isTest

        # if the output an hdf file name, then this is a single whole-dataset hdf5 file.
        # xxx - initializations for writing output features could be cleaned up.
        write_outputs = (write_output is not None); append_features = False
        if write_outputs:
            fn, ext = os.path.splitext(write_output); ext = ext.lower()
            # .conf indicates to write knossos-style outputs
            append_features = (ext == '.h5' or ext == '.hdf5' or ext == '.conf')
            write_outputs = not append_features
        # instantiate the actual em data parser, code shared with cuda-convnets2 em data parser
        self.parser = EMDataParser(cfg_file, write_outputs=write_outputs, append_features=append_features, 
                                    chunk_skip_list=chunk_skip_list, dim_ordering=dim_ordering, isTest=self.isTest,
                                    image_in_size=image_in_size)
        if write_outputs or append_features:
            # force some properties if in mode for writing outputs.
            # xxx - this is not clean, needs some rethinking on how write_outputs modes are initialized
            self.parser.outpath = write_output
            self.parser.no_label_lookup = True
            self.parser.append_features_knossos = append_features and (ext == '.conf')
            if self.parser.append_features_knossos: 
                self.parser.outpath, fn = os.path.split(fn); self.parser.strnetid = re.findall(r'\d+', fn)[0]
        # parser relies on having initBatches called right away, xxx - could revisit this?
        self.parser.initBatches()

        # no need for special code to concatenate if there is only one macrobatch anyways
        self.concatenate_batches = concatenate_batches and (self.nmacrobatches > 1)

        self.nexamples = self.parser.num_cases_per_batch
        if self.concatenate_batches: self.nexamples *= self.nmacrobatches

        # locks and events for synchronizing data loading thread.
        self.init_event = threading.Event()
        if self.NBUF > 1:
            self.lbuf_lock = threading.Lock(); self.cbuf_lock = threading.Lock()
            self.lbuf_event = threading.Event(); self.cbuf_event = threading.Event()
        else:
            self.push_event = threading.Event(); self.push_done_event = threading.Event()

        # set pycuda driver for gpu backend
        # xxx - this is a bit hacky, is there a better way to do this?
        if type(self.be) == NervanaGPU:
            import pycuda.driver as drv
            self.drv = drv
            #self.stream = self.drv.Stream() # xxx - for other synchonize method??? see below
        else:
            self.drv = None
            
        # start the thread and wait for initialization to complete.
        # initialization of backend memory has to occur within the thread.
        self.daemon = True  # so that stop event is not necessary to terminate threads when process completes.
        self.start()
        self.init_event.wait()

    def run(self):
        # this allows the current running thread to push data to the gpu memory buffer.
        # ArrayIterator constructor has to be called within this thread also,
        #   so that the memory is allocated in this context.
        # xxx - cleanup call to self.ctx.detach() ???
        if self.drv is not None:
            self.ctx = self.drv.Device(self.be.device_id).make_context()

        # iterator initilizes random batches but will be overwritten with first batch in __iter__
        super(EMDataIterator, self).__init__(name=self.name, nexamples=self.nexamples)

        # setup multiple buffers (two should be sufficient?).
        # this allows data to be copied to the backend (gpu) memory while the previous macrobatch is running.
        self.iter_buf = [None]*self.NBUF; self.iter_buf[0] = self; self.cbuf = 0; self.lbuf = 0
        for i in range(1,self.NBUF):
            self.iter_buf[i] = NervanaEMDataIterator(name=self.name + str(i), nexamples=self.nexamples,
                parser=self.parser)

        # cpu buffers for storing batches from EM parser before they are written to gpu.
        self.num_data = 1 + self.parser.naug_data
        self.num_labels = 0 if self.parser.no_labels else 1
        self.num_data_labels = self.num_data + self.num_labels
        self.nextdata = [None] * self.num_data_labels
        if self.concatenate_batches:
            # http://stackoverflow.com/questions/2397141/how-to-initialize-a-two-dimensional-array-in-python
            # http://stackoverflow.com/questions/10668341/create-3d-array-using-python
            self.allnextdata = [[None for i in range(self.nmacrobatches)] for j in range(self.num_data_labels)]

        # run loop for loading data continues as long as process is running.
        self.init_event.set()  # initialization completed
        while True:
            # load the next set of batches into system memory
            self._get_EMbatches()

            if self.NBUF > 1:
                # immediately push the data into the current lbuf
                self._push_be_buffer()
                
                # advance the load buffer pointer
                self.lbuf_lock.acquire()
                self.lbuf = (self.lbuf + 1) % self.NBUF
                self.lbuf_event.set()
                self.lbuf_lock.release()
                
                # wait until the next load buffer is free
                self.cbuf_lock.acquire()
                wait = ((self.cbuf - 1) % self.NBUF == self.lbuf)
                self.cbuf_event.clear()
                self.cbuf_lock.release()
                if wait: self.cbuf_event.wait()
            else:
                # wait until backend is ready to push next data.
                self.push_event.wait()
                # push data to backend and then signal push done
                self.push_event.clear()
                self._push_be_buffer()
                self.push_done_event.set()

    def reset_batchnum(self, batchnum):
        # xxx - purpose of this is to start training a model at the batch where it left off.
        #   this is pretty minor in the grand scheme of training, and a pain to implement here.
        pass

    def _get_EMbatches(self):
        if self.concatenate_batches:
            # fetch all the batches into system memory at once
            for n in range(self.nmacrobatches):
                self._get_next_EMbatch()
                for i in range(self.num_data_labels):
                    self.allnextdata[i][n] = self.nextdata[i]
            self.pushdata = [np.concatenate(self.allnextdata[i], axis=0) for i in range(self.num_data_labels)]
        else:
            # featch single batch into system memory
            self._get_next_EMbatch()
            self.pushdata = self.nextdata

    def _push_be_buffer(self):
        # push batch onto backend buffer
        for i in range(self.num_data_labels):
            self.iter_buf[self.lbuf].dbuf[i].set(self.pushdata[i])

        if self.drv is not None:
            # xxx - does it matter which synchronize method is used here???
            #end = self.drv.Event()
            #end.record(self.stream)
            #end.synchronize()
            self.ctx.synchronize()
            
    def _get_next_EMbatch(self):
        p = self.parser
        nextdata = p.getBatch(self.batchnum)

        # need to manipulate data and labels returned by EM parser to be congruent with neon
        assert( len(nextdata) == self.num_data_labels )
        # re-arrange so that labels are last
        if self.num_labels > 0:
            nextdata = [nextdata[i] for i in ([0] + range(2,p.naug_data+2) + [1])]
        # order from EM data parser is tranpose of neon data, so switch nexamples (num_cases_per_batch) to first dim
        for i in range(self.num_data):
            # image dimensions and pixels / examples dimensions are transposed relative to cc2 input
            #self.nextdata[i] = nextdata[i].reshape((p.nzslices, p.image_size, p.image_size, p.num_cases_per_batch)).\
            #    transpose((3,0,2,1)).reshape((p.num_cases_per_batch, p.pixels_per_image)).copy(order='C')
            # xxx - decided above was a poor choice, transpose should not matter as long as input/ouput are in same
            #   orientation relative to each other. swap the image and samples dimensions only
            self.nextdata[i] = nextdata[i].T.copy(order='C')
                
        if self.num_labels > 0:
            # convert labels that are not onehot (independent_labels) to int
            if self.make_onehot:
                self.nextdata[-1] = nextdata[-1].T.astype(np.int32, order='C')
            else:
                self.nextdata[-1] = nextdata[-1].T.copy(order='C')

        # advance to next batch, roll around at end of batch range
        self.batchnum += 1
        if self.batchnum > self.batch_range[1]: self.batchnum = self.batch_range[0]

    @property
    def nmacrobatches(self):
        return self.batch_range[1] - self.batch_range[0] + 1

    def __iter__(self):
        if self.NBUF > 1:
            # wait until the next current buffer is available
            self.lbuf_lock.acquire()
            wait = (self.cbuf == self.lbuf)
            self.lbuf_event.clear()
            self.lbuf_lock.release()
            if wait: self.lbuf_event.wait()
        else:
            # signal to push data to backend and wait until push done
            self.push_event.set()
            self.push_done_event.wait()
            self.push_done_event.clear()

        # generate next batch from current buffer
        _iter = super(NervanaEMDataIterator, self.iter_buf[self.cbuf]).__iter__()

        if self.NBUF > 1:
            # advance current buffer pointer
            self.cbuf_lock.acquire()
            self.cbuf = (self.cbuf + 1) % self.NBUF
            self.cbuf_event.set()
            self.cbuf_lock.release()

        return _iter
Esempio n. 5
0
class EMDataIterator(NervanaEMDataIterator, Thread):
    def __init__(self,
                 cfg_file,
                 write_output=None,
                 chunk_skip_list=[],
                 dim_ordering='',
                 batch_range=[1, 10],
                 name='emdata',
                 isTest=False,
                 concatenate_batches=False,
                 NBUF=2,
                 image_in_size=None):
        Thread.__init__(self)
        self.name = name

        # mostly intended for double buffering (NBUF==2) so that data can be pushed to card simultaneous with training.
        # single buffer (NUF==1) fetches next EM batch in parallel but waits until __iter__ to push to backend buffer.
        # more buffers should work (NBUF > 2) but takes more gpu memory and likely no speed improvement
        assert (NBUF > 0)
        self.NBUF = NBUF

        # batches are numbered starting at 1 and inclusive of end of range.
        # this needs to be done first so that nmacrobatches property works.
        self.batch_range = batch_range
        self.batchnum = batch_range[0]

        # previously parser was agnostic to test or train, but needed it for allowing single ini in chunk_list_all mode
        self.isTest = isTest

        # if the output an hdf file name, then this is a single whole-dataset hdf5 file.
        # xxx - initializations for writing output features could be cleaned up.
        write_outputs = (write_output is not None)
        append_features = False
        if write_outputs:
            fn, ext = os.path.splitext(write_output)
            ext = ext.lower()
            # .conf indicates to write knossos-style outputs
            append_features = (ext == '.h5' or ext == '.hdf5'
                               or ext == '.conf')
            write_outputs = not append_features
        # instantiate the actual em data parser, code shared with cuda-convnets2 em data parser
        self.parser = EMDataParser(cfg_file,
                                   write_outputs=write_outputs,
                                   append_features=append_features,
                                   chunk_skip_list=chunk_skip_list,
                                   dim_ordering=dim_ordering,
                                   isTest=self.isTest,
                                   image_in_size=image_in_size)
        if write_outputs or append_features:
            # force some properties if in mode for writing outputs.
            # xxx - this is not clean, needs some rethinking on how write_outputs modes are initialized
            self.parser.outpath = write_output
            self.parser.no_label_lookup = True
            self.parser.append_features_knossos = append_features and (
                ext == '.conf')
            if self.parser.append_features_knossos:
                self.parser.outpath, fn = os.path.split(fn)
                self.parser.strnetid = re.findall(r'\d+', fn)[0]
        # parser relies on having initBatches called right away, xxx - could revisit this?
        self.parser.initBatches()

        # no need for special code to concatenate if there is only one macrobatch anyways
        self.concatenate_batches = concatenate_batches and (self.nmacrobatches
                                                            > 1)

        self.nexamples = self.parser.num_cases_per_batch
        if self.concatenate_batches: self.nexamples *= self.nmacrobatches

        # locks and events for synchronizing data loading thread.
        self.init_event = threading.Event()
        if self.NBUF > 1:
            self.lbuf_lock = threading.Lock()
            self.cbuf_lock = threading.Lock()
            self.lbuf_event = threading.Event()
            self.cbuf_event = threading.Event()
        else:
            self.push_event = threading.Event()
            self.push_done_event = threading.Event()

        # set pycuda driver for gpu backend
        # xxx - this is a bit hacky, is there a better way to do this?
        if type(self.be) == NervanaGPU:
            import pycuda.driver as drv
            self.drv = drv
            #self.stream = self.drv.Stream() # xxx - for other synchonize method??? see below
        else:
            self.drv = None

        # start the thread and wait for initialization to complete.
        # initialization of backend memory has to occur within the thread.
        self.daemon = True  # so that stop event is not necessary to terminate threads when process completes.
        self.start()
        self.init_event.wait()

    def run(self):
        # this allows the current running thread to push data to the gpu memory buffer.
        # ArrayIterator constructor has to be called within this thread also,
        #   so that the memory is allocated in this context.
        # xxx - cleanup call to self.ctx.detach() ???
        if self.drv is not None:
            self.ctx = self.drv.Device(self.be.device_id).make_context()

        # iterator initilizes random batches but will be overwritten with first batch in __iter__
        super(EMDataIterator, self).__init__(name=self.name,
                                             nexamples=self.nexamples)

        # setup multiple buffers (two should be sufficient?).
        # this allows data to be copied to the backend (gpu) memory while the previous macrobatch is running.
        self.iter_buf = [None] * self.NBUF
        self.iter_buf[0] = self
        self.cbuf = 0
        self.lbuf = 0
        for i in range(1, self.NBUF):
            self.iter_buf[i] = NervanaEMDataIterator(name=self.name + str(i),
                                                     nexamples=self.nexamples,
                                                     parser=self.parser)

        # cpu buffers for storing batches from EM parser before they are written to gpu.
        self.num_data = 1 + self.parser.naug_data
        self.num_labels = 0 if self.parser.no_labels else 1
        self.num_data_labels = self.num_data + self.num_labels
        self.nextdata = [None] * self.num_data_labels
        if self.concatenate_batches:
            # http://stackoverflow.com/questions/2397141/how-to-initialize-a-two-dimensional-array-in-python
            # http://stackoverflow.com/questions/10668341/create-3d-array-using-python
            self.allnextdata = [[None for i in range(self.nmacrobatches)]
                                for j in range(self.num_data_labels)]

        # run loop for loading data continues as long as process is running.
        self.init_event.set()  # initialization completed
        while True:
            # load the next set of batches into system memory
            self._get_EMbatches()

            if self.NBUF > 1:
                # immediately push the data into the current lbuf
                self._push_be_buffer()

                # advance the load buffer pointer
                self.lbuf_lock.acquire()
                self.lbuf = (self.lbuf + 1) % self.NBUF
                self.lbuf_event.set()
                self.lbuf_lock.release()

                # wait until the next load buffer is free
                self.cbuf_lock.acquire()
                wait = ((self.cbuf - 1) % self.NBUF == self.lbuf)
                self.cbuf_event.clear()
                self.cbuf_lock.release()
                if wait: self.cbuf_event.wait()
            else:
                # wait until backend is ready to push next data.
                self.push_event.wait()
                # push data to backend and then signal push done
                self.push_event.clear()
                self._push_be_buffer()
                self.push_done_event.set()

    def reset_batchnum(self, batchnum):
        # xxx - purpose of this is to start training a model at the batch where it left off.
        #   this is pretty minor in the grand scheme of training, and a pain to implement here.
        pass

    def _get_EMbatches(self):
        if self.concatenate_batches:
            # fetch all the batches into system memory at once
            for n in range(self.nmacrobatches):
                self._get_next_EMbatch()
                for i in range(self.num_data_labels):
                    self.allnextdata[i][n] = self.nextdata[i]
            self.pushdata = [
                np.concatenate(self.allnextdata[i], axis=0)
                for i in range(self.num_data_labels)
            ]
        else:
            # featch single batch into system memory
            self._get_next_EMbatch()
            self.pushdata = self.nextdata

    def _push_be_buffer(self):
        # push batch onto backend buffer
        for i in range(self.num_data_labels):
            self.iter_buf[self.lbuf].dbuf[i].set(self.pushdata[i])

        if self.drv is not None:
            # xxx - does it matter which synchronize method is used here???
            #end = self.drv.Event()
            #end.record(self.stream)
            #end.synchronize()
            self.ctx.synchronize()

    def _get_next_EMbatch(self):
        p = self.parser
        nextdata = p.getBatch(self.batchnum)

        # need to manipulate data and labels returned by EM parser to be congruent with neon
        assert (len(nextdata) == self.num_data_labels)
        # re-arrange so that labels are last
        if self.num_labels > 0:
            nextdata = [
                nextdata[i] for i in ([0] + range(2, p.naug_data + 2) + [1])
            ]
        # order from EM data parser is tranpose of neon data, so switch nexamples (num_cases_per_batch) to first dim
        for i in range(self.num_data):
            # image dimensions and pixels / examples dimensions are transposed relative to cc2 input
            #self.nextdata[i] = nextdata[i].reshape((p.nzslices, p.image_size, p.image_size, p.num_cases_per_batch)).\
            #    transpose((3,0,2,1)).reshape((p.num_cases_per_batch, p.pixels_per_image)).copy(order='C')
            # xxx - decided above was a poor choice, transpose should not matter as long as input/ouput are in same
            #   orientation relative to each other. swap the image and samples dimensions only
            self.nextdata[i] = nextdata[i].T.copy(order='C')

        if self.num_labels > 0:
            # convert labels that are not onehot (independent_labels) to int
            if self.make_onehot:
                self.nextdata[-1] = nextdata[-1].T.astype(np.int32, order='C')
            else:
                self.nextdata[-1] = nextdata[-1].T.copy(order='C')

        # advance to next batch, roll around at end of batch range
        self.batchnum += 1
        if self.batchnum > self.batch_range[1]:
            self.batchnum = self.batch_range[0]

    @property
    def nmacrobatches(self):
        return self.batch_range[1] - self.batch_range[0] + 1

    def __iter__(self):
        if self.NBUF > 1:
            # wait until the next current buffer is available
            self.lbuf_lock.acquire()
            wait = (self.cbuf == self.lbuf)
            self.lbuf_event.clear()
            self.lbuf_lock.release()
            if wait: self.lbuf_event.wait()
        else:
            # signal to push data to backend and wait until push done
            self.push_event.set()
            self.push_done_event.wait()
            self.push_done_event.clear()

        # generate next batch from current buffer
        _iter = super(NervanaEMDataIterator,
                      self.iter_buf[self.cbuf]).__iter__()

        if self.NBUF > 1:
            # advance current buffer pointer
            self.cbuf_lock.acquire()
            self.cbuf = (self.cbuf + 1) % self.NBUF
            self.cbuf_event.set()
            self.cbuf_lock.release()

        return _iter