def initialize(self, device, **kwargs): super(FullBatchLoader, self).initialize(device=device, **kwargs) assert self.total_samples > 0 self.analyze_original_dataset() self._map_original_labels() if isinstance(self.device, NumpyDevice): return self.info("Will try to store the entire dataset on the device") try: self.init_vectors(self.original_data, self.minibatch_data) except CLRuntimeError as e: if e.code == CL_MEM_OBJECT_ALLOCATION_FAILURE: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() return else: raise from_none(e) except CUDARuntimeError as e: if e.code == CUDA_ERROR_OUT_OF_MEMORY: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() return else: raise from_none(e) if self.has_labels: self.init_vectors(self._mapped_original_labels_, self.minibatch_labels) if not self.shuffled_indices: self.shuffled_indices.mem = numpy.arange(self.total_samples, dtype=Loader.LABEL_DTYPE) self.init_vectors(self.shuffled_indices, self.minibatch_indices)
def initialize(self, device, **kwargs): if device is None: raise ValueError("device may not be None") if not isinstance(device, Device): raise TypeError("deviec must be of type %s" % Device) if self._force_numpy: device = NumpyDevice() # Scan class hierarchy checked = [] # this is just for exception message for cls in type(device).mro(): if not hasattr(cls, "BACKEND"): continue checked.append(cls) try: self.verify_interface(INTERFACE_MAPPING[cls]) break except NotImplementedError: pass else: raise NotImplementedError("%s does not implement any of %s" % (type(self), checked)) if not device.is_attached(self.thread_pool): device.thread_pool_attach(self.thread_pool) try: super(AcceleratedUnit, self).initialize(**kwargs) except AttributeError: pass self.device = device self.intel_opencl_workaround = ( self.intel_opencl_workaround and isinstance(device, OpenCLDevice) and device.device_info.is_cpu) if isinstance(self.device, NumpyDevice) and \ not self._numpy_run_jitted_ and \ not root.common.engine.disable_numba: if jit is None and root.common.warnings.numba: self.warning( "Numba (http://numba.pydata.org) was not found, " "numpy_run() is going to be slower. Ignore this warning " "by setting root.common.warnings.numba to False.") else: self.numpy_run = jit(nopython=True, nogil=True)(self.numpy_run) self.debug("Jitted numpy_run() with numba") self._numpy_run_jitted_ = True
def _after_backend_init(self): try: self.fill_indices(0, min(self.max_minibatch_size, self.total_samples)) except CLRuntimeError as e: if e.code == CL_MEM_OBJECT_ALLOCATION_FAILURE: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() else: raise from_none(e) except CUDARuntimeError as e: if e.code == CUDA_ERROR_OUT_OF_MEMORY: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() else: raise from_none(e)
def __init__(self, workflow, **kwargs): super(AcceleratedUnit, self).__init__(workflow, **kwargs) self._device = NumpyDevice() self._cache = kwargs.get("cache", True) # Yup, this is right - self._force_numpy is initialized in # init_unpickled self._force_numpy = kwargs.get("force_numpy", self._force_numpy) self.intel_opencl_workaround = \ root.common.engine.force_numpy_run_on_intel_opencl
def __init__(self, interactive=False, **kwargs): super(Launcher, self).__init__() self._initialized = False self._running = False parser = Launcher.init_parser(**kwargs) self.args, _ = parser.parse_known_args(self.argv) self.args.master_address = self.args.master_address.strip() self.args.listen_address = self.args.listen_address.strip() self.testing = self.args.test self.args.matplotlib_backend = self.args.matplotlib_backend.strip() self._slaves = [ x.strip() for x in self.args.nodes.split(',') if x.strip() != "" ] self._slave_launch_transform = self.args.slave_launch_transform if self._slave_launch_transform.find("%s") < 0: raise ValueError("Slave launch command transform must contain %s") if self.args.log_file != "": log_file = self.args.log_file if self.args.log_file_pid: log_base_name = os.path.splitext(os.path.basename(log_file)) log_file = os.path.join( os.path.dirname(log_file), "%s.%d%s" % (log_base_name[0], os.getpid(), log_base_name[1])) logger.Logger.redirect_all_logging_to_file(log_file) self._result_file = self.args.result_file self.info("My Python is %s %s", platform.python_implementation(), platform.python_version()) self.info("My PID is %d", os.getpid()) self.info("My time is %s", datetime.datetime.now()) self.id = str(uuid.uuid4()) if not self.is_slave else None self.log_id = self.args.log_id or self.id if self.logs_to_mongo: if self.mongo_log_addr == "": self.args.log_mongo = root.common.mongodb_logging_address if not self.is_slave: logger.Logger.duplicate_all_logging_to_mongo( self.args.log_mongo, self.log_id, "master") self._monkey_patch_twisted_failure() self._lock = threading.Lock() self._webagg_port = 0 self._agent = None self._workflow = None self._start_time = None self._device = NumpyDevice() self._interactive = interactive self._reactor_thread = None self._notify_update_interval = kwargs.get( "status_update_interval", root.common.web.notification_interval) if self.args.yarn_nodes is not None and self.is_master: self._discover_nodes_from_yarn(self.args.yarn_nodes)
def __init__(self, data=None, shallow_pickle=False): super(Array, self).__init__() self._device = NumpyDevice() self.mem = data self._max_value = 1.0 self._shallow_pickle = shallow_pickle if six.PY3: # Workaround for unspecified destructor call order # This is a hard to reduce bug to report it Array.__vectors__.add(weakref.ref(self)) if not Array.__registered: atexit.register(Array.reset_all) Array.__registered = True
def init_unpickled(self): super(AcceleratedWorkflow, self).init_unpickled() self._power_ = 0 self._last_power_measurement_time = 0 self.device = NumpyDevice() self._power_measure_time_interval = 120
class FullBatchLoader(AcceleratedUnit, FullBatchLoaderBase): """Loads data entire in memory. Attributes: validation_ratio: specifies which part of the merged train + validation set goes to validation. original_data: original data (Array). original_labels: original labels (Array, dtype=Loader.LABEL_DTYPE) (in case of classification). Should be overriden in child class: load_data() """ def __init__(self, workflow, **kwargs): super(FullBatchLoader, self).__init__(workflow, **kwargs) self.verify_interface(IFullBatchLoader) def init_unpickled(self): super(FullBatchLoader, self).init_unpickled() self._original_data_ = memory.Array() self._original_labels_ = [] self._mapped_original_labels_ = memory.Array() self.sources_["fullbatch_loader"] = {} self._global_size = None self._krn_const = numpy.zeros(2, dtype=Loader.LABEL_DTYPE) @Loader.shape.getter def shape(self): """ Takes the shape from original_data. :return: Sample's shape. """ if not self.original_data: raise AttributeError("Must first initialize original_data") return self.original_data[0].shape @property def on_device(self): return not self.force_numpy @on_device.setter def on_device(self, value): if not isinstance(value, bool): raise TypeError("on_device must be boolean (got %s)" % type(value)) self.force_numpy = not value @property def original_data(self): return self._original_data_ @property def original_labels(self): return self._original_labels_ @property def validation_ratio(self): return getattr(self, "_validation_ratio", None) @validation_ratio.setter def validation_ratio(self, value): if value is None: self._validation_ratio = None return if isinstance(value, int): if value != 0: raise ValueError("validation_ratio must be in [0, 1).") self._validation_ratio = 0.0 return if not isinstance(value, float): raise TypeError("validation_ratio must be a float") if value < 0 or value >= 1: raise ValueError("validation_ratio must be in [0, 1).") self._validation_ratio = value def get_ocl_defines(self): """Add definitions before building the kernel during initialize(). """ return {} def initialize(self, device, **kwargs): super(FullBatchLoader, self).initialize(device=device, **kwargs) assert self.total_samples > 0 self.analyze_original_dataset() self._map_original_labels() if isinstance(self.device, NumpyDevice): return self.info("Will try to store the entire dataset on the device") try: self.init_vectors(self.original_data, self.minibatch_data) except CLRuntimeError as e: if e.code == CL_MEM_OBJECT_ALLOCATION_FAILURE: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() return else: raise from_none(e) except CUDARuntimeError as e: if e.code == CUDA_ERROR_OUT_OF_MEMORY: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() return else: raise from_none(e) if self.has_labels: self.init_vectors(self._mapped_original_labels_, self.minibatch_labels) if not self.shuffled_indices: self.shuffled_indices.mem = numpy.arange(self.total_samples, dtype=Loader.LABEL_DTYPE) self.init_vectors(self.shuffled_indices, self.minibatch_indices) def _gpu_init(self): defines = { "LABELS": int(self.has_labels), "SAMPLE_SIZE": self.original_data.sample_size, "MAX_MINIBATCH_SIZE": self.max_minibatch_size, "original_data_dtype": numpy_dtype_to_opencl(self.original_data.dtype), "minibatch_data_dtype": numpy_dtype_to_opencl(self.minibatch_data.dtype) } defines.update(self.get_ocl_defines()) self.build_program(defines, "fullbatch_loader", dtype=self.minibatch_data.dtype) self.assign_kernel("fill_minibatch_data_labels") if not self.has_labels: self.set_args(self.original_data, self.minibatch_data, self.device.skip(2), self.shuffled_indices, self.minibatch_indices) else: self.set_args(self.original_data, self.minibatch_data, self.device.skip(2), self._mapped_original_labels_, self.minibatch_labels, self.shuffled_indices, self.minibatch_indices) def _after_backend_init(self): try: self.fill_indices(0, min(self.max_minibatch_size, self.total_samples)) except CLRuntimeError as e: if e.code == CL_MEM_OBJECT_ALLOCATION_FAILURE: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() else: raise from_none(e) except CUDARuntimeError as e: if e.code == CUDA_ERROR_OUT_OF_MEMORY: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() else: raise from_none(e) def numpy_run(self): Loader.run(self) def ocl_run(self): self.numpy_run() def cuda_run(self): self.numpy_run() def ocl_init(self): self._gpu_init() self._global_size = (self.max_minibatch_size, self.minibatch_data.sample_size) self._local_size = None def cuda_init(self): self._gpu_init() block_size = self.device.suggest_block_size(self._kernel_) self._global_size = (int( numpy.ceil(self.minibatch_data.size / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def on_before_create_minibatch_data(self): self._has_labels = len(self.original_labels) > 0 try: super(FullBatchLoader, self).on_before_create_minibatch_data() except AttributeError: pass self._resize_validation() def create_minibatch_data(self): self.minibatch_data.reset( numpy.zeros((self.max_minibatch_size, ) + self.shape, dtype=self.dtype)) def create_originals(self, dshape, labels=True): """ Create original_data.mem and original_labels.mem. :param dshape: Future original_data.shape[1:] """ self.original_data.reset( numpy.zeros((self.total_samples, ) + dshape, self.dtype)) if not labels: return self._mapped_original_labels_.reset( numpy.zeros(self.total_samples, Loader.LABEL_DTYPE)) del self.original_labels[:] self.original_labels.extend(None for _ in range(self.total_samples)) def fill_indices(self, start_offset, count): if isinstance(self.device, NumpyDevice): return super(FullBatchLoader, self).fill_indices(start_offset, count) self.unmap_vectors(self.original_data, self.minibatch_data, self.shuffled_indices, self.minibatch_indices) if self.has_labels: self.unmap_vectors(self._mapped_original_labels_, self.minibatch_labels) self._krn_const[0:2] = start_offset, count self._kernel_.set_arg(2, self._krn_const[0:1]) self._kernel_.set_arg(3, self._krn_const[1:2]) self.execute_kernel(self._global_size, self._local_size) # No further processing needed, so return True return True def fill_minibatch(self): for i, sample_index in enumerate( self.minibatch_indices.mem[:self.minibatch_size]): # int() is required by (guess what...) PyPy self.minibatch_data[i] = self.original_data[int(sample_index)] if self.has_labels: self.minibatch_labels[i] = \ self._mapped_original_labels_[int(sample_index)] def map_minibatch_labels(self): pass def analyze_dataset(self): """ Override. """ pass def normalize_minibatch(self): """ Override. """ pass def analyze_original_dataset(self): self.info("Normalizing to %s...", self.normalization_type) self.debug("Data range: (%.6f, %.6f), " % (self.original_data.min(), self.original_data.max())) if self.class_lengths[TRAIN] > 0: self.normalizer.analyze( self.original_data[self.class_end_offsets[VALID]:]) self.normalizer.normalize(self.original_data.mem) self.debug("Normalized data range: (%.6f, %.6f), " % (self.original_data.min(), self.original_data.max())) def _resize_validation(self): """Extracts validation dataset from joined validation and train datasets randomly. We will rearrange indexes only. """ rand = self.prng ratio = self.validation_ratio if ratio is None: return if ratio <= 0: # Dispose validation set self.class_lengths[TRAIN] += self.class_lengths[VALID] self.class_lengths[VALID] = 0 if self.shuffled_indices.mem is None: self.shuffled_indices.mem = numpy.arange( self.total_samples, dtype=Loader.LABEL_DTYPE) return offs_test = self.class_lengths[TEST] offs = offs_test train_samples = self.class_lengths[VALID] + self.class_lengths[TRAIN] total_samples = train_samples + offs original_labels = self.original_labels if self.shuffled_indices.mem is None: self.shuffled_indices.mem = numpy.arange(total_samples, dtype=Loader.LABEL_DTYPE) self.shuffled_indices.map_write() shuffled_indices = self.shuffled_indices.mem # If there are no labels if not self.has_labels: n = int(numpy.round(ratio * train_samples)) while n > 0: i = rand.randint(offs, offs + train_samples) # Swap indexes shuffled_indices[offs], shuffled_indices[i] = \ shuffled_indices[i], shuffled_indices[offs] offs += 1 n -= 1 self.class_lengths[VALID] = offs - offs_test self.class_lengths[TRAIN] = \ total_samples - self.class_lengths[VALID] - offs_test return # If there are labels nn = {} for i in shuffled_indices[offs:]: l = original_labels[i] nn[l] = nn.get(l, 0) + 1 n = 0 for l in nn.keys(): n_train = nn[l] nn[l] = max(int(numpy.round(ratio * nn[l])), 1) if nn[l] >= n_train: raise ValueError("There are too few labels for class %s: %s" % (l, n_train)) n += nn[l] while n > 0: i = rand.randint(offs, offs_test + train_samples) l = original_labels[shuffled_indices[i]] if nn[l] <= 0: # Move unused label to the end # Swap indexes ii = shuffled_indices[offs_test + train_samples - 1] shuffled_indices[offs_test + train_samples - 1] = shuffled_indices[i] shuffled_indices[i] = ii train_samples -= 1 continue # Swap indexes ii = shuffled_indices[offs] shuffled_indices[offs] = shuffled_indices[i] shuffled_indices[i] = ii nn[l] -= 1 n -= 1 offs += 1 self.class_lengths[VALID] = offs - offs_test self.class_lengths[TRAIN] = (total_samples - self.class_lengths[VALID] - offs_test) def _map_original_labels(self): self._has_labels = len(self.original_labels) > 0 if not self.has_labels: return if len(self.labels_mapping) > 0: self._init_mapped_original_labels() return if len(self.original_labels) != self.original_data.shape[0]: raise ValueError( "original_labels and original_data must have the same length " "(%d vs %d)" % (len(self.original_labels), self.original_data.shape[0])) for ind, lbl in enumerate(self.original_labels): self._samples_mapping[lbl].add(ind) different_labels = tuple( Counter(self.original_labels[i] for i in self.shuffled_indices[self.class_end_offsets[c] - self.class_lengths[c]:self. class_end_offsets[c]]) for c in range(3)) self._setup_labels_mapping(different_labels) self._init_mapped_original_labels() def _init_mapped_original_labels(self): self._mapped_original_labels_.reset( numpy.zeros(self.total_samples, Loader.LABEL_DTYPE)) for i, label in enumerate(self.original_labels): self._mapped_original_labels_[i] = self.labels_mapping[label]
class FullBatchLoader(AcceleratedUnit, FullBatchLoaderBase): """Loads data entire in memory. Attributes: validation_ratio: specifies which part of the merged train + validation set goes to validation. original_data: original data (Array). original_labels: original labels (Array, dtype=Loader.LABEL_DTYPE) (in case of classification). Should be overriden in child class: load_data() """ def __init__(self, workflow, **kwargs): super(FullBatchLoader, self).__init__(workflow, **kwargs) self.verify_interface(IFullBatchLoader) def init_unpickled(self): super(FullBatchLoader, self).init_unpickled() self._original_data_ = memory.Array() self._original_labels_ = [] self._mapped_original_labels_ = memory.Array() self.sources_["fullbatch_loader"] = {} self._global_size = None self._krn_const = numpy.zeros(2, dtype=Loader.LABEL_DTYPE) @Loader.shape.getter def shape(self): """ Takes the shape from original_data. :return: Sample's shape. """ if not self.original_data: raise AttributeError("Must first initialize original_data") return self.original_data[0].shape @property def on_device(self): return not self.force_numpy @on_device.setter def on_device(self, value): if not isinstance(value, bool): raise TypeError("on_device must be boolean (got %s)" % type(value)) self.force_numpy = not value @property def original_data(self): return self._original_data_ @property def original_labels(self): return self._original_labels_ @property def validation_ratio(self): return getattr(self, "_validation_ratio", None) @validation_ratio.setter def validation_ratio(self, value): if value is None: self._validation_ratio = None return if isinstance(value, int): if value != 0: raise ValueError("validation_ratio must be in [0, 1).") self._validation_ratio = 0.0 return if not isinstance(value, float): raise TypeError("validation_ratio must be a float") if value < 0 or value >= 1: raise ValueError("validation_ratio must be in [0, 1).") self._validation_ratio = value def get_ocl_defines(self): """Add definitions before building the kernel during initialize(). """ return {} def initialize(self, device, **kwargs): super(FullBatchLoader, self).initialize(device=device, **kwargs) assert self.total_samples > 0 self.analyze_original_dataset() self._map_original_labels() if isinstance(self.device, NumpyDevice): return self.info("Will try to store the entire dataset on the device") try: self.init_vectors(self.original_data, self.minibatch_data) except CLRuntimeError as e: if e.code == CL_MEM_OBJECT_ALLOCATION_FAILURE: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() return else: raise from_none(e) except CUDARuntimeError as e: if e.code == CUDA_ERROR_OUT_OF_MEMORY: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() return else: raise from_none(e) if self.has_labels: self.init_vectors(self._mapped_original_labels_, self.minibatch_labels) if not self.shuffled_indices: self.shuffled_indices.mem = numpy.arange(self.total_samples, dtype=Loader.LABEL_DTYPE) self.init_vectors(self.shuffled_indices, self.minibatch_indices) def _gpu_init(self): defines = { "LABELS": int(self.has_labels), "SAMPLE_SIZE": self.original_data.sample_size, "MAX_MINIBATCH_SIZE": self.max_minibatch_size, "original_data_dtype": numpy_dtype_to_opencl(self.original_data.dtype), "minibatch_data_dtype": numpy_dtype_to_opencl(self.minibatch_data.dtype), } defines.update(self.get_ocl_defines()) self.build_program(defines, "fullbatch_loader", dtype=self.minibatch_data.dtype) self.assign_kernel("fill_minibatch_data_labels") if not self.has_labels: self.set_args( self.original_data, self.minibatch_data, self.device.skip(2), self.shuffled_indices, self.minibatch_indices, ) else: self.set_args( self.original_data, self.minibatch_data, self.device.skip(2), self._mapped_original_labels_, self.minibatch_labels, self.shuffled_indices, self.minibatch_indices, ) def _after_backend_init(self): try: self.fill_indices(0, min(self.max_minibatch_size, self.total_samples)) except CLRuntimeError as e: if e.code == CL_MEM_OBJECT_ALLOCATION_FAILURE: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() else: raise from_none(e) except CUDARuntimeError as e: if e.code == CUDA_ERROR_OUT_OF_MEMORY: self.warning("Failed to store the entire dataset on the " "device") self.force_numpy = True self.device = NumpyDevice() else: raise from_none(e) def numpy_run(self): Loader.run(self) def ocl_run(self): self.numpy_run() def cuda_run(self): self.numpy_run() def ocl_init(self): self._gpu_init() self._global_size = (self.max_minibatch_size, self.minibatch_data.sample_size) self._local_size = None def cuda_init(self): self._gpu_init() block_size = self.device.suggest_block_size(self._kernel_) self._global_size = (int(numpy.ceil(self.minibatch_data.size / block_size)), 1, 1) self._local_size = (block_size, 1, 1) def on_before_create_minibatch_data(self): self._has_labels = len(self.original_labels) > 0 try: super(FullBatchLoader, self).on_before_create_minibatch_data() except AttributeError: pass self._resize_validation() def create_minibatch_data(self): self.minibatch_data.reset(numpy.zeros((self.max_minibatch_size,) + self.shape, dtype=self.dtype)) def create_originals(self, dshape, labels=True): """ Create original_data.mem and original_labels.mem. :param dshape: Future original_data.shape[1:] """ self.original_data.reset(numpy.zeros((self.total_samples,) + dshape, self.dtype)) if not labels: return self._mapped_original_labels_.reset(numpy.zeros(self.total_samples, Loader.LABEL_DTYPE)) del self.original_labels[:] self.original_labels.extend(None for _ in range(self.total_samples)) def fill_indices(self, start_offset, count): if isinstance(self.device, NumpyDevice): return super(FullBatchLoader, self).fill_indices(start_offset, count) self.unmap_vectors(self.original_data, self.minibatch_data, self.shuffled_indices, self.minibatch_indices) if self.has_labels: self.unmap_vectors(self._mapped_original_labels_, self.minibatch_labels) self._krn_const[0:2] = start_offset, count self._kernel_.set_arg(2, self._krn_const[0:1]) self._kernel_.set_arg(3, self._krn_const[1:2]) self.execute_kernel(self._global_size, self._local_size) # No further processing needed, so return True return True def fill_minibatch(self): for i, sample_index in enumerate(self.minibatch_indices.mem[: self.minibatch_size]): # int() is required by (guess what...) PyPy self.minibatch_data[i] = self.original_data[int(sample_index)] if self.has_labels: self.minibatch_labels[i] = self._mapped_original_labels_[int(sample_index)] def map_minibatch_labels(self): pass def analyze_dataset(self): """ Override. """ pass def normalize_minibatch(self): """ Override. """ pass def analyze_original_dataset(self): self.info("Normalizing to %s...", self.normalization_type) self.debug("Data range: (%.6f, %.6f), " % (self.original_data.min(), self.original_data.max())) if self.class_lengths[TRAIN] > 0: self.normalizer.analyze(self.original_data[self.class_end_offsets[VALID] :]) self.normalizer.normalize(self.original_data.mem) self.debug("Normalized data range: (%.6f, %.6f), " % (self.original_data.min(), self.original_data.max())) def _resize_validation(self): """Extracts validation dataset from joined validation and train datasets randomly. We will rearrange indexes only. """ rand = self.prng ratio = self.validation_ratio if ratio is None: return if ratio <= 0: # Dispose validation set self.class_lengths[TRAIN] += self.class_lengths[VALID] self.class_lengths[VALID] = 0 if self.shuffled_indices.mem is None: self.shuffled_indices.mem = numpy.arange(self.total_samples, dtype=Loader.LABEL_DTYPE) return offs_test = self.class_lengths[TEST] offs = offs_test train_samples = self.class_lengths[VALID] + self.class_lengths[TRAIN] total_samples = train_samples + offs original_labels = self.original_labels if self.shuffled_indices.mem is None: self.shuffled_indices.mem = numpy.arange(total_samples, dtype=Loader.LABEL_DTYPE) self.shuffled_indices.map_write() shuffled_indices = self.shuffled_indices.mem # If there are no labels if not self.has_labels: n = int(numpy.round(ratio * train_samples)) while n > 0: i = rand.randint(offs, offs + train_samples) # Swap indexes shuffled_indices[offs], shuffled_indices[i] = shuffled_indices[i], shuffled_indices[offs] offs += 1 n -= 1 self.class_lengths[VALID] = offs - offs_test self.class_lengths[TRAIN] = total_samples - self.class_lengths[VALID] - offs_test return # If there are labels nn = {} for i in shuffled_indices[offs:]: l = original_labels[i] nn[l] = nn.get(l, 0) + 1 n = 0 for l in nn.keys(): n_train = nn[l] nn[l] = max(int(numpy.round(ratio * nn[l])), 1) if nn[l] >= n_train: raise ValueError("There are too few labels for class %s: %s" % (l, n_train)) n += nn[l] while n > 0: i = rand.randint(offs, offs_test + train_samples) l = original_labels[shuffled_indices[i]] if nn[l] <= 0: # Move unused label to the end # Swap indexes ii = shuffled_indices[offs_test + train_samples - 1] shuffled_indices[offs_test + train_samples - 1] = shuffled_indices[i] shuffled_indices[i] = ii train_samples -= 1 continue # Swap indexes ii = shuffled_indices[offs] shuffled_indices[offs] = shuffled_indices[i] shuffled_indices[i] = ii nn[l] -= 1 n -= 1 offs += 1 self.class_lengths[VALID] = offs - offs_test self.class_lengths[TRAIN] = total_samples - self.class_lengths[VALID] - offs_test def _map_original_labels(self): self._has_labels = len(self.original_labels) > 0 if not self.has_labels: return if len(self.labels_mapping) > 0: self._init_mapped_original_labels() return if len(self.original_labels) != self.original_data.shape[0]: raise ValueError( "original_labels and original_data must have the same length " "(%d vs %d)" % (len(self.original_labels), self.original_data.shape[0]) ) for ind, lbl in enumerate(self.original_labels): self._samples_mapping[lbl].add(ind) different_labels = tuple( Counter( self.original_labels[i] for i in self.shuffled_indices[ self.class_end_offsets[c] - self.class_lengths[c] : self.class_end_offsets[c] ] ) for c in range(3) ) self._setup_labels_mapping(different_labels) self._init_mapped_original_labels() def _init_mapped_original_labels(self): self._mapped_original_labels_.reset(numpy.zeros(self.total_samples, Loader.LABEL_DTYPE)) for i, label in enumerate(self.original_labels): self._mapped_original_labels_[i] = self.labels_mapping[label]