def _transform_multi_channel_data(self, X, y): # Data partitioning parted_X, parted_y = self._partition_data( X=X, y=y, partition_size=self.window_size) transposed_X = np.transpose(parted_X, [0, 2, 1]) converted_X = np.reshape(transposed_X, (transposed_X.shape[0], transposed_X.shape[1], 1, transposed_X.shape[2])) # Create view converter view_converter = DefaultViewConverter(shape=self.sample_shape, axes=('b', 0, 1, 'c')) # Convert data into a design matrix view_converted_X = view_converter.topo_view_to_design_mat(converted_X) assert np.all(converted_X == view_converter.design_mat_to_topo_view( view_converted_X)) # Format the target into proper format sum_y = np.sum(parted_y, axis=1) sum_y[sum_y > 0] = 1 one_hot_formatter = OneHotFormatter(max_labels=self.n_classes) hot_y = one_hot_formatter.format(sum_y) return view_converted_X, hot_y, view_converter
def get_features(path, split, standardize): if path.find(',') != -1: paths = path.split(',') Xs = [get_features(subpath, split, standardize) for subpath in paths] X = np.concatenate(Xs, axis=1) return X if path.endswith('.npy'): topo_view = np.load(path) else: topo_view = serial.load(path) if str(type(topo_view)).find('h5py') != -1: name, = topo_view.keys() topo_view = topo_view[name].value.T if len(topo_view.shape) == 2: X = topo_view else: view_converter = DefaultViewConverter(topo_view.shape[1:]) print 'converting data' X = view_converter.topo_view_to_design_mat(topo_view) if split: X = np.concatenate((np.abs(X), np.abs(-X)), axis=1) if standardize: assert False #bug: if X is test set, we need to subtract train mean, divide by train std X -= X.mean(axis=0) X /= np.sqrt(.01 + np.var(X, axis=0)) return X
def get_features(path, split, standardize): if path.find(',') != -1: paths = path.split(',') Xs = [ get_features(subpath, split, standardize) for subpath in paths ] X = np.concatenate( Xs, axis = 1) return X if path.endswith('.npy'): topo_view = np.load(path) else: topo_view = serial.load(path) if str(type(topo_view)).find('h5py') != -1: name ,= topo_view.keys() topo_view = topo_view[name].value.T if len(topo_view.shape) == 2: X = topo_view else: view_converter = DefaultViewConverter(topo_view.shape[1:]) print 'converting data' X = view_converter.topo_view_to_design_mat(topo_view) if split: X = np.concatenate( (np.abs(X),np.abs(-X)), axis=1) if standardize: assert False #bug: if X is test set, we need to subtract train mean, divide by train std X -= X.mean(axis=0) X /= np.sqrt(.01+np.var(X,axis=0)) return X
def _transform_multi_channel_data(self, X, y): # Data partitioning parted_X, parted_y = self._partition_data(X=X, y=y, partition_size=self.window_size) transposed_X = np.transpose(parted_X, [0, 2, 1]) converted_X = np.reshape(transposed_X, (transposed_X.shape[0], transposed_X.shape[1], 1, transposed_X.shape[2])) # Create view converter view_converter = DefaultViewConverter(shape=self.sample_shape, axes=('b', 0, 1, 'c')) # Convert data into a design matrix view_converted_X = view_converter.topo_view_to_design_mat(converted_X) assert np.all(converted_X == view_converter.design_mat_to_topo_view(view_converted_X)) # Format the target into proper format sum_y = np.sum(parted_y, axis=1) sum_y[sum_y > 0] = 1 one_hot_formatter = OneHotFormatter(max_labels=self.n_classes) hot_y = one_hot_formatter.format(sum_y) return view_converted_X, hot_y, view_converter
class TemporalDenseDesignMatrix(DenseDesignMatrix): ''' A class for representing datasets that can be stored as a dense design matrix, but whose examples are slices of width >= 2 rows each. ''' _default_seed = (17, 2, 946) def __init__(self, X=None, topo_view=None, y=None, view_converter=None, axes = ('b', 0, 1, 2, 'c'), rng=_default_seed, preprocessor = None, fit_preprocessor=False): ''' TODO: rewrite or just inherit... same as DenseDesignMatrix...??? Parameters ---------- X : ndarray, 2-dimensional, optional Should be supplied if `topo_view` is not. A design matrix of shape (number examples, number features) that defines the dataset. XXXXXXXXXXX not allowed topo_view : ndarray, optional Should be supplied if X is not. An array whose first dimension is of length number examples. The remaining dimensions are xamples with topological significance, e.g. for images the remaining axes are rows, columns, and channels. TODO: time is 0, ii is 1, jj is 2 y : ndarray, 1-dimensional(?), optional Labels or targets for each example. The semantics here are not quite nailed down for this yet. view_converter : object, optional An object for converting between the design matrix stored internally and the data that will be returned by iterators. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. ''' assert topo_view is not None, ( 'For TemporalDenseDesignMatrix, must provide topo_view (not X)' ) assert axes == ('b', 0, 1, 2, 'c') reduced_axes = ('b', 0, 1, 'c') super(TemporalDenseDesignMatrix, self).__init__( X = X, topo_view = topo_view, y = y, view_converter = view_converter, axes = reduced_axes, rng = rng, preprocessor = preprocessor, fit_preprocessor = fit_preprocessor ) self._X = self.X self.X = None # prevent other access def set_topological_view(self, topo_view, axes=('b', 0, 1, 'c')): ''' Sets the dataset to represent topo_view, where topo_view is a batch of topological views of examples. Parameters ---------- topo_view : ndarray An array containing a design matrix representation of training examples. ''' assert not np.any(np.isnan(topo_view)) frames = topo_view.shape[axes.index('b')] # pretend frames come in as batch dim rows = topo_view.shape[axes.index(0)] cols = topo_view.shape[axes.index(1)] channels = topo_view.shape[axes.index('c')] # leave out frames... self.view_converter = DefaultViewConverter([rows, cols, channels], axes=axes) self.X = self.view_converter.topo_view_to_design_mat(topo_view) # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = self.view_converter.topo_space assert not np.any(np.isnan(self.X)) # Update data specs X_space = VectorSpace(dim = frames * rows * cols * channels) X_source = 'features' assert self.y is None, 'y not supported now' space = X_space source = X_source self.data_specs = (space, source) self.X_space = X_space self._iter_data_specs = (X_space, X_source) @functools.wraps(Dataset.iterator) def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): '''thin wrapper... TODO: doc''' assert mode == 'shuffled_sequential', ( 'Only shuffled_sequential mode is supported' ) assert data_specs != None, 'Must provide data_specs' assert len(data_specs) == 2, 'data_specs must include only one tuple for "features"' assert type(data_specs[0]) is CompositeSpace, 'must be composite space...??' assert data_specs[0].num_components == 1, 'must only have one component, features' assert data_specs[1][0] == 'features', 'data_specs must include only one tuple for "features"' output_space = data_specs[0].components[0] num_frames = output_space.shape[0] if num_batches is None: num_batches = 10 # another hack... just determines how often new iterators will be created? base_num_batches = num_batches * batch_size # Iterates through ONE example at a time # BEGIN HUGE HACK (enable self.X access just for this function) self.X = self._X base_iterator = super(TemporalDenseDesignMatrix, self).iterator( mode='random_slice', # to return continguous bits batch_size=num_frames, num_batches=base_num_batches, topo=topo, targets=targets, rng=rng, data_specs=data_specs, return_tuple=False) self.X = None # END HUGE HACK return CopyingConcatenatingIterator(base_iterator, how_many = batch_size)
class myDenseDesignMatrix(dense_design_matrix.DenseDesignMatrix): _default_seed = (17, 2, 946) def __init__(self, X=None, topo_view=None, y=None, latent = None, view_converter=None, axes=('b', 0, 1, 'c'), rng=_default_seed, preprocessor=None, fit_preprocessor=False, X_labels=None, y_labels=None): self.latent = latent self.X = X self.y = y self.view_converter = view_converter self.X_labels = X_labels self.y_labels = y_labels self._check_labels() if topo_view is not None: assert view_converter is None self.set_topological_view(topo_view, axes) else: assert X is not None, ("DenseDesignMatrix needs to be provided " "with either topo_view, or X") if view_converter is not None: # Get the topo_space (usually Conv2DSpace) from the # view_converter if not hasattr(view_converter, 'topo_space'): raise NotImplementedError("Not able to get a topo_space " "from this converter: %s" % view_converter) # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = view_converter.topo_space else: self.X_topo_space = None # Update data specs, if not done in set_topological_view X_source = 'features' if X_labels is None: X_space = VectorSpace(dim=X.shape[1]) else: if X.ndim == 1: dim = 1 else: dim = X.shape[-1] X_space = IndexSpace(dim=dim, max_labels=X_labels) if y is None: space = X_space source = X_source else: if y.ndim == 1: dim = 1 else: dim = y.shape[-1] if y_labels is not None: y_space = IndexSpace(dim=dim, max_labels=y_labels) else: y_space = VectorSpace(dim=dim) y_source = 'targets' Latent_space = VectorSpace(dim=latent.shape[-1]) Latent_source = 'latents' space = CompositeSpace((X_space, y_space, Latent_space)) source = (X_source, y_source, Latent_source) self.data_specs = (space, source) self.X_space = X_space self.compress = False self.design_loc = None self.rng = make_np_rng(rng, which_method="random_integers") # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_targets = False self._iter_data_specs = (self.X_space, 'features') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor def get_data(self): """ Returns all the data, as it is internally stored. The definition and format of these data are described in `self.get_data_specs()`. Returns ------- data : numpy matrix or 2-tuple of matrices The data """ if self.y is None: return self.X else: return (self.X, self.y, self.latent) def set_topological_view(self, V, axes=('b', 0, 1, 'c')): """ Sets the dataset to represent V, where V is a batch of topological views of examples. .. todo:: Why is this parameter named 'V'? Parameters ---------- V : ndarray An array containing a design matrix representation of training examples. axes : WRITEME """ assert not contains_nan(V) rows = V.shape[axes.index(0)] cols = V.shape[axes.index(1)] channels = V.shape[axes.index('c')] self.view_converter = DefaultViewConverter([rows, cols, channels], axes=axes) self.X = self.view_converter.topo_view_to_design_mat(V) # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = self.view_converter.topo_space assert not contains_nan(self.X) # Update data specs X_space = VectorSpace(dim=self.X.shape[1]) X_source = 'features' if self.y is None: space = X_space source = X_source else: if self.y.ndim == 1: dim = 1 else: dim = self.y.shape[-1] # This is to support old pickled models if getattr(self, 'y_labels', None) is not None: y_space = IndexSpace(dim=dim, max_labels=self.y_labels) elif getattr(self, 'max_labels', None) is not None: y_space = IndexSpace(dim=dim, max_labels=self.max_labels) else: y_space = VectorSpace(dim=dim) y_source = 'targets' Latent_space = VectorSpace(dim=self.latent.shape[-1]) Latent_source = 'latents' space = CompositeSpace((X_space, y_space,Latent_space)) source = (X_source, y_source,Latent_source) self.data_specs = (space, source) self.X_space = X_space self._iter_data_specs = (X_space, X_source) def get_targets(self): """ .. todo:: WRITEME """ return self.y def get_latents(self): """ .. todo:: WRITEME """ return self.latent def get_batch_design(self, batch_size, include_labels=False): try: idx = self.rng.randint(self.X.shape[0] - batch_size + 1) except ValueError: if batch_size > self.X.shape[0]: reraise_as(ValueError("Requested %d examples from a dataset " "containing only %d." % (batch_size, self.X.shape[0]))) raise rx = self.X[idx:idx + batch_size, :] if include_labels: if self.y is None: return rx, None ry = self.y[idx:idx + batch_size] rlatent = self.latent[idx:idx + batch_size] return rx, ry,rlatent rx = np.cast[config.floatX](rx) return rx def get_batch_topo(self, batch_size, include_labels=False): """ .. todo:: WRITEME Parameters ---------- batch_size : int WRITEME include_labels : bool WRITEME """ if include_labels: batch_design, labels, latents= self.get_batch_design(batch_size, True) else: batch_design = self.get_batch_design(batch_size) rval = self.view_converter.design_mat_to_topo_view(batch_design) if include_labels: return rval, labels, latents return rval
class TemporalDenseDesignMatrix(DenseDesignMatrix): ''' A class for representing datasets that can be stored as a dense design matrix, but whose examples are slices of width >= 2 rows each. ''' _default_seed = (17, 2, 946) def __init__(self, X=None, topo_view=None, y=None, view_converter=None, axes=('b', 0, 1, 2, 'c'), rng=_default_seed, preprocessor=None, fit_preprocessor=False): ''' TODO: rewrite or just inherit... same as DenseDesignMatrix...??? Parameters ---------- X : ndarray, 2-dimensional, optional Should be supplied if `topo_view` is not. A design matrix of shape (number examples, number features) that defines the dataset. XXXXXXXXXXX not allowed topo_view : ndarray, optional Should be supplied if X is not. An array whose first dimension is of length number examples. The remaining dimensions are xamples with topological significance, e.g. for images the remaining axes are rows, columns, and channels. TODO: time is 0, ii is 1, jj is 2 y : ndarray, 1-dimensional(?), optional Labels or targets for each example. The semantics here are not quite nailed down for this yet. view_converter : object, optional An object for converting between the design matrix stored internally and the data that will be returned by iterators. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. ''' assert topo_view is not None, ( 'For TemporalDenseDesignMatrix, must provide topo_view (not X)') assert axes == ('b', 0, 1, 2, 'c') reduced_axes = ('b', 0, 1, 'c') super(TemporalDenseDesignMatrix, self).__init__(X=X, topo_view=topo_view, y=y, view_converter=view_converter, axes=reduced_axes, rng=rng, preprocessor=preprocessor, fit_preprocessor=fit_preprocessor) self._X = self.X self.X = None # prevent other access def set_topological_view(self, topo_view, axes=('b', 0, 1, 'c')): ''' Sets the dataset to represent topo_view, where topo_view is a batch of topological views of examples. Parameters ---------- topo_view : ndarray An array containing a design matrix representation of training examples. ''' assert not np.any(np.isnan(topo_view)) frames = topo_view.shape[axes.index( 'b')] # pretend frames come in as batch dim rows = topo_view.shape[axes.index(0)] cols = topo_view.shape[axes.index(1)] channels = topo_view.shape[axes.index('c')] # leave out frames... self.view_converter = DefaultViewConverter([rows, cols, channels], axes=axes) self.X = self.view_converter.topo_view_to_design_mat(topo_view) # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = self.view_converter.topo_space assert not np.any(np.isnan(self.X)) # Update data specs X_space = VectorSpace(dim=frames * rows * cols * channels) X_source = 'features' assert self.y is None, 'y not supported now' space = X_space source = X_source self.data_specs = (space, source) self.X_space = X_space self._iter_data_specs = (X_space, X_source) @functools.wraps(Dataset.iterator) def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): '''thin wrapper... TODO: doc''' assert mode == 'shuffled_sequential', ( 'Only shuffled_sequential mode is supported') assert data_specs != None, 'Must provide data_specs' assert len( data_specs ) == 2, 'data_specs must include only one tuple for "features"' assert type( data_specs[0]) is CompositeSpace, 'must be composite space...??' assert data_specs[ 0].num_components == 1, 'must only have one component, features' assert data_specs[1][ 0] == 'features', 'data_specs must include only one tuple for "features"' output_space = data_specs[0].components[0] num_frames = output_space.shape[0] if num_batches is None: num_batches = 10 # another hack... just determines how often new iterators will be created? base_num_batches = num_batches * batch_size # Iterates through ONE example at a time # BEGIN HUGE HACK (enable self.X access just for this function) self.X = self._X base_iterator = super(TemporalDenseDesignMatrix, self).iterator( mode='random_slice', # to return continguous bits batch_size=num_frames, num_batches=base_num_batches, topo=topo, targets=targets, rng=rng, data_specs=data_specs, return_tuple=False) self.X = None # END HUGE HACK return CopyingConcatenatingIterator(base_iterator, how_many=batch_size)
def __init__(self, patient_id, which_set, leave_out_seizure_idx_valid, leave_out_seizure_idx_test, data_dir, preprocessor_dir, batch_size=None, balance_class=True, decompose_subbands=False, axes=('b', 0, 1, 'c'), default_seed=0): self.balance_class = balance_class self.batch_size = batch_size EpilepsiaeEEGLoader.__init__( self, patient_id=patient_id, which_set=which_set, leave_out_seizure_idx_valid=leave_out_seizure_idx_valid, leave_out_seizure_idx_test=leave_out_seizure_idx_test, data_dir=data_dir) print 'Load signal ...' t = time.time() # (# of segments, # of samples, # of channels) raw_X, y = self.load_data() elapsed = time.time() - t print(' Elapsed time: ' + str(elapsed) + ' seconds') # Preprocessing print 'Scaling signal ...' t = time.time() if which_set == 'train': # Reshape the data back to (number of samples x number of channels) for pre-processing unrolled_X = np.reshape(raw_X, (-1, self.scalp_channel_labels.size)) scaler = preprocessing.StandardScaler() # scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) scaler = scaler.fit(unrolled_X) with open( os.path.join( preprocessor_dir, self.patient_id + '_scaler_eeg_' + str(self.leave_out_seizure_idx_valid) + '_' + str(self.leave_out_seizure_idx_test) + '.pkl'), 'w') as f: pickle.dump(scaler, f) scaled_X = raw_X.copy() for seg_idx in range(scaled_X.shape[0]): scaled_X[seg_idx, :, :] = scaler.transform( scaled_X[seg_idx, :, :]) else: with open( os.path.join( preprocessor_dir, self.patient_id + '_scaler_eeg_' + str(self.leave_out_seizure_idx_valid) + '_' + str(self.leave_out_seizure_idx_test) + '.pkl')) as f: scaler = pickle.load(f) scaled_X = raw_X.copy() for seg_idx in range(scaled_X.shape[0]): scaled_X[seg_idx, :, :] = scaler.transform( scaled_X[seg_idx, :, :]) elapsed = time.time() - t print(' Elapsed time: ' + str(elapsed) + ' seconds') raw_X = None if decompose_subbands: def bandpass_fir(data, lowcut_f, highcut_f, sampling_rate, window='hamming'): ''' Bandpass filtering using a FIR filter. Parameters ---------- data: numpy array Input data with shape [n_samples, n_channels]. :param lowcut_f: :param highcut_f: :param sampling_rate: :param window: :return: ''' nyq_f = sampling_rate * 0.5 n_taps = max(3 * (sampling_rate / (lowcut_f * 1.0)), 3 * nyq_f) # Filter length # The filter length must be even if a passband includes the Nyquist frequency. if n_taps % 2 == 1: n_taps = n_taps + 1 taps = firwin(n_taps, [lowcut_f, highcut_f], nyq=nyq_f, pass_zero=False, window=window, scale=False) # If the data is too short, zero-padding extra = (3 * taps.size) - data.shape[0] half_extra = int(np.ceil(extra / 2.0)) + 1 if half_extra > 0: padded_data = np.lib.pad(data, ((half_extra, half_extra), (0, 0)), 'constant', constant_values=0) else: padded_data = data filtered_data = filtfilt(taps, 1.0, padded_data, axis=0) if half_extra > 0: return filtered_data[half_extra:-half_extra, :] else: return filtered_data print 'Decompose EEG signals into 5 sub-bands ...' # Decompose EEG data in each segment in to 5 sub-bands preprocessed_X = np.zeros(( scaled_X.shape[0], # Number of segments scaled_X.shape[1], # Segment samples 5, # Number of sub-bands scaled_X.shape[2])) # Number of channels t = time.time() for seg_idx in range(preprocessed_X.shape[0]): delta_X = bandpass_fir(scaled_X[seg_idx], 0.5, 4, self.sampling_rate) # Delta 0.5-4 Hz theta_X = bandpass_fir(scaled_X[seg_idx], 4, 8, self.sampling_rate) # Theta 4-8 Hz alpha_X = bandpass_fir(scaled_X[seg_idx], 8, 15, self.sampling_rate) # Alpha 8-15 Hz beta_X = bandpass_fir(scaled_X[seg_idx], 15, 30, self.sampling_rate) # Beta 15-30 Hz gamma_X = bandpass_fir( scaled_X[seg_idx], 30, (self.sampling_rate * 0.5) - 0.1, self.sampling_rate) # Gamma 30-Nyquist Hz for ch_idx in range(preprocessed_X.shape[3]): preprocessed_X[seg_idx][:, 0, ch_idx] = delta_X[:, ch_idx] preprocessed_X[seg_idx][:, 1, ch_idx] = theta_X[:, ch_idx] preprocessed_X[seg_idx][:, 2, ch_idx] = alpha_X[:, ch_idx] preprocessed_X[seg_idx][:, 3, ch_idx] = beta_X[:, ch_idx] preprocessed_X[seg_idx][:, 4, ch_idx] = gamma_X[:, ch_idx] if seg_idx % 20 == 0 or seg_idx == preprocessed_X.shape[0] - 1: print ' {0} segments {1} seconds ...'.format( seg_idx + 1, time.time() - t) elapsed = time.time() - t print ' Elapsed time: ' + str(elapsed) + ' seconds' else: # Reshape the preprocessed EEG data into a compatible format for CNN in pylearn2 preprocessed_X = np.reshape( scaled_X, ( scaled_X.shape[0], # Number of segments scaled_X.shape[1], # Segment samples 1, # EEG data are time-series data (i.e., 1 dimension) scaled_X.shape[2])) # Number of channels scaled_X = None # Print shape of input data print '------------------------------' print 'Dataset: {0}'.format(self.which_set) print 'Number of samples: {0}'.format(preprocessed_X.shape[0]) print ' Preictal samples: {0}'.format(self.preictal_samples) print ' Nonictal samples: {0}'.format(self.nonictal_samples) print 'Shape of each sample: ({0}, {1})'.format( preprocessed_X.shape[1], preprocessed_X.shape[2]) print 'Number of channels: {0}'.format(preprocessed_X.shape[3]) print '------------------------------' # Create a view converter view_converter = DefaultViewConverter( shape=[ preprocessed_X.shape[1], # Segment samples preprocessed_X.shape[2], # Number of sub-bands preprocessed_X.shape[3] ], # Number of channels axes=('b', 0, 1, 'c')) # Sanity check view_converted_X = view_converter.topo_view_to_design_mat( preprocessed_X) assert np.all(preprocessed_X == view_converter.design_mat_to_topo_view( view_converted_X)) preprocessed_X = None if self.balance_class and (self.which_set == 'train' or self.which_set == 'valid_train'): self.X_full = view_converted_X self.y_full = y (X, y) = self.get_data() else: # Zero-padding (if necessary) if not (self.batch_size is None): view_converted_X, y = self.zero_pad(view_converted_X, y, self.batch_size) X = view_converted_X # Initialize DenseDesignMatrix DenseDesignMatrix.__init__(self, X=X, y=y, view_converter=view_converter, axes=axes)
class FaceBBoxDDMPytables(dense_design_matrix.DenseDesignMatrix): filters = tables.Filters(complib='blosc', complevel=1) h5file = None """ DenseDesignMatrix based on PyTables for face bounding boxes. """ def __init__(self, X=None, h5file=None, topo_view=None, y=None, view_converter=None, axes = ('b', 0, 1, 'c'), image_shape=None, receptive_field_shape=None, bbox_conversion_type=ConversionType.GUID, area_ratio=None, stride=None, use_output_map=True, rng=None): """ Parameters ---------- X : ndarray, 2-dimensional, optional Should be supplied if `topo_view` is not. A design matrix of shape (number examples, number features) that defines the dataset. topo_view : ndarray, optional Should be supplied if X is not. An array whose first dimension is of length number examples. The remaining dimensions are xamples with topological significance, e.g. for images the remaining axes are rows, columns, and channels. y : ndarray, 1-dimensional(?), optional Labels or targets for each example. The semantics here are not quite nailed down for this yet. view_converter : object, optional An object for converting between design matrices and topological views. Currently DefaultViewConverter is the only type available but later we may want to add one that uses the retina encoding that the U of T group uses. image_shape: list Shape of the images that we are processing. receptive_field_size: list Size of the receptive field of the convolutional neural network. stride: integer The stride that we have used for the convolution operation. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ if rng is None: rng = (17, 2, 946) assert image_shape is not None assert receptive_field_shape is not None assert stride is not None self.image_shape = image_shape self.receptive_field_shape = receptive_field_shape self.stride = stride self.use_output_map = use_output_map self.bbox_conversion_type = bbox_conversion_type self.h5file = h5file self.area_ratio = area_ratio self._deprecated_interface = True FaceBBoxDDMPytables.filters = tables.Filters(complib='blosc', complevel=1) super(FaceBBoxDDMPytables, self).__init__(X = X, topo_view = topo_view, y = y, view_converter = view_converter, axes = axes, rng = rng) def set_design_matrix(self, X, start = 0): """ Parameters ---------- X: Images """ assert (len(X.shape) == 2) assert self.h5file is not None assert not numpy.any(numpy.isnan(X)) if self.h5file.isopen and (self.h5file.mode == "w" or self.h5file.mode == "r+"): self.fill_hdf5(h5file=self.h5file, data_x=X, start=start) else: raise ValueError("H5File is not open or not in the writable mode!") def set_topological_view(self, V, axes=('b', 0, 1, 'c'), start=0): """ Sets the dataset to represent V, where V is a batch of topological views of examples. Parameters ---------- V : ndarray An array containing a design matrix representation of training examples. If unspecified, the entire dataset (`self.X`) is used instead. TODO: why is this parameter named 'V'? """ assert not numpy.any(numpy.isnan(V)) rows = V.shape[axes.index(0)] cols = V.shape[axes.index(1)] channels = V.shape[axes.index('c')] self.view_converter = DefaultViewConverter([rows, cols, channels], axes=axes) X = self.view_converter.topo_view_to_design_mat(V) assert not numpy.any(numpy.isnan(X)) FaceBBoxDDMPytables.fill_hdf5(h5file = self.h5file, data_x = X, start = start) @functools.wraps(Dataset.iterator) def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): # build data_specs from topo and targets if needed if topo is None: topo = getattr(self, '_iter_topo', False) if data_specs[0] is not None: if isinstance(data_specs[0], Conv2DSpace) or isinstance(data_specs[0].components[0], Conv2DSpace): topo = True if topo: # self.iterator is called without a data_specs, and with # "topo=True", so we use the default topological space # stored in self.X_topo_space assert self.X_topo_space is not None X_space = self.X_topo_space else: X_space = self.X_space if targets is None: if "targets" in data_specs[1]: targets = True else: targets = False if data_specs is None: if targets: assert self.y is not None y_space = data_specs[0].components[1] space = CompositeSpace(components=(X_space, y_space)) source = ('features', 'targets') else: space = X_space source = 'features' print space data_specs = (space, source) # TODO: Refactor if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng if data_specs is None: data_specs = self._iter_data_specs return FaceBBoxDDMIterator(self, mode(self.X.shape[0], batch_size, num_batches, rng), img_shape=self.image_shape, receptive_field_shape=self.receptive_field_shape, stride=self.stride, bbox_conversion_type=self.bbox_conversion_type, topo=topo, targets=targets, area_ratio=self.area_ratio, use_output_map=self.use_output_map, data_specs=data_specs, return_tuple=return_tuple) @staticmethod def init_hdf5(path=None, shapes=None): """ Initialize hdf5 file to be used as a dataset """ assert shapes is not None x_shape, y_shape = shapes print "init_hdf5" # make pytables if path is None: if FaceBBoxDDMPytables.h5file is None: raise ValueError("path variable should not be empty.") else: h5file = FaceBBoxDDMPytables.h5file else: h5file = tables.openFile(path, mode = "w", title = "Google Face bounding boxes Dataset.") gcolumns = h5file.createGroup(h5file.root, "Data", "Data") atom = tables.Float32Atom() if config.floatX == 'float32' else tables.Float64Atom() filters = FaceBBoxDDMPytables.filters h5file.createCArray(gcolumns, 'X', atom = atom, shape = x_shape, title = "Images", filters = filters) h5file.createTable(gcolumns, 'bboxes', BoundingBox, title = "Face bounding boxes", filters = filters) return h5file, gcolumns @staticmethod def fill_hdf5(h5file, data_x, data_y = None, node = None, start = 0, batch_size = 5000): """ PyTables tends to crash if you write large data on them at once. This function write data on file in batches start: the start index to write data """ if node is None: node = h5file.root.Data if FaceBBoxDDMPytables.h5file is None: FaceBBoxDDMPytables.h5file = h5file data_size = data_x.shape[0] last = numpy.floor(data_size / float(batch_size)) * batch_size for i in xrange(0, data_size, batch_size): stop = i + numpy.mod(data_size, batch_size) if i >= last else i + batch_size assert len(range(start + i, start + stop)) == len(range(i, stop)) assert (start + stop) <= (node.X.shape[0]) node.X[start + i: start + stop, :] = data_x[i:stop, :] if data_y is not None: node.y[start + i: start + stop, :] = data_y[i:stop, :] h5file.flush() @staticmethod def resize(h5file, start, stop, remove_old_node=False): if h5file is None: raise ValueError("h5file should not be None.") data = h5file.root.Data node_name = "Data_%s_%s" % (start, stop) if remove_old_node: try: gcolumns = h5file.createGroup('/', node_name, "Data %s" % node_name) except tables.exceptions.NodeError: h5file.removeNode('/', node_name, 1) gcolumns = h5file.createGroup('/', node_name, "Data %s" % node_name) elif node_name in h5file.root: return h5file, getattr(h5file.root, node_name) else: gcolumns = h5file.createGroup('/', node_name, "Data %s" % node_name) if FaceBBoxDDMPytables.h5file is None: FaceBBoxDDMPytables.h5file = h5file start = 0 if start is None else start stop = gcolumns.X.nrows if stop is None else stop atom = tables.Float32Atom() if config.floatX == 'float32' else tables.Float64Atom() filters = FaceBBoxDDMPytables.filters x = h5file.createCArray(gcolumns, 'X', atom = atom, shape = ((stop - start, data.X.shape[1])), title = "Images", filters = filters) y = h5file.createTable(gcolumns, 'bboxes', BoundingBox, title = "Face bounding boxes", filters = filters) x[:] = data.X[start:stop] bboxes = get_image_bboxes(slice(start, stop), data.bboxes) y.append(bboxes) if remove_old_node: h5file.removeNode('/', "Data", 1) h5file.renameNode('/', "Data", node_name) h5file.flush() return h5file, gcolumns
class FaceBBoxDDMPytables(dense_design_matrix.DenseDesignMatrix): filters = tables.Filters(complib='blosc', complevel=1) h5file = None """ DenseDesignMatrix based on PyTables for face bounding boxes. """ def __init__(self, X=None, h5file=None, topo_view=None, y=None, view_converter=None, axes=('b', 0, 1, 'c'), image_shape=None, receptive_field_shape=None, bbox_conversion_type=ConversionType.GUID, area_ratio=None, stride=None, use_output_map=True, rng=None): """ Parameters ---------- X : ndarray, 2-dimensional, optional Should be supplied if `topo_view` is not. A design matrix of shape (number examples, number features) that defines the dataset. topo_view : ndarray, optional Should be supplied if X is not. An array whose first dimension is of length number examples. The remaining dimensions are xamples with topological significance, e.g. for images the remaining axes are rows, columns, and channels. y : ndarray, 1-dimensional(?), optional Labels or targets for each example. The semantics here are not quite nailed down for this yet. view_converter : object, optional An object for converting between design matrices and topological views. Currently DefaultViewConverter is the only type available but later we may want to add one that uses the retina encoding that the U of T group uses. image_shape: list Shape of the images that we are processing. receptive_field_size: list Size of the receptive field of the convolutional neural network. stride: integer The stride that we have used for the convolution operation. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ if rng is None: rng = (17, 2, 946) assert image_shape is not None assert receptive_field_shape is not None assert stride is not None self.image_shape = image_shape self.receptive_field_shape = receptive_field_shape self.stride = stride self.use_output_map = use_output_map self.bbox_conversion_type = bbox_conversion_type self.h5file = h5file self.area_ratio = area_ratio self._deprecated_interface = True FaceBBoxDDMPytables.filters = tables.Filters(complib='blosc', complevel=1) super(FaceBBoxDDMPytables, self).__init__(X=X, topo_view=topo_view, y=y, view_converter=view_converter, axes=axes, rng=rng) def set_design_matrix(self, X, start=0): """ Parameters ---------- X: Images """ assert (len(X.shape) == 2) assert self.h5file is not None assert not numpy.any(numpy.isnan(X)) if self.h5file.isopen and (self.h5file.mode == "w" or self.h5file.mode == "r+"): self.fill_hdf5(h5file=self.h5file, data_x=X, start=start) else: raise ValueError("H5File is not open or not in the writable mode!") def set_topological_view(self, V, axes=('b', 0, 1, 'c'), start=0): """ Sets the dataset to represent V, where V is a batch of topological views of examples. Parameters ---------- V : ndarray An array containing a design matrix representation of training examples. If unspecified, the entire dataset (`self.X`) is used instead. TODO: why is this parameter named 'V'? """ assert not numpy.any(numpy.isnan(V)) rows = V.shape[axes.index(0)] cols = V.shape[axes.index(1)] channels = V.shape[axes.index('c')] self.view_converter = DefaultViewConverter([rows, cols, channels], axes=axes) X = self.view_converter.topo_view_to_design_mat(V) assert not numpy.any(numpy.isnan(X)) FaceBBoxDDMPytables.fill_hdf5(h5file=self.h5file, data_x=X, start=start) @functools.wraps(Dataset.iterator) def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): # build data_specs from topo and targets if needed if topo is None: topo = getattr(self, '_iter_topo', False) if data_specs[0] is not None: if isinstance(data_specs[0], Conv2DSpace) or isinstance( data_specs[0].components[0], Conv2DSpace): topo = True if topo: # self.iterator is called without a data_specs, and with # "topo=True", so we use the default topological space # stored in self.X_topo_space assert self.X_topo_space is not None X_space = self.X_topo_space else: X_space = self.X_space if targets is None: if "targets" in data_specs[1]: targets = True else: targets = False if data_specs is None: if targets: assert self.y is not None y_space = data_specs[0].components[1] space = CompositeSpace(components=(X_space, y_space)) source = ('features', 'targets') else: space = X_space source = 'features' print space data_specs = (space, source) # TODO: Refactor if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng if data_specs is None: data_specs = self._iter_data_specs return FaceBBoxDDMIterator( self, mode(self.X.shape[0], batch_size, num_batches, rng), img_shape=self.image_shape, receptive_field_shape=self.receptive_field_shape, stride=self.stride, bbox_conversion_type=self.bbox_conversion_type, topo=topo, targets=targets, area_ratio=self.area_ratio, use_output_map=self.use_output_map, data_specs=data_specs, return_tuple=return_tuple) @staticmethod def init_hdf5(path=None, shapes=None): """ Initialize hdf5 file to be used as a dataset """ assert shapes is not None x_shape, y_shape = shapes print "init_hdf5" # make pytables if path is None: if FaceBBoxDDMPytables.h5file is None: raise ValueError("path variable should not be empty.") else: h5file = FaceBBoxDDMPytables.h5file else: h5file = tables.openFile( path, mode="w", title="Google Face bounding boxes Dataset.") gcolumns = h5file.createGroup(h5file.root, "Data", "Data") atom = tables.Float32Atom( ) if config.floatX == 'float32' else tables.Float64Atom() filters = FaceBBoxDDMPytables.filters h5file.createCArray(gcolumns, 'X', atom=atom, shape=x_shape, title="Images", filters=filters) h5file.createTable(gcolumns, 'bboxes', BoundingBox, title="Face bounding boxes", filters=filters) return h5file, gcolumns @staticmethod def fill_hdf5(h5file, data_x, data_y=None, node=None, start=0, batch_size=5000): """ PyTables tends to crash if you write large data on them at once. This function write data on file in batches start: the start index to write data """ if node is None: node = h5file.root.Data if FaceBBoxDDMPytables.h5file is None: FaceBBoxDDMPytables.h5file = h5file data_size = data_x.shape[0] last = numpy.floor(data_size / float(batch_size)) * batch_size for i in xrange(0, data_size, batch_size): stop = i + numpy.mod(data_size, batch_size) if i >= last else i + batch_size assert len(range(start + i, start + stop)) == len(range(i, stop)) assert (start + stop) <= (node.X.shape[0]) node.X[start + i:start + stop, :] = data_x[i:stop, :] if data_y is not None: node.y[start + i:start + stop, :] = data_y[i:stop, :] h5file.flush() @staticmethod def resize(h5file, start, stop, remove_old_node=False): if h5file is None: raise ValueError("h5file should not be None.") data = h5file.root.Data node_name = "Data_%s_%s" % (start, stop) if remove_old_node: try: gcolumns = h5file.createGroup('/', node_name, "Data %s" % node_name) except tables.exceptions.NodeError: h5file.removeNode('/', node_name, 1) gcolumns = h5file.createGroup('/', node_name, "Data %s" % node_name) elif node_name in h5file.root: return h5file, getattr(h5file.root, node_name) else: gcolumns = h5file.createGroup('/', node_name, "Data %s" % node_name) if FaceBBoxDDMPytables.h5file is None: FaceBBoxDDMPytables.h5file = h5file start = 0 if start is None else start stop = gcolumns.X.nrows if stop is None else stop atom = tables.Float32Atom( ) if config.floatX == 'float32' else tables.Float64Atom() filters = FaceBBoxDDMPytables.filters x = h5file.createCArray(gcolumns, 'X', atom=atom, shape=((stop - start, data.X.shape[1])), title="Images", filters=filters) y = h5file.createTable(gcolumns, 'bboxes', BoundingBox, title="Face bounding boxes", filters=filters) x[:] = data.X[start:stop] bboxes = get_image_bboxes(slice(start, stop), data.bboxes) y.append(bboxes) if remove_old_node: h5file.removeNode('/', "Data", 1) h5file.renameNode('/', "Data", node_name) h5file.flush() return h5file, gcolumns