Beispiel #1
0
    def _transform_multi_channel_data(self, X, y):
        # Data partitioning
        parted_X, parted_y = self._partition_data(
            X=X, y=y, partition_size=self.window_size)
        transposed_X = np.transpose(parted_X, [0, 2, 1])
        converted_X = np.reshape(transposed_X,
                                 (transposed_X.shape[0], transposed_X.shape[1],
                                  1, transposed_X.shape[2]))

        # Create view converter
        view_converter = DefaultViewConverter(shape=self.sample_shape,
                                              axes=('b', 0, 1, 'c'))

        # Convert data into a design matrix
        view_converted_X = view_converter.topo_view_to_design_mat(converted_X)
        assert np.all(converted_X == view_converter.design_mat_to_topo_view(
            view_converted_X))

        # Format the target into proper format
        sum_y = np.sum(parted_y, axis=1)
        sum_y[sum_y > 0] = 1
        one_hot_formatter = OneHotFormatter(max_labels=self.n_classes)
        hot_y = one_hot_formatter.format(sum_y)

        return view_converted_X, hot_y, view_converter
Beispiel #2
0
def get_features(path, split, standardize):
    if path.find(',') != -1:
        paths = path.split(',')
        Xs = [get_features(subpath, split, standardize) for subpath in paths]
        X = np.concatenate(Xs, axis=1)
        return X

    if path.endswith('.npy'):
        topo_view = np.load(path)
    else:
        topo_view = serial.load(path)

        if str(type(topo_view)).find('h5py') != -1:
            name, = topo_view.keys()
            topo_view = topo_view[name].value.T

    if len(topo_view.shape) == 2:
        X = topo_view
    else:
        view_converter = DefaultViewConverter(topo_view.shape[1:])

        print 'converting data'
        X = view_converter.topo_view_to_design_mat(topo_view)

    if split:
        X = np.concatenate((np.abs(X), np.abs(-X)), axis=1)

    if standardize:
        assert False  #bug: if X is test set, we need to subtract train mean, divide by train std
        X -= X.mean(axis=0)
        X /= np.sqrt(.01 + np.var(X, axis=0))

    return X
Beispiel #3
0
def get_features(path, split, standardize):
    if path.find(',') != -1:
        paths = path.split(',')
        Xs = [ get_features(subpath, split, standardize) for subpath in paths ]
        X = np.concatenate( Xs, axis = 1)
        return X


    if path.endswith('.npy'):
        topo_view = np.load(path)
    else:
        topo_view = serial.load(path)

        if str(type(topo_view)).find('h5py') != -1:
            name ,= topo_view.keys()
            topo_view = topo_view[name].value.T

    if len(topo_view.shape) == 2:
        X = topo_view
    else:
        view_converter = DefaultViewConverter(topo_view.shape[1:])

        print 'converting data'
        X = view_converter.topo_view_to_design_mat(topo_view)

    if split:
        X = np.concatenate( (np.abs(X),np.abs(-X)), axis=1)

    if standardize:
        assert False   #bug: if X is test set, we need to subtract train mean, divide by train std
        X -= X.mean(axis=0)
        X /= np.sqrt(.01+np.var(X,axis=0))

    return X
Beispiel #4
0
    def _transform_multi_channel_data(self, X, y):
        # Data partitioning
        parted_X, parted_y = self._partition_data(X=X, y=y, partition_size=self.window_size)
        transposed_X = np.transpose(parted_X, [0, 2, 1])
        converted_X = np.reshape(transposed_X, (transposed_X.shape[0],
                                                transposed_X.shape[1],
                                                1,
                                                transposed_X.shape[2]))

        # Create view converter
        view_converter = DefaultViewConverter(shape=self.sample_shape,
                                              axes=('b', 0, 1, 'c'))

        # Convert data into a design matrix
        view_converted_X = view_converter.topo_view_to_design_mat(converted_X)
        assert np.all(converted_X == view_converter.design_mat_to_topo_view(view_converted_X))

        # Format the target into proper format
        sum_y = np.sum(parted_y, axis=1)
        sum_y[sum_y > 0] = 1
        one_hot_formatter = OneHotFormatter(max_labels=self.n_classes)
        hot_y = one_hot_formatter.format(sum_y)

        return view_converted_X, hot_y, view_converter
class TemporalDenseDesignMatrix(DenseDesignMatrix):
    '''
    A class for representing datasets that can be stored as a dense design
    matrix, but whose examples are slices of width >= 2 rows each.
    '''

    _default_seed = (17, 2, 946)

    def __init__(self, X=None, topo_view=None, y=None,
                 view_converter=None, axes = ('b', 0, 1, 2, 'c'),
                 rng=_default_seed, preprocessor = None, fit_preprocessor=False):
        '''
        TODO: rewrite or just inherit...
        same as DenseDesignMatrix...???
        
        Parameters
        ----------

        X : ndarray, 2-dimensional, optional
            Should be supplied if `topo_view` is not. A design
            matrix of shape (number examples, number features)
            that defines the dataset.
            XXXXXXXXXXX not allowed
        topo_view : ndarray, optional
            Should be supplied if X is not.  An array whose first
            dimension is of length number examples. The remaining
            dimensions are xamples with topological significance,
            e.g. for images the remaining axes are rows, columns,
            and channels.
            TODO: time is 0, ii is 1, jj is 2
        y : ndarray, 1-dimensional(?), optional
            Labels or targets for each example. The semantics here
            are not quite nailed down for this yet.
        view_converter : object, optional
            An object for converting between the design matrix
            stored internally and the data that will be returned
            by iterators.
        rng : object, optional
            A random number generator used for picking random
            indices into the design matrix when choosing minibatches.
        '''

        assert topo_view is not None, (
            'For TemporalDenseDesignMatrix, must provide topo_view (not X)'
        )

        assert axes == ('b', 0, 1, 2, 'c')

        reduced_axes = ('b', 0, 1, 'c')
        
        super(TemporalDenseDesignMatrix, self).__init__(
            X = X,
            topo_view = topo_view,
            y = y,
            view_converter = view_converter,
            axes = reduced_axes,
            rng = rng,
            preprocessor = preprocessor,
            fit_preprocessor = fit_preprocessor
        )

        self._X = self.X
        self.X = None   # prevent other access

    def set_topological_view(self, topo_view, axes=('b', 0, 1, 'c')):
        '''
        Sets the dataset to represent topo_view, where topo_view is a batch
        of topological views of examples.

        Parameters
        ----------
        topo_view : ndarray
            An array containing a design matrix representation of training
            examples.
        '''
        
        assert not np.any(np.isnan(topo_view))
        frames = topo_view.shape[axes.index('b')]    # pretend frames come in as batch dim
        rows = topo_view.shape[axes.index(0)]
        cols = topo_view.shape[axes.index(1)]
        channels = topo_view.shape[axes.index('c')]

        # leave out frames...
        self.view_converter = DefaultViewConverter([rows, cols, channels], axes=axes)
        
        self.X = self.view_converter.topo_view_to_design_mat(topo_view)
        # self.X_topo_space stores a "default" topological space that
        # will be used only when self.iterator is called without a
        # data_specs, and with "topo=True", which is deprecated.
        self.X_topo_space = self.view_converter.topo_space
        assert not np.any(np.isnan(self.X))

        # Update data specs
        X_space = VectorSpace(dim = frames * rows * cols * channels)
        X_source = 'features'

        assert self.y is None, 'y not supported now'
        space = X_space
        source = X_source

        self.data_specs = (space, source)
        self.X_space = X_space
        self._iter_data_specs = (X_space, X_source)

    @functools.wraps(Dataset.iterator)
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 topo=None, targets=None, rng=None, data_specs=None,
                 return_tuple=False):
        '''thin wrapper... TODO: doc'''

        assert mode == 'shuffled_sequential', (
            'Only shuffled_sequential mode is supported'
        )
        assert data_specs != None, 'Must provide data_specs'
        assert len(data_specs) == 2, 'data_specs must include only one tuple for "features"'
        assert type(data_specs[0]) is CompositeSpace, 'must be composite space...??'
        assert data_specs[0].num_components == 1, 'must only have one component, features'
        assert data_specs[1][0] == 'features', 'data_specs must include only one tuple for "features"'

        output_space = data_specs[0].components[0]
        num_frames = output_space.shape[0]

        if num_batches is None:
            num_batches = 10  # another hack... just determines how often new iterators will be created?
        base_num_batches = num_batches * batch_size




        # Iterates through ONE example at a time
        # BEGIN HUGE HACK  (enable self.X access just for this function)
        self.X = self._X
        base_iterator = super(TemporalDenseDesignMatrix, self).iterator(
            mode='random_slice',  # to return continguous bits
            batch_size=num_frames,
            num_batches=base_num_batches,
            topo=topo,
            targets=targets,
            rng=rng,
            data_specs=data_specs,
            return_tuple=False)
        self.X = None
        # END HUGE HACK
        
        return CopyingConcatenatingIterator(base_iterator, how_many = batch_size)
Beispiel #6
0
class myDenseDesignMatrix(dense_design_matrix.DenseDesignMatrix):

    _default_seed = (17, 2, 946)

    def __init__(self, X=None, topo_view=None, y=None, latent = None,
                 view_converter=None, axes=('b', 0, 1, 'c'),
                 rng=_default_seed, preprocessor=None, fit_preprocessor=False,
                 X_labels=None, y_labels=None):

        self.latent = latent
        self.X = X
        self.y = y
        self.view_converter = view_converter
        self.X_labels = X_labels
        self.y_labels = y_labels

        self._check_labels()

        if topo_view is not None:
            assert view_converter is None
            self.set_topological_view(topo_view, axes)
        else:
            assert X is not None, ("DenseDesignMatrix needs to be provided "
                                   "with either topo_view, or X")
            if view_converter is not None:

                # Get the topo_space (usually Conv2DSpace) from the
                # view_converter
                if not hasattr(view_converter, 'topo_space'):
                    raise NotImplementedError("Not able to get a topo_space "
                                              "from this converter: %s"
                                              % view_converter)

                # self.X_topo_space stores a "default" topological space that
                # will be used only when self.iterator is called without a
                # data_specs, and with "topo=True", which is deprecated.
                self.X_topo_space = view_converter.topo_space
            else:
                self.X_topo_space = None

            # Update data specs, if not done in set_topological_view
            X_source = 'features'
            if X_labels is None:
                X_space = VectorSpace(dim=X.shape[1])
            else:
                if X.ndim == 1:
                    dim = 1
                else:
                    dim = X.shape[-1]
                X_space = IndexSpace(dim=dim, max_labels=X_labels)

            if y is None:
                space = X_space
                source = X_source
            else:
                if y.ndim == 1:
                    dim = 1
                else:
                    dim = y.shape[-1]
                if y_labels is not None:
                    y_space = IndexSpace(dim=dim, max_labels=y_labels)
                else:
                    y_space = VectorSpace(dim=dim)
                y_source = 'targets'

                Latent_space = VectorSpace(dim=latent.shape[-1])
                Latent_source = 'latents'
                space = CompositeSpace((X_space, y_space, Latent_space))
                source = (X_source, y_source, Latent_source)

            self.data_specs = (space, source)
            self.X_space = X_space

        self.compress = False
        self.design_loc = None
        self.rng = make_np_rng(rng, which_method="random_integers")
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = (self.X_space, 'features')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor



    def get_data(self):
        """
        Returns all the data, as it is internally stored.
        The definition and format of these data are described in
        `self.get_data_specs()`.

        Returns
        -------
        data : numpy matrix or 2-tuple of matrices
            The data
        """
        if self.y is None:
            return self.X
        else:
            return (self.X, self.y, self.latent)



    def set_topological_view(self, V, axes=('b', 0, 1, 'c')):
        """
        Sets the dataset to represent V, where V is a batch
        of topological views of examples.

        .. todo::

            Why is this parameter named 'V'?

        Parameters
        ----------
        V : ndarray
            An array containing a design matrix representation of
            training examples.
        axes : WRITEME
        """
        assert not contains_nan(V)
        rows = V.shape[axes.index(0)]
        cols = V.shape[axes.index(1)]
        channels = V.shape[axes.index('c')]
        self.view_converter = DefaultViewConverter([rows, cols, channels],
                                                   axes=axes)
        self.X = self.view_converter.topo_view_to_design_mat(V)
        # self.X_topo_space stores a "default" topological space that
        # will be used only when self.iterator is called without a
        # data_specs, and with "topo=True", which is deprecated.
        self.X_topo_space = self.view_converter.topo_space
        assert not contains_nan(self.X)

        # Update data specs
        X_space = VectorSpace(dim=self.X.shape[1])
        X_source = 'features'
        if self.y is None:
            space = X_space
            source = X_source
        else:
            if self.y.ndim == 1:
                dim = 1
            else:
                dim = self.y.shape[-1]
            # This is to support old pickled models
            if getattr(self, 'y_labels', None) is not None:
                y_space = IndexSpace(dim=dim, max_labels=self.y_labels)
            elif getattr(self, 'max_labels', None) is not None:
                y_space = IndexSpace(dim=dim, max_labels=self.max_labels)
            else:
                y_space = VectorSpace(dim=dim)
            y_source = 'targets'

            Latent_space = VectorSpace(dim=self.latent.shape[-1])
            Latent_source = 'latents'

            space = CompositeSpace((X_space, y_space,Latent_space))
            source = (X_source, y_source,Latent_source)

        self.data_specs = (space, source)
        self.X_space = X_space
        self._iter_data_specs = (X_space, X_source)

    def get_targets(self):
        """
        .. todo::

            WRITEME
        """
        return self.y
    def get_latents(self):
        """
        .. todo::

            WRITEME
        """
        return self.latent

    def get_batch_design(self, batch_size, include_labels=False):

        try:
            idx = self.rng.randint(self.X.shape[0] - batch_size + 1)
        except ValueError:
            if batch_size > self.X.shape[0]:
                reraise_as(ValueError("Requested %d examples from a dataset "
                                      "containing only %d." %
                                      (batch_size, self.X.shape[0])))
            raise
        rx = self.X[idx:idx + batch_size, :]
        if include_labels:
            if self.y is None:
                return rx, None
            ry = self.y[idx:idx + batch_size]
            rlatent = self.latent[idx:idx + batch_size]
            return rx, ry,rlatent
        rx = np.cast[config.floatX](rx)
        return rx

    def get_batch_topo(self, batch_size, include_labels=False):
        """
        .. todo::

            WRITEME

        Parameters
        ----------
        batch_size : int
            WRITEME
        include_labels : bool
            WRITEME
        """

        if include_labels:
            batch_design, labels, latents= self.get_batch_design(batch_size, True)
        else:
            batch_design = self.get_batch_design(batch_size)

        rval = self.view_converter.design_mat_to_topo_view(batch_design)

        if include_labels:
            return rval, labels, latents

        return rval
Beispiel #7
0
class TemporalDenseDesignMatrix(DenseDesignMatrix):
    '''
    A class for representing datasets that can be stored as a dense design
    matrix, but whose examples are slices of width >= 2 rows each.
    '''

    _default_seed = (17, 2, 946)

    def __init__(self,
                 X=None,
                 topo_view=None,
                 y=None,
                 view_converter=None,
                 axes=('b', 0, 1, 2, 'c'),
                 rng=_default_seed,
                 preprocessor=None,
                 fit_preprocessor=False):
        '''
        TODO: rewrite or just inherit...
        same as DenseDesignMatrix...???
        
        Parameters
        ----------

        X : ndarray, 2-dimensional, optional
            Should be supplied if `topo_view` is not. A design
            matrix of shape (number examples, number features)
            that defines the dataset.
            XXXXXXXXXXX not allowed
        topo_view : ndarray, optional
            Should be supplied if X is not.  An array whose first
            dimension is of length number examples. The remaining
            dimensions are xamples with topological significance,
            e.g. for images the remaining axes are rows, columns,
            and channels.
            TODO: time is 0, ii is 1, jj is 2
        y : ndarray, 1-dimensional(?), optional
            Labels or targets for each example. The semantics here
            are not quite nailed down for this yet.
        view_converter : object, optional
            An object for converting between the design matrix
            stored internally and the data that will be returned
            by iterators.
        rng : object, optional
            A random number generator used for picking random
            indices into the design matrix when choosing minibatches.
        '''

        assert topo_view is not None, (
            'For TemporalDenseDesignMatrix, must provide topo_view (not X)')

        assert axes == ('b', 0, 1, 2, 'c')

        reduced_axes = ('b', 0, 1, 'c')

        super(TemporalDenseDesignMatrix,
              self).__init__(X=X,
                             topo_view=topo_view,
                             y=y,
                             view_converter=view_converter,
                             axes=reduced_axes,
                             rng=rng,
                             preprocessor=preprocessor,
                             fit_preprocessor=fit_preprocessor)

        self._X = self.X
        self.X = None  # prevent other access

    def set_topological_view(self, topo_view, axes=('b', 0, 1, 'c')):
        '''
        Sets the dataset to represent topo_view, where topo_view is a batch
        of topological views of examples.

        Parameters
        ----------
        topo_view : ndarray
            An array containing a design matrix representation of training
            examples.
        '''

        assert not np.any(np.isnan(topo_view))
        frames = topo_view.shape[axes.index(
            'b')]  # pretend frames come in as batch dim
        rows = topo_view.shape[axes.index(0)]
        cols = topo_view.shape[axes.index(1)]
        channels = topo_view.shape[axes.index('c')]

        # leave out frames...
        self.view_converter = DefaultViewConverter([rows, cols, channels],
                                                   axes=axes)

        self.X = self.view_converter.topo_view_to_design_mat(topo_view)
        # self.X_topo_space stores a "default" topological space that
        # will be used only when self.iterator is called without a
        # data_specs, and with "topo=True", which is deprecated.
        self.X_topo_space = self.view_converter.topo_space
        assert not np.any(np.isnan(self.X))

        # Update data specs
        X_space = VectorSpace(dim=frames * rows * cols * channels)
        X_source = 'features'

        assert self.y is None, 'y not supported now'
        space = X_space
        source = X_source

        self.data_specs = (space, source)
        self.X_space = X_space
        self._iter_data_specs = (X_space, X_source)

    @functools.wraps(Dataset.iterator)
    def iterator(self,
                 mode=None,
                 batch_size=None,
                 num_batches=None,
                 topo=None,
                 targets=None,
                 rng=None,
                 data_specs=None,
                 return_tuple=False):
        '''thin wrapper... TODO: doc'''

        assert mode == 'shuffled_sequential', (
            'Only shuffled_sequential mode is supported')
        assert data_specs != None, 'Must provide data_specs'
        assert len(
            data_specs
        ) == 2, 'data_specs must include only one tuple for "features"'
        assert type(
            data_specs[0]) is CompositeSpace, 'must be composite space...??'
        assert data_specs[
            0].num_components == 1, 'must only have one component, features'
        assert data_specs[1][
            0] == 'features', 'data_specs must include only one tuple for "features"'

        output_space = data_specs[0].components[0]
        num_frames = output_space.shape[0]

        if num_batches is None:
            num_batches = 10  # another hack... just determines how often new iterators will be created?
        base_num_batches = num_batches * batch_size

        # Iterates through ONE example at a time
        # BEGIN HUGE HACK  (enable self.X access just for this function)
        self.X = self._X
        base_iterator = super(TemporalDenseDesignMatrix, self).iterator(
            mode='random_slice',  # to return continguous bits
            batch_size=num_frames,
            num_batches=base_num_batches,
            topo=topo,
            targets=targets,
            rng=rng,
            data_specs=data_specs,
            return_tuple=False)
        self.X = None
        # END HUGE HACK

        return CopyingConcatenatingIterator(base_iterator, how_many=batch_size)
Beispiel #8
0
    def __init__(self,
                 patient_id,
                 which_set,
                 leave_out_seizure_idx_valid,
                 leave_out_seizure_idx_test,
                 data_dir,
                 preprocessor_dir,
                 batch_size=None,
                 balance_class=True,
                 decompose_subbands=False,
                 axes=('b', 0, 1, 'c'),
                 default_seed=0):

        self.balance_class = balance_class
        self.batch_size = batch_size

        EpilepsiaeEEGLoader.__init__(
            self,
            patient_id=patient_id,
            which_set=which_set,
            leave_out_seizure_idx_valid=leave_out_seizure_idx_valid,
            leave_out_seizure_idx_test=leave_out_seizure_idx_test,
            data_dir=data_dir)

        print 'Load signal ...'
        t = time.time()
        # (# of segments, # of samples, # of channels)
        raw_X, y = self.load_data()
        elapsed = time.time() - t
        print(' Elapsed time: ' + str(elapsed) + ' seconds')

        # Preprocessing
        print 'Scaling signal ...'
        t = time.time()
        if which_set == 'train':
            # Reshape the data back to (number of samples x number of channels) for pre-processing
            unrolled_X = np.reshape(raw_X,
                                    (-1, self.scalp_channel_labels.size))

            scaler = preprocessing.StandardScaler()
            # scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
            scaler = scaler.fit(unrolled_X)

            with open(
                    os.path.join(
                        preprocessor_dir, self.patient_id + '_scaler_eeg_' +
                        str(self.leave_out_seizure_idx_valid) + '_' +
                        str(self.leave_out_seizure_idx_test) + '.pkl'),
                    'w') as f:
                pickle.dump(scaler, f)

            scaled_X = raw_X.copy()
            for seg_idx in range(scaled_X.shape[0]):
                scaled_X[seg_idx, :, :] = scaler.transform(
                    scaled_X[seg_idx, :, :])
        else:
            with open(
                    os.path.join(
                        preprocessor_dir, self.patient_id + '_scaler_eeg_' +
                        str(self.leave_out_seizure_idx_valid) + '_' +
                        str(self.leave_out_seizure_idx_test) + '.pkl')) as f:
                scaler = pickle.load(f)

            scaled_X = raw_X.copy()
            for seg_idx in range(scaled_X.shape[0]):
                scaled_X[seg_idx, :, :] = scaler.transform(
                    scaled_X[seg_idx, :, :])
        elapsed = time.time() - t
        print(' Elapsed time: ' + str(elapsed) + ' seconds')

        raw_X = None

        if decompose_subbands:

            def bandpass_fir(data,
                             lowcut_f,
                             highcut_f,
                             sampling_rate,
                             window='hamming'):
                '''
                Bandpass filtering using a FIR filter.

                Parameters
                ----------
                data: numpy array
                    Input data with shape [n_samples, n_channels].
                :param lowcut_f:
                :param highcut_f:
                :param sampling_rate:
                :param window:
                :return:
                '''

                nyq_f = sampling_rate * 0.5
                n_taps = max(3 * (sampling_rate / (lowcut_f * 1.0)),
                             3 * nyq_f)  # Filter length

                # The filter length must be even if a passband includes the Nyquist frequency.
                if n_taps % 2 == 1:
                    n_taps = n_taps + 1

                taps = firwin(n_taps, [lowcut_f, highcut_f],
                              nyq=nyq_f,
                              pass_zero=False,
                              window=window,
                              scale=False)

                # If the data is too short, zero-padding
                extra = (3 * taps.size) - data.shape[0]
                half_extra = int(np.ceil(extra / 2.0)) + 1
                if half_extra > 0:
                    padded_data = np.lib.pad(data, ((half_extra, half_extra),
                                                    (0, 0)),
                                             'constant',
                                             constant_values=0)
                else:
                    padded_data = data

                filtered_data = filtfilt(taps, 1.0, padded_data, axis=0)

                if half_extra > 0:
                    return filtered_data[half_extra:-half_extra, :]
                else:
                    return filtered_data

            print 'Decompose EEG signals into 5 sub-bands ...'

            # Decompose EEG data in each segment in to 5 sub-bands
            preprocessed_X = np.zeros((
                scaled_X.shape[0],  # Number of segments
                scaled_X.shape[1],  # Segment samples
                5,  # Number of sub-bands
                scaled_X.shape[2]))  # Number of channels

            t = time.time()
            for seg_idx in range(preprocessed_X.shape[0]):
                delta_X = bandpass_fir(scaled_X[seg_idx], 0.5, 4,
                                       self.sampling_rate)  # Delta 0.5-4 Hz
                theta_X = bandpass_fir(scaled_X[seg_idx], 4, 8,
                                       self.sampling_rate)  # Theta 4-8 Hz
                alpha_X = bandpass_fir(scaled_X[seg_idx], 8, 15,
                                       self.sampling_rate)  # Alpha 8-15 Hz
                beta_X = bandpass_fir(scaled_X[seg_idx], 15, 30,
                                      self.sampling_rate)  # Beta 15-30 Hz
                gamma_X = bandpass_fir(
                    scaled_X[seg_idx], 30, (self.sampling_rate * 0.5) - 0.1,
                    self.sampling_rate)  # Gamma 30-Nyquist Hz

                for ch_idx in range(preprocessed_X.shape[3]):
                    preprocessed_X[seg_idx][:, 0, ch_idx] = delta_X[:, ch_idx]
                    preprocessed_X[seg_idx][:, 1, ch_idx] = theta_X[:, ch_idx]
                    preprocessed_X[seg_idx][:, 2, ch_idx] = alpha_X[:, ch_idx]
                    preprocessed_X[seg_idx][:, 3, ch_idx] = beta_X[:, ch_idx]
                    preprocessed_X[seg_idx][:, 4, ch_idx] = gamma_X[:, ch_idx]

                if seg_idx % 20 == 0 or seg_idx == preprocessed_X.shape[0] - 1:
                    print ' {0} segments {1} seconds ...'.format(
                        seg_idx + 1,
                        time.time() - t)

            elapsed = time.time() - t
            print ' Elapsed time: ' + str(elapsed) + ' seconds'

        else:
            # Reshape the preprocessed EEG data into a compatible format for CNN in pylearn2
            preprocessed_X = np.reshape(
                scaled_X,
                (
                    scaled_X.shape[0],  # Number of segments
                    scaled_X.shape[1],  # Segment samples
                    1,  # EEG data are time-series data (i.e., 1 dimension)
                    scaled_X.shape[2]))  # Number of channels

        scaled_X = None

        # Print shape of input data
        print '------------------------------'
        print 'Dataset: {0}'.format(self.which_set)
        print 'Number of samples: {0}'.format(preprocessed_X.shape[0])
        print ' Preictal samples: {0}'.format(self.preictal_samples)
        print ' Nonictal samples: {0}'.format(self.nonictal_samples)
        print 'Shape of each sample: ({0}, {1})'.format(
            preprocessed_X.shape[1], preprocessed_X.shape[2])
        print 'Number of channels: {0}'.format(preprocessed_X.shape[3])
        print '------------------------------'

        # Create a view converter
        view_converter = DefaultViewConverter(
            shape=[
                preprocessed_X.shape[1],  # Segment samples
                preprocessed_X.shape[2],  # Number of sub-bands
                preprocessed_X.shape[3]
            ],  # Number of channels
            axes=('b', 0, 1, 'c'))

        # Sanity check
        view_converted_X = view_converter.topo_view_to_design_mat(
            preprocessed_X)
        assert np.all(preprocessed_X == view_converter.design_mat_to_topo_view(
            view_converted_X))

        preprocessed_X = None

        if self.balance_class and (self.which_set == 'train'
                                   or self.which_set == 'valid_train'):
            self.X_full = view_converted_X
            self.y_full = y

            (X, y) = self.get_data()
        else:
            # Zero-padding (if necessary)
            if not (self.batch_size is None):
                view_converted_X, y = self.zero_pad(view_converted_X, y,
                                                    self.batch_size)

            X = view_converted_X

        # Initialize DenseDesignMatrix
        DenseDesignMatrix.__init__(self,
                                   X=X,
                                   y=y,
                                   view_converter=view_converter,
                                   axes=axes)
Beispiel #9
0
class FaceBBoxDDMPytables(dense_design_matrix.DenseDesignMatrix):
    filters = tables.Filters(complib='blosc', complevel=1)
    h5file = None
    """
    DenseDesignMatrix based on PyTables for face bounding boxes.
    """
    def __init__(self, X=None, h5file=None, topo_view=None, y=None,
                 view_converter=None, axes = ('b', 0, 1, 'c'),
                 image_shape=None, receptive_field_shape=None,
                 bbox_conversion_type=ConversionType.GUID,
                 area_ratio=None,
                 stride=None, use_output_map=True, rng=None):
        """
        Parameters
        ----------

        X : ndarray, 2-dimensional, optional
            Should be supplied if `topo_view` is not. A design
            matrix of shape (number examples, number features)
            that defines the dataset.
        topo_view : ndarray, optional
            Should be supplied if X is not.  An array whose first
            dimension is of length number examples. The remaining
            dimensions are xamples with topological significance,
            e.g. for images the remaining axes are rows, columns,
            and channels.
        y : ndarray, 1-dimensional(?), optional
            Labels or targets for each example. The semantics here
            are not quite nailed down for this yet.
        view_converter : object, optional
            An object for converting between design matrices and
            topological views. Currently DefaultViewConverter is
            the only type available but later we may want to add
            one that uses the retina encoding that the U of T group
            uses.
        image_shape: list
            Shape of the images that we are processing.
        receptive_field_size: list
            Size of the receptive field of the convolutional neural network.
        stride: integer
            The stride that we have used for the convolution operation.
        rng : object, optional
            A random number generator used for picking random
            indices into the design matrix when choosing minibatches.
        """

        if rng is None:
            rng = (17, 2, 946)

        assert image_shape is not None
        assert receptive_field_shape is not None
        assert stride is not None

        self.image_shape = image_shape
        self.receptive_field_shape = receptive_field_shape
        self.stride = stride
        self.use_output_map = use_output_map
        self.bbox_conversion_type = bbox_conversion_type
        self.h5file = h5file
        self.area_ratio = area_ratio
        self._deprecated_interface = True
        FaceBBoxDDMPytables.filters = tables.Filters(complib='blosc', complevel=1)


        super(FaceBBoxDDMPytables, self).__init__(X = X,
                                            topo_view = topo_view,
                                            y = y,
                                            view_converter = view_converter,
                                            axes = axes,
                                            rng = rng)

    def set_design_matrix(self, X, start = 0):
        """
        Parameters
        ----------
        X: Images
        """
        assert (len(X.shape) == 2)
        assert self.h5file is not None
        assert not numpy.any(numpy.isnan(X))

        if self.h5file.isopen and (self.h5file.mode == "w" or self.h5file.mode == "r+"):
            self.fill_hdf5(h5file=self.h5file,
                data_x=X,
                start=start)
        else:
            raise ValueError("H5File is not open or not in the writable mode!")

    def set_topological_view(self, V, axes=('b', 0, 1, 'c'), start=0):
        """
        Sets the dataset to represent V, where V is a batch
        of topological views of examples.

        Parameters
        ----------
        V : ndarray
            An array containing a design matrix representation of training
            examples. If unspecified, the entire dataset (`self.X`) is used
            instead.
        TODO: why is this parameter named 'V'?
        """
        assert not numpy.any(numpy.isnan(V))
        rows = V.shape[axes.index(0)]
        cols = V.shape[axes.index(1)]
        channels = V.shape[axes.index('c')]
        self.view_converter = DefaultViewConverter([rows, cols, channels], axes=axes)
        X = self.view_converter.topo_view_to_design_mat(V)
        assert not numpy.any(numpy.isnan(X))

        FaceBBoxDDMPytables.fill_hdf5(h5file = self.h5file,
                                            data_x = X,
                                            start = start)

    @functools.wraps(Dataset.iterator)
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 topo=None, targets=None, rng=None, data_specs=None,
                 return_tuple=False):

        # build data_specs from topo and targets if needed
        if topo is None:
            topo = getattr(self, '_iter_topo', False)

        if data_specs[0] is not None:
            if isinstance(data_specs[0], Conv2DSpace) or isinstance(data_specs[0].components[0],
                    Conv2DSpace):
                topo = True

        if topo:
            # self.iterator is called without a data_specs, and with
            # "topo=True", so we use the default topological space
            # stored in self.X_topo_space
            assert self.X_topo_space is not None
            X_space = self.X_topo_space
        else:
            X_space = self.X_space

        if targets is None:
            if "targets" in data_specs[1]:
                targets = True
            else:
                targets = False

        if data_specs is None:
            if targets:
                assert self.y is not None
                y_space = data_specs[0].components[1]
                space = CompositeSpace(components=(X_space, y_space))
                source = ('features', 'targets')
            else:
                space = X_space
                source = 'features'

            print space
            data_specs = (space, source)

        # TODO: Refactor
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)

        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)

        if rng is None and mode.stochastic:
            rng = self.rng

        if data_specs is None:
            data_specs = self._iter_data_specs

        return FaceBBoxDDMIterator(self,
                                    mode(self.X.shape[0], batch_size, num_batches, rng),
                                    img_shape=self.image_shape,
                                    receptive_field_shape=self.receptive_field_shape,
                                    stride=self.stride,
                                    bbox_conversion_type=self.bbox_conversion_type,
                                    topo=topo,
                                    targets=targets,
                                    area_ratio=self.area_ratio,
                                    use_output_map=self.use_output_map,
                                    data_specs=data_specs,
                                    return_tuple=return_tuple)

    @staticmethod
    def init_hdf5(path=None, shapes=None):
        """
        Initialize hdf5 file to be used as a dataset
        """
        assert shapes is not None

        x_shape, y_shape = shapes
        print "init_hdf5"

        # make pytables
        if path is None:
            if FaceBBoxDDMPytables.h5file is None:
                raise ValueError("path variable should not be empty.")
            else:
                h5file = FaceBBoxDDMPytables.h5file
        else:
                h5file = tables.openFile(path, mode = "w", title = "Google Face bounding boxes Dataset.")

        gcolumns = h5file.createGroup(h5file.root, "Data", "Data")
        atom = tables.Float32Atom() if config.floatX == 'float32' else tables.Float64Atom()

        filters = FaceBBoxDDMPytables.filters

        h5file.createCArray(gcolumns, 'X', atom = atom, shape = x_shape,
                title = "Images", filters = filters)

        h5file.createTable(gcolumns, 'bboxes', BoundingBox,
                title = "Face bounding boxes", filters = filters)

        return h5file, gcolumns

    @staticmethod
    def fill_hdf5(h5file, data_x, data_y = None, node = None, start = 0, batch_size = 5000):
        """
        PyTables tends to crash if you write large data on them at once.
        This function write data on file in batches

        start: the start index to write data
        """

        if node is None:
            node = h5file.root.Data
        if FaceBBoxDDMPytables.h5file is None:
            FaceBBoxDDMPytables.h5file = h5file

        data_size = data_x.shape[0]
        last = numpy.floor(data_size / float(batch_size)) * batch_size
        for i in xrange(0, data_size, batch_size):
            stop = i + numpy.mod(data_size, batch_size) if i >= last else i + batch_size
            assert len(range(start + i, start + stop)) == len(range(i, stop))
            assert (start + stop) <= (node.X.shape[0])

            node.X[start + i: start + stop, :] = data_x[i:stop, :]

            if data_y is not None:
                node.y[start + i: start + stop, :] = data_y[i:stop, :]

            h5file.flush()

    @staticmethod
    def resize(h5file, start, stop, remove_old_node=False):
        if h5file is None:
            raise ValueError("h5file should not be None.")

        data = h5file.root.Data
        node_name = "Data_%s_%s" % (start, stop)
        if remove_old_node:
            try:
                gcolumns = h5file.createGroup('/', node_name, "Data %s" %   node_name)
            except tables.exceptions.NodeError:
                h5file.removeNode('/', node_name, 1)
                gcolumns = h5file.createGroup('/', node_name, "Data %s" % node_name)
        elif node_name in h5file.root:
            return h5file, getattr(h5file.root, node_name)
        else:
            gcolumns = h5file.createGroup('/', node_name, "Data %s" %   node_name)

        if FaceBBoxDDMPytables.h5file is None:
            FaceBBoxDDMPytables.h5file = h5file

        start = 0 if start is None else start
        stop = gcolumns.X.nrows if stop is None else stop

        atom = tables.Float32Atom() if config.floatX == 'float32' else tables.Float64Atom()
        filters = FaceBBoxDDMPytables.filters

        x = h5file.createCArray(gcolumns, 'X', atom = atom, shape = ((stop - start, data.X.shape[1])),
                title = "Images", filters = filters)

        y = h5file.createTable(gcolumns, 'bboxes', BoundingBox,
                title = "Face bounding boxes", filters = filters)

        x[:] = data.X[start:stop]
        bboxes = get_image_bboxes(slice(start, stop), data.bboxes)
        y.append(bboxes)

        if remove_old_node:
            h5file.removeNode('/', "Data", 1)
            h5file.renameNode('/', "Data", node_name)

        h5file.flush()
        return h5file, gcolumns
Beispiel #10
0
class FaceBBoxDDMPytables(dense_design_matrix.DenseDesignMatrix):
    filters = tables.Filters(complib='blosc', complevel=1)
    h5file = None
    """
    DenseDesignMatrix based on PyTables for face bounding boxes.
    """
    def __init__(self,
                 X=None,
                 h5file=None,
                 topo_view=None,
                 y=None,
                 view_converter=None,
                 axes=('b', 0, 1, 'c'),
                 image_shape=None,
                 receptive_field_shape=None,
                 bbox_conversion_type=ConversionType.GUID,
                 area_ratio=None,
                 stride=None,
                 use_output_map=True,
                 rng=None):
        """
        Parameters
        ----------

        X : ndarray, 2-dimensional, optional
            Should be supplied if `topo_view` is not. A design
            matrix of shape (number examples, number features)
            that defines the dataset.
        topo_view : ndarray, optional
            Should be supplied if X is not.  An array whose first
            dimension is of length number examples. The remaining
            dimensions are xamples with topological significance,
            e.g. for images the remaining axes are rows, columns,
            and channels.
        y : ndarray, 1-dimensional(?), optional
            Labels or targets for each example. The semantics here
            are not quite nailed down for this yet.
        view_converter : object, optional
            An object for converting between design matrices and
            topological views. Currently DefaultViewConverter is
            the only type available but later we may want to add
            one that uses the retina encoding that the U of T group
            uses.
        image_shape: list
            Shape of the images that we are processing.
        receptive_field_size: list
            Size of the receptive field of the convolutional neural network.
        stride: integer
            The stride that we have used for the convolution operation.
        rng : object, optional
            A random number generator used for picking random
            indices into the design matrix when choosing minibatches.
        """

        if rng is None:
            rng = (17, 2, 946)

        assert image_shape is not None
        assert receptive_field_shape is not None
        assert stride is not None

        self.image_shape = image_shape
        self.receptive_field_shape = receptive_field_shape
        self.stride = stride
        self.use_output_map = use_output_map
        self.bbox_conversion_type = bbox_conversion_type
        self.h5file = h5file
        self.area_ratio = area_ratio
        self._deprecated_interface = True
        FaceBBoxDDMPytables.filters = tables.Filters(complib='blosc',
                                                     complevel=1)

        super(FaceBBoxDDMPytables,
              self).__init__(X=X,
                             topo_view=topo_view,
                             y=y,
                             view_converter=view_converter,
                             axes=axes,
                             rng=rng)

    def set_design_matrix(self, X, start=0):
        """
        Parameters
        ----------
        X: Images
        """
        assert (len(X.shape) == 2)
        assert self.h5file is not None
        assert not numpy.any(numpy.isnan(X))

        if self.h5file.isopen and (self.h5file.mode == "w"
                                   or self.h5file.mode == "r+"):
            self.fill_hdf5(h5file=self.h5file, data_x=X, start=start)
        else:
            raise ValueError("H5File is not open or not in the writable mode!")

    def set_topological_view(self, V, axes=('b', 0, 1, 'c'), start=0):
        """
        Sets the dataset to represent V, where V is a batch
        of topological views of examples.

        Parameters
        ----------
        V : ndarray
            An array containing a design matrix representation of training
            examples. If unspecified, the entire dataset (`self.X`) is used
            instead.
        TODO: why is this parameter named 'V'?
        """
        assert not numpy.any(numpy.isnan(V))
        rows = V.shape[axes.index(0)]
        cols = V.shape[axes.index(1)]
        channels = V.shape[axes.index('c')]
        self.view_converter = DefaultViewConverter([rows, cols, channels],
                                                   axes=axes)
        X = self.view_converter.topo_view_to_design_mat(V)
        assert not numpy.any(numpy.isnan(X))

        FaceBBoxDDMPytables.fill_hdf5(h5file=self.h5file,
                                      data_x=X,
                                      start=start)

    @functools.wraps(Dataset.iterator)
    def iterator(self,
                 mode=None,
                 batch_size=None,
                 num_batches=None,
                 topo=None,
                 targets=None,
                 rng=None,
                 data_specs=None,
                 return_tuple=False):

        # build data_specs from topo and targets if needed
        if topo is None:
            topo = getattr(self, '_iter_topo', False)

        if data_specs[0] is not None:
            if isinstance(data_specs[0], Conv2DSpace) or isinstance(
                    data_specs[0].components[0], Conv2DSpace):
                topo = True

        if topo:
            # self.iterator is called without a data_specs, and with
            # "topo=True", so we use the default topological space
            # stored in self.X_topo_space
            assert self.X_topo_space is not None
            X_space = self.X_topo_space
        else:
            X_space = self.X_space

        if targets is None:
            if "targets" in data_specs[1]:
                targets = True
            else:
                targets = False

        if data_specs is None:
            if targets:
                assert self.y is not None
                y_space = data_specs[0].components[1]
                space = CompositeSpace(components=(X_space, y_space))
                source = ('features', 'targets')
            else:
                space = X_space
                source = 'features'

            print space
            data_specs = (space, source)

        # TODO: Refactor
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)

        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)

        if rng is None and mode.stochastic:
            rng = self.rng

        if data_specs is None:
            data_specs = self._iter_data_specs

        return FaceBBoxDDMIterator(
            self,
            mode(self.X.shape[0], batch_size, num_batches, rng),
            img_shape=self.image_shape,
            receptive_field_shape=self.receptive_field_shape,
            stride=self.stride,
            bbox_conversion_type=self.bbox_conversion_type,
            topo=topo,
            targets=targets,
            area_ratio=self.area_ratio,
            use_output_map=self.use_output_map,
            data_specs=data_specs,
            return_tuple=return_tuple)

    @staticmethod
    def init_hdf5(path=None, shapes=None):
        """
        Initialize hdf5 file to be used as a dataset
        """
        assert shapes is not None

        x_shape, y_shape = shapes
        print "init_hdf5"

        # make pytables
        if path is None:
            if FaceBBoxDDMPytables.h5file is None:
                raise ValueError("path variable should not be empty.")
            else:
                h5file = FaceBBoxDDMPytables.h5file
        else:
            h5file = tables.openFile(
                path, mode="w", title="Google Face bounding boxes Dataset.")

        gcolumns = h5file.createGroup(h5file.root, "Data", "Data")
        atom = tables.Float32Atom(
        ) if config.floatX == 'float32' else tables.Float64Atom()

        filters = FaceBBoxDDMPytables.filters

        h5file.createCArray(gcolumns,
                            'X',
                            atom=atom,
                            shape=x_shape,
                            title="Images",
                            filters=filters)

        h5file.createTable(gcolumns,
                           'bboxes',
                           BoundingBox,
                           title="Face bounding boxes",
                           filters=filters)

        return h5file, gcolumns

    @staticmethod
    def fill_hdf5(h5file,
                  data_x,
                  data_y=None,
                  node=None,
                  start=0,
                  batch_size=5000):
        """
        PyTables tends to crash if you write large data on them at once.
        This function write data on file in batches

        start: the start index to write data
        """

        if node is None:
            node = h5file.root.Data
        if FaceBBoxDDMPytables.h5file is None:
            FaceBBoxDDMPytables.h5file = h5file

        data_size = data_x.shape[0]
        last = numpy.floor(data_size / float(batch_size)) * batch_size
        for i in xrange(0, data_size, batch_size):
            stop = i + numpy.mod(data_size,
                                 batch_size) if i >= last else i + batch_size
            assert len(range(start + i, start + stop)) == len(range(i, stop))
            assert (start + stop) <= (node.X.shape[0])

            node.X[start + i:start + stop, :] = data_x[i:stop, :]

            if data_y is not None:
                node.y[start + i:start + stop, :] = data_y[i:stop, :]

            h5file.flush()

    @staticmethod
    def resize(h5file, start, stop, remove_old_node=False):
        if h5file is None:
            raise ValueError("h5file should not be None.")

        data = h5file.root.Data
        node_name = "Data_%s_%s" % (start, stop)
        if remove_old_node:
            try:
                gcolumns = h5file.createGroup('/', node_name,
                                              "Data %s" % node_name)
            except tables.exceptions.NodeError:
                h5file.removeNode('/', node_name, 1)
                gcolumns = h5file.createGroup('/', node_name,
                                              "Data %s" % node_name)
        elif node_name in h5file.root:
            return h5file, getattr(h5file.root, node_name)
        else:
            gcolumns = h5file.createGroup('/', node_name,
                                          "Data %s" % node_name)

        if FaceBBoxDDMPytables.h5file is None:
            FaceBBoxDDMPytables.h5file = h5file

        start = 0 if start is None else start
        stop = gcolumns.X.nrows if stop is None else stop

        atom = tables.Float32Atom(
        ) if config.floatX == 'float32' else tables.Float64Atom()
        filters = FaceBBoxDDMPytables.filters

        x = h5file.createCArray(gcolumns,
                                'X',
                                atom=atom,
                                shape=((stop - start, data.X.shape[1])),
                                title="Images",
                                filters=filters)

        y = h5file.createTable(gcolumns,
                               'bboxes',
                               BoundingBox,
                               title="Face bounding boxes",
                               filters=filters)

        x[:] = data.X[start:stop]
        bboxes = get_image_bboxes(slice(start, stop), data.bboxes)
        y.append(bboxes)

        if remove_old_node:
            h5file.removeNode('/', "Data", 1)
            h5file.renameNode('/', "Data", node_name)

        h5file.flush()
        return h5file, gcolumns