Beispiel #1
0
 def create_input_space(self):
     ws = (self.ws * 2 + 1)
     return CompositeSpace([
         IndexSpace(max_labels=self.vocab_size, dim=ws),
         IndexSpace(max_labels=self.total_feats, dim=self.feat_num),
         VectorSpace(dim=self.extender_dim * ws)
     ])
Beispiel #2
0
    def set_topological_view(self, V, axes=('b', 0, 1, 'c')):
        """
        Sets the dataset to represent V, where V is a batch
        of topological views of examples.

        .. todo::

            Why is this parameter named 'V'?

        Parameters
        ----------
        V : ndarray
            An array containing a design matrix representation of
            training examples.
        axes : WRITEME
        """
        assert not contains_nan(V)
        rows = V.shape[axes.index(0)]
        cols = V.shape[axes.index(1)]
        channels = V.shape[axes.index('c')]
        self.view_converter = DefaultViewConverter([rows, cols, channels],
                                                   axes=axes)
        self.X = self.view_converter.topo_view_to_design_mat(V)
        # self.X_topo_space stores a "default" topological space that
        # will be used only when self.iterator is called without a
        # data_specs, and with "topo=True", which is deprecated.
        self.X_topo_space = self.view_converter.topo_space
        assert not contains_nan(self.X)

        # Update data specs
        X_space = VectorSpace(dim=self.X.shape[1])
        X_source = 'features'
        if self.y is None:
            space = X_space
            source = X_source
        else:
            if self.y.ndim == 1:
                dim = 1
            else:
                dim = self.y.shape[-1]
            # This is to support old pickled models
            if getattr(self, 'y_labels', None) is not None:
                y_space = IndexSpace(dim=dim, max_labels=self.y_labels)
            elif getattr(self, 'max_labels', None) is not None:
                y_space = IndexSpace(dim=dim, max_labels=self.max_labels)
            else:
                y_space = VectorSpace(dim=dim)
            y_source = 'targets'

            Latent_space = VectorSpace(dim=self.latent.shape[-1])
            Latent_source = 'latents'

            space = CompositeSpace((X_space, y_space,Latent_space))
            source = (X_source, y_source,Latent_source)

        self.data_specs = (space, source)
        self.X_space = X_space
        self._iter_data_specs = (X_space, X_source)
Beispiel #3
0
 def set_spaces(self, dataset, dim_X, dim_Y, m):
     dataset.X_space = IndexSpace(dim=dim_X, max_labels=m)
     dataset.y_space = IndexSpace(dim=dim_Y, max_labels=m)
     X_source = 'features'
     y_source = 'targets'
     space = CompositeSpace((dataset.X_space, dataset.y_space))
     source = (X_source, y_source)
     dataset._iter_data_specs = (dataset.X_space, 'features')
     dataset.data_specs = (space, source)
Beispiel #4
0
    def set_topological_view(self, V, axes=('b', 0, 1, 'c')):
        """
        Set up dataset topological view, without building an in-memory
        design matrix.

        This is mostly copied from DenseDesignMatrix, except:
        * HDF5ViewConverter is used instead of DefaultViewConverter
        * Data specs are derived from topo_view, not X
        * NaN checks have been moved to HDF5DatasetIterator.next

        Note that y may be loaded into memory for reshaping if y.ndim != 2.

        Parameters
        ----------
        V : ndarray
            Topological view.
        axes : tuple, optional (default ('b', 0, 1, 'c'))
            Order of axes in topological view.
        """
        shape = [
            V.shape[axes.index('b')], V.shape[axes.index(0)],
            V.shape[axes.index(1)], V.shape[axes.index('c')]
        ]
        self.view_converter = HDF5ViewConverter(shape[1:], axes=axes)
        self.X = self.view_converter.topo_view_to_design_mat(V)
        # self.X_topo_space stores a "default" topological space that
        # will be used only when self.iterator is called without a
        # data_specs, and with "topo=True", which is deprecated.
        self.X_topo_space = self.view_converter.topo_space

        # Update data specs
        X_space = VectorSpace(dim=V.shape[axes.index('b')])
        X_source = 'features'
        if self.y is None:
            space = X_space
            source = X_source
        else:
            if self.y.ndim == 1:
                dim = 1
            else:
                dim = self.y.shape[-1]

            # check if y_labels has been specified
            if getattr(self, 'y_labels', None) is not None:
                y_space = IndexSpace(dim=dim, max_labels=self.y_labels)
            elif getattr(self, 'max_labels', None) is not None:
                y_space = IndexSpace(dim=dim, max_labels=self.max_labels)
            else:
                y_space = VectorSpace(dim=dim)
            y_source = 'targets'
            space = CompositeSpace((X_space, y_space))
            source = (X_source, y_source)

        self.data_specs = (space, source)
        self.X_space = X_space
        self._iter_data_specs = (X_space, X_source)
Beispiel #5
0
def test_np_format_as_index2index():
    index_space_initial = IndexSpace(max_labels=10, dim=1)

    index_space_final = IndexSpace(max_labels=10, dim=1)
    data = np.array([[0], [2], [1], [3], [5], [8], [1]])
    rval = index_space_initial.np_format_as(data, index_space_final)
    assert index_space_initial == index_space_final
    assert np.all(rval == data)

    index_space_downcast = IndexSpace(max_labels=10, dim=1, dtype='int32')
    rval = index_space_initial.np_format_as(data, index_space_downcast)
    assert index_space_initial != index_space_downcast
    assert np.all(rval == data)
    assert rval.dtype == 'int32' and data.dtype == 'int64'
Beispiel #6
0
    def __init__(self, which_set, which_day, path=None):

        self.daylist = range(21, 32)
        self.mapper = {'train': 0, 'valid': 1, 'test': 2}
        assert which_set in self.mapper.keys()
        assert which_day in self.daylist

        f = open('/home/whale/Documents/click/dayrows.pkl', 'r')
        self.dayrows = cPickle.load(f)
        f.close()

        self.__dict__.update(locals())
        del self.self

        if path is not None:
            raise NotImplementedError("Data path is the current directory.")

        # load data
        file_n = "click_data.h5"
        self.h5file = tables.open_file(file_n, mode='r')

        if which_set == 'test':
            test_group = self.h5file.root.test.test_raw
            self.X = test_group.X_t
            self.y = None
            self.samples = slice(0, self.X.shape[0])
            self.sample_index = self.samples.start
            self.examples = self.X.shape[0]
        else:
            train_group = self.h5file.root.train.train_raw
            self.X = train_group.X
            self.y = train_group.y
            self.samples = slice(sum(self.dayrows[:which_day - 21]),
                                 sum(self.dayrows[:which_day - 20]))
            self.sample_index = self.samples.start
            self.examples = self.dayrows[which_day - 21]

        max_labels = 2

        X_source = 'features'
        X_space = VectorSpace(dim=23)
        if self.y is None:
            space = X_space
            source = X_source
        else:
            y_space = IndexSpace(dim=1, max_labels=max_labels)
            y_source = 'targets'
        space = CompositeSpace((X_space, y_space))
        source = (X_source, y_source)
        self.data_specs = (space, source)
        self.X_space = X_space

        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = (self.X_space, 'features')
Beispiel #7
0
    def __init__(self, which_set, path=None):

        self.mapper = {'train': 0, 'valid': 1, 'test': 2}
        assert which_set in self.mapper.keys()

        self.__dict__.update(locals())
        del self.self

        if path is not None:
            raise NotImplementedError("Data path is the current directory.")

        # load data
        file_n = "click_data.h5"
        self.h5file = tables.open_file(file_n, mode='r')

        if which_set == 'test':
            test_group = self.h5file.root.test.test_raw
            self.X = test_group.X_t
            self.y = None

        else:
            train_group = self.h5file.root.train.train_raw
            if which_set == 'train':
                self.X = train_group.X_train
                self.y = train_group.y_train

            else:
                self.X = train_group.X_valid
                self.y = train_group.y_valid

        self.samples = slice(0, self.X.shape[0])
        self.sample_index = self.samples.start
        self.examples = self.X.shape[0]

        max_labels = 2

        X_source = 'features'
        X_space = VectorSpace(dim=23)
        if self.y is None:
            space = X_space
            source = X_source
        else:
            y_space = IndexSpace(dim=1, max_labels=max_labels)
            y_source = 'targets'
        space = CompositeSpace((X_space, y_space))
        source = (X_source, y_source)
        self.data_specs = (space, source)
        self.X_space = X_space

        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = (self.X_space, 'features')
        self._iter_subset_class = resolve_iterator_class('even_sequential')
Beispiel #8
0
 def test_y_index_space(self):
     """
     Tests that requesting the targets to be in IndexSpace and iterating
     over them works
     """
     data_specs = (IndexSpace(max_labels=10, dim=1), 'targets')
     it = self.test.iterator(mode='sequential',
                             data_specs=data_specs,
                             batch_size=100)
     for y in it:
         pass
Beispiel #9
0
class BowmanWordnetDataset(object):
    dtype = 'int32'
    input_components = (IndexSpace(dim=1,
                                   max_labels=BWD_vertex_count,
                                   dtype=dtype),
                        IndexSpace(dim=1,
                                   max_labels=BWD_vertex_count,
                                   dtype=dtype))
    input_source = ('left_input', 'right_input')
    input_space = CompositeSpace(components=input_components)
    chromaticity = 3
    target_component = IndexSpace(dim=1, max_labels=chromaticity, dtype=dtype)
    target_source = ('target', )
    data_specs = (CompositeSpace(components=(input_components[0],
                                             input_components[1],
                                             target_component)),
                  (input_source[0], input_source[1], target_source[0]))

    def __init__(self):
        pass
Beispiel #10
0
def test_np_format_as_sequence2other():
    vector_sequence_space = VectorSequenceSpace(dim=3)
    vector_space = VectorSpace(dim=3)

    data = np.random.uniform(low=0.0, high=1.0, size=(10, 3))
    np.testing.assert_raises(ValueError, vector_sequence_space.np_format_as,
                             data, vector_space)

    index_sequence_space = IndexSequenceSpace(max_labels=6, dim=1)
    index_space = IndexSpace(max_labels=6, dim=1)

    data = np.random.randint(low=0, high=5, size=(10, 1))
    np.testing.assert_raises(ValueError, index_sequence_space.np_format_as,
                             data, index_space)
    def set_topological_view(self, V, axes=('b', 0, 1, 'c')):
        """
        Sets the dataset to represent V, where V is a batch
        of topological views of examples.

        Parameters
        ----------
        V : ndarray
            An array containing a design matrix representation of training \
            examples.
        axes : WRITEME

        .. todo::

            Why is this parameter named 'V'?
        """
        assert not np.any(np.isnan(V))
        rows = V.shape[axes.index(0)]
        cols = V.shape[axes.index(1)]
        channels = V.shape[axes.index('c')]
        self.view_converter = DefaultViewConverter([rows, cols, channels],
                                                   axes=axes)
        self.X = self.view_converter.topo_view_to_design_mat(V)
        # self.X_topo_space stores a "default" topological space that
        # will be used only when self.iterator is called without a
        # data_specs, and with "topo=True", which is deprecated.
        self.X_topo_space = self.view_converter.topo_space
        assert not np.any(np.isnan(self.X))

        # Update data specs
        X_space = VectorSpace(dim=self.X.shape[1])
        X_source = 'features'
        if self.y is None:
            space = X_space
            source = X_source
        else:
            if self.y.ndim != 2:
                assert self.max_labels
                y_space = IndexSpace(max_labels=self.max_labels, dim=1)
                y_source = 'targets'
            else:
                y_space = VectorSpace(dim=self.y.shape[-1])
                y_source = 'targets'
            space = CompositeSpace((X_space, y_space))
            source = (X_source, y_source)

        self.data_specs = (space, source)
        self.X_space = X_space
        self._iter_data_specs = (X_space, X_source)
Beispiel #12
0
    def __init__(self, which_set, data_mode, context_len=None, shuffle=True):
        self._load_data(which_set, context_len, data_mode)
        source = ('features', 'targets')
        space = CompositeSpace([
            SequenceDataSpace(IndexSpace(dim=1, max_labels=self._max_labels)),
            SequenceDataSpace(IndexSpace(dim=1, max_labels=self._max_labels))
        ])

        if context_len is None:
            context_len = len(self._raw_data) - 1
        X = np.asarray([
            self._raw_data[:-1][i * context_len:(i + 1) * context_len,
                                np.newaxis]
            for i in range(
                int(np.ceil((len(self._raw_data) - 1) / float(context_len))))
        ])
        y = np.asarray([
            self._raw_data[1:][i * context_len:(i + 1) * context_len,
                               np.newaxis]
            for i in range(
                int(np.ceil((len(self._raw_data) - 1) / float(context_len))))
        ])
        super(PennTreebankSequences, self).__init__(data=(X, y),
                                                    data_specs=(space, source))
Beispiel #13
0
    def create_model(self):

        # input will be projected. ProjectionLayer? MatrixMul? IndexSpace?
        input_ = ProjectionLayer(layer_name='X', dim=self.edim, irange=0.)

        # sparse_init=15?
        h0 = Tanh(layer_name='h0', dim=self.hdim, irange=.01)
        output = Softmax(layer_name='softmax',
                         n_classes=self.vocab_size,
                         irange=0.,
                         binary_target_dim=1)

        input_space = IndexSpace(max_labels=self.vocab_size,
                                 dim=self.window_size)
        model = MLP(layers=[input_, h0, output], input_space=input_space)
        self.model = model
Beispiel #14
0
def test_compare_index():
    dims = [5, 5, 5, 6]
    max_labels = [10, 10, 9, 10]
    index_spaces = [
        IndexSpace(dim=dim, max_labels=max_label)
        for dim, max_label in zip(dims, max_labels)
    ]
    assert index_spaces[0] == index_spaces[1]
    assert not any(index_spaces[i] == index_spaces[j]
                   for i, j in itertools.combinations([1, 2, 3], 2))
    vector_space = VectorSpace(dim=5)
    conv2d_space = Conv2DSpace(shape=(8, 8),
                               num_channels=3,
                               axes=('b', 'c', 0, 1))
    composite_space = CompositeSpace((index_spaces[0], ))
    assert not any(index_space == vector_space for index_space in index_spaces)
    assert not any(index_space == composite_space
                   for index_space in index_spaces)
    assert not any(index_space == conv2d_space for index_space in index_spaces)
Beispiel #15
0
def test_np_format_as_index2index():
    index_space_initial = IndexSpace(max_labels=10, dim=1)

    index_space_final = IndexSpace(max_labels=10, dim=1)
    data = np.array([[0], [2], [1], [3], [5], [8], [1]])
    rval = index_space_initial.np_format_as(data, index_space_final)
    assert index_space_initial == index_space_final
    assert np.all(rval == data)

    index_space_downcast = IndexSpace(max_labels=10, dim=1, dtype='int32')
    rval = index_space_initial.np_format_as(data, index_space_downcast)
    assert index_space_initial != index_space_downcast
    assert np.all(rval == data)
    assert rval.dtype == 'int32' and data.dtype == 'int64'
Beispiel #16
0
class ChainDataset(Dataset):
    """Training data generator.

    Supports the PyLearn2 dataset interface.

    """
    num_states = 3
    trans_prob = numpy.array([[0.1, 0.5, 0.4],
                              [0.1, 0.9, 0.0],
                              [0.3, 0.3, 0.4]])
    values, vectors = numpy.linalg.eig(trans_prob.T)
    equilibrium = vectors[:, values.argmax()]
    equilibrium = equilibrium / equilibrium.sum()
    trans_entropy = trans_prob * numpy.log(trans_prob + 1e-6)
    entropy = equilibrium.dot(trans_entropy).sum()

    data_specs = (SequenceDataSpace(IndexSpace(max_labels=num_states,
                                               dim=1)),
                  'x')

    def __init__(self, rng, seq_len):
        update_instance(self, locals())

    def iterator(self, batch_size, data_specs,
                 return_tuple, mode, num_batches, rng=None):
        """Returns a PyLearn2 compatible iterator."""
        assert return_tuple

        dataset = self

        class Iterator(six.Iterator):
            # This is not true, but let PyLearn2 think that this
            # iterator is not stochastic.
            # Makes life easier for now.
            stochastic = False
            num_examples = num_batches * batch_size

            def __init__(self, **kwargs):
                self.batches_retrieved = 0

            def __iter__(self):
                return self

            def __next__(self):
                if self.batches_retrieved < num_batches:
                    self.batches_retrieved += 1
                    return (dataset._next_batch(batch_size)[..., None],)
                raise StopIteration()
        return Iterator()

    def get_num_examples(self):
        """Part of the PyLearn2 Dataset interface."""
        return float('inf')

    def _next_single(self):
        states = [0]
        while len(states) != self.seq_len:
            states.append(numpy.random.multinomial(
                1, self.trans_prob[states[-1]]).argmax())
        return states

    def _next_batch(self, batch_size):
        """Generate random sequences from the family."""
        x = numpy.zeros((self.seq_len, batch_size), dtype='int64')
        for i in range(batch_size):
            x[:, i] = self._next_single()
        return x
    def __init__(self,
                 path,
                 data_node,
                 transformer,
                 X_str,
                 s_str,
                 y_str=None,
                 y_labels=None,
                 start=0,
                 stop=None,
                 axes=('b', 0, 1, 'c'),
                 rescale=None,
                 rng=_default_seed):
        # Locally cache the files before reading them
        path = preprocess(path)
        datasetCache = cache.datasetCache
        path = datasetCache.cache_file(path)

        self.h5file = tables.openFile(path, mode="r")
        node = self.h5file.getNode('/', data_node)

        self.rescale = float(rescale)

        self.rng = make_np_rng(rng, which_method="random_integers")

        self.X = getattr(node, X_str)
        # Make sure images have values in [0, 1]. This is needed for
        # self.adjust_for_viewer, amongst other things.
        if not numpy.all(x >= 0 and x <= 1 for x in self.X.iterrows()):
            raise ValueError("features must be normalized between 0 and 1")
        self.axes = axes
        self.s = getattr(node, s_str)
        self.y = getattr(node, y_str) if y_str is not None else None

        self.y_labels = y_labels
        self._check_labels()

        self.transformer = transformer

        X_source = 'features'
        shape = self.transformer.get_shape()
        channels = self.s[0][-1]
        X_space = Conv2DSpace(shape=shape, num_channels=channels, axes=axes)

        if self.y is None:
            space = X_space
            source = X_source
        else:
            if self.y.ndim == 1:
                dim = 1
            else:
                dim = self.y.shape[-1]
            if self.y_labels is not None:
                y_space = IndexSpace(dim=dim, max_labels=self.y_labels)
            else:
                y_space = VectorSpace(dim=dim)
            y_source = 'targets'
            space = CompositeSpace((X_space, y_space))
            source = (X_source, y_source)
        self.data_specs = (space, source)

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class(
            'batchwise_shuffled_sequential')
        self._iter_data_specs = self.data_specs

        self.start = start
        self.stop = self.X.shape[0] if stop is None else stop
        assert (self.start >= 0 and self.start < self.stop
                and self.stop <= self.X.shape[0])
        self.num_examples = self.stop - self.start

        # Data buffers
        self.X_buffer = None
        self.y_buffer = None
Beispiel #18
0
def test_np_format_as_index2vector():
    # Test 5 random batches for shape, number of non-zeros
    for _ in xrange(5):
        max_labels = np.random.randint(2, 10)
        batch_size = np.random.randint(1, 10)
        labels = np.random.randint(1, 10)
        batch = np.random.random_integers(max_labels - 1,
                                          size=(batch_size, labels))
        index_space = IndexSpace(dim=labels, max_labels=max_labels)
        vector_space_merge = VectorSpace(dim=max_labels)
        vector_space_concatenate = VectorSpace(dim=max_labels * labels)
        merged = index_space.np_format_as(batch, vector_space_merge)
        concatenated = index_space.np_format_as(batch,
                                                vector_space_concatenate)
        assert merged.shape == (batch_size, max_labels)
        assert concatenated.shape == (batch_size, max_labels * labels)
        assert np.count_nonzero(merged) <= batch.size
        assert np.count_nonzero(concatenated) == batch.size
        assert np.all(np.unique(concatenated) == np.array([0, 1]))
    # Make sure Theano variables give the same result
    batch = tensor.lmatrix('batch')
    single = tensor.lvector('single')
    batch_size = np.random.randint(1, 10)
    np_batch = np.random.random_integers(max_labels - 1,
                                         size=(batch_size, labels))
    np_single = np.random.random_integers(max_labels - 1,
                                          size=(labels))
    f_batch_merge = theano.function(
        [batch], index_space._format_as_impl(False, batch, vector_space_merge)
    )
    f_batch_concatenate = theano.function(
        [batch], index_space._format_as_impl(False, batch,
                                             vector_space_concatenate)
    )
    f_single_merge = theano.function(
        [single], index_space._format_as_impl(False, single,
                                              vector_space_merge)
    )
    f_single_concatenate = theano.function(
        [single], index_space._format_as_impl(False, single,
                                              vector_space_concatenate)
    )
    np.testing.assert_allclose(
        f_batch_merge(np_batch),
        index_space._format_as_impl(True, np_batch, vector_space_merge)
    )
    np.testing.assert_allclose(
        f_batch_concatenate(np_batch),
        index_space._format_as_impl(True, np_batch, vector_space_concatenate)
    )
    np.testing.assert_allclose(
        f_single_merge(np_single),
        index_space._format_as_impl(True, np_single, vector_space_merge)
    )
    np.testing.assert_allclose(
        f_single_concatenate(np_single),
        index_space._format_as_impl(True, np_single, vector_space_concatenate)
    )
    num_dev = int(X.shape[0] * 0.01 * args.devpercent)
    log.info('.. Using %f percent of data (%d examples) as development data',
             args.devpercent, num_dev)

    if num_dev > 0:
        X_dev = X[-num_dev:]
        X = X[:-num_dev]
        speakers_dev = speakers[-num_dev:]
        speakers = speakers[:-num_dev]
        y_dev = y[-num_dev:]
        y = y[:-num_dev]

    space = CompositeSpace([
        VectorSpace(dim=len(feature_dict)),
        IndexSpace(dim=1, max_labels=num_speakers),
        VectorSpace(dim=1)
    ])
    source = ('features', 'speakers', 'targets')
    final_dataset = vector_spaces_dataset.VectorSpacesDataset(
        data=(X, speakers, y), data_specs=(space, source))

    if args.save_filename:
        log.info('.. Writing data to %s', args.save_filename)
        serial.save(args.save_filename, final_dataset)

    if args.savedev_filename:
        log.info('.. Writing dev data to %s', args.savedev_filename)
        final_dataset_dev = vector_spaces_dataset.VectorSpacesDataset(
            data=(X_dev, speakers_dev, y_dev), data_specs=(space, source))
Beispiel #20
0
    def __init__(self, which_set, center=False, rescale=False, gcn=None,
                 one_hot=None, start=None, stop=None, axes=('b', 'c', 0, 1), 
                 preprocessor = None, noise_v=0., noise_a=0.):
        # note: there is no such thing as the cifar10 validation set;
        # pylearn1 defined one but really it should be user-configurable
        # (as it is here)
        
        self.noise_a = noise_a
        self.noise_v = noise_v

        self.axes = axes

        # we define here:
        dtype = 'float32'
        ntrain = 288
        nvalid = 0  # artefact, we won't use it
        ntest = 72

        # we also expose the following details:
        self.img_shape = (1, 32, 32)
        self.img_size = N.prod(self.img_shape)
        self.n_classes = 36
        self.label_names = ['fadg0', 'fcft0', 'bird', 'cat', 'deer',
                            'dog', 'frog', 'horse', 'ship', 'truck']

        # prepare loading
        '''
        fnames = ['data_batch_%i' % i for i in range(1, 6)]
        lenx = N.ceil((ntrain + nvalid) / 10000.)*10000
        x = N.zeros((lenx, self.img_size), dtype=dtype)
        y = N.zeros((lenx, 1), dtype=dtype)

        # load train data
        nloaded = 0
        for i, fname in enumerate(fnames):
            data = CIFAR10._unpickle(fname)
            x[i*10000:(i+1)*10000, :] = data['data']
            y[i*10000:(i+1)*10000, 0] = data['labels']
            nloaded += 10000
            if nloaded >= ntrain + nvalid + ntest:
                break
        '''
        path1 = os.path.join(serial.preprocess('${VIDTIMIT}'), 'data', 'cut_dataset.pkl')
        video = cPickle.load(file(path1))
        path2 = os.path.join(serial.preprocess('${VIDTIMIT}'), 'data', 'audio_dataset.pkl')
        audio = cPickle.load(file(path2))

        # process this data
        train_idx = video['train_idx']==1
        test_idx = video['test_idx']==1
        
        if noise_v==0.:
            v_noise_tr = 0.
            v_noise_te = 0.
        else:
            v_noise_tr = np.random.normal(0., noise_v, size=video['data'][train_idx].shape)
            v_noise_te = np.random.normal(0., noise_v, size=video['data'][test_idx].shape)
        if noise_a==0.:
            a_noise_tr = 0.
            a_noise_te = 0.
        else:
            a_noise_tr = np.random.normal(0., noise_a, audio['data'][train_idx].shape)
            a_noise_te = np.random.normal(0., noise_a, audio['data'][test_idx].shape)

        vXs = {'train': video['data'][train_idx] + v_noise_tr,
              'test': video['data'][test_idx] + v_noise_te}
       
        Ys = {'train': video['labels'][train_idx],
              'test': video['labels'][test_idx]}

        vX = N.cast['float32'](vXs[which_set])
        y = Ys[which_set][np.newaxis].T.astype('int32')
        
        aXs = {'train': audio['data'][train_idx] + a_noise_tr,
              'test': audio['data'][test_idx] + a_noise_te}

        aX = N.cast['float32'](aXs[which_set])

        if isinstance(y, list):
            y = np.asarray(y).astype('int32')

        if which_set == 'test':
            assert y.shape[0] == 72
            y = y.reshape((y.shape[0], 1))

        max_labels = 36
        if one_hot is not None:
            ynew = np.zeros((y.shape[0], 36))
            for i in range(y.shape[0]):
                ynew[y[i]] = 1
            warnings.warn("the `one_hot` parameter is deprecated. To get "
                          "one-hot encoded targets, request that they "
                          "live in `VectorSpace` through the `data_specs` "
                          "parameter of MNIST's iterator method. "
                          "`one_hot` will be removed on or after "
                          "September 20, 2014.", stacklevel=2)
            y = ynew.astype('int32')

        if center:
            vX -= 0.2845
            aX -= -0.00003
        self.center = center

        if rescale:
            vX /= .1644
            aX /= .02
        self.rescale = rescale

        if start is not None:
            # This needs to come after the prepro so that it doesn't
            # change the pixel means computed above for toronto_prepro
            assert start >= 0
            assert stop > start
            assert stop <= vX.shape[0]
            vX = vX[start:stop, :]
            aX = aX[start:stop, :]
            y  = y[start:stop, :]

            assert vX.shape[0] == y.shape[0]

        if which_set == 'test':
            assert vX.shape[0] == 72

        super(FULLVIDTIMIT, self).__init__( (vX, aX, y), (CompositeSpace([
            Conv2DSpace(shape=[32,32], num_channels=54, axes=['b','c',0,1]),
            Conv2DSpace(shape=[1000,1], num_channels=54, axes=['b','c',0,1]),
            IndexSpace(36,1)]), 
            ('video','audio','targets')))

        #assert not contains_nan(self.X)

        if preprocessor:
            preprocessor.apply(self)
    def __init__(self,
                 which_set,
                 start=None,
                 stop=None,
                 center=False,
                 rescale=False,
                 path='./',
                 axes=('b', 'c')):

        # we also expose the following details:
        self.shape = (128)
        self.size = np.prod(self.shape)
        self.n_classes = 2

        if which_set not in ['train', 'test']:
            raise ValueError('Unrecognized which_set value "%s".' %
                             (which_set, ) +
                             '". Valid values are ["train","test"].')

        def dimshuffle(bc):
            """
            .. todo::

                WRITEME
            """
            default = ('b', 'c')
            return bc.transpose(*[default.index(axis) for axis in axes])

        if which_set == 'train':
            im_path = path + 'trainSIFT_vectors.npy'
            y_path = path + 'trainSIFT_labels.npy'
        else:
            assert which_set == 'test'
            im_path = path + 'testSIFT_vectors.npy'
            y_path = path + 'testSIFT_labels.npy'

        time1 = time.time()
        X = np.load(im_path).reshape((-1, 128))
        Y = np.load(y_path).reshape((-1, 1))

        time2 = time.time()
        print 'Loading data took %0.3f ms' % ((time2 - time1) * 1000.0)

        y_labels = 2

        m, r = X.shape
        assert r == 128

        source = ('features', 'targets')
        space = CompositeSpace(
            [VectorSpace(128),
             IndexSpace(dim=1, max_labels=2)])

        self.X = X
        self.y = Y

        assert not N.any(N.isnan(self.X))

        if start is not None:
            assert start >= 0
            if stop > self.X.shape[0]:
                raise ValueError('stop=' + str(stop) + '>' + 'm=' +
                                 str(self.X.shape[0]))
            assert stop > start
            self.X = self.X[start:stop, :]
            if self.X.shape[0] != stop - start:
                raise ValueError("X.shape[0]: %d. start: %d stop: %d" %
                                 (self.X.shape[0], start, stop))
            if len(self.y.shape) > 1:
                self.y = self.y[start:stop, :]
            else:
                self.y = self.y[start:stop]
            assert self.y.shape[0] == stop - start

        super(FoodData, self).__init__(data=(self.X, self.y),
                                       data_specs=(space, source))
Beispiel #22
0
    def __init__(self,
                 X_load_path=None,
                 X_from_scipy_sparse_dataset=None,
                 X_zipped_npy=False,
                 y_path=None,
                 y_labels=None,
                 y_part=None,
                 rng=_default_seed):

        if X_load_path is not None:
            if X_zipped_npy is True:
                logger.info('... loading sparse data set from a zip npy file')
                self.X = scipy.sparse.csr_matrix(numpy.load(
                    gzip.open(X_load_path)),
                                                 dtype=floatX)
            else:
                logger.info('... loading sparse data set from a npy file')
                loader = numpy.load(X_load_path)
                self.X = scipy.sparse.csr_matrix((loader['data'], \
                         loader['indices'], loader['indptr']), \
                         shape = loader['shape'], dtype=floatX)
        else:
            logger.info('... building from given sparse dataset')
            self.X = X_from_scipy_sparse_dataset
            if not scipy.sparse.issparse(X_from_scipy_sparse_dataset):
                msg = "from_scipy_sparse_dataset is not sparse : %s" \
                      % type(self.X)
                raise TypeError(msg)

        if y_path is not None:
            logger.info('... loading y data set from a hdf5 file')
            file_handler = tables.open_file(y_path, mode="r")
            y = file_handler.root.train.train_raw.y
            assert y_part is not None
            f = open('dayrows.pkl', 'r')
            dayrows = cPickle.load(f)
            f.close()
            self.y = y[sum(dayrows[:y_part - 1]):sum(dayrows[:y_part])]
        self.y_labels = y_labels

        X_source = 'features'
        X_space = VectorSpace(dim=self.X.shape[1], sparse=True)

        if y_path is None:
            space = X_space
            source = X_source
        else:
            if self.y.ndim == 1:
                dim = 1
            else:
                dim = self.y.shape[-1]
            if self.y_labels is not None:
                y_space = IndexSpace(dim=dim, max_labels=self.y_labels)
            else:
                y_space = VectorSpace(dim=dim)
            y_source = 'targets'

            space = CompositeSpace((X_space, y_space))
            source = (X_source, y_source)
        self.data_specs = (space, source)
        self.X_space = X_space

        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = (self.X_space, 'features')
Beispiel #23
0
    def __init__(self,
                 which_set,
                 frame_length,
                 overlap=0,
                 frames_per_example=1,
                 start=0,
                 stop=None,
                 audio_only=False,
                 rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        if not self.audio_only:
            self.num_phones = numpy.max(
                [numpy.max(sequence) for sequence in self.phones]) + 1
            self.num_phonemes = numpy.max(
                [numpy.max(sequence) for sequence in self.phonemes]) + 1
            self.num_words = numpy.max(
                [numpy.max(sequence) for sequence in self.words]) + 1
            # The following is hard coded. However, the way it is done above
            # could be problematic if a max value (the max over the whole
            # dataset (train + valid + test)) is not present in at least one
            # one of the three subsets. This is the case for speakers. This is
            # not the case for phones.
            self.num_speakers = 630

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
                self.speaker_id = self.speaker_id[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]
                self.speaker_id = self.speaker_id[start:]

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            if not self.audio_only:
                # Phones segmentation
                phones_sequence = self.phones[sequence_id]
                phones_segmented_sequence = segment_axis(
                    phones_sequence, frame_length, overlap)
                self.phones[sequence_id] = phones_segmented_sequence
                # phones_segmented_sequence = scipy.stats.mode(
                #     phones_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phones_segmented_sequence = numpy.asarray(
                #     phones_segmented_sequence,
                #     dtype='int'
                # )
                # phones_sequence_list.append(phones_segmented_sequence)
                # Phonemes segmentation
                phonemes_sequence = self.phonemes[sequence_id]
                phonemes_segmented_sequence = segment_axis(
                    phonemes_sequence, frame_length, overlap)
                self.phonemes[sequence_id] = phonemes_segmented_sequence
                # phonemes_segmented_sequence = scipy.stats.mode(
                #     phonemes_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phonemes_segmented_sequence = numpy.asarray(
                #     phonemes_segmented_sequence,
                #     dtype='int'
                # )
                # phonemes_sequence_list.append(phonemes_segmented_sequence)
                # Words segmentation
                words_sequence = self.words[sequence_id]
                words_segmented_sequence = segment_axis(
                    words_sequence, frame_length, overlap)
                self.words[sequence_id] = words_segmented_sequence
                # words_segmented_sequence = scipy.stats.mode(
                #     words_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # words_segmented_sequence = numpy.asarray(words_segmented_sequence,
                #                                          dtype='int')
                # words_sequence_list.append(words_segmented_sequence)

            # TODO: look at this, does it force copying the data?
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length, overlap)
            self.raw_wav[sequence_id] = samples_segmented_sequence

            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = samples_segmented_sequence.shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        if not self.audio_only:
            self.phones_sequences = self.phones
            self.phonemes_sequences = self.phonemes
            self.words_sequences = self.words
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(dim=self.frame_length *
                                     self.frames_per_example)
        features_source = 'features'

        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index]
                            [example_index:example_index +
                             self.frames_per_example].ravel())
            return rval

        targets_space = VectorSpace(dim=self.frame_length)
        targets_source = 'targets'

        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][
                    example_index + self.frames_per_example].ravel())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSpace(max_labels=self.num_phones,
                                      dim=1,
                                      dtype=str(
                                          self.phones_sequences[0].dtype))
            phones_source = 'phones'

            def phones_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.phones_sequences[sequence_index][
                        example_index + self.frames_per_example].ravel())
                return rval

            phonemes_space = IndexSpace(max_labels=self.num_phonemes,
                                        dim=1,
                                        dtype=str(
                                            self.phonemes_sequences[0].dtype))
            phonemes_source = 'phonemes'

            def phonemes_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.phonemes_sequences[sequence_index][
                        example_index + self.frames_per_example].ravel())
                return rval

            words_space = IndexSpace(max_labels=self.num_words,
                                     dim=1,
                                     dtype=str(self.words_sequences[0].dtype))
            words_source = 'words'

            def words_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.words_sequences[sequence_index][
                        example_index + self.frames_per_example].ravel())
                return rval

            speaker_id_space = IndexSpace(max_labels=self.num_speakers,
                                          dim=1,
                                          dtype=str(self.speaker_id.dtype))
            speaker_id_source = 'speaker_id'

            def speaker_id_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.speaker_id[sequence_index].ravel())
                return rval

            dialect_space = IndexSpace(max_labels=8, dim=1, dtype='int32')
            dialect_source = 'dialect'

            def dialect_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[1:9]))
                return rval

            education_space = IndexSpace(max_labels=6, dim=1, dtype='int32')
            education_source = 'education'

            def education_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[9:15]))
                return rval

            race_space = IndexSpace(max_labels=8, dim=1, dtype='int32')
            race_source = 'race'

            def race_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[16:24]))
                return rval

            gender_space = IndexSpace(max_labels=2, dim=1, dtype='int32')
            gender_source = 'gender'

            def gender_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[24:]))
                return rval

            space_components.extend([
                phones_space, phonemes_space, words_space, speaker_id_space,
                dialect_space, education_space, race_space, gender_space
            ])
            source_components.extend([
                phones_source, phonemes_source, words_source,
                speaker_id_source, dialect_source, education_source,
                race_source, gender_source
            ])
            map_fn_components.extend([
                phones_map_fn, phonemes_map_fn, words_map_fn,
                speaker_id_map_fn, dialect_map_fn, education_map_fn,
                race_map_fn, gender_map_fn
            ])
            batch_components.extend(
                [None, None, None, None, None, None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace(
            (features_space, targets_space)), (features_source,
                                               targets_source))
Beispiel #24
0
                     args.read_features_filename)
            feature_dict = OrderedDict(
                durmodel_utils.get_features(args.read_features_filename))
        else:
            log.info('.. Reading features from full_features_and_durs')
            feature_dict = OrderedDict()
            for (_features, _speaker_id, d) in full_features_and_durs:
                for f in _features:
                    feature_name = f[0]
                    feature_dict.setdefault(feature_name, len(feature_dict))

        if args.write_features_filename:
            log.info('.. Writing features to %s', args.write_features_filename)
            durmodel_utils.write_features(
                args.write_features_filename, feature_dict)

        matrices = create_matrices(feature_dict, full_features_and_durs)

        space = CompositeSpace([VectorSpace(dim=len(feature_dict)),
                                IndexSpace(dim=1, max_labels=len(speaker_ids)),
                                VectorSpace(dim=1)])
        source = ('features', 'speakers', 'targets')
        dataset = vector_spaces_dataset.VectorSpacesDataset(
            data=matrices,
            data_specs=(space, source)
        )

    if args.save_filename:
        log.info('.. Writing data to %s', args.save_filename)
        serial.save(args.save_filename, dataset)
Beispiel #25
0
def test_np_format_as_index2vector():
    # Test 5 random batches for shape, number of non-zeros
    for _ in xrange(5):
        max_labels = np.random.randint(2, 10)
        batch_size = np.random.randint(1, 10)
        labels = np.random.randint(1, 10)
        batch = np.random.random_integers(max_labels - 1,
                                          size=(batch_size, labels))
        index_space = IndexSpace(dim=labels, max_labels=max_labels)
        vector_space_merge = VectorSpace(dim=max_labels)
        vector_space_concatenate = VectorSpace(dim=max_labels * labels)
        merged = index_space.np_format_as(batch, vector_space_merge)
        concatenated = index_space.np_format_as(batch,
                                                vector_space_concatenate)
        if batch_size > 1:
            assert merged.shape == (batch_size, max_labels)
            assert concatenated.shape == (batch_size, max_labels * labels)
        else:
            assert merged.shape == (max_labels, )
            assert concatenated.shape == (max_labels * labels, )
        assert np.count_nonzero(merged) <= batch.size
        assert np.count_nonzero(concatenated) == batch.size
        assert np.all(np.unique(concatenated) == np.array([0, 1]))
    # Make sure Theano variables give the same result
    batch = tensor.lmatrix('batch')
    single = tensor.lvector('single')
    batch_size = np.random.randint(2, 10)
    np_batch = np.random.random_integers(max_labels - 1,
                                         size=(batch_size, labels))
    np_single = np.random.random_integers(max_labels - 1, size=(labels))
    f_batch_merge = theano.function([batch],
                                    index_space._format_as(
                                        batch, vector_space_merge))
    f_batch_concatenate = theano.function([batch],
                                          index_space._format_as(
                                              batch, vector_space_concatenate))
    f_single_merge = theano.function([single],
                                     index_space._format_as(
                                         single, vector_space_merge))
    f_single_concatenate = theano.function([single],
                                           index_space._format_as(
                                               single,
                                               vector_space_concatenate))
    np.testing.assert_allclose(
        f_batch_merge(np_batch),
        index_space.np_format_as(np_batch, vector_space_merge))
    np.testing.assert_allclose(
        f_batch_concatenate(np_batch),
        index_space.np_format_as(np_batch, vector_space_concatenate))
    np.testing.assert_allclose(
        f_single_merge(np_single),
        index_space.np_format_as(np_single, vector_space_merge))
    np.testing.assert_allclose(
        f_single_concatenate(np_single),
        index_space.np_format_as(np_single, vector_space_concatenate))
Beispiel #26
0
    def __init__(
        self,
        path,
        name='',  # optional name

        # selectors
        subjects='all',  # optional selector (list) or 'all'
        trial_types='all',  # optional selector (list) or 'all'
        trial_numbers='all',  # optional selector (list) or 'all'
        conditions='all',  # optional selector (list) or 'all'     
        partitioner=None,
        channel_filter=NoChannelFilter(
        ),  # optional channel filter, default: keep all
        channel_names=None,  # optional channel names (for metadata)
        label_map=None,  # optional conversion of labels
        remove_dc_offset=False,  # optional subtraction of channel mean, usually done already earlier
        resample=None,  # optional down-sampling

        # optional sub-sequences selection
        start_sample=0,
        stop_sample=None,  # optional for selection of sub-sequences

        # optional signal filter to by applied before spitting the signal
        signal_filter=None,

        # windowing parameters
        frame_size=-1,
        hop_size=-1,  # values > 0 will lead to windowing
        hop_fraction=None,  # alternative to specifying absolute hop_size

        # # optional spectrum parameters, n_fft = 0 keeps raw data
        # n_fft = 0,
        # n_freq_bins = None,
        # spectrum_log_amplitude = False,
        # spectrum_normalization_mode = None,
        # include_phase = False,
        flatten_channels=False,
        # layout='tf',       # (0,1)-axes layout tf=time x features or ft=features x time

        # save_matrix_path = None,
        keep_metadata=False,
        target_mode='label',
    ):
        '''
        Constructor
        '''

        # save params
        self.params = locals().copy()
        del self.params['self']
        # print self.params

        # TODO: get the whole filtering into an extra class

        datafiles_metadata, metadb = load_datafiles_metadata(path)

        #         print datafiles_metadata

        def apply_filters(filters, node):
            if isinstance(node, dict):
                filtered = []
                keepkeys = filters[0]
                for key, value in node.items():
                    if keepkeys == 'all' or key in keepkeys:
                        filtered.extend(apply_filters(filters[1:], value))
                return filtered
            else:
                return node  # [node]

        # keep only files that match the metadata filters
        self.datafiles = apply_filters(
            [subjects, trial_types, trial_numbers, conditions],
            datafiles_metadata)

        # copy metadata for retained files
        self.metadb = {}
        for datafile in self.datafiles:
            self.metadb[datafile] = metadb[datafile]

#         print self.datafiles
#         print self.metadb

        self.name = name

        if partitioner is not None:
            self.datafiles = partitioner.get_partition(self.name, self.metadb)

        # self.include_phase = include_phase
        # self.spectrum_normalization_mode = spectrum_normalization_mode
        # self.spectrum_log_amplitude = spectrum_log_amplitude

        self.sequence_partitions = [
        ]  # used to keep track of original sequences

        # metadata: [subject, trial_no, stimulus, channel, start, ]
        self.metadata = []

        sequences = []
        labels = []
        targets = []
        n_sequences = 0

        print(hop_size)
        if frame_size > 0 and hop_size == -1 and hop_fraction is not None:
            hop_size = np.ceil(frame_size / hop_fraction)
        print(hop_size)

        if target_mode == 'next':
            # get 1 more value per frame as target
            frame_size += 1
        # print 'frame size: {}'.format(frame_size)

        for i in xrange(len(self.datafiles)):
            with log_timing(log,
                            'loading data from {}'.format(self.datafiles[i])):

                # save start of next sequence
                self.sequence_partitions.append(n_sequences)

                data, metadata = load(os.path.join(path, self.datafiles[i]))
                # data, metadata = self.generate_test_data()

                label = metadata['label']
                if label_map is not None:
                    label = label_map[label]

                multi_channel_frames = []
                multi_channel_targets = []

                # process 1 channel at a time
                for channel in xrange(data.shape[1]):
                    # filter channels
                    if not channel_filter.keep_channel(channel):
                        continue

                    samples = data[:, channel]
                    # print samples

                    # subtract channel mean
                    #FIXME
                    if remove_dc_offset:
                        samples -= samples.mean()

                    # down-sample if requested
                    if resample is not None and resample[0] != resample[1]:
                        samples = librosa.resample(samples, resample[0],
                                                   resample[1])

                    # apply optional signal filter after down-sampling -> requires lower order
                    if signal_filter is not None:
                        samples = signal_filter.process(samples)

                    # get sub-sequence in resampled space
                    # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape))
                    samples = samples[start_sample:stop_sample]
                    # print start_sample, stop_sample, samples.shape

                    # if n_fft is not None and n_fft > 0: # Optionally:
                    #     ### frequency spectrum branch ###
                    #
                    #     # transform to spectogram
                    #     hop_length = n_fft / 4;
                    #
                    #     '''
                    #     from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html
                    #     >>> # Get a power spectrogram from a waveform y
                    #     >>> S       = np.abs(librosa.stft(y)) ** 2
                    #     >>> log_S   = librosa.logamplitude(S)
                    #     '''
                    #
                    #     S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length)
                    #     # mag = np.abs(S)        # magnitude spectrum
                    #     mag = np.abs(S)**2       # power spectrum
                    #
                    #     # include phase information if requested
                    #     if self.include_phase:
                    #         # phase = np.unwrap(np.angle(S))
                    #         phase = np.angle(S)
                    #
                    #     # Optionally: cut off high bands
                    #     if n_freq_bins is not None:
                    #         mag = mag[0:n_freq_bins, :]
                    #         if self.include_phase:
                    #             phase = phase[0:n_freq_bins, :]
                    #
                    #     if self.spectrum_log_amplitude:
                    #         mag = librosa.logamplitude(mag)
                    #
                    #     s = mag # for normalization
                    #
                    #     '''
                    #     NOTE on normalization:
                    #     It depends on the structure of a neural network and (even more)
                    #     on the properties of data. There is no best normalization algorithm
                    #     because if there would be one, it would be used everywhere by default...
                    #
                    #     In theory, there is no requirement for the data to be normalized at all.
                    #     This is a purely practical thing because in practice convergence could
                    #     take forever if your input is spread out too much. The simplest would be
                    #     to just normalize it by scaling your data to (-1,1) (or (0,1) depending
                    #     on activation function), and in most cases it does work. If your
                    #     algorithm converges well, then this is your answer. If not, there are
                    #     too many possible problems and methods to outline here without knowing
                    #     the actual data.
                    #     '''
                    #
                    #     ## normalize to mean 0, std 1
                    #     if self.spectrum_normalization_mode == 'mean0_std1':
                    #         # s = preprocessing.scale(s, axis=0);
                    #         mean = np.mean(s)
                    #         std = np.std(s)
                    #         s = (s - mean) / std
                    #
                    #     ## normalize by linear transform to [0,1]
                    #     elif self.spectrum_normalization_mode == 'linear_0_1':
                    #         s = s / np.max(s)
                    #
                    #     ## normalize by linear transform to [-1,1]
                    #     elif self.spectrum_normalization_mode == 'linear_-1_1':
                    #         s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s))
                    #
                    #     elif self.spectrum_normalization_mode is not None:
                    #         raise ValueError(
                    #             'unsupported spectrum normalization mode {}'.format(
                    #                 self.spectrum_normalization_mode)
                    #          )
                    #
                    #     #print s.mean(axis=0)
                    #     #print s.std(axis=0)
                    #
                    #     # include phase information if requested
                    #     if self.include_phase:
                    #         # normalize phase to [-1.1]
                    #         phase = phase / np.pi
                    #         s = np.vstack([s, phase])
                    #
                    #     # transpose to fit pylearn2 layout
                    #     s = np.transpose(s)
                    #     # print s.shape
                    #
                    #     ### end of frequency spectrum branch ###
                    # else:
                    ### raw waveform branch ###

                    # normalize to max amplitude 1

                    s = librosa.util.normalize(samples)

                    # add 2nd data dimension
                    # s = s.reshape(s.shape[0], 1)
                    # print s.shape

                    ### end of raw waveform branch ###

                    s = np.asfarray(s, dtype='float32')

                    if frame_size > 0 and hop_size > 0:
                        # print 'frame size: {}'.format(frame_size)
                        s = s.copy(
                        )  # FIXME: THIS IS NECESSARY - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!!
                        frames = compute_frames(s,
                                                frame_length=frame_size,
                                                hop_length=hop_size)
                        # frames = librosa.util.frame(s, frame_length=frame_size, hop_length=hop_size)
                    else:
                        frames = s
                    del s
                    # print frames.shape

                    if target_mode == 'next':
                        frame_targets = np.empty(len(frames))
                        tmp = []
                        for f, frame in enumerate(frames):
                            tmp.append(frame[:-1])
                            frame_targets[f] = frame[-1]
                        frames = np.asarray(tmp)

                        # print frames.shape
                        # for f, frm in enumerate(frames):
                        #     print frm, frame_targets[f]
                        # # FIXME: OK so far

                    if flatten_channels:
                        # add artificial channel dimension
                        frames = frames.reshape(
                            (frames.shape[0], frames.shape[1], frames.shape[2],
                             1))
                        # print frames.shape

                        sequences.append(frames)

                        # increment counter by new number of frames
                        n_sequences += frames.shape[0]

                        if keep_metadata:
                            # determine channel name
                            channel_name = None
                            if channel_names is not None:
                                channel_name = channel_names[channel]
                            elif 'channels' in metadata:
                                channel_name = metadata['channels'][channel]

                            self.metadata.append({
                                'subject':
                                metadata['subject'],  # subject
                                'trial_type':
                                metadata['trial_type'],  # trial_type
                                'trial_no':
                                metadata['trial_no'],  # trial_no
                                'condition':
                                metadata['condition'],  # condition
                                'channel':
                                channel,  # channel
                                'channel_name':
                                channel_name,
                                'start':
                                self.sequence_partitions[-1],  # start
                                'stop':
                                n_sequences  # stop
                            })

                        for _ in xrange(frames.shape[0]):
                            labels.append(label)

                        if target_mode == 'next':
                            for next in frame_targets:
                                targets.append(next)
                    else:
                        multi_channel_frames.append(frames)
                        if target_mode == 'next':
                            multi_channel_targets.append(frame_targets)

                    ### end of channel iteration ###

                    # print np.asarray(multi_channel_frames, dtype=np.int)
                    # # FIXME: OK so far

                if not flatten_channels:
                    # turn list into array
                    multi_channel_frames = np.asfarray(multi_channel_frames,
                                                       dtype='float32')
                    # [channels x frames x time x freq] -> cb01
                    # [channels x frames x time x 1] -> cb0.

                    # move channel dimension to end
                    multi_channel_frames = np.rollaxis(
                        multi_channel_frames, 0,
                        len(multi_channel_frames.shape))
                    # print multi_channel_frames.shape
                    log.info(multi_channel_frames.shape)

                    sequences.append(multi_channel_frames)

                    # increment counter by new number of frames
                    n_sequences += multi_channel_frames.shape[0]

                    if keep_metadata:
                        self.metadata.append({
                            'subject':
                            metadata['subject'],  # subject
                            'trial_type':
                            metadata['trial_type'],  # trial_type
                            'trial_no':
                            metadata['trial_no'],  # trial_no
                            'condition':
                            metadata['condition'],  # condition
                            'channel':
                            'all',  # channel
                            'start':
                            self.sequence_partitions[-1],  # start
                            'stop':
                            n_sequences  # stop
                        })

                    for _ in xrange(multi_channel_frames.shape[0]):
                        labels.append(label)

                    if target_mode == 'next':
                        multi_channel_targets = np.asfarray(
                            multi_channel_targets, dtype='float32')
                        targets.append(multi_channel_targets.T)

                ### end of datafile iteration ###

        # print sequences[0].shape
        # print np.asarray(sequences[0], dtype=np.int)
        # # FIXME: looks OK

        # turn into numpy arrays
        sequences = np.vstack(sequences)
        # sequences = np.asarray(sequences).squeeze()

        # sequences = sequences.reshape(sequences.shape[0]*sequences.shape[1], sequences.shape[2])

        print('sequences: {}'.format(sequences.shape))

        labels = np.hstack(labels)
        self.labels = labels
        print('labels: {}'.format(labels.shape))

        if target_mode == 'label':
            targets = labels.copy()

            ## copy targets to fit SequenceDataSpace(VectorSpace) structure (*, frame_size, 12)
            # targets = targets.reshape((targets.shape[0], 1))
            # targets = np.repeat(targets, frame_size, axis=1)
            # print targets.shape
            # one_hot_formatter = OneHotFormatter(max(targets.max() + 1, len(label_map)), dtype=np.int)
            # one_hot_y = one_hot_formatter.format(targets)
            # print one_hot_y.shape

            ## copy targets to fit SequenceDataSpace(IndexSpace) structure -> (*, frame_size, 1)
            targets = targets.reshape((targets.shape[0], 1))
            targets = np.repeat(targets, frame_size, axis=1)
            targets = targets.reshape((targets.shape[0], targets.shape[1], 1))
            print(targets.shape)

        elif target_mode == 'next':
            targets = np.concatenate(targets)
            targets = targets.reshape((targets.shape[0], 1, targets.shape[1]))
        print('targets: {}'.format(targets.shape))

        n_channels = sequences.shape[2]
        print('number of channels: {}'.format(n_channels))

        # if layout == 'ft': # swap axes to (batch, feature, time, channels)
        #     sequences = sequences.swapaxes(1, 2)

        log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape))

        source = ('features', 'targets')
        # space = CompositeSpace([
        #     # VectorSequenceSpace(dim=64),
        #     SequenceSpace(VectorSpace(dim=64)),
        #     VectorSpace(dim=12),
        # ])

        if target_mode == 'label':
            space = CompositeSpace([
                SequenceDataSpace(VectorSpace(dim=n_channels)),
                # SequenceDataSpace(VectorSpace(dim=12)),
                SequenceDataSpace(IndexSpace(dim=1, max_labels=12)),
                # SequenceDataSpace(IndexSpace(dim=512, max_labels=12)),
            ])
        elif target_mode == 'next':
            space = CompositeSpace([
                # does not work with VectorSpacesDataset
                # SequenceSpace(VectorSpace(dim=64)),
                # SequenceSpace(VectorSpace(dim=64))
                SequenceDataSpace(VectorSpace(dim=n_channels)),
                SequenceDataSpace(VectorSpace(dim=n_channels))
                # VectorSpace(dim=n_channels)
            ])

        # source = ('features')
        # space = SequenceSpace(VectorSpace(dim=64))

        print('sequences: {}'.format(sequences.shape))
        print('targets: {}'.format(targets.shape))

        # for i, seq in enumerate(sequences):
        #     print np.asarray(seq, dtype=np.int)
        #     print np.asarray(targets[i], dtype=np.int)
        #     break
        #     # FIXME: looks OK

        # SequenceDataSpace(IndexSpace(dim=1, max_labels=self._max_labels)),
        if target_mode == 'label':
            super(MultiChannelEEGSequencesDataset, self).__init__(
                # data=(sequences, one_hot_y),  # works with vectorspace-target
                data=(sequences, targets),  # works with indexspace-target
                # data=sequences,
                data_specs=(space, source))
        elif target_mode == 'next':
            super(MultiChannelEEGSequencesDataset, self).__init__(
                # data=(sequences, one_hot_y),  # works with vectorspace-target
                data=(sequences, targets),  # works with indexspace-target
                # data=sequences,
                data_specs=(space, source))
Beispiel #27
0
 def get_target_space(self):
     return IndexSpace(max_labels=11, dim=1)
Beispiel #28
0
    def __init__(self,
                 chromosomes="ALL",
                 dataset_name="snp",
                 read_only=False, balance_classes=False,
                 start=None, stop=None, shuffle=False,
                 add_noise=False, rng=_default_seed, flip_labels=False):
        print "Loading %r chromosomes for %s" % (chromosomes, dataset_name)
        if start is not None and stop is not None:
            print "Start: %d, stop: %d" % (start, stop)
        assert isinstance(chromosomes, int) or chromosomes == "ALL",\
            "Can only set chromosomes to be an integer or ALL"

        p = serial.preprocess("${PYLEARN2_NI_PATH}/" + dataset_name)
        data_files = glob(path.join(p, "chr*.npy"))
        label_file = path.join(p, "labels.npy")

        get_int = lambda y: int(''.join(x for x in y if x.isdigit()))
        data_files.sort(key=get_int)

        if (chromosomes == "ALL" or chromosomes > len(data_files)):
            chromosomes = len(data_files)

        self.y = np.atleast_2d(np.load(label_file)).T[start:stop]
        if flip_labels:
            self.y = (self.y + 1) % 2
        self.Xs = ()
        space = ()
        source = ()

        balanced_idx = None
        if balance_classes:
            num_classes = np.amax(self.y) + 1
            class_counts = [len(np.where(self.y == i)[0].tolist())
                            for i in range(num_classes)]
            min_count = min(class_counts)
            balanced_idx = []
            for i in range(num_classes):
                idx = np.where(self.y == i)[0].tolist()[:min_count]
                balanced_idx += idx
            balanced_idx.sort()
            assert len(balanced_idx) / min_count == num_classes
            assert len(balanced_idx) % min_count == 0

            self.y = self.y[balanced_idx]
            for i in range(num_classes):
                assert len(np.where(self.y == i)[0].tolist()) == min_count

        if read_only:
            print "Format is read-only for %s" % which_set
            h5_path = path.join(p, "gen." + which_set + ".h5")

            if not path.isfile(h5_path):
                self.make_h5(data_files,
                             h5_path,
                             start=start,
                             stop=stop)

            h5file = tables.openFile(h5_path)
            datas = [h5file.getNode("/", "Chr%d" % (c + 1)) for c in range(chromosomes)]
            self.Xs = tuple(data.X for data in datas)
            sizes = [h5file.getNode("/", "Sizes")[c] for c in range(chromosomes)]

        else:
            print "Format is on-memory for %s" % dataset_name
            sizes = []
            for c in range(0, chromosomes):
                X = np.load(data_files[c])[start:stop, :]

                assert "%d" % (c+1) in data_files[c]

                if balanced_idx is not None:
                    X = X[balanced_idx]

                assert X.shape[0] == self.y.shape[0],\
                    "Data and labels have different number of samples (%d vs %d)" %\
                    (X.shape[0], self.y.shape[0])

                self.Xs = self.Xs + (X / 2.0,)
                sizes.append(X.shape[1])

        print "%s samples are %d" % (dataset_name, self.y.shape[0])

        space = tuple(VectorSpace(dim=size) for size in sizes)
        source = tuple("chromosomes_%d" % (c + 1) for c in range(chromosomes))

        self.X_space = CompositeSpace(space)
        self.X_source = source

        space = space + (IndexSpace(dim=1, max_labels=2),)
        source = source + ("targets",)
        space = CompositeSpace(space)

        self.data_specs = (space, source)
        self.rng = make_np_rng(rng, which_method="random_integers")
        assert self.rng is not None

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class("shuffled_sequential")
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = self.data_specs

        if add_noise:
            if add_noise is True:
                add_noise = 0.05
            self.convert = list(randomize_snps.RandomizeSNPs(input_space=x_space,
                                                             corruption_prob=add_noise)
                            for x_space in self.X_space.components) + [None]
        else:
            self.convert = None
Beispiel #29
0
    def __init__(self, X=None, topo_view=None, y=None, latent = None,
                 view_converter=None, axes=('b', 0, 1, 'c'),
                 rng=_default_seed, preprocessor=None, fit_preprocessor=False,
                 X_labels=None, y_labels=None):

        self.latent = latent
        self.X = X
        self.y = y
        self.view_converter = view_converter
        self.X_labels = X_labels
        self.y_labels = y_labels

        self._check_labels()

        if topo_view is not None:
            assert view_converter is None
            self.set_topological_view(topo_view, axes)
        else:
            assert X is not None, ("DenseDesignMatrix needs to be provided "
                                   "with either topo_view, or X")
            if view_converter is not None:

                # Get the topo_space (usually Conv2DSpace) from the
                # view_converter
                if not hasattr(view_converter, 'topo_space'):
                    raise NotImplementedError("Not able to get a topo_space "
                                              "from this converter: %s"
                                              % view_converter)

                # self.X_topo_space stores a "default" topological space that
                # will be used only when self.iterator is called without a
                # data_specs, and with "topo=True", which is deprecated.
                self.X_topo_space = view_converter.topo_space
            else:
                self.X_topo_space = None

            # Update data specs, if not done in set_topological_view
            X_source = 'features'
            if X_labels is None:
                X_space = VectorSpace(dim=X.shape[1])
            else:
                if X.ndim == 1:
                    dim = 1
                else:
                    dim = X.shape[-1]
                X_space = IndexSpace(dim=dim, max_labels=X_labels)

            if y is None:
                space = X_space
                source = X_source
            else:
                if y.ndim == 1:
                    dim = 1
                else:
                    dim = y.shape[-1]
                if y_labels is not None:
                    y_space = IndexSpace(dim=dim, max_labels=y_labels)
                else:
                    y_space = VectorSpace(dim=dim)
                y_source = 'targets'

                Latent_space = VectorSpace(dim=latent.shape[-1])
                Latent_source = 'latents'
                space = CompositeSpace((X_space, y_space, Latent_space))
                source = (X_source, y_source, Latent_source)

            self.data_specs = (space, source)
            self.X_space = X_space

        self.compress = False
        self.design_loc = None
        self.rng = make_np_rng(rng, which_method="random_integers")
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = (self.X_space, 'features')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor