def __init__(self, inputs, targets=None, name="dataset", keep_on_cpu=False): """ Parameters ---------- inputs : list of ndarray Training examples (can be variable length sequences). targets : ndarray (optional) Target for each training example (can be variable length sequences). name : str (optional) The name of the dataset is used to name Theano variables. Default: 'dataset'. """ self.keep_on_cpu = keep_on_cpu self.name = name self.inputs = inputs self.targets = targets self.symb_inputs = T.TensorVariable(type=T.TensorType( "floatX", [False] * (inputs[0].ndim + 1)), name=self.name + '_symb_inputs') self.symb_inputs.tag.test_value = inputs[0][ None, ...] # For debugging Theano graphs. self.symb_targets = None if self.has_targets: self.symb_targets = T.TensorVariable( type=T.TensorType("floatX", [False] * (targets[0].ndim + 1)), name=self.name + '_symb_targets') self.symb_targets.tag.test_value = targets[0][ None, ...] # For debugging Theano graphs.
def __init__(self, inputs, targets=None, name="dataset"): """ Parameters ---------- inputs : ndarray Training examples targets : ndarray (optional) Target for each training example. name : str (optional) The name of the dataset is used to name Theano variables. Default: 'dataset'. """ self.name = name self.inputs = inputs self.targets = targets self.symb_inputs = T.TensorVariable(type=T.TensorType( "floatX", [False] * self.inputs.ndim), name=self.name + '_symb_inputs') self.symb_inputs.tag.test_value = self.inputs.get_value( ) # For debugging Theano graphs. self.symb_targets = None if self.has_targets: self.symb_targets = T.TensorVariable( type=T.TensorType("floatX", [False] * self.targets.ndim), name=self.name + '_symb_targets') self.symb_targets.tag.test_value = self.targets.get_value( ) # For debugging Theano graphs.
def expr(self, model, data): weights = model.get_weights() error = weights - weights.mean() kurtosis = weights.shape[0] * weights.shape[1] * N.sum( N.power(error, 4)) / N.power(N.sum(N.power(error, 2)), 2) - 3 return T.TensorVariable(kurtosis)
def test_tensorvariable(self): ## Re-init counter Variable.__count__ = count(0) r1 = tensor.TensorType(dtype='int32', broadcastable=())('myvar') r2 = tensor.TensorVariable( tensor.TensorType(dtype='int32', broadcastable=())) r3 = shared(numpy.random.randn(3, 4)) assert r1.auto_name == "auto_0" assert r2.auto_name == "auto_1" assert r3.auto_name == "auto_2"
def test_tensorvariable(self): ## Get counter value autoname_id = next(Variable.__count__) Variable.__count__ = count(autoname_id) r1 = tensor.TensorType(dtype='int32', broadcastable=())('myvar') r2 = tensor.TensorVariable( tensor.TensorType(dtype='int32', broadcastable=())) r3 = shared(numpy.random.randn(3, 4)) assert r1.auto_name == "auto_" + str(autoname_id) assert r2.auto_name == "auto_" + str(autoname_id + 1) assert r3.auto_name == "auto_" + str(autoname_id + 2)
def __init__(self, dataset, batch_size, seed=1234): """ Parameters ---------- dataset : :class:`MaskClassifierDataset` Dataset from which to get the examples. batch_size : int Nb. of examples per batch. seed : int, optional Seed for the random generator when shuffling streamlines or adding noise to the streamlines. """ self.dataset = dataset self.batch_size = batch_size self.indices = np.arange(len(self.dataset)) self.seed = seed self.rng = np.random.RandomState(self.seed) # Shared variables self._shared_batch_inputs = sharedX(np.ndarray((0, 0))) self._shared_batch_targets = sharedX(np.ndarray((0, ))) # Test value batch_inputs, batch_targets = self._next_batch(0) # Redefine symbolic variables for single input model self.dataset.symb_inputs = T.TensorVariable( type=T.TensorType("floatX", [False] * batch_inputs.ndim), name=self.dataset.name + '_symb_inputs') self.dataset.symb_inputs.tag.test_value = batch_inputs # Since this batch scheduler creates its own targets. if self.dataset.symb_targets is None: self.dataset.symb_targets = T.TensorVariable( type=T.TensorType("floatX", [False] * batch_targets.ndim), name=self.dataset.name + '_symb_targets') self.dataset.symb_targets.tag.test_value = batch_targets
def __init__(self, dataset, batch_size, use_mask_as_input=False, keep_mask=False, seed=1234): """ Parameters ---------- dataset : `SequenceDataset` object Dataset of datasets (one for each bundle). batch_size : int Number of examples per batch. *Must be greater than the number of bundles in `bundles_dataset`.* seed : int (optional) Seed of the random numbers generator used to sample a different regressive mask for each example. """ super().__init__(dataset, batch_size) self.use_mask_as_input = use_mask_as_input self.seed = seed self.rng = np.random.RandomState(self.seed) self.keep_mask = keep_mask # Allocate memory for the autoregressive mask. self.mask_shape = (len(dataset), ) + self.dataset.input_shape self._shared_mask_o_lt_d = sharedX(np.zeros(self.mask_shape), name='autoregressive_mask', keep_on_cpu=True) # Add a new attribute: a symbolic variable representing the auto regressive mask. self._shared_mask_o_lt_d.set_value(self.generate_autoregressive_mask()) self.dataset.mask_o_lt_d = T.TensorVariable( type=T.TensorType("floatX", [False] * dataset.inputs.ndim), name=dataset.name + '_symb_mask') # Keep only `batch_size` masks as test values. self.dataset.mask_o_lt_d.tag.test_value = self._shared_mask_o_lt_d.get_value( )[:batch_size] # For debugging Theano graphs. if self.use_mask_as_input: self.dataset.symb_inputs.tag.test_value = np.concatenate([ self.dataset.symb_inputs.tag.test_value * self.dataset.mask_o_lt_d.tag.test_value, self.dataset.mask_o_lt_d.tag.test_value ], axis=1)
def __init__(self, dataset, batch_size, k, noisy_streamlines_sigma=None, nb_updates_per_epoch=None, seed=1234, include_last_point=False): self.dataset = dataset self.batch_size = batch_size self.k = k self.include_last_point = include_last_point self.use_augment_by_flipping = True self._nb_updates_per_epoch = nb_updates_per_epoch self.use_sample_from_bundle = self._nb_updates_per_epoch is not None self.noisy_streamlines_sigma = noisy_streamlines_sigma self.use_noisy_streamlines = self.noisy_streamlines_sigma is not None self.seed = seed self.rng = np.random.RandomState(self.seed) self.rng_noise = np.random.RandomState(self.seed + 1) # No need for a mask since streamlines are going to be resampled. self.dataset.symb_mask = None # Shared variables self._shared_batch_inputs = sharedX(np.ndarray((0, 0, 0))) self._shared_batch_targets = sharedX(np.ndarray((0, 0, 0, 0))) # Test value batch_inputs, batch_targets = self._next_batch(0) self.dataset.symb_inputs.tag.test_value = batch_inputs # Since this batch scheduler creates its own targets. if self.dataset.symb_targets is None: self.dataset.symb_targets = T.TensorVariable( type=T.TensorType("floatX", [False] * batch_targets.ndim), name=self.dataset.name + '_symb_targets') self.dataset.symb_targets.tag.test_value = batch_targets
def __init__(self, inputs, targets=None, name="dataset", keep_on_cpu=False): """ Parameters ---------- inputs : list of ndarray Training examples (can be variable length sequences). targets : ndarray (optional) Target for each training example (can be variable length sequences). name : str (optional) The name of the dataset is used to name Theano variables. Default: 'dataset'. """ super().__init__(inputs, targets, name, keep_on_cpu) self.symb_mask = T.TensorVariable(type=T.TensorType( "floatX", [False] * inputs[0].ndim), name=self.name + '_symb_mask') self.symb_mask.tag.test_value = ( inputs[0][:, 0] > 0.5).astype(floatX)[None, ...] # For debugging Theano graphs.
def test_th_matmul(): vlist = [] flist = [] ndlist = [] for i in range(2, 30): dims = int(np.random.random() * 4 + 2) # Create a tuple of tensors with potentially different broadcastability. vs = tuple( tt.TensorVariable( tt.TensorType( 'float64', tuple((p < .3) for p in np.random.ranf(dims - 2)) # Make full matrices + (False, False))) for _ in range(2)) vs = tuple( tt.swapaxes(v, -2, -1) if j % 2 == 0 else v for j, v in enumerate(vs)) f = th.function([*vs], [matmul(*vs)]) # Create the default shape for the test ndarrays defshape = tuple(int(np.random.random() * 5 + 1) for _ in range(dims)) # Create a test array matching the broadcastability of each v, for each v. nds = tuple( np.random.ranf( tuple(s if not v.broadcastable[j] else 1 for j, s in enumerate(defshape))) for v in vs) nds = tuple( np.swapaxes(nd, -2, -1) if j % 2 == 0 else nd for j, nd in enumerate(nds)) ndlist.append(nds) vlist.append(vs) flist.append(f) for i in range(len(ndlist)): assert np.allclose(flist[i](*ndlist[i]), np.matmul(*ndlist[i]))
def fit(self, X, bounds=None, constraints=None, use_gradient=True, **kwargs): # Map parameters to placeholders param_to_placeholder = [] param_to_index = {} for i, v in enumerate(self.parameters_): w = T.TensorVariable(v.type) param_to_placeholder.append((v, w)) param_to_index[v] = i # Build bounds mapped_bounds = None if bounds is not None: mapped_bounds = [(None, None) for v in param_to_placeholder] for b in bounds: mapped_bounds[param_to_index[b["param"]]] = b["bounds"] # Build constraints mapped_constraints = None if constraints is not None: mapped_constraints = [] for c in constraints: args = c["param"] if isinstance(args, SharedVariable): args = (args, ) m_c = { "type": c["type"], "fun": lambda x: c["fun"](*[x[param_to_index[a]] for a in args]) } if "jac" in c: m_c["jac"] = lambda x: c["jac"](*[x[param_to_index[a]] for a in args]) mapped_constraints.append(m_c) # Derive objective and gradient objective_ = theano.function( [self.X] + [w for _, w in param_to_placeholder] + [theano.In(v, name=v.name) for v in self.observeds_], T.sum(self.nnlf_), givens=param_to_placeholder, allow_input_downcast=True) def objective(x): return objective_(X, *x, **kwargs) / len(X) if use_gradient: gradient_ = theano.function( [self.X] + [w for _, w in param_to_placeholder] + [theano.In(v, name=v.name) for v in self.observeds_], theano.grad(T.sum(self.nnlf_), [v for v, _ in param_to_placeholder]), givens=param_to_placeholder, allow_input_downcast=True) def gradient(x): return np.array(gradient_(X, *x, **kwargs)) / len(X) # Solve! x0 = np.array([v.get_value() for v, _ in param_to_placeholder]) r = minimize(objective, jac=gradient if use_gradient else None, x0=x0, method=self.optimizer, bounds=mapped_bounds, constraints=mapped_constraints) if r.success: # Assign the solution for i, value in enumerate(r.x): param_to_placeholder[i][0].set_value(value) else: print("Parameter fitting failed!") print(r) return self
def fit(self, X, bounds=None, constraints=None, use_gradient=True, optimizer=None, **kwargs): """Fit the distribution parameters to data by minimizing the negative log-likelihood of the data. Parameters ---------- * `X` [array-like, shape=(n_samples, n_features)]: The samples. * `bounds` [list of (parameter, (low, high))]: The parameter bounds. * `constraints`: The constraints on the parameters. * `use_gradient` [boolean, default=True]: Whether to use exact gradients (if `True`) or numerical gradients (if `False`). * `optimizer` [string]: The optimization method. Returns ------- * `self` [object]: `self`. """ # Map parameters to placeholders param_to_placeholder = [] param_to_index = {} for i, v in enumerate(self.parameters_): w = T.TensorVariable(v.type) param_to_placeholder.append((v, w)) param_to_index[v] = i # Build bounds mapped_bounds = None if bounds is not None: mapped_bounds = [(None, None) for v in param_to_placeholder] for b in bounds: mapped_bounds[param_to_index[b["param"]]] = b["bounds"] # Build constraints mapped_constraints = None if constraints is not None: mapped_constraints = [] for c in constraints: args = c["param"] if isinstance(args, SharedVariable): args = (args, ) m_c = { "type": c["type"], "fun": lambda x: c["fun"](*[x[param_to_index[a]] for a in args]) } if "jac" in c: m_c["jac"] = lambda x: c["jac"]( *[x[param_to_index[a]] for a in args]) mapped_constraints.append(m_c) # Derive objective and gradient objective_ = theano.function( [self.X] + [w for _, w in param_to_placeholder] + [theano.In(v, name=v.name) for v in self.observeds_], T.sum(self.nll_), givens=param_to_placeholder, allow_input_downcast=True) def objective(x): return objective_(X, *x, **kwargs) / len(X) if use_gradient: gradient_ = theano.function( [self.X] + [w for _, w in param_to_placeholder] + [theano.In(v, name=v.name) for v in self.observeds_], theano.grad(T.sum(self.nll_), [v for v, _ in param_to_placeholder]), givens=param_to_placeholder, allow_input_downcast=True) def gradient(x): return np.array(gradient_(X, *x, **kwargs)) / len(X) # Solve! x0 = np.array([v.get_value() for v, _ in param_to_placeholder]) r = minimize(objective, jac=gradient if use_gradient else None, x0=x0, method=optimizer, bounds=mapped_bounds, constraints=mapped_constraints) if r.success: # Assign the solution for i, value in enumerate(r.x): param_to_placeholder[i][0].set_value(value) else: print("Parameter fitting failed!") print(r) return self
def __init__(self, dataset, batch_size, noisy_streamlines_sigma=None, seed=1234, use_data_augment=True, normalize_target=False, shuffle_streamlines=True, resample_streamlines=True, feed_previous_direction=False): """ Parameters ---------- dataset : :class:`TractographyDataset` Dataset from which to get the examples. batch_size : int Nb. of examples per batch. seed : int, optional Seed for the random generator when shuffling streamlines or adding noise to the streamlines. use_data_augment : bool If true, perform data augmentation by flipping streamlines. normalize_target : bool If true, targets will have a norm of one (usually used by the GruRegression model). shuffle_streamlines : bool Shuffle streamlines in the dataset between each epoch. resample_streamlines : bool Streamlines in a same batch will all have the same number of points. Should be always set to True for now (until the method _process_batch supports it). feed_previous_direction : bool Should the previous direction be appended to the input when making a prediction? """ self.dataset = dataset self.batch_size = batch_size self.normalize_target = normalize_target self.noisy_streamlines_sigma = noisy_streamlines_sigma self.use_noisy_streamlines = self.noisy_streamlines_sigma is not None # Parameter use_data_augment cannot be used in the case of a FFNN model (or any other non-recurrent model, # without feed_previous_direction because the targets are flipped but the inputs stay the same) self.use_augment_by_flipping = feed_previous_direction and use_data_augment self.seed = seed self.rng = np.random.RandomState(self.seed) self.rng_noise = np.random.RandomState(self.seed + 1) self.shuffle_streamlines = shuffle_streamlines self.resample_streamlines = resample_streamlines self.indices = np.arange(len(self.dataset)) self.feed_previous_direction = feed_previous_direction # Shared variables self._shared_batch_inputs = sharedX(np.ndarray((0, 0))) self._shared_batch_targets = sharedX(np.ndarray((0, 0))) # Test value batch_inputs, batch_targets = self._next_batch(0) # Redefine symbolic variables for single input model self.dataset.symb_inputs = T.TensorVariable( type=T.TensorType("floatX", [False] * batch_inputs.ndim), name=self.dataset.name + '_symb_inputs') self.dataset.symb_inputs.tag.test_value = batch_inputs # Since this batch scheduler creates its own targets. if self.dataset.symb_targets is None: self.dataset.symb_targets = T.TensorVariable( type=T.TensorType("floatX", [False] * batch_targets.ndim), name=self.dataset.name + '_symb_targets') self.dataset.symb_targets.tag.test_value = batch_targets
def __init__(self, dataset, batch_size, noisy_streamlines_sigma=None, seed=1234, use_data_augment=True, normalize_target=False, shuffle_streamlines=True, resample_streamlines=True, feed_previous_direction=False, sort_streamlines_by_length=False, learn_to_stop=False): """ Parameters ---------- dataset : :class:`TractographyDataset` Dataset from which to get the examples. batch_size : int Nb. of examples per batch. seed : int, optional Seed for the random generator when shuffling streamlines or adding noise to the streamlines. use_data_augment : bool If true, perform data augmentation by flipping streamlines. normalize_target : bool If true, targets will have a norm of one (usually used by the GruRegression model). shuffle_streamlines : bool Shuffle streamlines in the dataset between each epoch. resample_streamlines : bool Streamlines in a same batch will all have the same number of points. Should be always set to True for now (until the method _process_batch supports it). feed_previous_direction : bool Should the previous direction be appended to the input when making a prediction? sort_streamlines_by_length : bool Streamlines will be approximatively regrouped according to their length. learn_to_stop : bool Predict whether the streamline being generated should stop or not """ self.dataset = dataset self.batch_size = batch_size self.use_augment_by_flipping = use_data_augment self.normalize_target = normalize_target self.noisy_streamlines_sigma = noisy_streamlines_sigma self.use_noisy_streamlines = self.noisy_streamlines_sigma is not None self.seed = seed self.rng = np.random.RandomState(self.seed) self.rng_noise = np.random.RandomState(self.seed + 1) self.shuffle_streamlines = shuffle_streamlines self.resample_streamlines = resample_streamlines self.sort_streamlines_by_length = sort_streamlines_by_length self.feed_previous_direction = feed_previous_direction self.learn_to_stop = learn_to_stop # Sort streamlines according to their length by default. # This should speed up validation. self.indices = np.argsort(self.dataset.streamlines._lengths) # Shared variables self._shared_batch_inputs = sharedX(np.ndarray((0, 0, 0))) self._shared_batch_targets = sharedX(np.ndarray((0, 0, 0))) self._shared_batch_mask = sharedX(np.ndarray((0, 0))) # Test value batch_inputs, batch_targets, batch_mask = self._next_batch(0) self.dataset.symb_inputs.tag.test_value = batch_inputs self.dataset.symb_mask.tag.test_value = batch_mask # Since this batch scheduler creates its own targets. if self.dataset.symb_targets is None: self.dataset.symb_targets = T.TensorVariable( type=T.TensorType("floatX", [False] * (batch_targets.ndim)), name=self.dataset.name + '_symb_targets') self.dataset.symb_targets.tag.test_value = batch_targets
def __init__(self, dataset, batch_size, batch_id, ordering_id, use_mask_as_input=False, seed=1234): """ Parameters ---------- dataset : `SequenceDataset` object Dataset of datasets (one for each bundle). batch_size : int Number of examples per batch. *Must be greater than the number of bundles in `bundles_dataset`.* seed : int (optional) Seed of the random numbers generator used to sample a different regressive mask for each example. """ super().__init__(dataset) self.use_mask_as_input = use_mask_as_input self.seed = seed self.rng = np.random.RandomState(self.seed) self.batch_size = batch_size self.batch_id = batch_id self.ordering_id = ordering_id # Determine the start and the end of the batch that will be used by this batch scheduler. assert batch_id * self.batch_size < len(self.dataset) self.batch_start = batch_id * self.batch_size self.batch_end = min((batch_id + 1) * self.batch_size, len(dataset)) # Determine the ordering that will be used by this batch scheduler. self.d = 0 self.D = self.dataset.input_shape[0] self.ordering = np.arange(self.D) for _ in range(ordering_id + 1): self.rng.shuffle(self.ordering) # Matrix mask that will be used when concatenating the mask. self._shared_Moltd = sharedX(np.zeros( (self.batch_end - self.batch_start, self.D)), name='Moltd') # Vector mask that will be broadcasted across all inputs. # self._shared_mod = sharedX(np.zeros((1, self.D)), name='mod') self._shared_mod = sharedX(np.zeros((self.D, )), name='mod') # Add a new attributes: a symbolic variable representing the auto regressive mask. self.change_masks(self.d) self.Moltd = T.TensorVariable(type=T.TensorType( "floatX", [False] * dataset.inputs.ndim), name="symb_Moltd") self.mod = T.TensorVariable(type=T.TensorType("floatX", [True, False]), name="symb_mod") # Keep only `(self.batch_end-self.batch_start)` examples as test values. self.dataset.symb_inputs.tag.test_value = self.dataset.inputs.get_value( )[:(self.batch_end - self.batch_start)] if self.dataset.has_targets: self.dataset.symb_targets.tag.test_value = self.dataset.targets.get_value( )[:(self.batch_end - self.batch_start)] self.Moltd.tag.test_value = self._shared_Moltd.get_value()[:( self.batch_end - self.batch_start)] self.mod.tag.test_value = self._shared_mod.get_value()[None, :] if self.use_mask_as_input: self.dataset.symb_inputs.tag.test_value = np.concatenate([ self.dataset.symb_inputs.tag.test_value * self.Moltd.tag.test_value, self.Moltd.tag.test_value ], axis=1)