def __init__(self): self.W1 = [sharedX(rng.randn(num_features, chunk_width)) for i in xrange(num_chunks)] disturb_mem.disturb_mem() self.W2 = [sharedX(rng.randn(chunk_width)) for i in xrange(num_chunks)] self._params = safe_union(self.W1, self.W2) self.input_space = VectorSpace(num_features) self.output_space = VectorSpace(1)
def __call__(self, model, X, Y=None, **kwargs): disturb_mem.disturb_mem() def mlp_pred(non_linearity): Z = [T.dot(X, W) for W in model.W1] H = map(non_linearity, Z) Z = [T.dot(h, W) for h, W in safe_izip(H, model.W2)] pred = sum(Z) return pred nonlinearity_predictions = map(mlp_pred, [T.nnet.sigmoid, T.nnet.softplus, T.sqr, T.sin]) pred = sum(nonlinearity_predictions) disturb_mem.disturb_mem() return abs(pred-Y[:,0]).sum()
def make_dataset(num_batches): disturb_mem.disturb_mem() m = num_batches*batch_size X = rng.randn(m, num_features) y = np.zeros((m,1)) y[:,0] = np.dot(X, w) > 0. rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning X = rval.get_batch_design(batch_size) assert X.shape == (batch_size, num_features) return rval
def make_dataset(num_batches): disturb_mem.disturb_mem() m = num_batches * batch_size X = rng.randn(m, num_features) y = np.zeros((m, 1)) y[:, 0] = np.dot(X, w) > 0. rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning X = rval.get_batch_design(batch_size) assert X.shape == (batch_size, num_features) return rval
def expr(self, model, data, **kwargs): self.get_data_specs(model)[0].validate(data) X, Y = data disturb_mem.disturb_mem() def mlp_pred(non_linearity): Z = [T.dot(X, W) for W in model.W1] H = map(non_linearity, Z) Z = [T.dot(h, W) for h, W in safe_izip(H, model.W2)] pred = sum(Z) return pred nonlinearity_predictions = map(mlp_pred, [T.nnet.sigmoid, T.nnet.softplus, T.sqr, T.sin]) pred = sum(nonlinearity_predictions) disturb_mem.disturb_mem() return abs(pred-Y[:,0]).sum()
def __call__(self, model, X, Y=None, **kwargs): disturb_mem.disturb_mem() def mlp_pred(non_linearity): Z = [T.dot(X, W) for W in model.W1] H = map(non_linearity, Z) Z = [T.dot(h, W) for h, W in safe_izip(H, model.W2)] pred = sum(Z) return pred nonlinearity_predictions = map( mlp_pred, [T.nnet.sigmoid, T.nnet.softplus, T.sqr, T.sin]) pred = sum(nonlinearity_predictions) disturb_mem.disturb_mem() return abs(pred - Y[:, 0]).sum()
def run(replay, log=None): if not replay: log = StringIO() else: log = StringIO(log) record = Record(replay=replay, file_object=log) disturb_mem.disturb_mem() mode = RecordMode(record=record) b = sharedX(np.zeros((2, )), name='b') channels = OrderedDict() disturb_mem.disturb_mem() v_max = b.max(axis=0) v_min = b.min(axis=0) v_range = v_max - v_min updates = [] for i, val in enumerate([ v_max.max(), v_max.min(), v_range.max(), ]): disturb_mem.disturb_mem() s = sharedX(0., name='s_' + str(i)) updates.append((s, val)) for var in theano.gof.graph.ancestors(update for _, update in updates): if var.name is not None and var.name is not 'b': if var.name[0] != 's' or len(var.name) != 2: var.name = None for key in channels: updates.append((s, channels[key])) f = theano.function([], mode=mode, updates=updates, on_unused_input='ignore', name='f') for output in f.maker.fgraph.outputs: mode.record.handle_line(var_descriptor(output) + '\n') disturb_mem.disturb_mem() f() mode.record.f.flush() if not replay: return log.getvalue()
def run(replay, log=None): if not replay: log = StringIO() else: log = StringIO(log) record = Record(replay=replay, file_object=log) disturb_mem.disturb_mem() mode = RecordMode(record=record) b = sharedX(np.zeros((2,)), name='b') channels = OrderedDict() disturb_mem.disturb_mem() v_max = b.max(axis=0) v_min = b.min(axis=0) v_range = v_max - v_min updates = [] for i, val in enumerate([ v_max.max(), v_max.min(), v_range.max(), ]): disturb_mem.disturb_mem() s = sharedX(0., name='s_' + str(i)) updates.append((s, val)) for var in theano.gof.graph.ancestors(update for _, update in updates): if var.name is not None and var.name is not 'b': if var.name[0] != 's' or len(var.name) != 2: var.name = None for key in channels: updates.append((s, channels[key])) f = theano.function([], mode=mode, updates=updates, on_unused_input='ignore', name='f') for output in f.maker.fgraph.outputs: mode.record.handle_line(var_descriptor(output) + '\n') disturb_mem.disturb_mem() f() mode.record.f.flush() if not replay: return log.getvalue()
def run_sgd(mode): # Must be seeded the same both times run_sgd is called disturb_mem.disturb_mem() rng = np.random.RandomState([2012, 11, 27]) batch_size = 5 train_batches = 3 valid_batches = 4 num_features = 2 # Synthesize dataset with a linear decision boundary w = rng.randn(num_features) def make_dataset(num_batches): disturb_mem.disturb_mem() m = num_batches*batch_size X = rng.randn(m, num_features) y = np.zeros((m,1)) y[:,0] = np.dot(X, w) > 0. rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning X = rval.get_batch_design(batch_size) assert X.shape == (batch_size, num_features) return rval train = make_dataset(train_batches) valid = make_dataset(valid_batches) num_chunks = 10 chunk_width = 2 class ManyParamsModel(Model): """ Make a model with lots of parameters, so that there are many opportunities for their updates to get accidentally re-ordered non-deterministically. This makes non-determinism bugs manifest more frequently. """ def __init__(self): self.W1 = [sharedX(rng.randn(num_features, chunk_width)) for i in xrange(num_chunks)] disturb_mem.disturb_mem() self.W2 = [sharedX(rng.randn(chunk_width)) for i in xrange(num_chunks)] self._params = safe_union(self.W1, self.W2) self.input_space = VectorSpace(num_features) self.output_space = VectorSpace(1) disturb_mem.disturb_mem() model = ManyParamsModel() disturb_mem.disturb_mem() class LotsOfSummingCost(Cost): """ Make a cost whose gradient on the parameters involves summing many terms together, so that T.grad is more likely to sum things in a random order. """ supervised = True def __call__(self, model, X, Y=None, **kwargs): disturb_mem.disturb_mem() def mlp_pred(non_linearity): Z = [T.dot(X, W) for W in model.W1] H = map(non_linearity, Z) Z = [T.dot(h, W) for h, W in safe_izip(H, model.W2)] pred = sum(Z) return pred nonlinearity_predictions = map(mlp_pred, [T.nnet.sigmoid, T.nnet.softplus, T.sqr, T.sin]) pred = sum(nonlinearity_predictions) disturb_mem.disturb_mem() return abs(pred-Y[:,0]).sum() cost = LotsOfSummingCost() disturb_mem.disturb_mem() algorithm = SGD(cost=cost, batch_size=batch_size, init_momentum=.5, learning_rate=1e-3, monitoring_dataset={'train': train, 'valid':valid}, update_callbacks=[ExponentialDecay(decay_factor=2., min_lr=.0001)], termination_criterion=EpochCounter(max_epochs=5)) disturb_mem.disturb_mem() train_object = Train( dataset=train, model=model, algorithm=algorithm, extensions=[ PolyakAveraging(start=0), MomentumAdjustor(final_momentum=.9, start=1, saturate=5), ], save_freq=0) disturb_mem.disturb_mem() train_object.main_loop()
def run_bgd(mode): # Must be seeded the same both times run_bgd is called disturb_mem.disturb_mem() rng = np.random.RandomState([2012, 11, 27, 8]) batch_size = 5 train_batches = 3 valid_batches = 4 num_features = 2 # Synthesize dataset with a linear decision boundary w = rng.randn(num_features) def make_dataset(num_batches): disturb_mem.disturb_mem() m = num_batches*batch_size X = rng.randn(m, num_features) y = np.zeros((m,1)) y[:,0] = np.dot(X, w) > 0. rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning X = rval.get_batch_design(batch_size) assert X.shape == (batch_size, num_features) return rval train = make_dataset(train_batches) valid = make_dataset(valid_batches) num_chunks = 10 chunk_width = 2 class ManyParamsModel(Model): """ Make a model with lots of parameters, so that there are many opportunities for their updates to get accidentally re-ordered non-deterministically. This makes non-determinism bugs manifest more frequently. """ def __init__(self): super(ManyParamsModel, self).__init__() self.W1 = [sharedX(rng.randn(num_features, chunk_width)) for i in xrange(num_chunks)] disturb_mem.disturb_mem() self.W2 = [sharedX(rng.randn(chunk_width)) for i in xrange(num_chunks)] self._params = safe_union(self.W1, self.W2) self.input_space = VectorSpace(num_features) self.output_space = VectorSpace(1) disturb_mem.disturb_mem() model = ManyParamsModel() disturb_mem.disturb_mem() class LotsOfSummingCost(Cost): """ Make a cost whose gradient on the parameters involves summing many terms together, so that T.grad is more likely to sum things in a random order. """ supervised = True def expr(self, model, data, **kwargs): self.get_data_specs(model)[0].validate(data) X, Y = data disturb_mem.disturb_mem() def mlp_pred(non_linearity): Z = [T.dot(X, W) for W in model.W1] H = [non_linearity(z) for z in Z] Z = [T.dot(h, W) for h, W in safe_izip(H, model.W2)] pred = sum(Z) return pred nonlinearity_predictions = map(mlp_pred, [T.nnet.sigmoid, T.nnet.softplus, T.sqr, T.sin]) pred = sum(nonlinearity_predictions) disturb_mem.disturb_mem() return abs(pred-Y[:,0]).sum() def get_data_specs(self, model): data = CompositeSpace((model.get_input_space(), model.get_output_space())) source = (model.get_input_source(), model.get_target_source()) return (data, source) cost = LotsOfSummingCost() disturb_mem.disturb_mem() algorithm = BGD(cost=cost, batch_size=batch_size, updates_per_batch=5, scale_step=.5, conjugate=1, reset_conjugate=0, monitoring_dataset={'train': train, 'valid':valid}, termination_criterion=EpochCounter(max_epochs=5)) disturb_mem.disturb_mem() train_object = Train( dataset=train, model=model, algorithm=algorithm, save_freq=0) disturb_mem.disturb_mem() train_object.main_loop()
def run_sgd(mode): # Must be seeded the same both times run_sgd is called disturb_mem.disturb_mem() rng = np.random.RandomState([2012, 11, 27]) batch_size = 5 train_batches = 3 valid_batches = 4 num_features = 2 # Synthesize dataset with a linear decision boundary w = rng.randn(num_features) def make_dataset(num_batches): disturb_mem.disturb_mem() m = num_batches * batch_size X = rng.randn(m, num_features) y = np.zeros((m, 1)) y[:, 0] = np.dot(X, w) > 0. rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning X = rval.get_batch_design(batch_size) assert X.shape == (batch_size, num_features) return rval train = make_dataset(train_batches) valid = make_dataset(valid_batches) num_chunks = 10 chunk_width = 2 class ManyParamsModel(Model): """ Make a model with lots of parameters, so that there are many opportunities for their updates to get accidentally re-ordered non-deterministically. This makes non-determinism bugs manifest more frequently. """ def __init__(self): self.W1 = [ sharedX(rng.randn(num_features, chunk_width)) for i in xrange(num_chunks) ] disturb_mem.disturb_mem() self.W2 = [ sharedX(rng.randn(chunk_width)) for i in xrange(num_chunks) ] self._params = safe_union(self.W1, self.W2) self.input_space = VectorSpace(num_features) self.output_space = VectorSpace(1) disturb_mem.disturb_mem() model = ManyParamsModel() disturb_mem.disturb_mem() class LotsOfSummingCost(Cost): """ Make a cost whose gradient on the parameters involves summing many terms together, so that T.grad is more likely to sum things in a random order. """ supervised = True def expr(self, model, data, **kwargs): self.get_data_specs(model)[0].validate(data) X, Y = data disturb_mem.disturb_mem() def mlp_pred(non_linearity): Z = [T.dot(X, W) for W in model.W1] H = map(non_linearity, Z) Z = [T.dot(h, W) for h, W in safe_izip(H, model.W2)] pred = sum(Z) return pred nonlinearity_predictions = map( mlp_pred, [T.nnet.sigmoid, T.nnet.softplus, T.sqr, T.sin]) pred = sum(nonlinearity_predictions) disturb_mem.disturb_mem() return abs(pred - Y[:, 0]).sum() def get_data_specs(self, model): data = CompositeSpace( (model.get_input_space(), model.get_output_space())) source = (model.get_input_source(), model.get_target_source()) return (data, source) cost = LotsOfSummingCost() disturb_mem.disturb_mem() algorithm = SGD( cost=cost, batch_size=batch_size, init_momentum=.5, learning_rate=1e-3, monitoring_dataset={ 'train': train, 'valid': valid }, update_callbacks=[ExponentialDecay(decay_factor=2., min_lr=.0001)], termination_criterion=EpochCounter(max_epochs=5)) disturb_mem.disturb_mem() train_object = Train(dataset=train, model=model, algorithm=algorithm, extensions=[ PolyakAveraging(start=0), MomentumAdjustor(final_momentum=.9, start=1, saturate=5), ], save_freq=0) disturb_mem.disturb_mem() train_object.main_loop()
def run_bgd(mode): # Must be seeded the same both times run_bgd is called disturb_mem.disturb_mem() rng = np.random.RandomState([2012, 11, 27, 8]) batch_size = 5 train_batches = 3 valid_batches = 4 num_features = 2 # Synthesize dataset with a linear decision boundary w = rng.randn(num_features) def make_dataset(num_batches): disturb_mem.disturb_mem() m = num_batches * batch_size X = rng.randn(m, num_features) y = np.zeros((m, 1)) y[:, 0] = np.dot(X, w) > 0. rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning X = rval.get_batch_design(batch_size) assert X.shape == (batch_size, num_features) return rval train = make_dataset(train_batches) valid = make_dataset(valid_batches) num_chunks = 10 chunk_width = 2 class ManyParamsModel(Model): """ Make a model with lots of parameters, so that there are many opportunities for their updates to get accidentally re-ordered non-deterministically. This makes non-determinism bugs manifest more frequently. """ def __init__(self): self.W1 = [ sharedX(rng.randn(num_features, chunk_width)) for i in xrange(num_chunks) ] disturb_mem.disturb_mem() self.W2 = [ sharedX(rng.randn(chunk_width)) for i in xrange(num_chunks) ] self._params = safe_union(self.W1, self.W2) self.input_space = VectorSpace(num_features) self.output_space = VectorSpace(1) disturb_mem.disturb_mem() model = ManyParamsModel() disturb_mem.disturb_mem() class LotsOfSummingCost(Cost): """ Make a cost whose gradient on the parameters involves summing many terms together, so that T.grad is more likely to sum things in a random order. """ supervised = True def __call__(self, model, X, Y=None, **kwargs): disturb_mem.disturb_mem() def mlp_pred(non_linearity): Z = [T.dot(X, W) for W in model.W1] H = map(non_linearity, Z) Z = [T.dot(h, W) for h, W in safe_izip(H, model.W2)] pred = sum(Z) return pred nonlinearity_predictions = map( mlp_pred, [T.nnet.sigmoid, T.nnet.softplus, T.sqr, T.sin]) pred = sum(nonlinearity_predictions) disturb_mem.disturb_mem() return abs(pred - Y[:, 0]).sum() cost = LotsOfSummingCost() disturb_mem.disturb_mem() algorithm = BGD(cost=cost, batch_size=batch_size, updates_per_batch=5, scale_step=.5, conjugate=1, reset_conjugate=0, monitoring_dataset={ 'train': train, 'valid': valid }, termination_criterion=EpochCounter(max_epochs=5)) disturb_mem.disturb_mem() train_object = Train(dataset=train, model=model, algorithm=algorithm, save_freq=0) disturb_mem.disturb_mem() train_object.main_loop()