def Dropout(p_drop, inputs): """ Drop each input randomly with probability `p_drop`, and scale the remaining ones to preserve the overall variance. This op doesn't yet support a test-time mode (where all inputs are kept). """ srng = RandomStreams(seed=234) scaled_inputs = inputs / swft.floatX(1 - p_drop) return scaled_inputs * srng.binomial( inputs.shape, p=swft.floatX(1 - p_drop), dtype=theano.config.floatX)
def Dropout(p_drop, inputs): """ Drop each input randomly with probability `p_drop`, and scale the remaining ones to preserve the overall variance. This op doesn't yet support a test-time mode (where all inputs are kept). """ srng = RandomStreams(seed=234) scaled_inputs = inputs / swft.floatX(1-p_drop) return scaled_inputs * srng.binomial( inputs.shape, p=swft.floatX(1-p_drop), dtype=theano.config.floatX )
def generator(n_samples): noise = theano_srng.uniform( size=(n_samples, 100), low=-swft.floatX(numpy.sqrt(3)), high=swft.floatX(numpy.sqrt(3)) ) output = ReLULayer('Generator.1', 100, 1200, noise) output = ReLULayer('Generator.2', 1200, 1200, output) output = ReLULayer('Generator.3', 1200, 1200, output) output = ReLULayer('Generator.4', 1200, 1200, output) return T.nnet.sigmoid( swft.ops.Linear('Generator.5', 1200, 784, output, initialization=('uniform', 0.05)) )
def step(current_processed_input, last_hidden): gates = T.nnet.sigmoid( swft.ops.Linear( name+'.Recurrent_Gates', hidden_dim, 2 * hidden_dim, last_hidden, biases=False ) + current_processed_input[:, :2*hidden_dim] ) update = gates[:, :hidden_dim] reset = gates[:, hidden_dim:] scaled_hidden = reset * last_hidden candidate = T.tanh( swft.ops.Linear( name+'.Recurrent_Candidate', hidden_dim, hidden_dim, scaled_hidden, biases=False, initialization='orthogonal' ) + current_processed_input[:, 2*hidden_dim:] ) one = swft.floatX(1.0) return (update * candidate) + ((one - update) * last_hidden)
def generator(n_samples): noise = theano_srng.uniform(size=(n_samples, 100), low=-swft.floatX(numpy.sqrt(3)), high=swft.floatX(numpy.sqrt(3))) output = ReLULayer('Generator.1', 100, 1200, noise) output = ReLULayer('Generator.2', 1200, 1200, output) output = ReLULayer('Generator.3', 1200, 1200, output) output = ReLULayer('Generator.4', 1200, 1200, output) return T.nnet.sigmoid( swft.ops.Linear('Generator.5', 1200, 784, output, initialization=('uniform', 0.05)))
def BatchNormalize(name, input_dim, inputs, stepwise=False): """ Batch normalization. By default, normalizes across all but the last axis. Set `stepwise` to true if you're batch-norming an RNN and want to normalize each timestep separately (e.g. for a language model, where you can't let information from step `t+1` leak into step `t`). """ if stepwise: means = inputs.mean(axis=1, keepdims=True) variances = inputs.var(axis=1, keepdims=True) else: means = inputs.reshape((-1, input_dim)).mean(axis=0) variances = inputs.reshape((-1, input_dim)).var(axis=0) beta = swft.param( name + '.beta', numpy.zeros(input_dim, dtype='float32') ) gamma = swft.param( name + '.gamma', numpy.ones(input_dim, dtype='float32') ) stdevs = T.sqrt(variances + swft.floatX(1e-4)) return (inputs - means) * (gamma / stdevs) + beta
def step(current_processed_input, last_hidden): gates = T.nnet.sigmoid( swft.ops.Linear(name + ".Recurrent_Gates", hidden_dim, 2 * hidden_dim, last_hidden, biases=False) + current_processed_input[:, : 2 * hidden_dim] ) update = gates[:, :hidden_dim] reset = gates[:, hidden_dim:] scaled_hidden = reset * last_hidden candidate = T.tanh( swft.ops.Linear( name + ".Recurrent_Candidate", hidden_dim, hidden_dim, scaled_hidden, biases=False, initialization="orthogonal", ) + current_processed_input[:, 2 * hidden_dim :] ) one = swft.floatX(1.0) return (update * candidate) + ((one - update) * last_hidden)
def evaluate(fakes): real_images = T.matrix() fake_images = T.matrix() cost = T.nnet.binary_crossentropy(_evaluator(real_images), swft.floatX(1)).mean() cost += T.nnet.binary_crossentropy(_evaluator(fake_images), swft.floatX(0)).mean() real_accuracy = T.ge(_evaluator(real_images), swft.floatX(0.5)).mean() fake_accuracy = T.lt(_evaluator(fake_images), swft.floatX(0.5)).mean() accuracy = (real_accuracy + fake_accuracy) / swft.floatX(2) real_train, real_dev, real_test = swft.mnist.load(BATCH_SIZE) assert(len(fakes) == 60000) fakes_train = fakes[:50000] fakes_dev = fakes[50000:] def train_epoch(): numpy.random.shuffle(fakes_train) batched = fakes_train.reshape(-1, BATCH_SIZE, 784) for i, (real_images, _) in enumerate(real_train()): yield [real_images, batched[i]] def dev_epoch(): yield [real_dev().next()[0], fakes_dev] swft.train( [real_images, fake_images], [cost], train_epoch, dev_data=dev_epoch, epochs=EPOCHS, print_every=1000 ) fn = theano.function([real_images, fake_images], cost) result = fn(real_dev().next()[0], fakes_dev) swft.delete_params('Evaluator') return result
def evaluate(fakes): real_images = T.matrix() fake_images = T.matrix() cost = T.nnet.binary_crossentropy(_evaluator(real_images), swft.floatX(1)).mean() cost += T.nnet.binary_crossentropy(_evaluator(fake_images), swft.floatX(0)).mean() real_accuracy = T.ge(_evaluator(real_images), swft.floatX(0.5)).mean() fake_accuracy = T.lt(_evaluator(fake_images), swft.floatX(0.5)).mean() accuracy = (real_accuracy + fake_accuracy) / swft.floatX(2) real_train, real_dev, real_test = swft.mnist.load(BATCH_SIZE) assert (len(fakes) == 60000) fakes_train = fakes[:50000] fakes_dev = fakes[50000:] def train_epoch(): numpy.random.shuffle(fakes_train) batched = fakes_train.reshape(-1, BATCH_SIZE, 784) for i, (real_images, _) in enumerate(real_train()): yield [real_images, batched[i]] def dev_epoch(): yield [real_dev().next()[0], fakes_dev] swft.train([real_images, fake_images], [cost], train_epoch, dev_data=dev_epoch, epochs=EPOCHS, print_every=1000) fn = theano.function([real_images, fake_images], cost) result = fn(real_dev().next()[0], fakes_dev) swft.delete_params('Evaluator') return result
def BatchNormalize(name, input_dim, inputs, stepwise=False): """ Batch normalization. By default, normalizes across all but the last axis. Set `stepwise` to true if you're batch-norming an RNN and want to normalize each timestep separately (e.g. for a language model, where you can't let information from step `t+1` leak into step `t`). """ if stepwise: means = inputs.mean(axis=1, keepdims=True) variances = inputs.var(axis=1, keepdims=True) else: means = inputs.reshape((-1, input_dim)).mean(axis=0) variances = inputs.reshape((-1, input_dim)).var(axis=0) beta = swft.param(name + '.beta', numpy.zeros(input_dim, dtype='float32')) gamma = swft.param(name + '.gamma', numpy.ones(input_dim, dtype='float32')) stdevs = T.sqrt(variances + swft.floatX(1e-4)) return (inputs - means) * (gamma / stdevs) + beta
def rectify(x): """ReLU nonlinearity: max(0, x)""" return (x + abs(x)) / swft.floatX(2.0)
output = swft.ops.Dropout(0.5, output) # We apply the sigmoid in a later step return swft.ops.Linear('Discriminator.Output', 240, 1, output, initialization=('uniform', 0.005)).flatten() symbolic_inputs = swft.mnist.symbolic_inputs() images, targets = symbolic_inputs generator_output = generator(BATCH_SIZE) disc_out = discriminator(T.concatenate([generator_output, images], axis=0)) disc_gen_out = T.nnet.sigmoid(disc_out[:BATCH_SIZE]) disc_inputs = T.nnet.sigmoid(disc_out[BATCH_SIZE:]) # Gen objective: push D(G) to one gen_cost = T.nnet.binary_crossentropy(disc_gen_out, swft.floatX(1)).mean() gen_cost.name = 'gen_cost' # Discrim objective: push D(G) to zero, and push D(real) to one discrim_cost = T.nnet.binary_crossentropy(disc_gen_out, swft.floatX(0)).mean() discrim_cost += T.nnet.binary_crossentropy(disc_inputs, swft.floatX(1)).mean() discrim_cost /= swft.floatX(2.0) discrim_cost.name = 'discrim_cost' train_data, dev_data, test_data = swft.mnist.load(BATCH_SIZE) gen_params = swft.search(gen_cost, lambda x: hasattr(x, 'param') and 'Generator' in x.name) discrim_params = swft.search(discrim_cost, lambda x: hasattr(x, 'param') and 'Discriminator' in x.name) _sample_fn = theano.function([], generator(100)) def generate_image(epoch):
output).flatten()) def noise(n_samples): output = theano_srng.normal(size=(n_samples, LATENT_DIM)) return swft.floatX(LATENT_STDEV) * output images, targets = swft.mnist.symbolic_inputs() latents = encoder(images) reconstructions = decoder(latents) # Encoder objective: push D(latents) to one... reg_cost = T.nnet.binary_crossentropy(discriminator(latents), swft.floatX(1)).mean() reg_cost.name = 'reg_cost' # ... and minimize reconstruction error reconst_cost = T.sqr(reconstructions - images).mean() reconst_cost.name = 'reconst_cost' # this seems to be an important hyperparam, maybe try playing with it more. full_enc_cost = (swft.floatX(100) * reconst_cost) + reg_cost # Decoder objective: minimize reconstruction loss dec_cost = reconst_cost # Discrim objective: push D(latents) to zero, D(noise) to one discrim_cost = T.nnet.binary_crossentropy(discriminator(latents), swft.floatX(0)).mean()
1, output, initialization=('uniform', 0.005)).flatten() symbolic_inputs = swft.mnist.symbolic_inputs() images, targets = symbolic_inputs generator_output = generator(BATCH_SIZE) disc_out = discriminator(T.concatenate([generator_output, images], axis=0)) disc_gen_out = T.nnet.sigmoid(disc_out[:BATCH_SIZE]) disc_inputs = T.nnet.sigmoid(disc_out[BATCH_SIZE:]) # Gen objective: push D(G) to one gen_cost = T.nnet.binary_crossentropy(disc_gen_out, swft.floatX(1)).mean() gen_cost.name = 'gen_cost' # Discrim objective: push D(G) to zero, and push D(real) to one discrim_cost = T.nnet.binary_crossentropy(disc_gen_out, swft.floatX(0)).mean() discrim_cost += T.nnet.binary_crossentropy(disc_inputs, swft.floatX(1)).mean() discrim_cost /= swft.floatX(2.0) discrim_cost.name = 'discrim_cost' train_data, dev_data, test_data = swft.mnist.load(BATCH_SIZE) gen_params = swft.search( gen_cost, lambda x: hasattr(x, 'param') and 'Generator' in x.name) discrim_params = swft.search( discrim_cost, lambda x: hasattr(x, 'param') and 'Discriminator' in x.name)
def noise(n_samples): output = theano_srng.normal(size=(n_samples, LATENT_DIM)) return swft.floatX(LATENT_STDEV) * output
output = Layer('Discriminator.Layer2', HIDDEN_DIM, HIDDEN_DIM, True, output) return T.nnet.sigmoid( swft.ops.Linear('Discriminator.Layer3', HIDDEN_DIM, 1, output).flatten() ) def noise(n_samples): output = theano_srng.normal(size=(n_samples,LATENT_DIM)) return swft.floatX(LATENT_STDEV) * output images, targets = swft.mnist.symbolic_inputs() latents = encoder(images) reconstructions = decoder(latents) # Encoder objective: push D(latents) to one... reg_cost = T.nnet.binary_crossentropy(discriminator(latents), swft.floatX(1)).mean() reg_cost.name = 'reg_cost' # ... and minimize reconstruction error reconst_cost = T.sqr(reconstructions - images).mean() reconst_cost.name = 'reconst_cost' # this seems to be an important hyperparam, maybe try playing with it more. full_enc_cost = (swft.floatX(100)*reconst_cost) + reg_cost # Decoder objective: minimize reconstruction loss dec_cost = reconst_cost # Discrim objective: push D(latents) to zero, D(noise) to one discrim_cost = T.nnet.binary_crossentropy(discriminator(latents), swft.floatX(0)).mean() discrim_cost += T.nnet.binary_crossentropy(discriminator(noise(BATCH_SIZE)), swft.floatX(1)).mean()
def train(symbolic_inputs, costs, train_data, dev_data=None, test_data=None, param_sets=None, optimizers=[lasagne.updates.adam], print_vars=None, epochs=10, print_every=10, callback=None): # TODO write documentation if param_sets == None: param_sets = [swft.search(costs[0], lambda x: hasattr(x, 'param'))] assert len(costs) == len(param_sets), "train() needs 1 param set per cost!" _print_paramsets_info(costs, param_sets) print "Building updates..." if print_vars is None: print_vars = [c for c in costs] for cost in costs: print_vars += swft.search(cost, lambda x: hasattr(x, '_print')) # Remove duplicate values in print_vars print_vars = list(set(print_vars)) all_updates = [] for cost, params, optimizer in zip(costs, param_sets, optimizers): grads = T.grad(cost, wrt=params) # Clip gradients elementwise grads = [T.clip(g, swft.floatX(-1.0), swft.floatX(1.0)) for g in grads] cost_updates = optimizer(grads, params) for k, v in cost_updates.items(): all_updates.append((k, v)) print "Compiling train function..." train_ = theano.function(symbolic_inputs, print_vars, updates=all_updates, on_unused_input='warn') print "Compiling evaluate function..." evaluate = theano.function(symbolic_inputs, print_vars, on_unused_input='warn') print "Training!" splits = [('train', train_, train_data)] if dev_data is not None: splits.append(('dev', evaluate, dev_data)) if test_data is not None: splits.append(('test', evaluate, test_data)) for epoch in xrange(epochs): for title, fn, data in splits: epoch_totals = [] since_last_print = [] n_inputs = 0 for iteration, inputs in enumerate(data(), start=1): n_inputs += 1 start_time = time.time() outputs_ = fn(*inputs) if iteration == 1: epoch_totals = [o.copy() for o in outputs_] since_last_print = [o.copy() for o in outputs_] else: for i, o in enumerate(outputs_): epoch_totals[i] += o since_last_print[i] += o if iteration % print_every == 0: new_time = time.time() values_to_print = [('epoch', epoch), ('input', iteration), ('time_per_input', (time.time() - start_time))] for symbolic, totalval in zip(print_vars, since_last_print): values_to_print.append( (str(symbolic), totalval / print_every)) print "{0}\t".format(title) + "\t".join([ "{0}:{1}".format(name, val) for name, val in values_to_print ]) last_print_time = new_time for i, t in enumerate(since_last_print): since_last_print[i].fill(0) values_to_print = [('epoch', epoch), ('n_inputs', n_inputs)] for symbolic_var, total_val in zip(print_vars, epoch_totals): values_to_print.append( (str(symbolic_var), total_val / n_inputs)) print "{0} summary\t".format(title) + "\t".join( ["{0}:{1}".format(name, val) for name, val in values_to_print]) if callback: callback(epoch)
def noise(n_samples): output = theano_srng.normal(size=(n_samples,LATENT_DIM)) return swft.floatX(LATENT_STDEV) * output
last_hidden = T.concatenate([gru1[:, -1], gru2[:, -1], gru3[:, -1]], axis=1) return (output, last_hidden) sequences = T.imatrix('sequences') transcripts = T.imatrix('transcripts') h0 = T.matrix('h0') frame_level_outputs, new_h0 = predict(sequences, h0) cost = T.nnet.categorical_crossentropy( T.nnet.softmax(frame_level_outputs[:, :-1].reshape((-1, Q_LEVELS))), sequences[:, 1:].flatten() ).mean() cost = cost * swft.floatX(1.44269504089) cost.name = 'cost' params = swft.search(cost, lambda x: hasattr(x, 'param')) swft._train._print_paramsets_info([cost], [params]) grads = T.grad(cost, wrt=params, disconnected_inputs='warn') grads = [T.clip(g, swft.floatX(-GRAD_CLIP), swft.floatX(GRAD_CLIP)) for g in grads] updates = lasagne.updates.adam(grads, params) train_fn = theano.function( [sequences, transcripts, h0], [cost, new_h0], updates=updates, on_unused_input='warn'
def train( symbolic_inputs, costs, train_data, dev_data=None, test_data=None, param_sets=None, optimizers=[lasagne.updates.adam], print_vars=None, epochs=10, print_every=10, callback=None ): # TODO write documentation if param_sets == None: param_sets = [ swft.search(costs[0], lambda x: hasattr(x, 'param')) ] assert len(costs)==len(param_sets), "train() needs 1 param set per cost!" _print_paramsets_info(costs, param_sets) print "Building updates..." if print_vars is None: print_vars = [c for c in costs] for cost in costs: print_vars += swft.search(cost, lambda x: hasattr(x, '_print')) # Remove duplicate values in print_vars print_vars = list(set(print_vars)) all_updates = [] for cost, params, optimizer in zip(costs, param_sets, optimizers): grads = T.grad(cost, wrt=params) # Clip gradients elementwise grads = [ T.clip(g, swft.floatX(-1.0), swft.floatX(1.0)) for g in grads ] cost_updates = optimizer(grads, params) for k, v in cost_updates.items(): all_updates.append((k,v)) print "Compiling train function..." train_ = theano.function( symbolic_inputs, print_vars, updates=all_updates, on_unused_input='warn' ) print "Compiling evaluate function..." evaluate = theano.function( symbolic_inputs, print_vars, on_unused_input='warn' ) print "Training!" splits = [ ('train', train_, train_data) ] if dev_data is not None: splits.append(('dev', evaluate, dev_data)) if test_data is not None: splits.append(('test', evaluate, test_data)) for epoch in xrange(epochs): for title, fn, data in splits: epoch_totals = [] since_last_print = [] n_inputs = 0 for iteration, inputs in enumerate(data(), start=1): n_inputs += 1 start_time = time.time() outputs_ = fn(*inputs) if iteration == 1: epoch_totals = [o.copy() for o in outputs_] since_last_print = [o.copy() for o in outputs_] else: for i, o in enumerate(outputs_): epoch_totals[i] += o since_last_print[i] += o if iteration % print_every == 0: new_time = time.time() values_to_print = [ ('epoch', epoch), ('input', iteration), ('time_per_input', (time.time() - start_time)) ] for symbolic, totalval in zip(print_vars, since_last_print): values_to_print.append( (str(symbolic), totalval / print_every) ) print "{0}\t".format(title) + "\t".join([ "{0}:{1}".format(name, val) for name, val in values_to_print ]) last_print_time = new_time for i, t in enumerate(since_last_print): since_last_print[i].fill(0) values_to_print = [ ('epoch', epoch), ('n_inputs', n_inputs) ] for symbolic_var, total_val in zip(print_vars, epoch_totals): values_to_print.append( (str(symbolic_var), total_val / n_inputs) ) print "{0} summary\t".format(title) + "\t".join( ["{0}:{1}".format(name, val) for name, val in values_to_print] ) if callback: callback(epoch)