class AttentionWriter(Initializable): def __init__(self, input_dim, output_dim, channels, width, height, N, **kwargs): super(AttentionWriter, self).__init__(name="writer", **kwargs) self.channels = channels self.img_width = width self.img_height = height self.N = N self.input_dim = input_dim self.output_dim = output_dim assert output_dim == channels*width*height self.zoomer = ZoomableAttentionWindow(channels, height, width, N) self.z_trafo = Linear( name=self.name+'_ztrafo', input_dim=input_dim, output_dim=5, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.w_trafo = Linear( name=self.name+'_wtrafo', input_dim=input_dim, output_dim=channels*N*N, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.children = [self.z_trafo, self.w_trafo] @application(inputs=['h'], outputs=['c_update']) def apply(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1./gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update @application(inputs=['h'], outputs=['c_update', 'center_y', 'center_x', 'delta']) def apply_detailed(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1./gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update, center_y, center_x, delta @application(inputs=['x','h'], outputs=['c_update', 'center_y', 'center_x', 'delta']) def apply_circular(self,x,h): #w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1./gamma * self.zoomer.write(x, center_y, center_x, delta, sigma) return c_update, center_y, center_x, delta
class AttentionWriter(Initializable): def __init__(self, input_dim, output_dim, channels, width, height, N, **kwargs): super(AttentionWriter, self).__init__(name="writer", **kwargs) self.channels = channels self.img_width = width self.img_height = height self.N = N self.input_dim = input_dim self.output_dim = output_dim assert output_dim == channels * width * height self.zoomer = ZoomableAttentionWindow(channels, height, width, N) self.z_trafo = Linear(name=self.name + '_ztrafo', input_dim=input_dim, output_dim=5, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.w_trafo = Linear(name=self.name + '_wtrafo', input_dim=input_dim, output_dim=channels * N * N, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.children = [self.z_trafo, self.w_trafo] @application(inputs=['h'], outputs=['c_update']) def apply(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1. / gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update @application(inputs=['h'], outputs=['c_update', 'center_y', 'center_x', 'delta']) def apply_detailed(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1. / gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update, center_y, center_x, delta
class AttentionWriter(Initializable): def __init__(self, input_dim, output_dim, width, height, N, **kwargs): super(AttentionWriter, self).__init__(name="writer", **kwargs) self.img_width = width self.img_height = height self.N = N self.input_dim = input_dim self.output_dim = output_dim assert output_dim == width * height self.zoomer = ZoomableAttentionWindow(height, width, N) self.z_trafo = Linear( name=self.name + "_ztrafo", input_dim=input_dim, output_dim=5, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True, ) self.w_trafo = Linear( name=self.name + "_wtrafo", input_dim=input_dim, output_dim=N * N, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True, ) self.children = [self.z_trafo, self.w_trafo] @application(inputs=["h"], outputs=["c_update"]) def apply(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1.0 / gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update @application(inputs=["h"], outputs=["c_update", "center_y", "center_x", "delta"]) def apply_detailed(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1.0 / gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update, center_y, center_x, delta
class AttentionWriter(Initializable): def __init__(self, input_dim, output_dim, width, height, N, **kwargs): super(AttentionWriter, self).__init__(name="writer", **kwargs) self.width = width self.height = height self.N = N self.input_dim = input_dim self.output_dim = output_dim assert output_dim == width * height self.zoomer = ZoomableAttentionWindow(height, width, N, normalize=True) self.z_trafo = Linear(name=self.name + '_ztrafo', input_dim=input_dim, output_dim=5, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.w_trafo = Linear(name=self.name + '_wtrafo', input_dim=input_dim, output_dim=N * N, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.children = [self.z_trafo, self.w_trafo] @application(inputs=['h'], outputs=['c_update']) def apply(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y = (l[:, 0] + 1.) / 2. center_x = (l[:, 1] + 1.) / 2. log_delta = l[:, 2] log_sigma = l[:, 3] / 2. log_gamma = l[:, 4] gamma = T.exp(log_gamma).dimshuffle(0, 'x') c_update = self.zoomer.write(w, center_y, center_x, T.exp(log_delta), T.exp(log_sigma)) / gamma return c_update
def main(name, epochs, batch_size, learning_rate): if name is None: name = "att-rw" print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print() #------------------------------------------------------------------------ img_height, img_width = 28, 28 read_N = 12 write_N = 14 inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } x_dim = img_height * img_width reader = ZoomableAttentionWindow(img_height, img_width, read_N) writer = ZoomableAttentionWindow(img_height, img_width, write_N) # Parameterize the attention reader and writer mlpr = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="RMLP", **inits) mlpw = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="WMLP", **inits) # MLP between the reader and writer mlp = MLP(activations=[Tanh(), Identity()], dims=[read_N**2, 300, write_N**2], name="MLP", **inits) for brick in [mlpr, mlpw, mlp]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') hr = mlpr.apply(x) hw = mlpw.apply(x) center_y, center_x, delta, sigma, gamma = reader.nn2att(hr) r = reader.read(x, center_y, center_x, delta, sigma) h = mlp.apply(r) center_y, center_x, delta, sigma, gamma = writer.nn2att(hw) c = writer.write(h, center_y, center_x, delta, sigma) / gamma x_recons = T.nnet.sigmoid(c) cost = BinaryCrossEntropy().apply(x, x_recons) cost.name = "cost" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ RemoveNotFinite(), Adam(learning_rate), StepClipping(3.), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]: # v_mean = v.mean() # v_mean.name = v.name # monitors += [v_mean] # monitors += [aggregation.mean(v)] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["cost"], ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX(DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX(DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), prefix="test"), TrainingDataMonitoring( train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name+".pkl"), #Plot(name, channels=plot_channels), ProgressBar(), Printing()]) main_loop.run()
def main(name, epochs, batch_size, learning_rate): if name is None: name = "att-rw" print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print() #------------------------------------------------------------------------ img_height, img_width = 28, 28 read_N = 12 write_N = 14 inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } x_dim = img_height * img_width reader = ZoomableAttentionWindow(img_height, img_width, read_N) writer = ZoomableAttentionWindow(img_height, img_width, write_N) # Parameterize the attention reader and writer mlpr = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="RMLP", **inits) mlpw = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="WMLP", **inits) # MLP between the reader and writer mlp = MLP(activations=[Tanh(), Identity()], dims=[read_N**2, 300, write_N**2], name="MLP", **inits) for brick in [mlpr, mlpw, mlp]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') hr = mlpr.apply(x) hw = mlpw.apply(x) center_y, center_x, delta, sigma, gamma = reader.nn2att(hr) r = reader.read(x, center_y, center_x, delta, sigma) h = mlp.apply(r) center_y, center_x, delta, sigma, gamma = writer.nn2att(hw) c = writer.write(h, center_y, center_x, delta, sigma) / gamma x_recons = T.nnet.sigmoid(c) cost = BinaryCrossEntropy().apply(x, x_recons) cost.name = "cost" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ RemoveNotFinite(), Adam(learning_rate), StepClipping(3.), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]: # v_mean = v.mean() # v_mean.name = v.name # monitors += [v_mean] # monitors += [aggregation.mean(v)] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["cost"], ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), #Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()