def create_model(self): input_dim = self.input_dim x = self.x x_to_h = Linear(input_dim, input_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(input_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(input_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) self.x_to_h = x_to_h self.lstm = lstm self.h_to_o = h_to_o h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification probs = h_to_o.apply(h[-1]) return probs
def rnn_layer(in_size, dim, x, h, n, first_layer=False): if connect_h_to_h == 'all-previous': if first_layer: rnn_input = x linear = Linear(input_dim=in_size, output_dim=dim, name='linear' + str(n) + '-') elif connect_x_to_h: rnn_input = T.concatenate([x] + [hidden for hidden in h], axis=2) linear = Linear(input_dim=in_size + dim * n, output_dim=dim, name='linear' + str(n) + '-') else: rnn_input = T.concatenate([hidden for hidden in h], axis=2) linear = Linear(input_dim=dim * n, output_dim=dim, name='linear' + str(n) + '-') elif connect_h_to_h == 'two-previous': if first_layer: rnn_input = x linear = Linear(input_dim=in_size, output_dim=dim, name='linear' + str(n) + '-') elif connect_x_to_h: rnn_input = T.concatenate([x] + h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=in_size + dim * 2 if n > 1 else in_size + dim, output_dim=dim, name='linear' + str(n) + '-') else: rnn_input = T.concatenate(h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=dim * 2 if n > 1 else dim, output_dim=dim, name='linear' + str(n) + '-') elif connect_h_to_h == 'one-previous': if first_layer: rnn_input = x linear = Linear(input_dim=in_size, output_dim=dim, name='linear' + str(n) + '-') elif connect_x_to_h: rnn_input = T.concatenate([x] + [h[n - 1]], axis=2) linear = Linear(input_dim=in_size + dim, output_dim=dim, name='linear' + str(n) + '-') else: rnn_input = h[n] linear = Linear(input_dim=dim, output_dim=dim, name='linear' + str(n) + '-') rnn = SimpleRecurrent(dim=dim, activation=Tanh(), name=layer_models[n] + str(n) + '-') initialize([linear, rnn]) if layer_models[n] == 'rnn': return rnn.apply(linear.apply(rnn_input)) elif layer_models[n] == 'mt_rnn': return rnn.apply(linear.apply(rnn_input), time_scale=layer_resolutions[n], time_offset=layer_execution_time_offset[n])
class AttentionWriter(Initializable): def __init__(self, input_dim, output_dim, channels, width, height, N, **kwargs): super(AttentionWriter, self).__init__(name="writer", **kwargs) self.channels = channels self.img_width = width self.img_height = height self.N = N self.input_dim = input_dim self.output_dim = output_dim assert output_dim == channels*width*height self.zoomer = ZoomableAttentionWindow(channels, height, width, N) self.z_trafo = Linear( name=self.name+'_ztrafo', input_dim=input_dim, output_dim=5, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.w_trafo = Linear( name=self.name+'_wtrafo', input_dim=input_dim, output_dim=channels*N*N, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.children = [self.z_trafo, self.w_trafo] @application(inputs=['h'], outputs=['c_update']) def apply(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1./gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update @application(inputs=['h'], outputs=['c_update', 'center_y', 'center_x', 'delta']) def apply_detailed(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1./gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update, center_y, center_x, delta @application(inputs=['x','h'], outputs=['c_update', 'center_y', 'center_x', 'delta']) def apply_circular(self,x,h): #w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1./gamma * self.zoomer.write(x, center_y, center_x, delta, sigma) return c_update, center_y, center_x, delta
class embeddingLayer: def __init__(self, word_dim, visual_dim, joint_dim): self.word_embed = Linear(word_dim, joint_dim, name='word_to_joint', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.visual_embed = Linear(visual_dim, joint_dim, name='visual_to_joint', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.word_embed.initialize() self.visual_embed.initialize() # words: batch_size x q x word_dim # video: batch_size x video_length x visual_dim def apply(self, words, video, u1, u2): w = self.word_embed.apply(words) v = self.visual_embed.apply(video) w = T.tanh(w) v = T.tanh(v) u = T.concatenate([u1, u2], axis=1) u = self.word_embed.apply(u) return w, v, u def apply_sentence(self, words, u1, u2): w = self.word_embed.apply(words) w = T.tanh(w) u = T.concatenate([u1, u2], axis=1) u = self.word_embed.apply(u) return w, u
def apply(self, input_, target): x_to_h = Linear(name='x_to_h', input_dim=self.dims[0], output_dim=self.dims[1] * 4) pre_rnn = x_to_h.apply(input_) pre_rnn.name = 'pre_rnn' rnn = LSTM(activation=Tanh(), dim=self.dims[1], name=self.name) h, _ = rnn.apply(pre_rnn) h.name = 'h' h_to_y = Linear(name='h_to_y', input_dim=self.dims[1], output_dim=self.dims[2]) y_hat = h_to_y.apply(h) y_hat.name = 'y_hat' cost = SquaredError().apply(target, y_hat) cost.name = 'MSE' self.outputs = {} self.outputs['y_hat'] = y_hat self.outputs['cost'] = cost self.outputs['pre_rnn'] = pre_rnn self.outputs['h'] = h # Initialization for brick in (rnn, x_to_h, h_to_y): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize()
def __init__(self, input_size, hidden_size, output_size): self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size x = tensor.tensor3('x', dtype=floatX) y = tensor.tensor3('y', dtype=floatX) x_to_lstm = Linear(name="x_to_lstm", input_dim=input_size, output_dim=4 * hidden_size, weights_init=IsotropicGaussian(), biases_init=Constant(0)) lstm = LSTM(dim=hidden_size, name="lstm", weights_init=IsotropicGaussian(), biases_init=Constant(0)) lstm_to_output = Linear(name="lstm_to_output", input_dim=hidden_size, output_dim=output_size, weights_init=IsotropicGaussian(), biases_init=Constant(0)) x_transform = x_to_lstm.apply(x) h, c = lstm.apply(x_transform) y_hat = lstm_to_output.apply(h) y_hat = Logistic(name="y_hat").apply(y_hat) self.cost = BinaryCrossEntropy(name="cost").apply(y, y_hat) x_to_lstm.initialize() lstm.initialize() lstm_to_output.initialize() self.computation_graph = ComputationGraph(self.cost)
def build_model(args): x = tensor.tensor3('features', dtype=floatX) y = tensor.tensor3('targets', dtype=floatX) linear = Linear(input_dim=1, output_dim=4 * args.units) rnn = LSTM(dim=args.units, activation=Tanh()) linear2 = Linear(input_dim=args.units, output_dim=1) prediction = Tanh().apply(linear2.apply(rnn.apply(linear.apply(x)))) prediction = prediction[:-1, :, :] # SquaredError does not work on 3D tensor y = y.reshape((y.shape[0] * y.shape[1], y.shape[2])) prediction = prediction.reshape((prediction.shape[0] * prediction.shape[1], prediction.shape[2])) cost = SquaredError().apply(y, prediction) # Initialization linear.weights_init = IsotropicGaussian(0.1) linear2.weights_init = IsotropicGaussian(0.1) linear.biases_init = Constant(0) linear2.biases_init = Constant(0) rnn.weights_init = Orthogonal() return cost
def lstm_layer(in_size, dim, x, h, n, first_layer=False): if connect_h_to_h == 'all-previous': if first_layer: lstm_input = x linear = Linear(input_dim=in_size, output_dim=dim * 4, name='linear' + str(n) + '-') elif connect_x_to_h: lstm_input = T.concatenate([x] + [hidden for hidden in h], axis=2) linear = Linear(input_dim=in_size + dim * (n), output_dim=dim * 4, name='linear' + str(n) + '-') else: lstm_input = T.concatenate([hidden for hidden in h], axis=2) linear = Linear(input_dim=dim * (n + 1), output_dim=dim * 4, name='linear' + str(n) + '-') elif connect_h_to_h == 'two-previous': if first_layer: lstm_input = x linear = Linear(input_dim=in_size, output_dim=dim * 4, name='linear' + str(n) + '-') elif connect_x_to_h: lstm_input = T.concatenate([x] + h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=in_size + dim * 2 if n > 1 else in_size + dim, output_dim=dim * 4, name='linear' + str(n) + '-') else: lstm_input = T.concatenate(h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=dim * 2 if n > 1 else dim, output_dim=dim * 4, name='linear' + str(n) + '-') elif connect_h_to_h == 'one-previous': if first_layer: lstm_input = x linear = Linear(input_dim=in_size, output_dim=dim * 4, name='linear' + str(n) + '-') elif connect_x_to_h: lstm_input = T.concatenate([x] + [h[n - 1]], axis=2) linear = Linear(input_dim=in_size + dim, output_dim=dim * 4, name='linear' + str(n) + '-') else: lstm_input = h[n - 1] # linear = LN_LSTM(input_dim=dim, output_dim=dim * 4, name='linear' + str(n) + '-' ) linear = Linear(input_dim=dim, output_dim=dim * 4, name='linear' + str(n) + '-') lstm = LN_LSTM(dim=dim, name=layer_models[network_mode][n] + str(n) + '-') initialize([linear, lstm]) if layer_models[network_mode][n] == 'lstm': return lstm.apply(linear.apply(lstm_input)) # return lstm.apply(linear.apply(lstm_input), mask=x_mask) elif layer_models[network_mode][n] == 'mt_lstm': return lstm.apply(linear.apply(lstm_input), time_scale=layer_resolutions[n], time_offset=layer_execution_time_offset[n])
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs): dataset_train = IterableDataset(generate_data(max_seq_length, batch_size, num_batches)) dataset_test = IterableDataset(generate_data(max_seq_length, batch_size, 100)) stream_train = DataStream(dataset=dataset_train) stream_test = DataStream(dataset=dataset_test) x = T.tensor3('x') y = T.matrix('y') # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(1, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(lstm_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop(algorithm, stream_train, extensions=[test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar()]) main_loop.run() print 'Learned weights:' for layer in (x_to_h, lstm, h_to_o): print "Layer '%s':" % layer.name for param in layer.parameters: print param.name, ': ', param.get_value() print
def lllistool(i, inp, func): if func == LSTM: NUMS[i+1] *= 4 sdim = DIMS[i] if func == SimpleRecurrent or func == LSTM: sdim = DIMS[i] + DIMS[i+1] l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], weights_init=IsotropicGaussian(std=sdim**(-0.5)), biases_init=IsotropicGaussian(std=sdim**(-0.5)), name='Lin{}'.format(i)) l.initialize() if func == SimpleRecurrent: gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() ret = gong.apply(l.apply(inp)) elif func == LSTM: gong = func(dim=DIMS[i+1], activation=Tanh(), weights_init=IsotropicGaussian(std=sdim**(-0.5))) gong.initialize() print(inp) ret, _ = gong.apply( l.apply(inp), T.zeros((inp.shape[1], DIMS[i+1])), T.zeros((inp.shape[1], DIMS[i+1])), ) elif func == SequenceGenerator: gong = func( readout=None, transition=SimpleRecurrent(dim=100, activation=Rectifier(), weights_init=IsotropicGaussian(std=0.1))) ret = None elif func == None: ret = l.apply(inp) else: gong = func() ret = gong.apply(l.apply(inp)) return ret
def construct_model(activation_function, r_dim, hidden_dim, out_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') nx = x.shape[0] nj = x.shape[1] # also is r.shape[0] nr = r.shape[1] # r is nj x nr # x is nx x nj # y is nx # Get a representation of r of size r_dim r = DAE(r) # r is now nj x r_dim # r_rep is nx x nj x r_dim r_rep = r[None, :, :].repeat(axis=0, repeats=nx) # x3 is nx x nj x 1 x3 = x[:, :, None] # concat is nx x nj x (r_dim + 1) concat = tensor.concatenate([r_rep, x3], axis=2) # Change concat from Batch x Time x Features to T X B x F rnn_input = concat.dimshuffle(1, 0, 2) linear = Linear(input_dim=r_dim + 1, output_dim=4 * hidden_dim, name="input_linear") lstm = LSTM(dim=hidden_dim, activation=activation_function, name="hidden_recurrent") top_linear = Linear(input_dim=hidden_dim, output_dim=out_dim, name="out_linear") pre_rnn = linear.apply(rnn_input) states = lstm.apply(pre_rnn)[0] activations = top_linear.apply(states) activations = tensor.mean(activations, axis=0) cost = Softmax().categorical_cross_entropy(y, activations) pred = activations.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in (linear, lstm, top_linear): brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() return cost, error_rate
class AttentionWriter(Initializable): def __init__(self, input_dim, output_dim, channels, width, height, N, **kwargs): super(AttentionWriter, self).__init__(name="writer", **kwargs) self.channels = channels self.img_width = width self.img_height = height self.N = N self.input_dim = input_dim self.output_dim = output_dim assert output_dim == channels * width * height self.zoomer = ZoomableAttentionWindow(channels, height, width, N) self.z_trafo = Linear(name=self.name + '_ztrafo', input_dim=input_dim, output_dim=5, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.w_trafo = Linear(name=self.name + '_wtrafo', input_dim=input_dim, output_dim=channels * N * N, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.children = [self.z_trafo, self.w_trafo] @application(inputs=['h'], outputs=['c_update']) def apply(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1. / gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update @application(inputs=['h'], outputs=['c_update', 'center_y', 'center_x', 'delta']) def apply_detailed(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1. / gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update, center_y, center_x, delta
class MyRecurrent(Brick): def __init__(self, recurrent, dims, activations=[Identity(), Identity()], **kwargs): super(MyRecurrent, self).__init__(**kwargs) self.dims = dims self.recurrent = recurrent self.activations = activations if isinstance(self.recurrent, (SimpleRecurrent, SimpleRecurrentBatchNorm)): output_dim = dims[1] elif isinstance(self.recurrent, (LSTM, LSTMBatchNorm)): output_dim = 4 * dims[1] else: raise NotImplementedError self.input_trans = Linear(name='input_trans', input_dim=dims[0], output_dim=output_dim, weights_init=NormalizedInitialization(), biases_init=Constant(0)) self.output_trans = Linear(name='output_trans', input_dim=dims[1], output_dim=dims[2], weights_init=NormalizedInitialization(), biases_init=Constant(0)) self.children = ( [self.input_trans, self.recurrent, self.output_trans] + self.activations) def _initialize(self): self.input_trans.initialize() self.output_trans.initialize() #self.recurrent.initialize() @application def apply(self, input_, input_mask=None, *args, **kwargs): input_recurrent = self.input_trans.apply(input_) try: input_recurrent = self.activations[0].apply(input_recurrent, input_mask=input_mask) except TypeError: input_recurrent = self.activations[0].apply(input_recurrent) output_recurrent = self.recurrent.apply(inputs=input_recurrent, mask=input_mask) if isinstance(self.recurrent, (LSTM, LSTMBatchNorm)): output_recurrent = output_recurrent[0] output = self.output_trans.apply(output_recurrent) try: output = self.activations[1].apply(output, input_mask=input_mask) except TypeError: output = self.activations[1].apply(output) return output
def MDN_output_layer(x, h, y, in_size, out_size, hidden_size, pred): if connect_h_to_o: hiddens = T.concatenate([hidden for hidden in h], axis=2) hidden_out_size = hidden_size * len(h) else: hiddens = h[-1] hidden_out_size = hidden_size mu_linear = Linear(name='mu_linear' + str(pred), input_dim=hidden_out_size, output_dim=out_size * components_size[network_mode]) sigma_linear = Linear(name='sigma_linear' + str(pred), input_dim=hidden_out_size, output_dim=components_size[network_mode]) mixing_linear = Linear(name='mixing_linear' + str(pred), input_dim=hidden_out_size, output_dim=components_size[network_mode]) initialize([mu_linear, sigma_linear, mixing_linear]) mu = mu_linear.apply(hiddens) mu = mu.reshape( (mu.shape[0], mu.shape[1], out_size, components_size[network_mode])) sigma_orig = sigma_linear.apply(hiddens) sigma = T.nnet.softplus(sigma_orig) mixing_orig = mixing_linear.apply(hiddens) e_x = T.exp(mixing_orig - mixing_orig.max(axis=2, keepdims=True)) mixing = e_x / e_x.sum(axis=2, keepdims=True) exponent = -0.5 * T.inv(sigma) * T.sum( (y.dimshuffle(0, 1, 2, 'x') - mu)**2, axis=2) normalizer = (2 * np.pi * sigma) exponent = exponent + T.log(mixing) - (out_size * .5) * T.log(normalizer) # LogSumExp(x) max_exponent = T.max(exponent, axis=2, keepdims=True) mod_exponent = exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent), axis=2, keepdims=True) log_gauss = T.log(gauss_mix) + max_exponent cost = -T.mean(log_gauss) srng = RandomStreams(seed=seed) mixing = mixing_orig * (1 + sampling_bias) sigma = T.nnet.softplus(sigma_orig - sampling_bias) e_x = T.exp(mixing - mixing.max(axis=2, keepdims=True)) mixing = e_x / e_x.sum(axis=2, keepdims=True) component = srng.multinomial(pvals=mixing) component_mean = T.sum(mu * component.dimshuffle(0, 1, 'x', 2), axis=3) component_std = T.sum(sigma * component, axis=2, keepdims=True) linear_output = srng.normal(avg=component_mean, std=component_std) linear_output.name = 'linear_output' return linear_output, cost
class AttentionWriter(Initializable): def __init__(self, input_dim, output_dim, width, height, N, **kwargs): super(AttentionWriter, self).__init__(name="writer", **kwargs) self.img_width = width self.img_height = height self.N = N self.input_dim = input_dim self.output_dim = output_dim assert output_dim == width * height self.zoomer = ZoomableAttentionWindow(height, width, N) self.z_trafo = Linear( name=self.name + "_ztrafo", input_dim=input_dim, output_dim=5, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True, ) self.w_trafo = Linear( name=self.name + "_wtrafo", input_dim=input_dim, output_dim=N * N, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True, ) self.children = [self.z_trafo, self.w_trafo] @application(inputs=["h"], outputs=["c_update"]) def apply(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1.0 / gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update @application(inputs=["h"], outputs=["c_update", "center_y", "center_x", "delta"]) def apply_detailed(self, h): w = self.w_trafo.apply(h) l = self.z_trafo.apply(h) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) c_update = 1.0 / gamma * self.zoomer.write(w, center_y, center_x, delta, sigma) return c_update, center_y, center_x, delta
def test_linear_nan_allocation(): x = tensor.matrix() linear = Linear(input_dim=16, output_dim=8, weights_init=Constant(2), biases_init=Constant(1)) linear.apply(x) w1 = numpy.nan * numpy.zeros((16, 8)) w2 = linear.params[0].get_value() b1 = numpy.nan * numpy.zeros(8) b2 = linear.params[1].get_value() numpy.testing.assert_equal(w1, w2) numpy.testing.assert_equal(b1, b2)
def test_linear_nan_allocation(): x = tensor.matrix() linear = Linear(input_dim=16, output_dim=8, weights_init=Constant(2), biases_init=Constant(1)) linear.apply(x) w1 = numpy.nan * numpy.zeros((16, 8)) w2 = linear.parameters[0].get_value() b1 = numpy.nan * numpy.zeros(8) b2 = linear.parameters[1].get_value() numpy.testing.assert_equal(w1, w2) numpy.testing.assert_equal(b1, b2)
class Highway(Initializable, Feedforward): """ Implements highway networks outlined in [1] y = H(x,WH)T(x,WT) + x(1-T(x,WT)) Highway networks have the same input dimension and output dimension Parameters ---------- input_dim: int number of input/output dimensions for the network output_activation: Activation activation function applied to x and the hidden weights transform_activation: Activation activation function applied to x and the transform weights [1] http://arxiv.org/pdf/1505.00387v1.pdf """ @lazy(allocation=['input_dim']) def __init__(self, input_dim, output_activation=None, transform_activation=None, **kwargs): super(Highway, self).__init__(**kwargs) self.input_dim = input_dim self.output_dim = input_dim if output_activation == None: output_activation = Rectifier() if transform_activation == None: transform_activation = Logistic() self._linear_h = Linear(name="linear_h", input_dim=input_dim, output_dim=input_dim) self._linear_t = Linear(name="linear_t", input_dim=input_dim, output_dim=input_dim) self._output_activation = output_activation self._transform_activation = transform_activation self.children = [ self._linear_h, self._linear_t, self._output_activation, self._transform_activation ] @application(inputs=['input_'], outputs=['output']) def apply(self, input_): h = self._output_activation.apply(self._linear_h.apply(input_)) t = self._transform_activation.apply(self._linear_t.apply(input_)) return h * t + input_ * (1 - t)
class questionEncoder: def __init__(self, word_dim, hidden_dim): self.forward_lstm= LSTM(hidden_dim, name='question_forward_lstm', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.backward_lstm= LSTM(hidden_dim, name='question_backward_lstm', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.x_to_h_forward = Linear(word_dim, hidden_dim * 4, name='word_x_to_h_forward', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.x_to_h_backward = Linear(word_dim, hidden_dim * 4, name='word_x_to_h_backward', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.forward_lstm.initialize() self.backward_lstm.initialize() self.x_to_h_forward.initialize() self.x_to_h_backward.initialize() # variable question length # words: batch_size x q x word_dim # words_reverse: be the reverse sentence of words # padding with 0 to max length q # mask: batch_size def apply(self, words, words_reverse, mask_, batch_size): mask = mask_.flatten() # batch_size x q x hidden_dim Wx = self.x_to_h_forward.apply(words) Wx_r = self.x_to_h_backward.apply(words_reverse) # q x batch_size x hidden_dim Wx = Wx.swapaxes(0, 1) Wx_r = Wx_r.swapaxes(0, 1) # q x batch_size x hidden_dim hf, cf = self.forward_lstm.apply(Wx) hb, cb = self.backward_lstm.apply(Wx_r) for i in range(batch_size): T.set_subtensor(hb[0:mask[i]+1, i, :], hb[0:mask[i]+1, i, :][::-1]) # q x batch_size x (2 x hidden_dim) h = T.concatenate([hf, hb], axis=2) # batch_size x hidden_dim y_q = hf[mask, range(batch_size), :] y_1 = hb[0, range(batch_size), :] return h.swapaxes(0, 1), y_q, y_1
def prior_network(x, n_input, hu_encoder, n_latent): logger.info('In prior_network: n_input: %d, hu_encoder: %d', n_input, hu_encoder) mlp1 = MLP(activations=[Rectifier()], dims=[n_input, hu_encoder], name='prior_in_to_hidEncoder') initialize([mlp1]) h_encoder = mlp1.apply(x) h_encoder = debug_print(h_encoder, 'h_encoder', False) lin1 = Linear(name='prior_hiddEncoder_to_latent_mu', input_dim=hu_encoder, output_dim=n_latent) lin2 = Linear(name='prior_hiddEncoder_to_latent_sigma', input_dim=hu_encoder, output_dim=n_latent) initialize([lin1]) initialize([lin2], rndstd=0.001) mu = lin1.apply(h_encoder) log_sigma = lin2.apply(h_encoder) return mu, log_sigma
class MyRecurrent(Brick): def __init__(self, recurrent, dims, activations=[Identity(), Identity()], **kwargs): super(MyRecurrent, self).__init__(**kwargs) self.dims = dims self.recurrent = recurrent self.activations = activations if isinstance(self.recurrent, (SimpleRecurrent, SimpleRecurrentBatchNorm)): output_dim = dims[1] elif isinstance(self.recurrent, (LSTM, LSTMBatchNorm)): output_dim = 4*dims[1] else: raise NotImplementedError self.input_trans = Linear(name='input_trans', input_dim=dims[0], output_dim=output_dim, weights_init=NormalizedInitialization(), biases_init=Constant(0)) self.output_trans = Linear(name='output_trans', input_dim=dims[1], output_dim=dims[2], weights_init=NormalizedInitialization(), biases_init=Constant(0)) self.children = ([self.input_trans, self.recurrent, self.output_trans] + self.activations) def _initialize(self): self.input_trans.initialize() self.output_trans.initialize() #self.recurrent.initialize() @application def apply(self, input_, input_mask=None, *args, **kwargs): input_recurrent = self.input_trans.apply(input_) try: input_recurrent = self.activations[0].apply(input_recurrent, input_mask=input_mask) except TypeError: input_recurrent = self.activations[0].apply(input_recurrent) output_recurrent = self.recurrent.apply(inputs=input_recurrent, mask=input_mask) if isinstance(self.recurrent, (LSTM, LSTMBatchNorm)): output_recurrent = output_recurrent[0] output = self.output_trans.apply(output_recurrent) try: output = self.activations[1].apply(output, input_mask=input_mask) except TypeError: output = self.activations[1].apply(output) return output
def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim*2],name='fork',output_names=["linear","gates"], weights_init=initialization.Identity(),biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(),biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear( input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin,gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
class BernoulliLayer(Initializable, ProbabilisticLayer): @lazy def __init__(self, dim_X, dim_Y, **kwargs): super(BernoulliLayer, self).__init__(**kwargs) self.dim_X = dim_X self.dim_Y = dim_Y self.linear_transform = Linear(name=self.name + '_linear', input_dim=dim_Y, output_dim=dim_X, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=self.use_bias) self.children = [self.linear_transform] @application(inputs=['Y'], outputs=['X_expected']) def sample_expected(self, Y): return tensor.nnet.sigmoid(self.linear_transform.apply(Y)) @application(inputs=['Y'], outputs=['X', 'log_prob']) def sample(self, Y): prob_X = self.sample_expected(Y) U = self.theano_rng.uniform(size=prob_X.shape, nstreams=N_STREAMS) X = tensor.cast(U <= prob_X, floatX) return X, self.log_prob(X, Y) @application(inputs=['X', 'Y'], outputs=['log_prob']) def log_prob(self, X, Y): prob_X = self.sample_expected(Y) log_prob = X * tensor.log(prob_X) + (1. - X) * tensor.log(1 - prob_X) return log_prob.sum(axis=1)
def nn_fprop(x, y, recurrent_in_size, out_size, hidden_size, num_recurrent_layers, train_flag): if task_ID_type == 'feedforward': x, recurrent_in_size = task_ID_layers(x, recurrent_in_size) recurrent_input = x cells = [] h = [] if dropout > 0: recurrent_input = Dropout(name='dropout_recurrent_in', train_flag=train_flag).apply(recurrent_input) if linear_before_recurrent_size > 0: linear = Linear(input_dim=2, output_dim=linear_before_recurrent_size, name='linear_befor_recurrent') initialize([linear]) recurrent_input = linear.apply(recurrent_input[:, :, -2:]) recurrent_in_size = linear_before_recurrent_size if single_dim_out: recurrent_input = T.extra_ops.repeat(recurrent_input, out_size, axis=0) p_components_size = components_size for i in range(num_recurrent_layers): model = layer_models[i] h, cells = add_layer(model, i, recurrent_in_size, hidden_size, recurrent_input, h, cells, train_flag, first_layer=True if i == 0 else False) return output_layer(recurrent_input, h, y, recurrent_in_size, out_size, hidden_size, p_components_size) + (cells, )
def example(): """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """ x = tensor.tensor3('x') rnn = SimpleRecurrent(dim=3, activation=Identity(), weights_init=initialization.Identity()) rnn.initialize() h = rnn.apply(x) f = theano.function([x], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) doubler = Linear( input_dim=3, output_dim=3, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() h_doubler = rnn.apply(doubler.apply(x)) f = theano.function([x], h_doubler) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) #Initial State h0 = tensor.matrix('h0') h = rnn.apply(inputs=x, states=h0) f = theano.function([x, h0], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
class LinearActivation(Initializable, Feedforward): """Base class that adds documentation and has all the logic.""" @lazy(allocation=['input_dim', 'output_dim']) def __init__(self, input_dim, output_dim, activation, **kwargs): super(LinearActivation, self).__init__(**kwargs) self.linear = Linear() self.activation = activation self.children = [self.linear, self.activation] self.input_dim = input_dim self.output_dim = output_dim @property def input_dim(self): return self.linear.input_dim @input_dim.setter def input_dim(self, value): self.linear.input_dim = value @property def output_dim(self): return self.linear.output_dim @output_dim.setter def output_dim(self, value): self.linear.output_dim = value @application(inputs=['input_'], outputs=['output']) def apply(self, input_): pre_activation = self.linear.apply(input_) output = self.activation.apply(pre_activation) return output
class Encoder(Initializable): def __init__(self, dimension, input_size, rnn_type=None, embed_input=False, **kwargs): super(Encoder, self).__init__(**kwargs) if rnn_type is None: rnn_type = SimpleRecurrent if embed_input: self.embedder = LookupTable(input_size, dimension) else: self.embedder = Linear(input_size, dimension) encoder = Bidirectional(rnn_type(dim=dimension, activation=Tanh())) fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ]) fork.input_dim = dimension fork.output_dims = [dimension for _ in fork.input_names] self.fork = fork self.encoder = encoder self.children = [fork, encoder, self.embedder] @application def apply(self, input_, input_mask): input_ = self.embedder.apply(input_) return self.encoder.apply(**dict_union( self.fork.apply(input_, as_dict=True), mask=input_mask))
def rnn_layer(in_dim, h, h_dim, n): linear = Linear(input_dim=in_dim, output_dim=h_dim, name='linear' + str(n) + h.name) rnn = SimpleRecurrent(dim=h_dim, name='rnn' + str(n)) initialize([linear, rnn]) return rnn.apply(linear.apply(h))
def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim * 2], name='fork', output_names=["linear", "gates"], weights_init=initialization.Identity(), biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(), biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear(input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin, gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def bilstm_layer(in_dim, inp, h_dim, n, pref=""): linear = Linear(input_dim=in_dim, output_dim=h_dim * 4, name='linear' + str(n) + pref) lstm = LSTM(dim=h_dim, name='lstm' + str(n) + pref) bilstm = Bidirectional(prototype=lstm) bilstm.name = 'bilstm' + str(n) + pref initialize([linear, bilstm]) return bilstm.apply(linear.apply(inp))[0]
class BernoulliLayer(Initializable, ProbabilisticLayer): @lazy def __init__(self, dim_X, dim_Y, **kwargs): super(BernoulliLayer, self).__init__(**kwargs) self.dim_X = dim_X self.dim_Y = dim_Y self.linear_transform = Linear( name=self.name + '_linear', input_dim=dim_Y, output_dim=dim_X, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=self.use_bias) self.children = [self.linear_transform] @application(inputs=['Y'], outputs=['X_expected']) def sample_expected(self, Y): return tensor.nnet.sigmoid(self.linear_transform.apply(Y)) @application(inputs=['Y'], outputs=['X', 'log_prob']) def sample(self, Y): prob_X = self.sample_expected(Y) U = self.theano_rng.uniform(size=prob_X.shape, nstreams=N_STREAMS) X = tensor.cast(U <= prob_X, floatX) return X, self.log_prob(X, Y) @application(inputs=['X', 'Y'], outputs=['log_prob']) def log_prob(self, X, Y): prob_X = self.sample_expected(Y) log_prob = X*tensor.log(prob_X) + (1.-X)*tensor.log(1-prob_X) return log_prob.sum(axis=1)
class Representer(Initializable): def __init__(self, representation_mlp, **kwargs): super(Representer, self).__init__(name="representer", **kwargs) self.representation_mlp = representation_mlp self.r_trafo = Linear(name=representation_mlp.name + '_trafo', input_dim=representation_mlp.output_dim, output_dim=representation_mlp.output_dim, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.children = [self.representation_mlp, self.r_trafo] def get_dim(self, name): if name == 'input': return self.representation_mlp.input_dim elif name == 'output': return self.representation_mlp.output_dim else: raise ValueError @application(inputs=['r'], outputs=['l_repr']) def apply(self, r): i_repr = self.representation_mlp.apply(r) l_repr = self.r_trafo.apply(i_repr) return l_repr
def example(): """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """ x = tensor.tensor3('x') rnn = SimpleRecurrent(dim=3, activation=Identity(), weights_init=initialization.Identity()) rnn.initialize() h = rnn.apply(x) f = theano.function([x], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) doubler = Linear(input_dim=3, output_dim=3, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() h_doubler = rnn.apply(doubler.apply(x)) f = theano.function([x], h_doubler) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) #Initial State h0 = tensor.matrix('h0') h = rnn.apply(inputs=x, states=h0) f = theano.function([x, h0], h) print( f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
class Locater(Initializable): def __init__(self, location_mlp, **kwargs): super(Locater, self).__init__(name="locater", **kwargs) self.location_mlp = location_mlp self.l_trafo = Linear(name=location_mlp.name + '_trafo', input_dim=location_mlp.output_dim, output_dim=location_mlp.output_dim, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.children = [self.location_mlp, self.l_trafo] def get_dim(self, name): if name == 'input': return self.location_mlp.input_dim elif name == 'output': return self.location_mlp.output_dim else: raise ValueError @application(inputs=['l'], outputs=['l_loc']) def apply(self, l): i_loc = self.location_mlp.apply(l) l_loc = self.l_trafo.apply(i_loc) return l_loc
class Locator(Initializable): def __init__(self, input_dim, n_spatial_dims, area_transform, weights_init, biases_init, location_std, scale_std, **kwargs): super(Locator, self).__init__(**kwargs) self.n_spatial_dims = n_spatial_dims self.area_transform = area_transform self.locationscale = Linear( input_dim=area_transform.brick.output_dim, output_dim=2*n_spatial_dims, # these are huge reductions in dimensionality, so use # normalized initialization to avoid huge values. weights_init=NormalizedInitialization(IsotropicGaussian(std=1e-3)), biases_init=Constant(0), name="locationscale") self.T_rng = theano.sandbox.rng_mrg.MRG_RandomStreams(12345) self.location_std = location_std self.scale_std = scale_std self.children = [self.area_transform.brick, self.locationscale] @application(inputs=['h'], outputs=['location', 'scale']) def apply(self, h): area = self.area_transform(h) locationscale = self.locationscale.apply(area) location, scale = (locationscale[:, :self.n_spatial_dims], locationscale[:, self.n_spatial_dims:]) location += self.T_rng.normal(location.shape, std=self.location_std) scale += self.T_rng.normal(scale.shape, std=self.scale_std) return location, scale
class ShallowEnergyComputer(Initializable, Feedforward): """A simple energy computer: first tanh, then weighted sum.""" @lazy() def __init__(self, **kwargs): super(ShallowEnergyComputer, self).__init__(**kwargs) self.tanh = Tanh() self.linear = Linear(use_bias=False) self.children = [self.tanh, self.linear] @application def apply(self, *args): output = args output = self.tanh.apply(*pack(output)) output = self.linear.apply(*pack(output)) return output @property def input_dim(self): return self.children[1].input_dim @input_dim.setter def input_dim(self, value): self.children[1].input_dim = value @property def output_dim(self): return self.children[1].output_dim @output_dim.setter def output_dim(self, value): self.children[1].output_dim = value
def softmax_layer(h, y, x_mask, y_mask, lens, vocab_size, hidden_size, boosting): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=vocab_size) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' softmax = NDimensionalSoftmax() #y_hat = softmax.apply(linear_output, extra_ndim=1) #y_hat.name = 'y_hat' cost_a = softmax.categorical_cross_entropy(y, linear_output, extra_ndim=1) #produces correct average cost_a = cost_a * y_mask if boosting: #boosting step, must divide by length here lensMat = T.tile(lens, (y.shape[0], 1)) cost_a = cost_a / lensMat #only count cost of correctly masked entries cost = cost_a.sum() / y_mask.sum() cost.name = 'cost' return (linear_output, cost)
class Locator(Initializable): def __init__(self, input_dim, n_spatial_dims, area_transform, weights_init, biases_init, location_std, scale_std, **kwargs): super(Locator, self).__init__(**kwargs) self.n_spatial_dims = n_spatial_dims self.area_transform = area_transform self.locationscale = Linear( input_dim=area_transform.brick.output_dim, output_dim=2 * n_spatial_dims, # these are huge reductions in dimensionality, so use # normalized initialization to avoid huge values. weights_init=NormalizedInitialization(IsotropicGaussian(std=1e-3)), biases_init=Constant(0), name="locationscale") self.T_rng = theano.sandbox.rng_mrg.MRG_RandomStreams(12345) self.location_std = location_std self.scale_std = scale_std self.children = [self.area_transform.brick, self.locationscale] @application(inputs=['h'], outputs=['location', 'scale']) def apply(self, h): area = self.area_transform(h) locationscale = self.locationscale.apply(area) location, scale = (locationscale[:, :self.n_spatial_dims], locationscale[:, self.n_spatial_dims:]) location += self.T_rng.normal(location.shape, std=self.location_std) scale += self.T_rng.normal(scale.shape, std=self.scale_std) return location, scale
def lstm_layer(in_dim, h, h_dim, n, pref=""): linear = Linear(input_dim=in_dim, output_dim=h_dim * 4, name='linear' + str(n) + pref) lstm = LSTM(dim=h_dim, name='lstm' + str(n) + pref) initialize([linear, lstm]) return lstm.apply(linear.apply(h))[0]
def softmax_layer(h, y, hidden_size, num_targets, cost_fn='cross'): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=num_targets) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' y_pred = T.argmax(linear_output, axis=1) label_of_predicted = debug_print(y[T.arange(y.shape[0]), y_pred], 'label_of_predicted', False) pat1 = T.mean(label_of_predicted) updates = None if 'ranking' in cost_fn: cost, updates = ranking_loss(linear_output, y) print 'using ranking loss function!' else: y_hat = Logistic().apply(linear_output) y_hat.name = 'y_hat' cost = cross_entropy_loss(y_hat, y) cost.name = 'cost' pat1.name = 'precision@1' misclassify_rate = MultiMisclassificationRate().apply( y, T.ge(linear_output, 0.5)) misclassify_rate.name = 'error_rate' return cost, pat1, updates, misclassify_rate
def generation(z_list, n_latent, hu_decoder, n_out, y): logger.info('in generation: n_latent: %d, hu_decoder: %d', n_latent, hu_decoder) if hu_decoder == 0: return generation_simple(z_list, n_latent, n_out, y) mlp1 = MLP(activations=[Rectifier()], dims=[n_latent, hu_decoder], name='latent_to_hidDecoder') initialize([mlp1]) hid_to_out = Linear(name='hidDecoder_to_output', input_dim=hu_decoder, output_dim=n_out) initialize([hid_to_out]) mysigmoid = Logistic(name='y_hat_vae') agg_logpy_xz = 0. agg_y_hat = 0. for i, z in enumerate(z_list): y_hat = mysigmoid.apply(hid_to_out.apply( mlp1.apply(z))) #reconstructed x agg_logpy_xz += cross_entropy_loss(y_hat, y) agg_y_hat += y_hat agg_logpy_xz /= len(z_list) agg_y_hat /= len(z_list) return agg_y_hat, agg_logpy_xz
def bilstm_layer(in_dim, inp, h_dim, n): linear = Linear(input_dim=in_dim, output_dim=h_dim * 4, name='linear' + str(n)+inp.name) lstm = LSTM(dim=h_dim, name='lstm' + str(n)+inp.name) bilstm = Bidirectional(prototype=lstm) bilstm.name = 'bilstm' + str(n) + inp.name initialize([linear, bilstm]) return bilstm.apply(linear.apply(inp))[0]
class Embedder(Initializable): """ Linear Embedding Brick Parameters ---------- dim_in: :class:`int` Dimensionality of the input dim_out: :class:`int` Dimensionality of the output output_type: :class:`str` fc for fully connected. conv for convolutional """ def __init__(self, dim_in, dim_out, output_type='fc', **kwargs): self.dim_in = dim_in self.dim_out = dim_out self.output_type = output_type self.linear = Linear(dim_in, dim_out, name='embed_layer') children = [self.linear] kwargs.setdefault('children', []).extend(children) super(Embedder, self).__init__(**kwargs) @application(inputs=['y'], outputs=['outputs']) def apply(self, y): embedding = self.linear.apply(y) if self.output_type == 'fc': return embedding if self.output_type == 'conv': return embedding.reshape((-1, embedding.shape[-1], 1, 1)) def get_dim(self, name): if self.output_type == 'fc': return self.linear.get_dim(name) if self.output_type == 'conv': return (self.linear.get_dim(name), 1, 1)
class Qsampler(Qlinear, Random): """ brick to handle the intermediate layer of an Autoencoder. The intermidate layer predict the mean and std of each dimension of the intermediate layer and then sample from a normal distribution. """ # Special brick to handle Variatonal Autoencoder statistical sampling def __init__(self, input_dim, output_dim, **kwargs): super(Qsampler, self).__init__(input_dim, output_dim, **kwargs) self.prior_mean = 0. self.prior_log_sigma = 0. self.log_sigma_transform = Linear( name=self.name+'_log_sigma', input_dim=input_dim, output_dim=output_dim, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.children.append(self.log_sigma_transform) @application(inputs=['x'], outputs=['z', 'kl_term']) def sample(self, x): """Return a samples and the corresponding KL term Parameters ---------- x : Returns ------- z : tensor.matrix Samples drawn from Q(z|x) kl : tensor.vector KL(Q(z|x) || P_z) """ mean = self.mean_transform.apply(x) log_sigma = self.log_sigma_transform.apply(x) batch_size = x.shape[0] dim_z = self.get_dim('output') # Sample from mean-zeros std.-one Gaussian u = self.theano_rng.normal( size=(batch_size, dim_z), avg=0., std=1.) z = mean + tensor.exp(log_sigma) * u # Calculate KL kl = ( self.prior_log_sigma - log_sigma + 0.5 * ( tensor.exp(2 * log_sigma) + (mean - self.prior_mean) ** 2 ) / tensor.exp(2 * self.prior_log_sigma) - 0.5 ).sum(axis=-1) return z, kl
def test_variable_filter_applications_error(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name="linear1") x = tensor.vector() h1 = brick1.apply(x) cg = ComputationGraph(h1) VariableFilter(applications=brick1.apply)(cg.variables)
def test_variable_filter_applications_error(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name='linear1') x = tensor.vector() h1 = brick1.apply(x) cg = ComputationGraph(h1) VariableFilter(applications=brick1.apply)(cg.variables)
def decoder_network(latent_sample, latent_dim=J): # bernoulli case hidden2 = get_typical_layer(latent_sample, latent_dim, 500, Logistic()) hidden2_to_output = Linear(name="last", input_dim=500, output_dim=784) hidden2_to_output.weights_init = IsotropicGaussian(0.01) hidden2_to_output.biases_init = Constant(0) hidden2_to_output.initialize() return Logistic().apply(hidden2_to_output.apply(hidden2))
class Encoder(Initializable): def __init__(self, image_feature_dim, embedding_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.image_embedding = Linear( input_dim=image_feature_dim , output_dim=embedding_dim # , weights_init=IsotropicGaussian(0.02) # , biases_init=Constant(0.) , name="image_embedding" ) self.to_inputs = Linear( input_dim=embedding_dim , output_dim=embedding_dim*4 # gate_inputs = vstack(input, forget, cell, hidden) # , weights_init=IsotropicGaussian(0.02) # , biases_init=Constant(0.) , name="to_inputs" ) # Don't think this dim has to also be dimension, more arbitrary self.transition = LSTM( dim=embedding_dim, name="transition") self.children = [ self.image_embedding , self.to_inputs , self.transition ] @application(inputs=['image_vects', 'word_vects'], outputs=['image_embedding', 'sentence_embedding']) def apply(self, image_vects, word_vects): image_embedding = self.image_embedding.apply(image_vects) # inputs = word_vects inputs = self.to_inputs.apply(word_vects) inputs = inputs.dimshuffle(1, 0, 2) hidden, cells = self.transition.apply(inputs=inputs, mask=None) # the last hidden state represents the accumulation of all the words (i.e. the sentence) # grab all batches, grab the last value representing accumulation of the sequence, grab all features sentence_embedding = hidden[-1] # sentence_embedding = inputs.mean(axis=0) return image_embedding, sentence_embedding
class Encoder(Initializable): def __init__(self, image_feature_dim, embedding_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.image_embedding = Linear( input_dim=image_feature_dim , output_dim=embedding_dim , name="image_embedding" ) self.to_inputs = Linear( input_dim=embedding_dim , output_dim=embedding_dim*4 # times 4 cuz vstack(input, forget, cell, hidden) , name="to_inputs" ) self.transition = LSTM( dim=embedding_dim, name="transition") self.children = [ self.image_embedding , self.to_inputs , self.transition ] @application( inputs=['image_vects', 'word_vects'] , outputs=['image_embedding', 'sentence_embedding'] ) def apply(self, image_vects, word_vects): image_embedding = self.image_embedding.apply(image_vects) inputs = self.to_inputs.apply(word_vects) # shuffle dimensions to correspond to (sequence, batch, features) inputs = inputs.dimshuffle(1, 0, 2) hidden, cells = self.transition.apply(inputs=inputs, mask=None) # last hidden state represents the accumulation of word embeddings # (i.e. the sentence embedding) sentence_embedding = hidden[-1] return image_embedding, sentence_embedding
def test_linear(): x = tensor.matrix() linear = Linear(input_dim=16, output_dim=8, weights_init=Constant(2), biases_init=Constant(1)) y = linear.apply(x) linear.initialize() x_val = numpy.ones((4, 16), dtype=theano.config.floatX) assert_allclose( y.eval({x: x_val}), x_val.dot(2 * numpy.ones((16, 8))) + numpy.ones((4, 8))) linear = Linear(input_dim=16, output_dim=8, weights_init=Constant(2), use_bias=False) y = linear.apply(x) linear.initialize() x_val = numpy.ones((4, 16), dtype=theano.config.floatX) assert_allclose(y.eval({x: x_val}), x_val.dot(2 * numpy.ones((16, 8))))
def create_rnn(hidden_dim, vocab_dim,mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name = "W1", #dim = hidden_dim*4, dim = hidden_dim, length = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) if mode == "lstm": # Long Short Term Memory H = LSTM( hidden_dim, name = 'H', weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0.0) ) else: # recurrent history weight H = SimpleRecurrent( name = "H", dim = hidden_dim, activation = Tanh(), weights_init = initialization.IsotropicGaussian(0.01) ) # S = Linear( name = "W2", input_dim = hidden_dim, output_dim = vocab_dim, weights_init = initialization.IsotropicGaussian(0.01), biases_init = initialization.Constant(0) ) A = NDimensionalSoftmax( name = "softmax" ) initLayers([W,H,S]) activations = W.apply(x) hiddens = H.apply(activations)#[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def test_variable_filter_roles_error(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name="linear1") x = tensor.vector() h1 = brick1.apply(x) cg = ComputationGraph(h1) # testing role error VariableFilter(roles=PARAMETER)(cg.variables)
def MSEloss_layer(h, y, frame_length, hidden_size): hidden_to_output = Linear(name="hidden_to_output", input_dim=hidden_size, output_dim=frame_length) initialize([hidden_to_output]) y_hat = hidden_to_output.apply(h) y_hat.name = "y_hat" cost = squared_error(y_hat, y).mean() cost.name = "cost" # import ipdb; ipdb.set_trace() return y_hat, cost
def test_variable_filter(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name='linear1') brick2 = Bias(2, name='bias1') activation = Sigmoid(name='sigm') x = tensor.vector() h1 = brick1.apply(x) h2 = activation.apply(h1) y = brick2.apply(h2) cg = ComputationGraph(y) parameters = [brick1.W, brick1.b, brick2.params[0]] bias = [brick1.b, brick2.params[0]] brick1_bias = [brick1.b] # Testing filtering by role role_filter = VariableFilter(roles=[PARAMETER]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[FILTER]) assert [] == role_filter(cg.variables) # Testing filtering by role using each_role flag role_filter = VariableFilter(roles=[PARAMETER, BIAS]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True) assert not parameters == role_filter(cg.variables) assert bias == role_filter(cg.variables) # Testing filtering by bricks classes brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by bricks instances brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by brick instance brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by name name_filter = VariableFilter(name='W_norm') assert [cg.variables[2]] == name_filter(cg.variables) # Testing filtering by name regex name_filter_regex = VariableFilter(name_regex='W_no.?m') assert [cg.variables[2]] == name_filter_regex(cg.variables) # Testing filtering by application appli_filter = VariableFilter(applications=[brick1.apply]) variables = [cg.variables[1], cg.variables[8]] assert variables == appli_filter(cg.variables) # Testing filtering by application appli_filter_list = VariableFilter(applications=[brick1.apply]) assert variables == appli_filter_list(cg.variables)
def add_lstm(input_dim, input_var): linear = Linear(input_dim=input_dim,output_dim=input_dim*4,name="linear_layer") lstm = LSTM(dim=input_dim, name="lstm_layer") testing_init(linear) #linear.initialize() default_init(lstm) h = linear.apply(input_var) return lstm.apply(h)
class Highway(Initializable, Feedforward): """ Implements highway networks outlined in [1] y = H(x,WH)T(x,WT) + x(1-T(x,WT)) Highway networks have the same input dimension and output dimension Parameters ---------- input_dim: int number of input/output dimensions for the network output_activation: Activation activation function applied to x and the hidden weights transform_activation: Activation activation function applied to x and the transform weights [1] http://arxiv.org/pdf/1505.00387v1.pdf """ @lazy(allocation=['input_dim']) def __init__(self, input_dim, output_activation=None, transform_activation=None, **kwargs): super(Highway, self).__init__(**kwargs) self.input_dim = input_dim self.output_dim = input_dim if output_activation == None: output_activation = Rectifier() if transform_activation == None: transform_activation = Logistic() self._linear_h = Linear(name="linear_h", input_dim=input_dim, output_dim=input_dim) self._linear_t = Linear(name="linear_t", input_dim=input_dim, output_dim=input_dim) self._output_activation = output_activation self._transform_activation = transform_activation self.children = [self._linear_h, self._linear_t, self._output_activation, self._transform_activation] @application(inputs=['input_'], outputs=['output']) def apply(self, input_): h = self._output_activation.apply(self._linear_h.apply(input_)) t = self._transform_activation.apply(self._linear_t.apply(input_)) return h*t+input_*(1-t)