def __init__(self, feature_dim, memory_dim, fc1_dim, fc2_dim): self.W = Linear(input_dim=feature_dim, output_dim=memory_dim * 4, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='seqDecoder_W') self.GRU_A = LSTM(feature_dim, name='seqDecoder_A', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.GRU_B = LSTM(memory_dim, name='seqDecoder_B', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.W.initialize() self.GRU_A.initialize() self.GRU_B.initialize() self.fc1 = Linear(input_dim=memory_dim, output_dim=fc1_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='fc1') self.fc2 = Linear(input_dim=fc1_dim, output_dim=fc2_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='fc2') self.fc1.initialize() self.fc2.initialize()
def __init__(self, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) # Dimension of the word embeddings taken as input self.embedding_dim = embedding_dim # Hidden state dimension self.state_dim = state_dim # The bidir GRU self.bidir = BidirectionalFromDict( GatedRecurrent(activation=Tanh(), dim=state_dim)) # Forks to administer the inputs of GRU gates self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [self.bidir, self.fwd_fork, self.back_fork]
def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='embeddings') self.bidir = NewBidirectional( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [ self.lookup, self.bidir, self.fwd_fork, self.back_fork ]
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state. self.transition = GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism. self.attention = SequenceContentAttention2( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") readout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=NewSoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=NewLookupFeedback( vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim) # Build sequence generator accordingly. self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear()), cost_type='categorical_cross_entropy') self.children = [self.sequence_generator]
def __init__(self, input_dim, output_dim, channels, width, height, N, **kwargs): super(AttentionWriter, self).__init__(name="writer", **kwargs) self.channels = channels self.img_width = width self.img_height = height self.N = N self.input_dim = input_dim self.output_dim = output_dim assert output_dim == channels * width * height self.zoomer = ZoomableAttentionWindow(channels, height, width, N) self.z_trafo = Linear(name=self.name + '_ztrafo', input_dim=input_dim, output_dim=5, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.w_trafo = Linear(name=self.name + '_wtrafo', input_dim=input_dim, output_dim=channels * N * N, weights_init=self.weights_init, biases_init=self.biases_init, use_bias=True) self.children = [self.z_trafo, self.w_trafo]
def __init__(self, input_dim, output_activation=None, transform_activation=None, **kwargs): super(Highway, self).__init__(**kwargs) self.input_dim = input_dim self.output_dim = input_dim if output_activation == None: output_activation = Rectifier() if transform_activation == None: transform_activation = Logistic() self._linear_h = Linear(name="linear_h", input_dim=input_dim, output_dim=input_dim) self._linear_t = Linear(name="linear_t", input_dim=input_dim, output_dim=input_dim) self._output_activation = output_activation self._transform_activation = transform_activation self.children = [ self._linear_h, self._linear_t, self._output_activation, self._transform_activation ]
def __init__(self, n_att_weights, match_dim, state_transformer=None, attended_transformer=None, energy_computer=None, **kwargs): super(SequenceContentAttention, self).__init__(**kwargs) self.n_att_weights = n_att_weights if not state_transformer: state_transformer = Linear(use_bias=False) self.match_dim = match_dim self.state_transformer = state_transformer self.state_transformers = Parallel(input_names=self.state_names, prototype=state_transformer, name="state_trans") if not attended_transformer: attended_transformer = Linear(name="preprocess") if not energy_computer: energy_computer = MultiShallowEnergyComputer(n_att_weights, name="energy_comp") self.attended_transformer = attended_transformer self.energy_computer = energy_computer self.children = [ self.state_transformers, attended_transformer, energy_computer ]
def test_defaults_sequence2(): seq = DefaultsSequence(input_dim=(3, 4, 4), lists=[ Convolutional(num_filters=10, stride=(2, 2), filter_size=(3, 3)), BatchNormalization(), Rectifier(), Flattener(), Linear(output_dim=10), BatchNormalization(), Rectifier(), Linear(output_dim=12), BatchNormalization(), Rectifier() ]) seq.weights_init = Constant(1.0) seq.biases_init = Constant(0.0) seq.push_allocation_config() seq.push_initialization_config() seq.initialize() x = T.tensor4('input') y = seq.apply(x) func_ = theano.function([x], [y]) x_val = np.ones((1, 3, 4, 4), dtype=theano.config.floatX) res = func_(x_val)[0] assert_allclose(res.shape, (1, 12))
def create_model(self): input_dim = self.input_dim x = self.x x_to_h = Linear(input_dim, input_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(input_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(input_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) self.x_to_h = x_to_h self.lstm = lstm self.h_to_o = h_to_o h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification probs = h_to_o.apply(h[-1]) return probs
def __init__(self, dim, mini_dim, summary_dim, **kwargs): super(LSTMwMini, self).__init__(**kwargs) self.dim = dim self.mini_dim = mini_dim self.summary_dim = summary_dim self.recurrent_layer = LSTM(dim=self.summary_dim, activation=Rectifier(), name='recurrent_layer', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) self.mini_recurrent_layer = LSTM(dim=self.mini_dim, activation=Rectifier(), name='mini_recurrent_layer', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) self.mini_to_main = Linear(self.dim + self.mini_dim, self.summary_dim, name='mini_to_main', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) self.mini_to_main2 = Linear(self.summary_dim, self.summary_dim * 4, name='mini_to_main2', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) self.children = [ self.recurrent_layer, self.mini_recurrent_layer, self.mini_to_main, self.mini_to_main2 ]
def __init__(self, networks, dims, **kwargs): super(DropMultiLayerEncoder, self).__init__(**kwargs) self.dims = dims self.networks = networks self.use_bias = True self.hid_linear_trans_forw = [ Fork([ name for name in networks[i].prototype.apply.sequences if name not in ['mask', 'drops_states', 'drops_cells', 'drops_igates'] ], name='fork_forw_{}'.format(i), prototype=Linear(), **kwargs) for i in range(len(networks)) ] self.hid_linear_trans_back = [ Fork([ name for name in networks[i].prototype.apply.sequences if name not in ['mask', 'drops_states', 'drops_cells', 'drops_igates'] ], name='fork_back_{}'.format(i), prototype=Linear(), **kwargs) for i in range(len(networks)) ] self.out_linear_trans = Linear(name='out_linear', **kwargs) self.children = (networks + self.hid_linear_trans_forw + self.hid_linear_trans_back + [self.out_linear_trans]) self.num_layers = len(networks)
def make_bidir_lstm_stack(seq, seq_dim, mask, sizes, skip=True, name=''): bricks = [] curr_dim = [seq_dim] curr_hidden = [seq] hidden_list = [] for k, dim in enumerate(sizes): fwd_lstm_ins = [Linear(input_dim=d, output_dim=4*dim, name='%s_fwd_lstm_in_%d_%d'%(name,k,l)) for l, d in enumerate(curr_dim)] fwd_lstm = LSTM(dim=dim, activation=Tanh(), name='%s_fwd_lstm_%d'%(name,k)) bwd_lstm_ins = [Linear(input_dim=d, output_dim=4*dim, name='%s_bwd_lstm_in_%d_%d'%(name,k,l)) for l, d in enumerate(curr_dim)] bwd_lstm = LSTM(dim=dim, activation=Tanh(), name='%s_bwd_lstm_%d'%(name,k)) bricks = bricks + [fwd_lstm, bwd_lstm] + fwd_lstm_ins + bwd_lstm_ins fwd_tmp = sum(x.apply(v) for x, v in zip(fwd_lstm_ins, curr_hidden)) bwd_tmp = sum(x.apply(v) for x, v in zip(bwd_lstm_ins, curr_hidden)) fwd_hidden, _ = fwd_lstm.apply(fwd_tmp, mask=mask) bwd_hidden, _ = bwd_lstm.apply(bwd_tmp[::-1], mask=mask[::-1]) hidden_list = hidden_list + [fwd_hidden, bwd_hidden] if skip: curr_hidden = [seq, fwd_hidden, bwd_hidden[::-1]] curr_dim = [seq_dim, dim, dim] else: curr_hidden = [fwd_hidden, bwd_hidden[::-1]] curr_dim = [dim, dim] return bricks, hidden_list
def test_sequence_variable_inputs(): x, y = tensor.matrix(), tensor.matrix() parallel_1 = Parallel(input_names=['input_1', 'input_2'], input_dims=[4, 5], output_dims=[3, 2], prototype=Linear(), weights_init=Constant(2), biases_init=Constant(1)) parallel_2 = Parallel(input_names=['input_1', 'input_2'], input_dims=[3, 2], output_dims=[5, 4], prototype=Linear(), weights_init=Constant(2), biases_init=Constant(1)) sequence = Sequence([parallel_1.apply, parallel_2.apply]) sequence.initialize() new_x, new_y = sequence.apply(x, y) x_val = numpy.ones((4, 4), dtype=theano.config.floatX) y_val = numpy.ones((4, 5), dtype=theano.config.floatX) assert_allclose(new_x.eval({x: x_val}), (x_val.dot(2 * numpy.ones( (4, 3))) + numpy.ones((4, 3))).dot(2 * numpy.ones( (3, 5))) + numpy.ones((4, 5))) assert_allclose(new_y.eval({y: y_val}), (y_val.dot(2 * numpy.ones( (5, 2))) + numpy.ones((4, 2))).dot(2 * numpy.ones( (2, 4))) + numpy.ones((4, 4)))
def __init__(self, vocab_size, embedding_dim, igru_state_dim, emitter=None, feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None, igru=None, **kwargs): self.igru = igru self.lookup = LookupTable(name='embeddings') self.vocab_size = vocab_size self.igru_state_dim = igru_state_dim self.gru_to_softmax = Linear(input_dim=igru_state_dim, output_dim=vocab_size) self.embedding_dim = embedding_dim self.gru_fork = Fork([ name for name in self.igru.apply.sequences if name != 'mask' and name != 'input_states' ], prototype=Linear(), name='gru_fork') kwargs['children'] = [ self.igru, self.lookup, self.gru_to_softmax, self.gru_fork ] super(Interpolator, self).__init__(emitter=emitter, feedback_brick=feedback_brick, merge=merge, merge_prototype=merge_prototype, post_merge=post_merge, merged_dim=merged_dim, **kwargs)
def __init__(self, match_dim, state_transformer=None, attended_transformer=None, energy_computer=None, **kwargs): if not state_transformer: state_transformer = Linear(use_bias=False) self.match_dim = match_dim self.state_transformer = state_transformer self.state_transformers = Parallel(input_names=kwargs['state_names'], prototype=state_transformer, name="state_trans") if not attended_transformer: attended_transformer = Linear(name="preprocess") if not energy_computer: energy_computer = ShallowEnergyComputer(name="energy_comp") self.attended_transformer = attended_transformer self.energy_computer = energy_computer children = [ self.state_transformers, attended_transformer, energy_computer ] kwargs.setdefault('children', []).extend(children) super(SequenceContentAttention, self).__init__(**kwargs)
def test_rng(): linear = Linear() assert isinstance(linear.rng, numpy.random.RandomState) linear = Linear(seed=1) assert linear.rng.rand() == numpy.random.RandomState(1).rand() linear = Linear() linear2 = Linear() assert linear.seed != linear2.seed
def construct_model(activation_function, r_dim, hidden_dim, out_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') nx = x.shape[0] nj = x.shape[1] # also is r.shape[0] nr = r.shape[1] # r is nj x nr # x is nx x nj # y is nx # Get a representation of r of size r_dim r = DAE(r) # r is now nj x r_dim # r_rep is nx x nj x r_dim r_rep = r[None, :, :].repeat(axis=0, repeats=nx) # x3 is nx x nj x 1 x3 = x[:, :, None] # concat is nx x nj x (r_dim + 1) concat = tensor.concatenate([r_rep, x3], axis=2) # Change concat from Batch x Time x Features to T X B x F rnn_input = concat.dimshuffle(1, 0, 2) linear = Linear(input_dim=r_dim + 1, output_dim=4 * hidden_dim, name="input_linear") lstm = LSTM(dim=hidden_dim, activation=activation_function, name="hidden_recurrent") top_linear = Linear(input_dim=hidden_dim, output_dim=out_dim, name="out_linear") pre_rnn = linear.apply(rnn_input) states = lstm.apply(pre_rnn)[0] activations = top_linear.apply(states) activations = tensor.mean(activations, axis=0) cost = Softmax().categorical_cross_entropy(y, activations) pred = activations.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in (linear, lstm, top_linear): brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() return cost, error_rate
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim,topical_dim,theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed #self.topical_dim=topical_dim; # Initialize gru with special initial state self.transition = GRUInitialState( attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") self.topical_attention=SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=topical_dim, match_dim=state_dim, name="topical_attention")#not sure whether the match dim would be correct. # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout( source_names=['states', 'feedback', self.attention.take_glimpses.outputs[0]],#check! readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence( [Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply]), merged_dim=state_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, topical_attention=self.topical_attention, topical_name='topical_embeddingq', content_name='content_embedding', fork=Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) ) self.children = [self.sequence_generator]
def __init__(self, vocab_size, embedding_dim, igru_state_dim, igru_depth, trg_dgru_depth, emitter, feedback_brick, merge=None, merge_prototype=None, post_merge=None, **kwargs): merged_dim = igru_state_dim if not merge: merge = Merge(input_names=kwargs['source_names'], prototype=merge_prototype) if not post_merge: post_merge = Bias(dim=merged_dim) # for compatible if igru_depth == 1: self.igru = IGRU(dim=igru_state_dim) else: self.igru = RecurrentStack( [IGRU(dim=igru_state_dim, name='igru')] + [ UpperIGRU(dim=igru_state_dim, activation=Tanh(), name='upper_igru' + str(i)) for i in range(1, igru_depth) ], skip_connections=True) self.embedding_dim = embedding_dim self.emitter = emitter self.feedback_brick = feedback_brick self.merge = merge self.post_merge = post_merge self.merged_dim = merged_dim self.igru_depth = igru_depth self.trg_dgru_depth = trg_dgru_depth self.lookup = LookupTable(name='embeddings') self.vocab_size = vocab_size self.igru_state_dim = igru_state_dim self.gru_to_softmax = Linear(input_dim=igru_state_dim, output_dim=vocab_size) self.gru_fork = Fork([ name for name in self.igru.apply.sequences if name != 'mask' and name != 'input_states' ], prototype=Linear(), name='gru_fork') children = [ self.emitter, self.feedback_brick, self.merge, self.post_merge, self.igru, self.lookup, self.gru_to_softmax, self.gru_fork ] kwargs.setdefault('children', []).extend(children) super(Interpolator, self).__init__(**kwargs)
def __init__(self, k=20, rec_h_dim=400, att_size=10, num_letters=68, sampling_bias=0., attention_type="graves", epsilon=1e-6, attention_alignment=1., **kwargs): super(Scribe, self).__init__(**kwargs) # For now only softmax and graves are supported. assert attention_type in ["graves", "softmax"] readouts_dim = 1 + 6 * k self.k = k self.rec_h_dim = rec_h_dim self.att_size = att_size self.num_letters = num_letters self.sampling_bias = sampling_bias self.attention_type = attention_type self.epsilon = epsilon self.attention_alignment = attention_alignment self.cell1 = GatedRecurrent(dim=rec_h_dim, name='cell1') self.inp_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'], input_dim=3, output_dims=[rec_h_dim, 2 * rec_h_dim], name='inp_to_h1') self.h1_to_readout = Linear(input_dim=rec_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'], input_dim=rec_h_dim, output_dims=[att_size] * 3, name='h1_to_att') self.att_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'], input_dim=num_letters, output_dims=[rec_h_dim, 2 * rec_h_dim], name='att_to_h1') self.att_to_readout = Linear(input_dim=num_letters, output_dim=readouts_dim, name='att_to_readout') self.emitter = BivariateGMMEmitter(k=k, sampling_bias=sampling_bias) self.children = [ self.cell1, self.inp_to_h1, self.h1_to_readout, self.h1_to_att, self.att_to_h1, self.att_to_readout, self.emitter ]
def lstm_layer(in_size, dim, x, h, n, first_layer=False): if connect_h_to_h == 'all-previous': if first_layer: lstm_input = x linear = Linear(input_dim=in_size, output_dim=dim * 4, name='linear' + str(n) + '-') elif connect_x_to_h: lstm_input = T.concatenate([x] + [hidden for hidden in h], axis=2) linear = Linear(input_dim=in_size + dim * (n), output_dim=dim * 4, name='linear' + str(n) + '-') else: lstm_input = T.concatenate([hidden for hidden in h], axis=2) linear = Linear(input_dim=dim * (n + 1), output_dim=dim * 4, name='linear' + str(n) + '-') elif connect_h_to_h == 'two-previous': if first_layer: lstm_input = x linear = Linear(input_dim=in_size, output_dim=dim * 4, name='linear' + str(n) + '-') elif connect_x_to_h: lstm_input = T.concatenate([x] + h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=in_size + dim * 2 if n > 1 else in_size + dim, output_dim=dim * 4, name='linear' + str(n) + '-') else: lstm_input = T.concatenate(h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=dim * 2 if n > 1 else dim, output_dim=dim * 4, name='linear' + str(n) + '-') elif connect_h_to_h == 'one-previous': if first_layer: lstm_input = x linear = Linear(input_dim=in_size, output_dim=dim * 4, name='linear' + str(n) + '-') elif connect_x_to_h: lstm_input = T.concatenate([x] + [h[n - 1]], axis=2) linear = Linear(input_dim=in_size + dim, output_dim=dim * 4, name='linear' + str(n) + '-') else: lstm_input = h[n - 1] # linear = LN_LSTM(input_dim=dim, output_dim=dim * 4, name='linear' + str(n) + '-' ) linear = Linear(input_dim=dim, output_dim=dim * 4, name='linear' + str(n) + '-') lstm = LN_LSTM(dim=dim, name=layer_models[network_mode][n] + str(n) + '-') initialize([linear, lstm]) if layer_models[network_mode][n] == 'lstm': return lstm.apply(linear.apply(lstm_input)) # return lstm.apply(linear.apply(lstm_input), mask=x_mask) elif layer_models[network_mode][n] == 'mt_lstm': return lstm.apply(linear.apply(lstm_input), time_scale=layer_resolutions[n], time_offset=layer_execution_time_offset[n])
def rnn_layer(in_size, dim, x, h, n, first_layer=False): if connect_h_to_h == 'all-previous': if first_layer: rnn_input = x linear = Linear(input_dim=in_size, output_dim=dim, name='linear' + str(n) + '-') elif connect_x_to_h: rnn_input = T.concatenate([x] + [hidden for hidden in h], axis=2) linear = Linear(input_dim=in_size + dim * n, output_dim=dim, name='linear' + str(n) + '-') else: rnn_input = T.concatenate([hidden for hidden in h], axis=2) linear = Linear(input_dim=dim * n, output_dim=dim, name='linear' + str(n) + '-') elif connect_h_to_h == 'two-previous': if first_layer: rnn_input = x linear = Linear(input_dim=in_size, output_dim=dim, name='linear' + str(n) + '-') elif connect_x_to_h: rnn_input = T.concatenate([x] + h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=in_size + dim * 2 if n > 1 else in_size + dim, output_dim=dim, name='linear' + str(n) + '-') else: rnn_input = T.concatenate(h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=dim * 2 if n > 1 else dim, output_dim=dim, name='linear' + str(n) + '-') elif connect_h_to_h == 'one-previous': if first_layer: rnn_input = x linear = Linear(input_dim=in_size, output_dim=dim, name='linear' + str(n) + '-') elif connect_x_to_h: rnn_input = T.concatenate([x] + [h[n - 1]], axis=2) linear = Linear(input_dim=in_size + dim, output_dim=dim, name='linear' + str(n) + '-') else: rnn_input = h[n] linear = Linear(input_dim=dim, output_dim=dim, name='linear' + str(n) + '-') rnn = SimpleRecurrent(dim=dim, activation=Tanh(), name=layer_models[n] + str(n) + '-') initialize([linear, rnn]) if layer_models[n] == 'rnn': return rnn.apply(linear.apply(rnn_input)) elif layer_models[n] == 'mt_rnn': return rnn.apply(linear.apply(rnn_input), time_scale=layer_resolutions[n], time_offset=layer_execution_time_offset[n])
def _build_bricks(self, *args, **kwargs): # Build lookup tables self.word_embed = self._embed(len(self.dataset.word2index), self.config.word_embed_dim, name='word_embed') self.hashtag_embed = self._embed(len(self.dataset.hashtag2index), self.config.lstm_dim, name='hashtag_embed') # Build text encoder self.mlstm_ins = Linear(input_dim=self.config.word_embed_dim, output_dim=4 * self.config.lstm_dim, name='mlstm_in') self.mlstm_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim)) self.mlstm_ins.biases_init = Constant(0) self.mlstm_ins.initialize() self.mlstm = MLSTM(self.config.lstm_time, self.config.lstm_dim, shared=False) self.mlstm.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim)) self.mlstm.biases_init = Constant(0) self.mlstm.initialize() self.hashtag2word = MLP( activations=[Tanh('hashtag2word_tanh')], dims=[self.config.lstm_dim, self.config.word_embed_dim], name='hashtag2word_mlp') self.hashtag2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.hashtag2word.biases_init = Constant(0) self.hashtag2word.initialize() self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias') self.hashtag2word_bias.biases_init = Constant(0) self.hashtag2word_bias.initialize() #Build character embedding self.char_embed = self._embed(len(self.dataset.char2index), self.config.char_embed_dim, name='char_embed') # Build sparse word encoder self.rnn_ins = Linear(input_dim=self.config.char_embed_dim, output_dim=self.config.word_embed_dim, name='rnn_in') self.rnn_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim + self.config.word_embed_dim)) self.rnn_ins.biases_init = Constant(0) self.rnn_ins.initialize() self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim, activation=Tanh()) self.rnn.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.rnn.initialize()
def MDN_output_layer(x, h, y, in_size, out_size, hidden_size, pred): if connect_h_to_o: hiddens = T.concatenate([hidden for hidden in h], axis=2) hidden_out_size = hidden_size * len(h) else: hiddens = h[-1] hidden_out_size = hidden_size mu_linear = Linear(name='mu_linear' + str(pred), input_dim=hidden_out_size, output_dim=out_size * components_size[network_mode]) sigma_linear = Linear(name='sigma_linear' + str(pred), input_dim=hidden_out_size, output_dim=components_size[network_mode]) mixing_linear = Linear(name='mixing_linear' + str(pred), input_dim=hidden_out_size, output_dim=components_size[network_mode]) initialize([mu_linear, sigma_linear, mixing_linear]) mu = mu_linear.apply(hiddens) mu = mu.reshape( (mu.shape[0], mu.shape[1], out_size, components_size[network_mode])) sigma_orig = sigma_linear.apply(hiddens) sigma = T.nnet.softplus(sigma_orig) mixing_orig = mixing_linear.apply(hiddens) e_x = T.exp(mixing_orig - mixing_orig.max(axis=2, keepdims=True)) mixing = e_x / e_x.sum(axis=2, keepdims=True) exponent = -0.5 * T.inv(sigma) * T.sum( (y.dimshuffle(0, 1, 2, 'x') - mu)**2, axis=2) normalizer = (2 * np.pi * sigma) exponent = exponent + T.log(mixing) - (out_size * .5) * T.log(normalizer) # LogSumExp(x) max_exponent = T.max(exponent, axis=2, keepdims=True) mod_exponent = exponent - max_exponent gauss_mix = T.sum(T.exp(mod_exponent), axis=2, keepdims=True) log_gauss = T.log(gauss_mix) + max_exponent cost = -T.mean(log_gauss) srng = RandomStreams(seed=seed) mixing = mixing_orig * (1 + sampling_bias) sigma = T.nnet.softplus(sigma_orig - sampling_bias) e_x = T.exp(mixing - mixing.max(axis=2, keepdims=True)) mixing = e_x / e_x.sum(axis=2, keepdims=True) component = srng.multinomial(pvals=mixing) component_mean = T.sum(mu * component.dimshuffle(0, 1, 'x', 2), axis=3) component_std = T.sum(sigma * component, axis=2, keepdims=True) linear_output = srng.normal(avg=component_mean, std=component_std) linear_output.name = 'linear_output' return linear_output, cost
def __init__(self, emb_dim, dim, dropout=0.0, def_word_gating="none", dropout_type="per_unit", compose_type="sum", word_dropout_weighting="no_weighting", shortcut_unk_and_excluded=False, num_input_words=-1, exclude_top_k=-1, vocab=None, **kwargs): self._dropout = dropout self._num_input_words = num_input_words self._exclude_top_K = exclude_top_k self._dropout_type = dropout_type self._compose_type = compose_type self._vocab = vocab self._shortcut_unk_and_excluded = shortcut_unk_and_excluded self._word_dropout_weighting = word_dropout_weighting self._def_word_gating = def_word_gating if def_word_gating not in {"none", "self_attention"}: raise NotImplementedError() if word_dropout_weighting not in {"no_weighting"}: raise NotImplementedError("Not implemented " + word_dropout_weighting) if dropout_type not in {"per_unit", "per_example", "per_word"}: raise NotImplementedError() children = [] if self._def_word_gating=="self_attention": self._gate_mlp = Linear(dim, dim) self._gate_act = Logistic() children.extend([self._gate_mlp, self._gate_act]) if compose_type == 'fully_connected_linear': self._def_state_compose = MLP(activations=[None], dims=[emb_dim + dim, emb_dim]) children.append(self._def_state_compose) if compose_type == "gated_sum" or compose_type == "gated_transform_and_sum": if dropout_type == "per_word" or dropout_type == "per_example": raise RuntimeError("I dont think this combination makes much sense") self._compose_gate_mlp = Linear(dim + emb_dim, emb_dim, name='gate_linear') self._compose_gate_act = Logistic() children.extend([self._compose_gate_mlp, self._compose_gate_act]) if compose_type == 'sum': if not emb_dim == dim: raise ValueError("Embedding has different dim! Cannot use compose_type='sum'") if compose_type == 'transform_and_sum' or compose_type == "gated_transform_and_sum": self._def_state_transform = Linear(dim, emb_dim, name='state_transform') children.append(self._def_state_transform) super(MeanPoolCombiner, self).__init__(children=children, **kwargs)
def __init__(self, **kwargs): super(TreeAttention, self).__init__(**kwargs) state_transformer = Linear() self.state_transformers = Parallel(input_names=self.state_names, prototype=state_transformer, name="state_trans") self.parent1_transformer = Linear(name="parent1_trans") self.parent2_transformer = Linear(name="parent2_trans") self.children = [self.state_transformers, self.parent1_transformer, self.parent2_transformer]
def __init__(self, match_dim, conv_n, conv_num_filters=1, state_transformer=None, attended_transformer=None, energy_computer=None, prior=None, energy_normalizer=None, **kwargs): super(SequenceContentAndConvAttention, self).__init__(**kwargs) if not state_transformer: state_transformer = Linear(use_bias=False) self.match_dim = match_dim self.state_transformer = state_transformer self.state_transformers = Parallel(input_names=self.state_names, prototype=state_transformer, name="state_trans") if not attended_transformer: # Only this contributor to the match vector # is allowed to have biases attended_transformer = Linear(name="preprocess") if not energy_normalizer: energy_normalizer = 'softmax' self.energy_normalizer = energy_normalizer if not energy_computer: energy_computer = ShallowEnergyComputer( name="energy_comp", use_bias=self.energy_normalizer != 'softmax') self.filter_handler = Linear(name="handler", use_bias=False) self.attended_transformer = attended_transformer self.energy_computer = energy_computer if not prior: prior = dict(type='expanding', initial_begin=0, initial_end=10000, min_speed=0, max_speed=0) self.prior = prior self.conv_n = conv_n self.conv_num_filters = conv_num_filters self.conv = Conv1D(conv_num_filters, 2 * conv_n + 1) self.children = [ self.state_transformers, self.attended_transformer, self.energy_computer, self.filter_handler, self.conv ]
def linear_layer(in_size, dim, x, h, n, first_layer=False): if first_layer: input = x linear = Linear(input_dim=in_size, output_dim=dim, name='feedforward' + str(n)) elif connect_x_to_h: input = T.concatenate([x] + [h[n - 1]], axis=1) linear = Linear(input_dim=in_size + dim, output_dim=dim, name='feedforward' + str(n)) else: input = h[n - 1] linear = Linear(input_dim=dim, output_dim=dim, name='feedforward' + str(n)) initialize([linear]) return linear.apply(input)
def __init__(self, word_dim, visual_dim, joint_dim): self.word_embed = Linear(word_dim, joint_dim, name='word_to_joint', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.visual_embed = Linear(visual_dim, joint_dim, name='visual_to_joint', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.word_embed.initialize() self.visual_embed.initialize()
def __init__(self, dimension, input_size, embed_input=False, **kwargs): super(LSTMEncoder, self).__init__(**kwargs) if embed_input: self.embedder = LookupTable(input_size, dimension) else: self.embedder = Linear(input_size, dimension) self.fork = Fork(['inputs'], dimension, output_dims=[dimension], prototype=Linear(dimension, 4 * dimension)) encoder = Bidirectional(LSTM(dim=dimension, activation=Tanh())) self.encoder = encoder self.children = [encoder, self.embedder, self.fork]