def ewma(series, axis=None, span=VOL_WINDOW_SPAN, adjust=True, initial=None): """ Exponentially-weighted moving average """ if axis is None: if series.ndim == 1: axis=0 else: raise ValueError("Please specify which axis to compute ewma over (usually time axis)") assert span >= 1 alpha = 2. / (span + 1) series = T.swapaxes(series, axis, 0) if adjust: assert initial is None initial = T.zeros_like(series[0]) else: if initial is None: initial = series[0] initial /= alpha def ewma_numerator_step(a_i, prev_ewma): return a_i + (1. - alpha) * prev_ewma ewma_numerators, _ = theano.scan(ewma_numerator_step, series, outputs_info=initial, strict=True) if adjust: ewma_denominators = T.cumsum((1 - alpha) ** T.arange(ewma_numerators.shape[0])) series_ewma = ewma_numerators / ewma_denominators.reshape((-1,)+(1,)*(ewma_numerators.ndim-1)) else: series_ewma = ewma_numerators * alpha series_ewma = T.swapaxes(series_ewma, 0, axis) return series_ewma
def columnwise_softmax(o): # comput softmax in numerically stable, column-wise way # move distribution axis to the end, collapse sequences/minibatches along first axis, calculate softmax # for each of the sequence_length*num_minibatches rows, then re-roll and swap axes back swapped_o = T.swapaxes(o,1,2) swapped_flat_o = swapped_o.reshape((-1,swapped_o.shape[-1])) clipped_swapped_flat_o1 = T.clip(swapped_flat_o, -5., 5.) # don't exponentiate numbers too big/small clipped_swapped_flat_o2 = T.exp(clipped_swapped_flat_o1 / softmax_temperature) clipped_swapped_flat_o3 = clipped_swapped_flat_o2 / clipped_swapped_flat_o2.sum(axis=1, keepdims=True) softmaxed_swapped_o = clipped_swapped_flat_o3.reshape(swapped_o.shape) softmaxed_o = T.swapaxes(softmaxed_swapped_o,1,2) return softmaxed_o
def forward(self, X, previous_state, previous_hidden): output, state = self.input_layer.step(X, previous_state[:, 0, :], previous_hidden[:, 0, :], self.dropout_probability) hiddens, states = [output], [state] for i, layer in enumerate(self.layers): output, state = layer.step(output, previous_state[:, i + 1, :], previous_hidden[:, i + 1, :], self.dropout_probability) hiddens.append(output) states.append(state) return T.swapaxes(T.stack(*hiddens), 0, 1), T.swapaxes(T.stack(*states), 0, 1)
def set_sampling_function(decoder_feature_function, decoder_red_function, decoder_green_function, decoder_blue_function): hidden_data = T.matrix(name='hidden_data', dtype=theano.config.floatX) # decoder decoder_outputs = decoder_feature_function(hidden_data) decoder_feature = decoder_outputs[1] decoder_red = decoder_red_function(decoder_feature) decoder_green = decoder_green_function(decoder_feature) decoder_blue = decoder_blue_function(decoder_feature) num_samples = decoder_red.shape[0] num_rows = decoder_red.shape[2] num_cols = decoder_red.shape[3] num_pixels = num_rows*num_cols # shape = (num_samples, num_intensity, num_pixels) decoder_red = T.flatten(decoder_red, 3) decoder_green = T.flatten(decoder_green, 3) decoder_blue = T.flatten(decoder_blue, 3) # shape = (num_samples, num_pixels, num_intensity) decoder_red = T.swapaxes(decoder_red, axis1=1, axis2=2) decoder_green = T.swapaxes(decoder_green, axis1=1, axis2=2) decoder_blue = T.swapaxes(decoder_blue, axis1=1, axis2=2) # shape = (num_samples*num_pixels, num_intensity) decoder_red = decoder_red.reshape((num_samples*num_pixels, -1)) decoder_green = decoder_green.reshape((num_samples*num_pixels, -1)) decoder_blue = decoder_blue.reshape((num_samples*num_pixels, -1)) # softmax decoder_red = T.argmax(T.nnet.softmax(decoder_red),axis=1) decoder_green = T.argmax(T.nnet.softmax(decoder_green),axis=1) decoder_blue = T.argmax(T.nnet.softmax(decoder_blue),axis=1) decoder_red = decoder_red.reshape((num_samples, 1, num_rows, num_cols)) decoder_green = decoder_green.reshape((num_samples, 1, num_rows, num_cols)) decoder_blue = decoder_blue.reshape((num_samples, 1, num_rows, num_cols)) decoder_image = T.concatenate([decoder_red, decoder_green, decoder_blue], axis=1) function_inputs = [hidden_data,] function_outputs = [decoder_image,] function = theano.function(inputs=function_inputs, outputs=function_outputs, on_unused_input='ignore') return function
def make_masks(x): nonsymbolic_masks = [] for layer in self.layers: rng = T.shared_randomstreams.RandomStreams(np.random.randint(999999)) mask = rng.binomial(p=self.dropout, size=(layer.hidden_dim,layer.minibatch_dim), dtype=theano.config.floatX) nonsymbolic_masks.append(mask) # T.stack gives (minibatch_dim,hidden_dim,layer_num) and we want (hidden_dim,minibatch_dim,layer_num) for point-wise multiplication masks = T.stack(nonsymbolic_masks) masks = T.swapaxes(masks, 0, 2) masks = T.swapaxes(masks, 0, 1) return masks
def apply(self, input_): output = tensor.swapaxes(input_, self.axis1, self.axis2) if self.debug: import theano output = theano.printing.Print('output:', attrs=('shape', ))(output) return output
def step(self, X, previous_hidden, previous_state): out, state = self.forward_with_weights(X, previous_hidden[:, 0, :], previous_state[:, 0, :], self.Wi, self.Ui[0], self.bi[0], self.Wf, self.Uf[0], self.bf[0], self.Wc, self.Uc[0], self.bc[0], self.Wo, self.Vo[0], self.Uo[0], self.bo[0]) outs = [out] states = [state] for l in xrange(1, self.num_layers): out, state = self.forward_with_weights(out, previous_hidden[:, l, :], previous_state[:, l, :], self.Whi[l - 1], self.Ui[l], self.bi[l], self.Whf[l - 1], self.Uf[l], self.bf[l], self.Whc[l - 1], self.Uc[l], self.bc[l], self.Who[l - 1], self.Vo[l], self.Uo[l], self.bo[l]) states.append(state) outs.append(out) return T.swapaxes(T.stack(*outs), 0, 1), T.swapaxes(T.stack(*states), 0, 1)
def compute_output(self, network, in_vw): axis1, axis2 = network.find_hyperparameter(["axes"]) out_shape = list(in_vw.shape) out_shape[axis1], out_shape[axis2] = out_shape[axis2], out_shape[axis1] network.create_vw( "default", variable=T.swapaxes(in_vw.variable, axis1, axis2), shape=out_shape, tags={"output"}, )
def fwd(self, x, disk=None, layer_begin=None, layer_end=None): """ x : signal """ # def cal_patch(theta_i, rho_i, x_dense, y_dense): # x_coord = Tsp.csr_from_dense(x_dense) # y_coord = Tsp.csr_from_dense(y_dense) # x0 = rho_i*T.cos(theta_i)*Tsp.basic.sp_ones_like(x_coord) # y0 = rho_i*T.sin(theta_i)*Tsp.basic.sp_ones_like(y_coord) # patch_i = Tsp.structured_exp((-1.0/self.sigma)*(Tsp.sqr(x_coord-x0)+Tsp.sqr(y_coord-y0))) # patch_i = Tsp.basic.row_scale(patch_i,1.0/Tsp.basic.sp_sum(patch_i,axis=1)) # return patch_i.toarray() # # scan_results,scan_updates = theano.scan(fn=cal_patch, outputs_info=None, # sequences=[self.theta, self.rho], # non_sequences=[x_local.toarray(), y_local.toarray()]) # disk = Tsp.csr_from_dense(T.swapaxes(scan_results,0,1).reshape([self.ntheta * self.nrho * x.shape[0], x.shape[0]])) # patch = Tsp.basic.structured_dot(disk,x) # patch = T.reshape(patch,(x.shape[0], self.ntheta*self.nrho, self.nin, 1)) # patch = T.swapaxes(patch, 1, 2) # def cal_patch(theta_i, rho_i, x_coord, y_coord): # x0 = rho_i*T.cos(theta_i)*Tsp.basic.sp_ones_like(x_coord) # y0 = rho_i*T.sin(theta_i)*Tsp.basic.sp_ones_like(y_coord) # patch_i = Tsp.structured_exp((-1.0/self.sigma)*(Tsp.sqr(x_coord-x0)+Tsp.sqr(y_coord-y0))) # patch_i = Tsp.basic.row_scale(patch_i,1.0/(1e-30+Tsp.basic.sp_sum(patch_i,axis=1))) # return patch_i # disk = [] # for i in xrange(self.ntheta * self.nrho): # disk.append(cal_patch(self.theta[i], self.rho[i], x_local, y_local)) # disk = Tsp.basic.vstack(disk,format='csc') layer_disk = disk[layer_begin[self.layer_id]:layer_end[self.layer_id],:] patch = Tsp.basic.structured_dot(layer_disk,x) patch = T.reshape(patch,[self.ntheta*self.nrho, x.shape[0], x.shape[1]]) patch = T.reshape(T.swapaxes(T.swapaxes(patch,0,1),1,2),[x.shape[0], self.nin, self.ntheta*self.nrho, 1]) return self.activation(theano.tensor.nnet.conv.conv2d(patch, self.a).flatten(2))
def build_transition_cost(logits, targets, num_transitions): """ Build a parse action prediction cost function. """ # swap seq_length dimension to front so that we can scan per timestep logits = T.swapaxes(logits, 0, 1) targets = targets.T def cost_t(logits, tgt, num_transitions): # TODO(jongauthier): Taper down xent cost as we proceed through # sequence? predicted_dist = T.nnet.softmax(logits) cost = T.nnet.categorical_crossentropy(predicted_dist, tgt) pred = T.argmax(logits, axis=1) error = T.neq(pred, tgt) return cost, error results, _ = theano.scan(cost_t, [logits, targets], non_sequences=[num_transitions]) costs, errors = results # Create a mask that selects only transitions that involve real data. unrolling_length = T.shape(costs)[0] padding = unrolling_length - num_transitions padding = T.reshape(padding, (1, -1)) rng = T.arange(unrolling_length) + 1 rng = T.reshape(rng, (-1, 1)) mask = T.gt(rng, padding) # Compute acc using the mask acc = 1.0 - (T.sum(errors * mask, dtype=theano.config.floatX) / T.sum(num_transitions, dtype=theano.config.floatX)) # Compute cost directly, since we *do* want a cost incentive to get the padding # transitions right. cost = T.mean(costs) return cost, acc
def apply_except_axis(x, axis, func): """ Apply a contraction function on all but one axis. Parameters ---------- x: T.Tensor Input tensor. axis: int Axis to exclude on application. func: function A function with signature ``func(x, axis=)`` eg T.mean, T.std ... Returns ------- T.Tensor Contraction of ``x``, but of the same dimensionality. """ x = T.swapaxes(x, 0, axis) # put axis on front x = T.flatten(x, 2) # flatten remainder y = func(x, axis=1) return y
def test_th_matmul(): vlist = [] flist = [] ndlist = [] for i in range(2, 30): dims = int(np.random.random() * 4 + 2) # Create a tuple of tensors with potentially different broadcastability. vs = tuple( tt.TensorVariable( tt.TensorType( 'float64', tuple((p < .3) for p in np.random.ranf(dims - 2)) # Make full matrices + (False, False))) for _ in range(2)) vs = tuple( tt.swapaxes(v, -2, -1) if j % 2 == 0 else v for j, v in enumerate(vs)) f = th.function([*vs], [matmul(*vs)]) # Create the default shape for the test ndarrays defshape = tuple(int(np.random.random() * 5 + 1) for _ in range(dims)) # Create a test array matching the broadcastability of each v, for each v. nds = tuple( np.random.ranf( tuple(s if not v.broadcastable[j] else 1 for j, s in enumerate(defshape))) for v in vs) nds = tuple( np.swapaxes(nd, -2, -1) if j % 2 == 0 else nd for j, nd in enumerate(nds)) ndlist.append(nds) vlist.append(vs) flist.append(f) for i in range(len(ndlist)): assert np.allclose(flist[i](*ndlist[i]), np.matmul(*ndlist[i]))
def train_conv_net( datasets, word_vecs, windows=[3, 2], pool_sizes=[2], dim=300, feature_maps=[100, 100], dropout_rate=[0.5], hidden_units=[], shuffle_batch=True, n_epochs=25, batch_size=50, lr_decay=0.95, conv_non_linear="relu", activations=['relu'], sqr_norm_lim=9, ): assert (len(windows) == len(feature_maps)) assert (len(windows) == len(pool_sizes) + 1) print('\nbuilding model...') index = T.lscalar() x1 = T.matrix('x1') x2 = T.matrix('x2') y = T.ivector('y') Words = theano.shared(value=word_vecs, name="Words") rng = np.random.RandomState(9999) ### define model architecture ### img_h = len(datasets[0][0][0]) filter_shapes = [(feature_maps[0], 1, windows[0], dim)] for i in range(1, len(windows)): filter_shapes.append( (feature_maps[i], 1, windows[i], feature_maps[i - 1])) next_layer_input_1 = Words[T.cast(x1.flatten(), dtype="int32")].reshape( (x1.shape[0], 1, x1.shape[1], dim)) next_layer_input_2 = Words[T.cast(x2.flatten(), dtype="int32")].reshape( (x1.shape[0], 1, x1.shape[1], dim)) conv_layers_1 = [] conv_layers_2 = [] for i in xrange(len(windows) - 1): filter_shape = filter_shapes[i] pool_size = (pool_sizes[i], 1) conv_layer_1 = LeNetConvPoolLayer(rng, input=next_layer_input_1, image_shape=(batch_size, 1, img_h, filter_shape[3]), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) conv_layer_2 = LeNetConvPoolLayer(rng, input=next_layer_input_2, image_shape=(batch_size, 1, img_h, filter_shape[3]), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) img_h -= windows[i] - 1 img_h /= pool_sizes[i] next_layer_input_1 = T.swapaxes(conv_layer_1.output, 1, 3) next_layer_input_2 = T.swapaxes(conv_layer_2.output, 1, 3) conv_layers_1.append(conv_layer_1) conv_layers_2.append(conv_layer_2) ###the last convPoolLayer needs different configurations### filter_shape = filter_shapes[-1] pool_size = (img_h - windows[-1] + 1, 1) conv_layer_1 = LeNetConvPoolLayer(rng, input=next_layer_input_1, image_shape=(batch_size, 1, img_h, filter_shape[3]), filter_shape=filter_shape, poolsize=pool_size) conv_layer_2 = LeNetConvPoolLayer(rng, input=next_layer_input_2, image_shape=(batch_size, 1, img_h, filter_shape[3]), filter_shape=filter_shape, poolsize=pool_size) output_1 = conv_layer_1.output.flatten(2) output_2 = conv_layer_2.output.flatten(2) conv_layers_1.append(conv_layer_1) conv_layers_2.append(conv_layer_2) next_layer_input = T.concatenate([output_1, output_2], 1) ###MLP with dropout### layer_sizes = [feature_maps[-1] * 2] for i in hidden_units: layer_sizes.append(hidden_units[i]) layer_sizes.append(2) classifier = MLPDropout(rng, input=next_layer_input, layer_sizes=layer_sizes, activations='relu', dropout_rates=dropout_rate) ###updates the params with adadelta### params = classifier.params for conv_layer in conv_layers_1: params += conv_layer.params for conv_layer in conv_layers_2: params += conv_layer.params dropout_cost = classifier.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) ###creat minibatches for training set### np.random.seed(9999) if datasets[0].shape[0] % batch_size > 0: data_zipped = zip(datasets[0], datasets[2]) extra_data_num = batch_size - datasets[0].shape[0] % batch_size train_set = np.random.permutation(data_zipped) extra_data = train_set[:extra_data_num] train_set = np.append(train_set, extra_data, axis=0) else: train_set = datasets[0] train_set = np.random.permutation(train_set) n_batches = train_set.shape[0] / batch_size train_labels = train_set[:, 1] train_set = np.array(train_set[:, 0]) train_set_x1 = [x[0] for x in train_set] train_set_x2 = [x[1] for x in train_set] test_set_x1 = [x[0] for x in datasets[1]] test_set_x2 = [x[1] for x in datasets[1]] test_labels = np.asarray(datasets[3], "int32") train_set_x1, train_set_x2, train_labels = shared_dataset( (train_set_x1, train_set_x2, train_labels)) ###theano functions for training and testing### train_model = theano.function( [index], classifier.errors(y), updates=grad_updates, givens={ x1: train_set_x1[index * batch_size:(index + 1) * batch_size], x2: train_set_x2[index * batch_size:(index + 1) * batch_size], y: train_labels[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) test_layer_input_1 = Words[T.cast(x1.flatten(), dtype="int32")].reshape( (x1.shape[0], 1, x1.shape[1], dim)) test_layer_input_2 = Words[T.cast(x2.flatten(), dtype="int32")].reshape( (x2.shape[0], 1, x2.shape[1], dim)) for i in xrange(len(conv_layers_1) - 1): output_1 = conv_layers_1[i].predict(test_layer_input_1, len(test_labels)) output_2 = conv_layers_1[i].predict(test_layer_input_2, len(test_labels)) test_layer_input_1 = T.swapaxes(output_1, 1, 3) test_layer_input_2 = T.swapaxes(output_2, 1, 3) output_1 = conv_layers_1[-1].predict(test_layer_input_1, len(test_labels)) output_2 = conv_layers_1[-1].predict(test_layer_input_2, len(test_labels)) next_layer_input = T.concatenate( [output_1.flatten(2), output_2.flatten(2)], 1) test_y_pred = classifier.predict(next_layer_input) test_error = T.mean(T.neq(test_y_pred, y)) test_model = theano.function([x1, x2, y], test_error, allow_input_downcast=True) ###training### print 'training...' epoch = 0 test_accs = [] train_losses = [] while (epoch < n_epochs): start_time = time.time() epoch = epoch + 1 if shuffle_batch: for minibatch_index in np.random.permutation(range(n_batches)): train_losses.append(train_model(minibatch_index)) else: for minibatch_index in xrange(n_batches): train_losses.append(train_model(minibatch_index)) test_error = test_model(test_set_x1, test_set_x2, test_labels) train_perf = 1 - np.mean(train_losses) test_perf = 1 - test_error test_accs.append(test_perf) print( 'epoch: %i, training time: %.2f secs, train perf: %.2f %% , test perf: %.2f %%' % (epoch, time.time() - start_time, train_perf * 100., test_perf * 100.)) return max(test_accs)
def train_conv_net(datasets, word_vecs, windows=[3,2], pool_sizes=[2], dim=300, feature_maps=[100, 100], dropout_rate=[0.5], hidden_units=[], shuffle_batch=True, n_epochs=25, batch_size=50, lr_decay=0.95, conv_non_linear="relu", activations=['relu'], sqr_norm_lim=9, ): assert(len(windows)==len(feature_maps)) assert (len(windows)==len(pool_sizes)+1) print('\nbuilding model...') index = T.lscalar() x1 = T.matrix('x1') x2 = T.matrix('x2') y = T.ivector('y') Words = theano.shared(value=word_vecs, name="Words") rng = np.random.RandomState(9999) ### define model architecture ### img_h = len(datasets[0][0][0]) filter_shapes = [(feature_maps[0], 1, windows[0], dim)] for i in range(1, len(windows)): filter_shapes.append((feature_maps[i], 1, windows[i], feature_maps[i - 1])) next_layer_input_1 = Words[T.cast(x1.flatten(), dtype="int32")].reshape((x1.shape[0], 1, x1.shape[1], dim)) next_layer_input_2 = Words[T.cast(x2.flatten(), dtype="int32")].reshape((x1.shape[0], 1, x1.shape[1], dim)) conv_layers_1 = [] conv_layers_2 = [] for i in xrange(len(windows) - 1): filter_shape = filter_shapes[i] pool_size = (pool_sizes[i], 1) conv_layer_1 = LeNetConvPoolLayer(rng, input=next_layer_input_1, image_shape=(batch_size, 1, img_h, filter_shape[3]), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) conv_layer_2 = LeNetConvPoolLayer(rng, input=next_layer_input_2, image_shape=(batch_size, 1, img_h, filter_shape[3]), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) img_h -= windows[i] - 1 img_h /= pool_sizes[i] next_layer_input_1 = T.swapaxes(conv_layer_1.output,1,3) next_layer_input_2 = T.swapaxes(conv_layer_2.output,1,3) conv_layers_1.append(conv_layer_1) conv_layers_2.append(conv_layer_2) ###the last convPoolLayer needs different configurations### filter_shape = filter_shapes[-1] pool_size = (img_h-windows[-1]+1, 1) conv_layer_1 = LeNetConvPoolLayer(rng, input=next_layer_input_1, image_shape=(batch_size, 1, img_h, filter_shape[3]), filter_shape=filter_shape, poolsize=pool_size) conv_layer_2 = LeNetConvPoolLayer(rng, input=next_layer_input_2, image_shape=(batch_size, 1, img_h, filter_shape[3]), filter_shape=filter_shape, poolsize=pool_size) output_1 = conv_layer_1.output.flatten(2) output_2 = conv_layer_2.output.flatten(2) conv_layers_1.append(conv_layer_1) conv_layers_2.append(conv_layer_2) next_layer_input = T.concatenate([output_1, output_2], 1) ###MLP with dropout### layer_sizes=[feature_maps[-1] * 2] for i in hidden_units: layer_sizes.append(hidden_units[i]) layer_sizes.append(2) classifier = MLPDropout(rng, input=next_layer_input, layer_sizes=layer_sizes, activations='relu', dropout_rates=dropout_rate) ###updates the params with adadelta### params = classifier.params for conv_layer in conv_layers_1: params += conv_layer.params for conv_layer in conv_layers_2: params += conv_layer.params dropout_cost = classifier.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) ###creat minibatches for training set### np.random.seed(9999) if datasets[0].shape[0] % batch_size > 0: data_zipped=zip(datasets[0],datasets[2]) extra_data_num = batch_size - datasets[0].shape[0] % batch_size train_set = np.random.permutation(data_zipped) extra_data = train_set[:extra_data_num] train_set=np.append(train_set ,extra_data,axis=0) else: train_set = datasets[0] train_set = np.random.permutation(train_set) n_batches = train_set.shape[0] / batch_size train_labels = train_set[:,1] train_set= np.array(train_set[:,0]) train_set_x1 = [x[0] for x in train_set] train_set_x2 = [x[1] for x in train_set] test_set_x1 = [x[0] for x in datasets[1]] test_set_x2 = [x[1] for x in datasets[1]] test_labels = np.asarray(datasets[3], "int32") train_set_x1, train_set_x2, train_labels = shared_dataset((train_set_x1, train_set_x2, train_labels)) ###theano functions for training and testing### train_model = theano.function([index], classifier.errors(y), updates=grad_updates, givens={ x1: train_set_x1[index * batch_size:(index + 1) * batch_size], x2: train_set_x2[index * batch_size:(index + 1) * batch_size], y: train_labels[index * batch_size:(index + 1) * batch_size]}, allow_input_downcast=True) test_layer_input_1= Words[T.cast(x1.flatten(),dtype="int32")].reshape((x1.shape[0],1,x1.shape[1],dim)) test_layer_input_2= Words[T.cast(x2.flatten(),dtype="int32")].reshape((x2.shape[0],1,x2.shape[1],dim)) for i in xrange(len(conv_layers_1)-1): output_1=conv_layers_1[i].predict(test_layer_input_1, len(test_labels)) output_2=conv_layers_1[i].predict(test_layer_input_2, len(test_labels)) test_layer_input_1 = T.swapaxes(output_1,1,3) test_layer_input_2 = T.swapaxes(output_2,1,3) output_1=conv_layers_1[-1].predict(test_layer_input_1, len(test_labels)) output_2=conv_layers_1[-1].predict(test_layer_input_2, len(test_labels)) next_layer_input=T.concatenate([output_1.flatten(2), output_2.flatten(2)], 1) test_y_pred = classifier.predict(next_layer_input) test_error = T.mean(T.neq(test_y_pred, y)) test_model = theano.function([x1,x2,y], test_error, allow_input_downcast = True) ###training### print 'training...' epoch = 0 test_accs = [] train_losses=[] while (epoch < n_epochs): start_time = time.time() epoch = epoch + 1 if shuffle_batch: for minibatch_index in np.random.permutation(range(n_batches)): train_losses.append(train_model(minibatch_index)) else: for minibatch_index in xrange(n_batches): train_losses.append(train_model(minibatch_index)) test_error = test_model(test_set_x1,test_set_x2,test_labels) train_perf = 1 - np.mean(train_losses) test_perf = 1 - test_error test_accs.append(test_perf) print('epoch: %i, training time: %.2f secs, train perf: %.2f %% , test perf: %.2f %%' % ( epoch, time.time() - start_time, train_perf * 100., test_perf*100.)) return max(test_accs)
def main(model='mlp', num_epochs=500): # Load the dataset print("Loading data...") # num_per_class = 100 # print("Using %d per class" % num_per_class) print("Using all the training data") ## Load Data## X_train, y_train, X_test, y_test = load_data("/X_train.npy", "/Y_train.npy", "/X_test.npy", "/Y_test.npy") X_train = extend_images(X_train, 227) X_test = extend_images(X_test, 227) y_train = y_train y_test = y_test ## Define Batch Size ## batch_size = 50 ## Define nRotation for exhaustive search ## nRotation = 16 # The dimension would be (nRotation * n, w, h) input_var = T.tensor4('inputs') vanilla_target_var = T.ivector('vanilla_targets') # Create neural network model (depending on first command line parameter) network, weight_decay = build_cnn(input_var, batch_size) # saved_weights = np.load("../data/mnist_Chi_dec_100.npy") saved_weights = np.load("../data/curet_test_hinge_epoch_400_2pool.npy") lasagne.layers.set_all_param_values(network, saved_weights) predictions = lasagne.layers.get_output(network) one_hot_targets = T.extra_ops.to_one_hot(vanilla_target_var, 61) rests = T.reshape(predictions, (nRotation, -1, 61)) final_rests = T.max(rests, 0) rests = T.swapaxes(rests, 0, 2) rests = T.swapaxes(rests, 0, 1) rests = rests[one_hot_targets.nonzero()] rests = T.max(rests, axis = 1) final_rests = T.set_subtensor(final_rests[one_hot_targets.nonzero()], rests) # loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) # loss = one_vs_all_hinge_loss(final_rests, vanilla_target_var) loss = lasagne.objectives.multiclass_hinge_loss(final_rests, vanilla_target_var, 5) loss = loss.mean() + weight_decay # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) # updates = lasagne.updates.nesterov_momentum( # loss, params, learning_rate=0.01, momentum=0.9) updates = lasagne.updates.adagrad(loss, params, learning_rate = 0.001) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_prediction = T.reshape(test_prediction,(nRotation, -1, 61)) test_prediction_res = test_prediction.max(axis = 0) final_test_prediction_res = test_prediction[0] test_prediction_process = T.swapaxes(test_prediction, 0, 2) test_prediction_process = T.swapaxes(test_prediction_process, 0, 1) test_prediction_process = test_prediction_process[one_hot_targets.nonzero()] test_prediction_process = T.max(test_prediction_process, axis = 1) final_test_prediction_res = T.set_subtensor(final_test_prediction_res[one_hot_targets.nonzero()], test_prediction_process) test_loss = lasagne.objectives.multiclass_hinge_loss(final_test_prediction_res, vanilla_target_var) test_loss = test_loss.mean() + weight_decay # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction_res, axis=1), vanilla_target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, vanilla_target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, vanilla_target_var], [test_loss, test_acc]) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True): inputs, targets = batch inputs = inputs.reshape(batch_size, 227, 227) inputs = rotateImage_batch(inputs, nRotation).reshape(batch_size * nRotation, 1, 227, 227) duplicated_targets = np.array([targets for i in range(nRotation)]).reshape(batch_size * nRotation,) train_err += train_fn(inputs, targets) train_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) if epoch % 10 == 0: # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False): inputs, targets = batch inputs = inputs.reshape(batch_size, 227, 227) inputs = rotateImage_batch(inputs, nRotation).reshape(batch_size * nRotation, 1, 227, 227) err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format( test_acc / test_batches * 100)) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=False): inputs, targets = batch inputs = inputs.reshape(batch_size, 227, 227) inputs = rotateImage_batch(inputs, nRotation).reshape(batch_size * nRotation, 1, 227, 227) err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format( test_acc / test_batches * 100)) weightsOfParams = lasagne.layers.get_all_param_values(network) np.save("../data/curet_justRotation_try5.npy", weightsOfParams)
def swapaxes(x, axis1, axis2): return T.swapaxes(x, axis1, axis2)
def _create_time_step(self, mask, x_preact, C_in, h_in, h_weights, mem_weights, mem_bias, v_weights, v_bias, q_weights): """The LSTM step function for theano.scan(). Creates the structure of one time step. The inputs do not contain the time step dimension. ``mask`` is a vector containing a boolean mask for each sequence. ``x_preact`` is a matrix containing the preactivations for each sequence. ``C_in`` and ``h_in``, as well as the outputs, are matrices containing the state vectors for each sequence. The required affine transformations have already been applied to the input prior to creating the loop. The transformed inputs and the mask that will be passed to the step function are vectors when processing a mini-batch - each value corresponds to the same time step in a different sequence. :type mask: Variable :param mask: a symbolic vector that masks out sequences that are past the last word :type x_preact: Variable :param x_preact: concatenation of the input x_(t) pre-activations computed using the gate and candidate state weights and biases; shape is (the number of sequences, state size * 4) :type C_in: Variable :param C_in: C_(t-1...t-n), memory (cell output) of the previous time steps; shape is (the number of sequences, state size* memory size) :type h_in: Variable :param h_in: h_(t-1), hidden state output of the previous time step; shape is (the number of sequences, state size) :type h_weights: Variable :param h_weights: concatenation of the gate and candidate state weights to be applied to h_(t-1); shape is (state size, state size * 4) :rtype: a tuple of two Variables :returns: C_(t) and h_(t), the cell state and hidden state outputs """ # pre-activation of the gates and candidate state preact = tensor.dot(h_in, h_weights) preact += x_preact num_sequences = x_preact.shape[0] # input, forget, and output gates i = tensor.nnet.sigmoid(get_submatrix(preact, 0, self.output_size)) f = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size)) o = tensor.nnet.sigmoid(get_submatrix(preact, 2, self.output_size)) # hidden state outputs candidate h_candidate = tensor.tanh(get_submatrix(preact, 3, self.output_size)) # calculate the attention weights # transforming the memory # First rehape C_in mem = C_in.reshape([num_sequences, self.memory_size, self.output_size]) hidden = tensor.dot(mem[:,:-1,:], mem_weights) + mem_bias hidden_q = (tensor.dot(h_in, q_weights)).reshape([num_sequences, 1, self.output_size]) # use V to calculate the attention scores for all previous input vectors raw_attention = tensor.dot(tensor.tanh(hidden+hidden_q), v_weights) + v_bias #logging.debug("time: %s, seq: &s", t, num_sequences) raw_attention = tensor.swapaxes(raw_attention, 0, 1) raw_attention = raw_attention.reshape([num_sequences, self.memory_size-1]) # with softmax we get the attention scores for each time t # shape is (num_sequences, t) attentions = tensor.nnet.softmax(raw_attention) # apply attention to the memory long_memory = tensor.batched_dot(attentions.reshape([attentions.shape[0],1,attentions.shape[1]]), mem[:,:-1,:]) #TODO test long_memory = long_memory.reshape([long_memory.shape[0], long_memory.shape[2]]) h_out = o * self._activation(f * long_memory + i * h_candidate) #concat new vector logging.debug("C ndim: %s, h_out ndim: %s", C_in.ndim, h_out.ndim) mem = tensor.concatenate([C_in[:,self.output_size:], h_out], axis=1) # TODO chech dimensions! # Apply the mask. None creates a new axis with size 1, causing the mask # to be broadcast to all the outputs. #C_out = tensor.switch(mask[:, None], C_out, C_in) h_out = tensor.switch(mask[:, None], h_out, h_in) return mem, h_out
def set_updater_function(encoder_feature_function, encoder_mean_function, encoder_var_function, decoder_feature_function, decoder_red_function, decoder_green_function, decoder_blue_function, encoder_params, decoder_params, optimizer): # positive visible data positive_visible_data = T.tensor4(name='positive_visible_data', dtype=theano.config.floatX) # positive hidden data positive_hidden_data = T.matrix(name='positive_hidden_data', dtype=theano.config.floatX) # negative hidden data negative_hidden_data = T.matrix(name='negative_hidden_data', dtype=theano.config.floatX) # moment weight moment_cost_weight = T.scalar(name='moment_cost_weight', dtype=theano.config.floatX) # num of samples num_samples = positive_visible_data.shape[0] num_rows = positive_visible_data.shape[2] num_cols = positive_visible_data.shape[3] num_pixels = num_rows*num_cols ################## # positive phase # ################## # positive encoder positive_encoder_outputs = encoder_feature_function(positive_visible_data/127.5 - 1.) positive_encoder_feature = positive_encoder_outputs[1] positive_encoder_mean = encoder_mean_function(positive_encoder_feature) positive_encoder_log_var = encoder_var_function(positive_encoder_feature) positive_encoder_std = T.sqrt(T.exp(positive_encoder_log_var)) positive_encoder_sample = positive_encoder_mean + positive_encoder_std*positive_hidden_data # positive decoder positive_decoder_outputs = decoder_feature_function(positive_encoder_sample) positive_decoder_hiddens = positive_decoder_outputs[0] positive_decoder_feature = positive_decoder_outputs[1] # shape = (num_samples, num_intensity, num_rows, num_cols) positive_decoder_red = decoder_red_function(positive_decoder_feature) positive_decoder_green = decoder_green_function(positive_decoder_feature) positive_decoder_blue = decoder_blue_function(positive_decoder_feature) # shape = (num_samples, num_intensity, num_pixels) positive_decoder_red = T.flatten(positive_decoder_red, 3) positive_decoder_green = T.flatten(positive_decoder_green, 3) positive_decoder_blue = T.flatten(positive_decoder_blue, 3) # shape = (num_samples, num_pixels, num_intensity) positive_decoder_red = T.swapaxes(positive_decoder_red, axis1=1, axis2=2) positive_decoder_green = T.swapaxes(positive_decoder_green, axis1=1, axis2=2) positive_decoder_blue = T.swapaxes(positive_decoder_blue, axis1=1, axis2=2) # shape = (num_samples*num_pixels, num_intensity) positive_decoder_red = positive_decoder_red.reshape((num_samples*num_pixels, -1)) positive_decoder_green = positive_decoder_green.reshape((num_samples*num_pixels, -1)) positive_decoder_blue = positive_decoder_blue.reshape((num_samples*num_pixels, -1)) # softmax positive_decoder_red = T.nnet.softmax(positive_decoder_red) positive_decoder_green = T.nnet.softmax(positive_decoder_green) positive_decoder_blue = T.nnet.softmax(positive_decoder_blue) # positive target positive_target_red = T.flatten(T.cast(positive_visible_data[:,0,:,:],'int64'), 1) positive_target_green = T.flatten(T.cast(positive_visible_data[:,1,:,:],'int64'), 1) positive_target_blue = T.flatten(T.cast(positive_visible_data[:,2,:,:],'int64'), 1) # positive lower bound cost positive_recon_red_cost = T.nnet.categorical_crossentropy( positive_decoder_red, positive_target_red).reshape((num_samples,-1)).sum(axis=1) positive_recon_green_cost = T.nnet.categorical_crossentropy(positive_decoder_green, positive_target_green).reshape((num_samples,-1)).sum(axis=1) positive_recon_blue_cost = T.nnet.categorical_crossentropy( positive_decoder_blue, positive_target_blue).reshape((num_samples,-1)).sum(axis=1) positive_recon_cost = positive_recon_red_cost + positive_recon_green_cost + positive_recon_blue_cost positive_kl_cost = -0.5*T.sum((1.0+positive_encoder_log_var-T.sqr(positive_encoder_mean)-T.exp(positive_encoder_log_var)), axis=1) positive_vae_cost = positive_recon_cost + positive_kl_cost ################## # negative phase # ################## # negative decoder negative_decoder_outputs = decoder_feature_function(negative_hidden_data) negative_decoder_hiddens = negative_decoder_outputs[0] negative_decoder_feature = negative_decoder_outputs[1] # moment matching moment_match_cost = 0 for i in xrange(len(positive_decoder_hiddens)): pos_feat = positive_decoder_hiddens[i] neg_feat = negative_decoder_hiddens[i] moment_match_cost += T.mean(T.sqr(T.mean(pos_feat, axis=0)-T.mean(neg_feat, axis=0))) moment_match_cost += T.mean(T.sqr(T.mean(T.sqr(pos_feat), axis=0)-T.mean(T.sqr(neg_feat), axis=0))) moment_match_cost += T.mean(T.sqr(T.mean(T.flatten(positive_decoder_feature, 2), axis=0)-T.mean(T.flatten(negative_decoder_feature, 2), axis=0))) moment_match_cost += T.mean(T.sqr(T.mean(T.sqr(T.flatten(positive_decoder_feature, 2)), axis=0)-T.mean(T.sqr(T.flatten(negative_decoder_feature, 2)), axis=0))) model_updater_cost = T.mean(positive_vae_cost) + moment_cost_weight*T.mean(moment_match_cost) model_updater_dict = optimizer(encoder_params+decoder_params, model_updater_cost) model_updater_inputs = [positive_visible_data, positive_hidden_data, negative_hidden_data, moment_cost_weight] model_updater_outputs = [positive_vae_cost, moment_match_cost, model_updater_cost] model_updater_function = theano.function(inputs=model_updater_inputs, outputs=model_updater_outputs, updates=model_updater_dict, on_unused_input='ignore') return model_updater_function
def main(model='mlp', num_epochs=2000): # Load the dataset print("Loading data...") # num_per_class = 100 # print("Using %d per class" % num_per_class) print("Using all the training data") #X_train, y_train, X_test, y_test = load_data("/X_train.npy", "/Y_train.npy", "/X_test_rotated.npy", "/Y_test_rotated.npy") X_train, y_train, X_test, y_test = load_data("/mnistROT.npy", "/mnistROTLabel.npy", "/mnistROTTEST.npy", "/mnistROTLABELTEST.npy", "ROT_MNIST") # Only for subclass trainning # X_train_final = [] # y_train_final = [] # for i in range(10): # X_train_class = X_train[y_train == i] # permutated_index = np.random.permutation(X_train_class.shape[0]) # permutated_index = np.arange(X_train_class.shape[0]) # X_train_final.append(X_train_class[permutated_index[:100]]) # y_train_final += [i] * num_per_class # X_train = np.vstack(X_train_final) # y_train = np.array(y_train_final, dtype = np.int32) X_train = extend_image(X_train, 40) X_test = extend_image(X_test, 40) #X_train, y_train, X_test, y_test = load_data("/cluttered_train_x.npy", "/cluttered_train_y.npy", "/cluttered_test_x.npy", "/cluttered_test_y.npy", dataset = "MNIST_CLUTTER") # Prepare Theano variables for inputs and targets nRotation = 8 # The dimension would be (nRotation * n, w, h) input_var = T.tensor4('inputs') # The dimension would be (n, ) vanilla_target_var = T.ivector('vanilla_targets') # The dimension would be (nRotation * n , ) target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) network, weight_decay = build_cnn(input_var) # saved_weights = np.load("../data/mnist_Chi_dec_100.npy") saved_weights = np.load("../data/mnist_CNN_params_drop_out_Chi_2017_hinge.npy") lasagne.layers.set_all_param_values(network, saved_weights) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): # The dimension would be (nRotation * n, 10) predictions = lasagne.layers.get_output(network) # The diVmension would be (nRotation * n, 10) one_hot_targets = T.extra_ops.to_one_hot(vanilla_target_var, 10) rests = T.reshape(predictions, (nRotation, -1, 10)) # final_rests = rests[0] final_rests = T.max(rests, 0) rests = T.swapaxes(rests, 0, 2) rests = T.swapaxes(rests, 0, 1) rests = rests[one_hot_targets.nonzero()] rests = T.max(rests, axis = 1) final_rests = T.set_subtensor(final_rests[one_hot_targets.nonzero()], rests) # loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) # loss = one_vs_all_hinge_loss(final_rests, vanilla_target_var) loss = lasagne.objectives.multiclass_hinge_loss(final_rests, vanilla_target_var) loss = loss.mean() + weight_decay # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) # updates = lasagne.updates.nesterov_momentum( # loss, params, learning_rate=0.01, momentum=0.9) updates = lasagne.updates.adagrad(loss, params, learning_rate = 0.01) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_prediction = T.reshape(test_prediction,(nRotation, -1, 10)) test_prediction_res = test_prediction.max(axis = 0) final_test_prediction_res = test_prediction[0] test_prediction_process = T.swapaxes(test_prediction, 0, 2) test_prediction_process = T.swapaxes(test_prediction_process, 0, 1) test_prediction_process = test_prediction_process[one_hot_targets.nonzero()] test_prediction_process = T.max(test_prediction_process, axis = 1) final_test_prediction_res = T.set_subtensor(final_test_prediction_res[one_hot_targets.nonzero()], test_prediction_process) test_loss = lasagne.objectives.multiclass_hinge_loss(final_test_prediction_res, vanilla_target_var) test_loss = test_loss.mean() + weight_decay # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction_res, axis=1), vanilla_target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, vanilla_target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, vanilla_target_var], [test_loss, test_acc]) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, 100, shuffle=True): inputs, targets = batch inputs = inputs.reshape(100, 40, 40) inputs = rotateImage_batch(inputs, nRotation).reshape(100 * nRotation, 1, 40, 40) duplicated_targets = np.array([targets for i in range(nRotation)]).reshape(100 * nRotation,) train_err += train_fn(inputs, targets) train_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) if epoch % 50 == 0: # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False): inputs, targets = batch inputs = inputs.reshape(500, 40, 40) inputs = rotateImage_batch(inputs, nRotation).reshape(500 * nRotation, 1, 40, 40) err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format( test_acc / test_batches * 100)) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_train, y_train, 500, shuffle=False): inputs, targets = batch inputs = inputs.reshape(500, 40, 40) inputs = rotateImage_batch(inputs, nRotation).reshape(500 * nRotation, 1, 40, 40) err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format( test_acc / test_batches * 100)) # Optionally, you could now dump the network weights to a file like this: # np.savez('model.npz', *lasagne.layers.get_all_param_values(network)) # # And load them again later on like this: # with np.load('model.npz') as f: # param_values = [f['arr_%d' % i] for i in range(len(f.files))] # lasagne.layers.set_all_param_values(network, param_values) weightsOfParams = lasagne.layers.get_all_param_values(network) #np.save("../data/mnist_clutter_CNN_params_sigmoid.npy", weightsOfParams) #np.save("../data/mnist_CNN_params_sigmoid.npy", weightsOfParams) #np.save("../data/mnist_CNN_params.npy", weightsOfParams) #np.save("../data/mnist_CNN_params_drop_out_semi_Chi_Dec7.npy", weightsOfParams) np.save("../data/mnist_CNN_params_drop_out_Chi_2017_ROT_hinge_2000.npy", weightsOfParams)
def _create_time_step(self, t, mask, S, x, x_preact, h_weights, h_bias, v_weights, v_bias): """The Attention step function for theano.scan(). Creates the structure of one time step. The inputs do not contain the time step dimension. ``mask`` is a vector containing a boolean mask for each sequence. ``x_preact`` is a matrix containing the preactivations for each sequence. The required affine transformations have already been applied to the input prior to creating the loop. The transformed inputs and the mask that will be passed to the step function are vectors when processing a mini-batch - each value corresponds to the same time step in a different sequence. :type t: Variable :param t: the current time index :type mask: Variable :param mask: a symbolic vector that masks out sequences that are past the last word :type x: Variable :param x: concatenation of the input vectors x_(t) :type x_preact: Variable :param x_preact: concatenation of the input x_(t) pre-activations computed using the W1 weights and biases; shape is (the number of sequences, num_hiden) :type S: Variable :param S: C_(t-1), layer output of the previous time step; shape is (the number of sequences, state size) :type h_weights: Variable :param h_weights: weights to be applied to S_(t-1); shape is (output_size, hidden_size) :type h_bias: Variable :param h_bias: bias to be applied with h_weights :type v_weights: Variable :param v_weights: value weights to calculate the raw attention score; shape is (hidden_size, 1) :type v_bias: Variable :param v_bias: weights to be applied with v_weights :rtype: matrix of output :returns: attended context vector for timestep t """ if t == 0: return x[0,:,:] # transforming the previous output hidden = tensor.dot(S, h_weights) + h_bias # use V to calculate the attention scores for all previous input vectors raw_attention = tensor.dot(tensor.tanh(x_preact[:t+1,:,:]+hidden), v_weights) + v_bias # Apply softmax to calculate the current attention weights # first reshape the 3D tensor into a 2D one num_sequences = x.shape[1] #logging.debug("time: %s, seq: &s", t, num_sequences) raw_attention = tensor.swapaxes(raw_attention, 0, 1) raw_attention = raw_attention.reshape([num_sequences, raw_attention.shape[1]]) # with softmax we get the attention scores for each time t # shape is (num_sequences, t) attentions = tensor.nnet.softmax(raw_attention) # Calculate the new output using the attention weights # multiply the input vectors with the appropiate attention score C_out = tensor.batched_dot(attentions.reshape([attentions.shape[0],1,attentions.shape[1]]), x[:t+1, :, :].dimshuffle((1,0,2))) C_out = C_out.reshape([C_out.shape[0], C_out.shape[2]]) #non_seq = [x[:t+1,:,:], attentions] #location = tensor.arange(num_sequences) #C_out,_ = theano.scan(fn=self._calc_sum, # outputs_info=None, # sequences=[location], # non_sequences=non_seq) # Apply the mask. None creates a new axis with size 1, causing the mask # to be broadcast to all the outputs. C_out = tensor.switch(mask[:, None], C_out, S) return C_out