def sample_step(x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, ctx): xinp_h1_t, xgate_h1_t = inp_to_h1.proj(x_tm1) xinp_h2_t, xgate_h2_t = inp_to_h2.proj(x_tm1) xinp_h3_t, xgate_h3_t = inp_to_h3.proj(x_tm1) attinp_h1, attgate_h1 = att_to_h1.proj(w_tm1) h1_t = cell1.step(xinp_h1_t + attinp_h1, xgate_h1_t + attgate_h1, h1_tm1) h1inp_h2, h1gate_h2 = h1_to_h2.proj(h1_t) h1inp_h3, h1gate_h3 = h1_to_h3.proj(h1_t) a_t = h1_t.dot(h1_to_att_a) b_t = h1_t.dot(h1_to_att_b) k_t = h1_t.dot(h1_to_att_k) a_t = tensor.exp(a_t) b_t = tensor.exp(b_t) k_t = k_tm1 + tensor.exp(k_t) ss_t = calc_phi(k_t, a_t, b_t, u) # calculate and return stopping criteria sh_t = calc_phi(k_t, a_t, b_t, u_max) ss5 = ss_t.dimshuffle(0, 1, 'x') ss6 = ss5 * ctx.dimshuffle(1, 0, 2) w_t = ss6.sum(axis=1) attinp_h2, attgate_h2 = att_to_h2.proj(w_t) attinp_h3, attgate_h3 = att_to_h3.proj(w_t) h2_t = cell2.step(xinp_h2_t + h1inp_h2 + attinp_h2, xgate_h2_t + h1gate_h2 + attgate_h2, h2_tm1) h2inp_h3, h2gate_h3 = h2_to_h3.proj(h2_t) h3_t = cell3.step(xinp_h3_t + h1inp_h3 + h2inp_h3 + attinp_h3, xgate_h3_t + h1gate_h3 + h2gate_h3 + attgate_h3, h3_tm1) out_t = h1_t.dot(h1_to_outs) + h2_t.dot(h2_to_outs) + h3_t.dot( h3_to_outs) theano.printing.Print("out_t.shape")(out_t.shape) l1_t = relu(out_t.dot(l1_proj) + b_l1_proj) l2_t = relu(l1_t.dot(l2_proj) + b_l2_proj) l3_t = relu(l2_t.dot(l3_proj) + b_l3_proj) theano.printing.Print("l3_t.shape")(l3_t.shape) pred_t = l3_t.dot(softmax_proj) + b_softmax_proj theano.printing.Print("pred_t.shape")(pred_t.shape) pred_t = pred_t.reshape((-1, n_features, n_softmax)) pred_t = softmax(pred_t * (1. + softmax_bias_sym)) theano.printing.Print("pred_t.shape")(pred_t.shape) shp = pred_t.shape pred_t = pred_t.reshape((-1, shp[-1])) samp_t = sample_softmax(pred_t, srng) samp_t = samp_t.reshape((shp[0], shp[1])) x_t = samp_t theano.printing.Print("samp_t.shape")(samp_t.shape) theano.printing.Print("x_t.shape")(x_t.shape) return x_t, h1_t, h2_t, h3_t, k_t, w_t, ss_t, sh_t
def sample_step(x_tm1, h1_tm1, h2_tm1, h3_tm1): xinp_h1_t, xgate_h1_t = inp_to_h1.proj(x_tm1) xinp_h2_t, xgate_h2_t = inp_to_h2.proj(x_tm1) xinp_h3_t, xgate_h3_t = inp_to_h3.proj(x_tm1) h1_t = cell1.step(xinp_h1_t, xgate_h1_t, h1_tm1) h1inp_h2, h1gate_h2 = h1_to_h2.proj(h1_t) h1inp_h3, h1gate_h3 = h1_to_h3.proj(h1_t) h2_t = cell2.step(xinp_h2_t + h1inp_h2, xgate_h2_t + h1gate_h2, h2_tm1) h2inp_h3, h2gate_h3 = h2_to_h3.proj(h2_t) h3_t = cell3.step(xinp_h3_t + h1inp_h3 + h2inp_h3, xgate_h3_t + h1gate_h3 + h2gate_h3, h3_tm1) out_t = h1_t.dot(h1_to_outs) + h2_t.dot(h2_to_outs) + h3_t.dot( h3_to_outs) + b_to_outs theano.printing.Print("x_tm1.shape")(x_tm1.shape) theano.printing.Print("out_t.shape")(out_t.shape) inpt_oh = theano_one_hot(x_tm1, n_classes=n_bins) theano.printing.Print("inpt_oh.shape")(inpt_oh.shape) shp = inpt_oh.shape prev_t = inpt_oh for i in range(n_frame): partial_out_t = out_t[:, i * n_hid: (i + 1) * n_hid] theano.printing.Print("partial_out_t.shape")(partial_out_t.shape) theano.printing.Print("prev_t.shape")(prev_t.shape) shp = prev_t.shape prev_ti = prev_t[:, i:].reshape((shp[0], -1)) theano.printing.Print("prev_ti.shape")(prev_ti.shape) features_t = tensor.concatenate((partial_out_t, prev_ti), axis=1) theano.printing.Print("features_t.shape")(features_t.shape) mlp1_t = relu(features_t.dot(mlp1_w) + mlp1_b) mlp2_t = relu(mlp1_t.dot(mlp2_w) + mlp2_b) mlp3_t = relu(mlp2_t.dot(mlp3_w) + mlp3_b) pred_t = softmax(mlp3_t.dot(pred_w) + pred_b) theano.printing.Print("pred_t.shape")(pred_t.shape) samp_t = sample_softmax(pred_t, srng) theano.printing.Print("samp_t.shape")(samp_t.shape) samp_t_oh = theano_one_hot(samp_t, n_classes=n_bins) samp_t_oh = samp_t_oh.dimshuffle(0, 'x', 1) theano.printing.Print("samp_t_oh.shape")(samp_t_oh.shape) prev_t = tensor.concatenate((prev_t, samp_t_oh), axis=1) theano.printing.Print("prev_t.shape")(prev_t.shape) pred_t = prev_t[:, n_frame:].argmax(axis=-1) x_t = tensor.cast(pred_t, theano.config.floatX) return x_t, h1_t, h2_t, h3_t
target = X_sym[1:] mask = X_mask_sym[1:] context = c_sym * c_mask_sym.dimshuffle(0, 1, 'x') theano.printing.Print("inpt.shape")(inpt.shape) theano.printing.Print("target.shape")(target.shape) inpt = inpt.dimshuffle(1, 'x', 0, 2) border_mode = (conv_size1 - 1, 0) conv1 = conv2d(inpt, w_conv1, subsample=(2, 1), border_mode=border_mode) conv1 = conv1 + b_conv1.dimshuffle('x', 0, 'x', 'x') theano.printing.Print("conv1.shape")(conv1.shape) border_mode = (conv_size2 - 1, 0) conv2 = conv2d(conv1, w_conv2, subsample=(2, 1), border_mode=border_mode) conv2 = relu(conv2 + b_conv2.dimshuffle('x', 0, 'x', 'x')) theano.printing.Print("conv2.shape")(conv2.shape) # Last axis is 1 conv_out = conv2[:, :, :, 0].dimshuffle(2, 0, 1) theano.printing.Print("conv_out.shape")(conv_out.shape) conv_h1, convgate_h1 = conv_to_h1.proj(conv_out) conv_h2, convgate_h2 = conv_to_h2.proj(conv_out) u = tensor.arange(c_sym.shape[0]).dimshuffle('x', 'x', 0) u = tensor.cast(u, theano.config.floatX) def calc_phi(k_t, a_t, b_t, u_c): a_t = a_t.dimshuffle(0, 1, 'x') b_t = b_t.dimshuffle(0, 1, 'x')
pred_i = [] for i in range(n_frame): partial_outs = outs[:, :, i * n_hid: (i + 1) * n_hid] joint = tensor.concatenate((inpt_oh, next_oh), axis=2) sliced_context = joint[:, :, i:i + n_frame] theano.printing.Print("sliced_context.shape")(sliced_context.shape) shp = sliced_context.shape sliced_context = sliced_context.reshape((shp[0], shp[1], -1)) features = tensor.concatenate((partial_outs, sliced_context), axis=2) theano.printing.Print("partial_outs.shape")(partial_outs.shape) theano.printing.Print("joint.shape")(joint.shape) theano.printing.Print("sliced_context.shape")(sliced_context.shape) theano.printing.Print("features.shape")(features.shape) shp = features.shape mlp_inpt = features.reshape((-1, shp[-1])) mlp1 = relu(mlp_inpt.dot(mlp1_w) + mlp1_b) mlp2 = relu(mlp1.dot(mlp2_w) + mlp2_b) mlp3 = relu(mlp2.dot(mlp3_w) + mlp3_b) pred = softmax(mlp3.dot(pred_w) + pred_b) theano.printing.Print("pred.shape")(pred.shape) pred = pred.reshape((shp[0], shp[1], -1)) theano.printing.Print("pred.shape")(pred.shape) pred_i.append(pred.dimshuffle(0, 1, 2, 'x')) pred = tensor.concatenate(pred_i, axis=-1).dimshuffle(0, 1, 3, 2) theano.printing.Print("pred.shape")(pred.shape) theano.printing.Print("target.shape")(target.shape) target = theano_one_hot(target, n_classes=n_bins) theano.printing.Print("target.shape")(target.shape) # dimshuffle so batch is on last axis cost = categorical_crossentropy(pred, target, eps=1E-9) theano.printing.Print("cost.shape")(cost.shape)
theano.printing.Print("x_t.shape")(x_t.shape) return x_t, h1_t, h2_t, h3_t, k_t, w_t, ss_t, sh_t (sampled, h1_s, h2_s, h3_s, k_s, w_s, stop_s, stop_h) = sample_step(init_x, init_h1, init_h2, init_h3, init_kappa, init_w, c_sym) theano.printing.Print("sampled.shape")(sampled.shape) (h1, h2, h3, kappa, w), updates = theano.scan( fn=step, sequences=[inp_h1, inpgate_h1, inp_h2, inpgate_h2, inp_h3, inpgate_h3], outputs_info=[init_h1, init_h2, init_h3, init_kappa, init_w], non_sequences=[context]) outs = h1.dot(h1_to_outs) + h2.dot(h2_to_outs) + h3.dot(h3_to_outs) l1 = relu(outs.dot(l1_proj) + b_l1_proj) l2 = relu(l1.dot(l2_proj) + b_l2_proj) l3 = relu(l2.dot(l3_proj) + b_l3_proj) shp = l3.shape l3 = l3.reshape((-1, shp[-1])) preds = l3.dot(softmax_proj) + b_softmax_proj preds = preds.reshape((shp[0], shp[1], n_features, n_softmax)) preds = softmax(preds * (1. + softmax_bias_sym)) theano.printing.Print("preds.shape")(preds.shape) theano.printing.Print("target.shape")(target.shape) target = theano_one_hot(target, n_softmax) theano.printing.Print("target.shape")(target.shape) cost = categorical_crossentropy(preds, target, eps=1E-9) theano.printing.Print("cost.shape")(cost.shape)