def sample_step(x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, ctx): xinp_h1_t, xgate_h1_t = inp_to_h1.proj(x_tm1) xinp_h2_t, xgate_h2_t = inp_to_h2.proj(x_tm1) xinp_h3_t, xgate_h3_t = inp_to_h3.proj(x_tm1) attinp_h1, attgate_h1 = att_to_h1.proj(w_tm1) h1_t = cell1.step(xinp_h1_t + attinp_h1, xgate_h1_t + attgate_h1, h1_tm1) h1inp_h2, h1gate_h2 = h1_to_h2.proj(h1_t) h1inp_h3, h1gate_h3 = h1_to_h3.proj(h1_t) a_t = h1_t.dot(h1_to_att_a) b_t = h1_t.dot(h1_to_att_b) k_t = h1_t.dot(h1_to_att_k) a_t = tensor.exp(a_t) b_t = tensor.exp(b_t) k_t = k_tm1 + tensor.exp(k_t) ss_t = calc_phi(k_t, a_t, b_t, u) # calculate and return stopping criteria sh_t = calc_phi(k_t, a_t, b_t, u_max) ss5 = ss_t.dimshuffle(0, 1, 'x') ss6 = ss5 * ctx.dimshuffle(1, 0, 2) w_t = ss6.sum(axis=1) attinp_h2, attgate_h2 = att_to_h2.proj(w_t) attinp_h3, attgate_h3 = att_to_h3.proj(w_t) h2_t = cell2.step(xinp_h2_t + h1inp_h2 + attinp_h2, xgate_h2_t + h1gate_h2 + attgate_h2, h2_tm1) h2inp_h3, h2gate_h3 = h2_to_h3.proj(h2_t) h3_t = cell3.step(xinp_h3_t + h1inp_h3 + h2inp_h3 + attinp_h3, xgate_h3_t + h1gate_h3 + h2gate_h3 + attgate_h3, h3_tm1) out_t = h1_t.dot(h1_to_outs) + h2_t.dot(h2_to_outs) + h3_t.dot( h3_to_outs) theano.printing.Print("out_t.shape")(out_t.shape) l1_t = relu(out_t.dot(l1_proj) + b_l1_proj) l2_t = relu(l1_t.dot(l2_proj) + b_l2_proj) l3_t = relu(l2_t.dot(l3_proj) + b_l3_proj) theano.printing.Print("l3_t.shape")(l3_t.shape) pred_t = l3_t.dot(softmax_proj) + b_softmax_proj theano.printing.Print("pred_t.shape")(pred_t.shape) pred_t = pred_t.reshape((-1, n_features, n_softmax)) pred_t = softmax(pred_t * (1. + softmax_bias_sym)) theano.printing.Print("pred_t.shape")(pred_t.shape) shp = pred_t.shape pred_t = pred_t.reshape((-1, shp[-1])) samp_t = sample_softmax(pred_t, srng) samp_t = samp_t.reshape((shp[0], shp[1])) x_t = samp_t theano.printing.Print("samp_t.shape")(samp_t.shape) theano.printing.Print("x_t.shape")(x_t.shape) return x_t, h1_t, h2_t, h3_t, k_t, w_t, ss_t, sh_t
def sample_step(x_tm1, h1_tm1, h2_tm1, h3_tm1): xinp_h1_t, xgate_h1_t = inp_to_h1.proj(x_tm1) xinp_h2_t, xgate_h2_t = inp_to_h2.proj(x_tm1) xinp_h3_t, xgate_h3_t = inp_to_h3.proj(x_tm1) h1_t = cell1.step(xinp_h1_t, xgate_h1_t, h1_tm1) h1inp_h2, h1gate_h2 = h1_to_h2.proj(h1_t) h1inp_h3, h1gate_h3 = h1_to_h3.proj(h1_t) h2_t = cell2.step(xinp_h2_t + h1inp_h2, xgate_h2_t + h1gate_h2, h2_tm1) h2inp_h3, h2gate_h3 = h2_to_h3.proj(h2_t) h3_t = cell3.step(xinp_h3_t + h1inp_h3 + h2inp_h3, xgate_h3_t + h1gate_h3 + h2gate_h3, h3_tm1) out_t = h1_t.dot(h1_to_outs) + h2_t.dot(h2_to_outs) + h3_t.dot( h3_to_outs) + b_to_outs theano.printing.Print("x_tm1.shape")(x_tm1.shape) theano.printing.Print("out_t.shape")(out_t.shape) inpt_oh = theano_one_hot(x_tm1, n_classes=n_bins) theano.printing.Print("inpt_oh.shape")(inpt_oh.shape) shp = inpt_oh.shape prev_t = inpt_oh for i in range(n_frame): partial_out_t = out_t[:, i * n_hid: (i + 1) * n_hid] theano.printing.Print("partial_out_t.shape")(partial_out_t.shape) theano.printing.Print("prev_t.shape")(prev_t.shape) shp = prev_t.shape prev_ti = prev_t[:, i:].reshape((shp[0], -1)) theano.printing.Print("prev_ti.shape")(prev_ti.shape) features_t = tensor.concatenate((partial_out_t, prev_ti), axis=1) theano.printing.Print("features_t.shape")(features_t.shape) mlp1_t = relu(features_t.dot(mlp1_w) + mlp1_b) mlp2_t = relu(mlp1_t.dot(mlp2_w) + mlp2_b) mlp3_t = relu(mlp2_t.dot(mlp3_w) + mlp3_b) pred_t = softmax(mlp3_t.dot(pred_w) + pred_b) theano.printing.Print("pred_t.shape")(pred_t.shape) samp_t = sample_softmax(pred_t, srng) theano.printing.Print("samp_t.shape")(samp_t.shape) samp_t_oh = theano_one_hot(samp_t, n_classes=n_bins) samp_t_oh = samp_t_oh.dimshuffle(0, 'x', 1) theano.printing.Print("samp_t_oh.shape")(samp_t_oh.shape) prev_t = tensor.concatenate((prev_t, samp_t_oh), axis=1) theano.printing.Print("prev_t.shape")(prev_t.shape) pred_t = prev_t[:, n_frame:].argmax(axis=-1) x_t = tensor.cast(pred_t, theano.config.floatX) return x_t, h1_t, h2_t, h3_t
def sample_step(x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, ctx): xinp_h1_t, xgate_h1_t = inp_to_h1.proj(x_tm1) xinp_h2_t, xgate_h2_t = inp_to_h2.proj(x_tm1) xinp_h3_t, xgate_h3_t = inp_to_h3.proj(x_tm1) attinp_h1, attgate_h1 = att_to_h1.proj(w_tm1) h1_t = cell1.step(xinp_h1_t + attinp_h1, xgate_h1_t + attgate_h1, h1_tm1) h1inp_h2, h1gate_h2 = h1_to_h2.proj(h1_t) h1inp_h3, h1gate_h3 = h1_to_h3.proj(h1_t) a_t = h1_t.dot(h1_to_att_a) b_t = h1_t.dot(h1_to_att_b) k_t = h1_t.dot(h1_to_att_k) a_t = tensor.exp(a_t) b_t = tensor.exp(b_t) k_t = k_tm1 + tensor.exp(k_t) ss_t = calc_phi(k_t, a_t, b_t, u) # calculate and return stopping criteria sh_t = calc_phi(k_t, a_t, b_t, u_max) ss5 = ss_t.dimshuffle(0, 1, 'x') ss6 = ss5 * ctx.dimshuffle(1, 0, 2) w_t = ss6.sum(axis=1) attinp_h2, attgate_h2 = att_to_h2.proj(w_t) attinp_h3, attgate_h3 = att_to_h3.proj(w_t) h2_t = cell2.step(xinp_h2_t + h1inp_h2 + attinp_h2, xgate_h2_t + h1gate_h2 + attgate_h2, h2_tm1) h2inp_h3, h2gate_h3 = h2_to_h3.proj(h2_t) h3_t = cell3.step(xinp_h3_t + h1inp_h3 + h2inp_h3 + attinp_h3, xgate_h3_t + h1gate_h3 + h2gate_h3 + attgate_h3, h3_tm1) out_t = h1_t.dot(h1_to_outs) + h2_t.dot(h2_to_outs) + h3_t.dot( h3_to_outs) out_t = softmax(out_t) s = sample_softmax(out_t, srng) x_t = s return x_t, h1_t, h2_t, h3_t, k_t, w_t, ss_t, sh_t
theano.printing.Print("sliced.shape")(sliced.shape) border_mode = "half" blur_conv = conv2d(sliced, w_blurconv, border_mode=border_mode) theano.printing.Print("w_blurconv.shape")(w_blurconv.shape) theano.printing.Print("blur_conv.shape")(blur_conv.shape) final_conv = conv2d(blur_conv, w_finalconv, border_mode=border_mode) theano.printing.Print("w_finalconv.shape")(w_finalconv.shape) theano.printing.Print("final_conv.shape")(final_conv.shape) """ outs_deconv = final_conv[:, :, :, :input_dim] outs_deconv = outs_deconv.dimshuffle(2, 0, 3, 1) outs_deconv = outs_deconv[:target.shape[0]] theano.printing.Print("outs_deconv.shape")(outs_deconv.shape) preds = softmax(outs_deconv + b_softmax) theano.printing.Print("preds.shape")(preds.shape) theano.printing.Print("target.shape")(target.shape) target = theano_one_hot(target, r=n_bins) theano.printing.Print("target.shape")(target.shape) cost = categorical_crossentropy(preds, target) theano.printing.Print("cost.shape")(cost.shape) theano.printing.Print("mask.shape")(mask.shape) cost = cost * mask.dimshuffle(0, 1, 'x') cost = cost.sum() / (target.shape[0] * target.shape[1]) grads = tensor.grad(cost, params) init_x = as_shared(np_zeros((minibatch_size, n_out))) srng = RandomStreams(1999) """
init_hidden = tensor.zeros((shuff_inpt.shape[1], n_v_proj), dtype=theano.config.floatX) theano.printing.Print("init_hidden.shape")(init_hidden.shape) v_h1, updates = theano.scan( fn=out_step, sequences=[vinp_h1, vgate_h1], outputs_info=[init_hidden]) pre_pred = v_h1.dot(pred_proj) + pred_b pre_pred = pre_pred.dimshuffle(1, 0, 2) shp = pre_pred.shape # Have to undo the minibatch_size * time features the same way they came in pre_pred = pre_pred.reshape((minibatch_size, shp[0] // minibatch_size, shp[1], shp[2])) pre_pred = pre_pred.dimshuffle(1, 0, 2, 3) theano.printing.Print("pre_pred.shape")(pre_pred.shape) pred = softmax(pre_pred) theano.printing.Print("target.shape")(target.shape) target = theano_one_hot(target, n_classes=n_bins) theano.printing.Print("target.shape")(target.shape) theano.printing.Print("pred.shape")(pred.shape) cost = categorical_crossentropy(pred, target) cost = cost * mask.dimshuffle(0, 1, 'x') # sum over sequence length and features, mean over minibatch cost = cost.dimshuffle(0, 2, 1) theano.printing.Print("cost.shape")(cost.shape) cost = cost.reshape((-1, cost.shape[2])) theano.printing.Print("cost.shape")(cost.shape) cost = cost.sum(axis=0).mean() """ # optimize sum of probabilities rather than product?
theano.printing.Print("sliced.shape")(sliced.shape) border_mode = "half" blur_conv = conv2d(sliced, w_blurconv, border_mode=border_mode) theano.printing.Print("w_blurconv.shape")(w_blurconv.shape) theano.printing.Print("blur_conv.shape")(blur_conv.shape) final_conv = conv2d(blur_conv, w_finalconv, border_mode=border_mode) theano.printing.Print("w_finalconv.shape")(w_finalconv.shape) theano.printing.Print("final_conv.shape")(final_conv.shape) """ outs_deconv = final_conv[:, :, :, :input_dim] outs_deconv = outs_deconv.dimshuffle(2, 0, 3, 1) outs_deconv = outs_deconv[:target.shape[0]] theano.printing.Print("outs_deconv.shape")(outs_deconv.shape) preds = softmax(outs_deconv + b_softmax) theano.printing.Print("preds.shape")(preds.shape) theano.printing.Print("target.shape")(target.shape) target = theano_one_hot(target, r=n_bins) theano.printing.Print("target.shape")(target.shape) cost = categorical_crossentropy(preds, target) theano.printing.Print("cost.shape")(cost.shape) theano.printing.Print("mask.shape")(mask.shape) cost = cost * mask.dimshuffle(0, 1, 'x') cost = cost.sum() / (target.shape[0] * target.shape[1]) grads = tensor.grad(cost, params) init_x = as_shared(np_zeros((minibatch_size, n_out))) srng = RandomStreams(1999) """ # Used to calculate stopping heuristic from sections 5.3
def sample_step(x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, ctx): theano.printing.Print("x_tm1.shape")(x_tm1.shape) pt1 = theano_one_hot(x_tm1[:, 0], n_classes=n_softmax1) theano.printing.Print("pt1.shape")(pt1.shape) pt2 = theano_one_hot(x_tm1[:, 1], n_classes=n_softmax2) theano.printing.Print("pt2.shape")(pt2.shape) x_tm1 = tensor.concatenate((pt1, pt2), axis=-1) theano.printing.Print("x_tm1.shape")(x_tm1.shape) x_tm1_reduced = x_tm1.dot(inp_proj) + inp_b theano.printing.Print("x_tm1_reduced.shape")(x_tm1_reduced.shape) xinp_h1_t, xgate_h1_t = inp_to_h1.proj(x_tm1_reduced) xinp_h2_t, xgate_h2_t = inp_to_h2.proj(x_tm1_reduced) xinp_h3_t, xgate_h3_t = inp_to_h3.proj(x_tm1_reduced) attinp_h1, attgate_h1 = att_to_h1.proj(w_tm1) h1_t = cell1.step(xinp_h1_t + attinp_h1, xgate_h1_t + attgate_h1, h1_tm1) h1inp_h2, h1gate_h2 = h1_to_h2.proj(h1_t) h1inp_h3, h1gate_h3 = h1_to_h3.proj(h1_t) a_t = h1_t.dot(h1_to_att_a) b_t = h1_t.dot(h1_to_att_b) k_t = h1_t.dot(h1_to_att_k) a_t = tensor.exp(a_t) b_t = tensor.exp(b_t) k_t = k_tm1 + tensor.exp(k_t) ss_t = calc_phi(k_t, a_t, b_t, u) # calculate and return stopping criteria sh_t = calc_phi(k_t, a_t, b_t, u_max) ss5 = ss_t.dimshuffle(0, 1, 'x') ss6 = ss5 * ctx.dimshuffle(1, 0, 2) w_t = ss6.sum(axis=1) attinp_h2, attgate_h2 = att_to_h2.proj(w_t) attinp_h3, attgate_h3 = att_to_h3.proj(w_t) h2_t = cell2.step(xinp_h2_t + h1inp_h2 + attinp_h2, xgate_h2_t + h1gate_h2 + attgate_h2, h2_tm1) h2inp_h3, h2gate_h3 = h2_to_h3.proj(h2_t) h3_t = cell3.step(xinp_h3_t + h1inp_h3 + h2inp_h3 + attinp_h3, xgate_h3_t + h1gate_h3 + h2gate_h3 + attgate_h3, h3_tm1) out_t = h1_t.dot(h1_to_outs) + h2_t.dot(h2_to_outs) + h3_t.dot( h3_to_outs) #l1_t = relu(out_t.dot(l1_proj) + l1_b) #l2_t = relu(l1_t.dot(l2_proj) + l2_b) pred1_t = softmax(out_t.dot(softmax1_proj) + softmax1_b) pred2_t = softmax(out_t.dot(softmax2_proj) + softmax2_b) s1_t = sample_softmax(pred1_t, srng) theano.printing.Print("s1_t.shape")(s1_t.shape) s2_t = sample_softmax(pred2_t, srng) theano.printing.Print("s2_t.shape")(s2_t.shape) s1_t = s1_t.dimshuffle(0, 'x') theano.printing.Print("s1_t.shape")(s1_t.shape) s2_t = s2_t.dimshuffle(0, 'x') theano.printing.Print("s2_t.shape")(s2_t.shape) x_t = tensor.concatenate((s1_t, s2_t), axis=1) theano.printing.Print("x_t.shape")(x_t.shape) return x_t, h1_t, h2_t, h3_t, k_t, w_t, ss_t, sh_t
(h1, h2, h3, kappa, w), updates = theano.scan( fn=step, sequences=[inp_h1, inpgate_h1, inp_h2, inpgate_h2, inp_h3, inpgate_h3], outputs_info=[init_h1, init_h2, init_h3, init_kappa, init_w], non_sequences=[context]) outs = h1.dot(h1_to_outs) + h2.dot(h2_to_outs) + h3.dot(h3_to_outs) theano.printing.Print("outs.shape")(outs.shape) outs_shape = outs.shape #l1 = relu(outs.dot(l1_proj) + l1_b) #l2 = relu(l1.dot(l2_proj) + l2_b) #theano.printing.Print("l1.shape")(l1.shape) #theano.printing.Print("l2.shape")(l2.shape) pred1 = softmax(outs.dot(softmax1_proj) + softmax1_b) pred2 = softmax(outs.dot(softmax2_proj) + softmax2_b) theano.printing.Print("pred1.shape")(pred1.shape) theano.printing.Print("pred2.shape")(pred2.shape) # Make one hot targets theano.printing.Print("target.shape")(target.shape) target1 = target[:, :, 0] target2 = target[:, :, 1] theano.printing.Print("target1.shape")(target1.shape) theano.printing.Print("target2.shape")(target2.shape) shp = target1.shape target1 = target1.ravel() target1 = theano_one_hot(target1, n_classes=n_softmax1) target1 = target1.reshape((shp[0], shp[1], n_softmax1))
joint = tensor.concatenate((inpt_oh, next_oh), axis=2) sliced_context = joint[:, :, i:i + n_frame] theano.printing.Print("sliced_context.shape")(sliced_context.shape) shp = sliced_context.shape sliced_context = sliced_context.reshape((shp[0], shp[1], -1)) features = tensor.concatenate((partial_outs, sliced_context), axis=2) theano.printing.Print("partial_outs.shape")(partial_outs.shape) theano.printing.Print("joint.shape")(joint.shape) theano.printing.Print("sliced_context.shape")(sliced_context.shape) theano.printing.Print("features.shape")(features.shape) shp = features.shape mlp_inpt = features.reshape((-1, shp[-1])) mlp1 = relu(mlp_inpt.dot(mlp1_w) + mlp1_b) mlp2 = relu(mlp1.dot(mlp2_w) + mlp2_b) mlp3 = relu(mlp2.dot(mlp3_w) + mlp3_b) pred = softmax(mlp3.dot(pred_w) + pred_b) theano.printing.Print("pred.shape")(pred.shape) pred = pred.reshape((shp[0], shp[1], -1)) theano.printing.Print("pred.shape")(pred.shape) pred_i.append(pred.dimshuffle(0, 1, 2, 'x')) pred = tensor.concatenate(pred_i, axis=-1).dimshuffle(0, 1, 3, 2) theano.printing.Print("pred.shape")(pred.shape) theano.printing.Print("target.shape")(target.shape) target = theano_one_hot(target, n_classes=n_bins) theano.printing.Print("target.shape")(target.shape) # dimshuffle so batch is on last axis cost = categorical_crossentropy(pred, target, eps=1E-9) theano.printing.Print("cost.shape")(cost.shape) theano.printing.Print("mask.shape")(mask.shape) cost = cost * mask.dimshuffle(0, 1, 'x')
theano.printing.Print("init_pred.shape")(init_pred.shape) theano.printing.Print("init_hidden.shape")(init_hidden.shape) r, updates = theano.scan(fn=out_step, sequences=[shuff_inpt, vinp], outputs_info=[init_pred, init_hidden]) (pre_pred, v_h1) = r theano.printing.Print("pre_pred.shape")(pre_pred.shape) pre_pred = pre_pred.dimshuffle(1, 0, 'x') shp = pre_pred.shape theano.printing.Print("pre_pred.shape")(pre_pred.shape) pre_pred = pre_pred.reshape( (minibatch_size, shp[0] // minibatch_size, shp[1], shp[2])) theano.printing.Print("pre_pred.shape")(pre_pred.shape) pre_pred = pre_pred.dimshuffle(1, 0, 2, 3) theano.printing.Print("pre_pred.shape")(pre_pred.shape) pred = softmax(pre_pred) theano.printing.Print("pred.shape")(pred.shape) theano.printing.Print("target.shape")(target.shape) target = theano_one_hot(target, n_classes=n_bins) theano.printing.Print("target.shape")(target.shape) raise ValueError() cost = categorical_crossentropy(pred, target) cost = cost * mask.dimshuffle(0, 1, 'x') # sum over sequence length and features, mean over minibatch cost = cost.dimshuffle(0, 2, 1) cost = cost.reshape((-1, cost.shape[2])) cost = cost.sum(axis=0).mean() l2_penalty = 0 for p in list(set(params) - set(biases)):
h2_t = GRU(h1_h2_t, h1gate_h2_t, h2_tm1, n_hid, n_hid, random_state) h2_h3_t, h2gate_h3_t = GRUFork([h2_t], [n_hid], n_hid, random_state) h3_t = GRU(h2_h3_t, h2gate_h3_t, h3_tm1, n_hid, n_hid, random_state) return h1_t, h2_t, h3_t (h1, h2, h3), updates = theano.scan( fn=step, sequences=[in_h1, ingate_h1], outputs_info=[init_h1, init_h2, init_h3]) out = Linear([h3], [n_hid], n_bins, random_state) pred = softmax(out) shp = target.shape target = target.reshape((shp[0], shp[1])) target = theano_one_hot(target, n_classes=n_bins) # dimshuffle so batch is on last axis cost = categorical_crossentropy(pred, target) cost = cost * mask.dimshuffle(0, 1) # sum over sequence length and features, mean over minibatch cost = cost.dimshuffle(1, 0) cost = cost.mean() # convert to bits vs nats cost = cost * tensor.cast(1.44269504089, theano.config.floatX) params = param_search(cost, lambda x: hasattr(x, "param")) print_param_info(params)
h1_t = GRU(in_h1_t, ingate_h1_t, h1_tm1, n_hid, n_hid, random_state) h1_h2_t, h1gate_h2_t = GRUFork([h1_t], [n_hid], n_hid, random_state) h2_t = GRU(h1_h2_t, h1gate_h2_t, h2_tm1, n_hid, n_hid, random_state) h2_h3_t, h2gate_h3_t = GRUFork([h2_t], [n_hid], n_hid, random_state) h3_t = GRU(h2_h3_t, h2gate_h3_t, h3_tm1, n_hid, n_hid, random_state) return h1_t, h2_t, h3_t (h1, h2, h3), updates = theano.scan(fn=step, sequences=[in_h1, ingate_h1], outputs_info=[init_h1, init_h2, init_h3]) out = Linear([h3], [n_hid], n_bins, random_state) pred = softmax(out) shp = target.shape target = target.reshape((shp[0], shp[1])) target = theano_one_hot(target, n_classes=n_bins) # dimshuffle so batch is on last axis cost = categorical_crossentropy(pred, target) cost = cost * mask.dimshuffle(0, 1) # sum over sequence length and features, mean over minibatch cost = cost.dimshuffle(1, 0) cost = cost.mean() # convert to bits vs nats cost = cost * tensor.cast(1.44269504089, theano.config.floatX) params = param_search(cost, lambda x: hasattr(x, "param")) print_param_info(params)
(h1, h2, h3, kappa, w), updates = theano.scan( fn=step, sequences=[inp_h1, inpgate_h1, inp_h2, inpgate_h2, inp_h3, inpgate_h3], outputs_info=[init_h1, init_h2, init_h3, init_kappa, init_w], non_sequences=[context]) outs = h1.dot(h1_to_outs) + h2.dot(h2_to_outs) + h3.dot(h3_to_outs) l1 = relu(outs.dot(l1_proj) + b_l1_proj) l2 = relu(l1.dot(l2_proj) + b_l2_proj) l3 = relu(l2.dot(l3_proj) + b_l3_proj) shp = l3.shape l3 = l3.reshape((-1, shp[-1])) preds = l3.dot(softmax_proj) + b_softmax_proj preds = preds.reshape((shp[0], shp[1], n_features, n_softmax)) preds = softmax(preds * (1. + softmax_bias_sym)) theano.printing.Print("preds.shape")(preds.shape) theano.printing.Print("target.shape")(target.shape) target = theano_one_hot(target, n_softmax) theano.printing.Print("target.shape")(target.shape) cost = categorical_crossentropy(preds, target, eps=1E-9) theano.printing.Print("cost.shape")(cost.shape) cost = cost * mask.dimshuffle(0, 1, 'x') cost = cost.sum() / cut_len grads = tensor.grad(cost, params) grads = gradient_clipping(grads, 10.) learning_rate = 1E-4 opt = adam(params, learning_rate)
init_h1, init_h2, init_h3, init_kappa, init_w, context) r = step(inp_h1[1], inpgate_h1[1], inp_h2[1], inpgate_h2[1], inp_h3[1], inpgate_h3[1], r[0], r[1], r[2], r[3], r[4], context) """ (h1, h2, h3, kappa, w), updates = theano.scan( fn=step, sequences=[inp_h1, inpgate_h1, inp_h2, inpgate_h2, inp_h3, inpgate_h3], outputs_info=[init_h1, init_h2, init_h3, init_kappa, init_w], non_sequences=[context]) outs = h1.dot(h1_to_outs) + h2.dot(h2_to_outs) + h3.dot(h3_to_outs) outs = softmax(outs) cost = categorical_crossentropy(outs, target) cost = cost * mask cost = cost.sum() / cut_len grads = tensor.grad(cost, params) grads = gradient_clipping(grads, 10.) learning_rate = 1E-4 opt = adam(params, learning_rate) updates = opt.updates(params, grads) train_function = theano.function([X_sym, X_mask_sym, c_sym, c_mask_sym, init_h1, init_h2, init_h3, init_kappa, init_w],