def ntm_address(opt, wprev_bhn, M_bnm, k_bhm, beta_bh, g_bh, s_bh3, gamma_bh): # Content addressing # Cosine similarity # take inner product along memory axis k * M numer_bhn = cgt.einsum("bhm,bnm->bhn", k_bhm, M_bnm) # compute denominator |k| * |m| denom_bhn = cgt.broadcast( "*", cgt.norm(k_bhm, axis=2, keepdims=True), # -> shape bh1 cgt.norm(M_bnm, axis=2, keepdims=True).transpose([0, 2, 1]), # -> bn1 -> b1n "xx1,x1x") csim_bhn = numer_bhn / denom_bhn assert infer_shape(csim_bhn) == (opt.b, 2 * opt.h, opt.n) # scale by beta tmp_bhn = cgt.broadcast("*", beta_bh[:, :, None], csim_bhn, "xx1,xxx") wc_bhn = sum_normalize2(cgt.exp(tmp_bhn)) # Interpolation g_bh1 = g_bh[:, :, None] wg_bhn = cgt.broadcast("*", wprev_bhn, (1 - g_bh1), "xxx,xx1") \ + cgt.broadcast("*", wc_bhn, g_bh1, "xxx,xx1") # Shift wtil_bhn = circ_conv_1d(wg_bhn, s_bh3, axis=2) # Sharpening wfin_bhn = sum_normalize2( cgt.broadcast("**", wtil_bhn, gamma_bh.reshape([opt.b, 2 * opt.h, 1]), "xxx,xx1")) b, h, n = opt.b, 2 * opt.h, opt.n assert infer_shape(wtil_bhn) == (b, h, n) assert infer_shape(gamma_bh) == (b, h) assert infer_shape(gamma_bh[:, :, None]) == (b, h, 1) return wfin_bhn
def ntm_address(opt, wprev_bhn, M_bnm, k_bhm, beta_bh, g_bh, s_bh3, gamma_bh): # Content addressing # Cosine similarity # take inner product along memory axis k * M numer_bhn = cgt.einsum("bhm,bnm->bhn", k_bhm, M_bnm) # compute denominator |k| * |m| denom_bhn = cgt.broadcast("*", cgt.norm(k_bhm, axis=2, keepdims=True), # -> shape bh1 cgt.norm(M_bnm, axis=2, keepdims=True).transpose([0,2,1]), # -> bn1 -> b1n "xx1,x1x" ) csim_bhn = numer_bhn / denom_bhn assert infer_shape(csim_bhn) == (opt.b, 2*opt.h, opt.n) # scale by beta tmp_bhn = cgt.broadcast("*", beta_bh[:,:,None], csim_bhn, "xx1,xxx") wc_bhn = sum_normalize2(cgt.exp( tmp_bhn )) # Interpolation g_bh1 = g_bh[:,:,None] wg_bhn = cgt.broadcast("*", wprev_bhn, (1 - g_bh1), "xxx,xx1") \ + cgt.broadcast("*", wc_bhn, g_bh1, "xxx,xx1") # Shift wtil_bhn = circ_conv_1d(wg_bhn, s_bh3, axis=2) # Sharpening wfin_bhn = sum_normalize2(cgt.broadcast("**", wtil_bhn, gamma_bh.reshape([opt.b,2*opt.h,1]), "xxx,xx1")) b,h,n = opt.b, 2*opt.h, opt.n assert infer_shape(wtil_bhn) == (b,h,n) assert infer_shape(gamma_bh) == (b,h) assert infer_shape(gamma_bh[:,:,None]) == (b,h,1) return wfin_bhn
def make_ff_controller(opt): b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k H = 2*h in_size = k + h*m out_size = H*m + H + H + H*3 + H + h*m + h*m + p # Previous reads r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m)) # External inputs X_bk = cgt.matrix("x", fixed_shape = (b,k)) r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]]) # Input to controller inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1) hid_sizes = opt.ff_hid_sizes activation = cgt.tanh layer_out_sizes = [in_size] + hid_sizes + [out_size] last_out = inp_bq # feedforward part. we could simplify a bit by using nn.Affine for i in xrange(len(layer_out_sizes)-1): indim = layer_out_sizes[i] outdim = layer_out_sizes[i+1] W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all") bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all") last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x") # Don't apply nonlinearity at the last layer if i != len(layer_out_sizes)-2: last_out = activation(last_out) idx = 0 k_bHm = last_out[:,idx:idx+H*m]; idx += H*m; k_bHm = k_bHm.reshape([b,H,m]) beta_bH = last_out[:,idx:idx+H]; idx += H g_bH = last_out[:,idx:idx+H]; idx += H s_bH3 = last_out[:,idx:idx+3*H]; idx += 3*H; s_bH3 = s_bH3.reshape([b,H,3]) gamma_bH = last_out[:,idx:idx+H]; idx += H e_bhm = last_out[:,idx:idx+h*m]; idx += h*m; e_bhm = e_bhm.reshape([b,h,m]) a_bhm = last_out[:,idx:idx+h*m]; idx += h*m; a_bhm = a_bhm.reshape([b,h,m]) y_bp = last_out[:,idx:idx+p]; idx += p k_bHm = cgt.tanh(k_bHm) beta_bH = nn.softplus(beta_bH) g_bH = cgt.sigmoid(g_bH) s_bH3 = sum_normalize2(cgt.exp(s_bH3)) gamma_bH = cgt.sigmoid(gamma_bH)+1 e_bhm = cgt.sigmoid(e_bhm) a_bhm = cgt.tanh(a_bhm) # y_bp = y_bp assert infer_shape(k_bHm) == (b,H,m) assert infer_shape(beta_bH) == (b,H) assert infer_shape(g_bH) == (b,H) assert infer_shape(s_bH3) == (b,H,3) assert infer_shape(gamma_bH) == (b,H) assert infer_shape(e_bhm) == (b,h,m) assert infer_shape(a_bhm) == (b,h,m) assert infer_shape(y_bp) == (b,p) return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
cgt.tensor(dtype=cgt.floatX, ndim=4, name=layer.name, fixed_shape=(batch_size, chans, crop_size, crop_size)), cgt.tensor(dtype='i8', ndim=2, name=layer.name, fixed_shape=(batch_size, 1)) ] elif layer.type == "Convolution": X = inputs[0] param = layer.convolution_param kh,kw = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\ else (param.kernel_h, param.kernel_w) nchanin = infer_shape(X)[0] Wshape = (param.num_output, nchanin, kh, kw) Wname = layer.param[0].name or layer.name + ":W" Wval = np.empty(Wshape, dtype=cgt.floatX) W = name2node[Wname] = cgt.shared(Wval, name=Wname, fixed_shape_mask="all") bshape = (1, param.num_output, 1, 1) bname = layer.param[1].name or layer.name + ":b" bval = np.empty(bshape, dtype=cgt.floatX) b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all") sh,sw = (param.stride, param.stride) if param.HasField("stride")\ else (param.stride_h, param.stride_w) output = [
output = None inputs = [name2node[name] for name in layer.bottom] if layer.type == "Data": tp = layer.transform_param crop_size = tp.crop_size chans = len(tp.mean_value) dp = layer.data_param batch_size = dp.batch_size output = [cgt.tensor(dtype=cgt.floatX,ndim=4,name=layer.name, fixed_shape=(batch_size,chans,crop_size,crop_size)), cgt.tensor(dtype='i8',ndim=2,name=layer.name, fixed_shape=(batch_size, 1))] elif layer.type == "Convolution": X = inputs[0] param = layer.convolution_param kh,kw = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\ else (param.kernel_h, param.kernel_w) nchanin = infer_shape(X)[0] Wshape = (param.num_output, nchanin, kh, kw) Wname = layer.param[0].name or layer.name+":W" Wval = np.empty(Wshape, dtype=cgt.floatX) W = name2node[Wname] = cgt.shared(Wval, name=Wname, fixed_shape_mask="all") bshape = (1, param.num_output, 1, 1) bname = layer.param[1].name or layer.name+":b" bval = np.empty(bshape, dtype=cgt.floatX) b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all") sh,sw = (param.stride, param.stride) if param.HasField("stride")\ else (param.stride_h, param.stride_w) output = [cgt.broadcast("+",nn.conv2d(X, W, subsample=(sh,sw)), b, "xxxx,1x11")] elif layer.type == "Pooling": param = layer.pooling_param X = inputs[0] pool_type = {param.MAX : "max", param.AVE : "mean"}[param.pool]