def ntm_address(opt, wprev_bhn, M_bnm, k_bhm, beta_bh, g_bh, s_bh3, gamma_bh): # Content addressing # Cosine similarity # take inner product along memory axis k * M numer_bhn = cgt.einsum("bhm,bnm->bhn", k_bhm, M_bnm) # compute denominator |k| * |m| denom_bhn = cgt.broadcast("*", cgt.norm(k_bhm, axis=2, keepdims=True), # -> shape bh1 cgt.norm(M_bnm, axis=2, keepdims=True).transpose([0,2,1]), # -> bn1 -> b1n "xx1,x1x" ) csim_bhn = numer_bhn / denom_bhn assert infer_shape(csim_bhn) == (opt.b, 2*opt.h, opt.n) # scale by beta tmp_bhn = cgt.broadcast("*", beta_bh[:,:,None], csim_bhn, "xx1,xxx") wc_bhn = sum_normalize2(cgt.exp( tmp_bhn )) # Interpolation g_bh1 = g_bh[:,:,None] wg_bhn = cgt.broadcast("*", wprev_bhn, (1 - g_bh1), "xxx,xx1") \ + cgt.broadcast("*", wc_bhn, g_bh1, "xxx,xx1") # Shift wtil_bhn = circ_conv_1d(wg_bhn, s_bh3, axis=2) # Sharpening wfin_bhn = sum_normalize2(cgt.broadcast("**", wtil_bhn, gamma_bh.reshape([opt.b,2*opt.h,1]), "xxx,xx1")) b,h,n = opt.b, 2*opt.h, opt.n assert infer_shape(wtil_bhn) == (b,h,n) assert infer_shape(gamma_bh) == (b,h) assert infer_shape(gamma_bh[:,:,None]) == (b,h,1) return wfin_bhn
def __call__(self, Y, U): if Y.ndim > (self.axis + 1): Y = Y.reshape(Y.shape[:self.axis] + [cgt.mul_multi(Y.shape[self.axis:])]) outer_YU = cgt.broadcast( '*', Y.dimshuffle(range(Y.ndim) + ['x']), U.dimshuffle([0] + ['x'] * self.axis + [1]), ''.join(['x'] * Y.ndim + ['1', ',', 'x'] + ['1'] * self.axis + ['x'])) bilinear = cgt.dot( outer_YU.reshape( (outer_YU.shape[0], cgt.mul_multi(outer_YU.shape[1:]))), self.M.reshape((self.y_dim, self.y_dim * self.u_dim)).T) if self.axis > 1: bilinear = bilinear.reshape((-1, ) + self.y_shape[:self.axis - 1] + (self.y_dim, )) linear = cgt.dot(U, self.N.T) if self.axis > 1: linear = linear.dimshuffle([0] + ['x'] * (self.axis - 1) + [1]) activation = bilinear + linear if self.b is not None: activation += cgt.broadcast( '+', activation, self.b.dimshuffle(['x'] * self.axis + [0]), ''.join(['x'] * activation.ndim + [','] + ['1'] * (activation.ndim - 1) + ['x'])) activation = activation.reshape((-1, ) + self.y_shape) return activation
def get_context(self, prev_state_bf): state_step_bf = self.states_mlp_bf(prev_state_bf) state_step_b1f = cgt.dimshuffle(state_step_bf, [0, 'x', 1]) # Compute the inner product <phi(s_i), psi(h_u)> where phi and psi are MLPs. # The below line computes the pointwise product of phi(s_i) and psi(h_u) and then sums to get the inner product. # scalar_energies_vec_bt = cgt.sqrt(cgt.sum(cgt.broadcast('*', state_step_b1f, self.features_post_mlp_btf, 'x1x,xxx'), axis=2)) # Compute tau=tanh(h_u*W + s_i*V), broadcasting to do all h_u mults at once. scalar_energies_vec_btf = cgt.tanh(cgt.broadcast('+', self.features_post_mlp_btf, state_step_b1f, 'xxx,x1x')) # The next two lines compute w^T*(tau) with a pointwise product and then a sum. scalar_energies_vec_btf = cgt.broadcast('*', self.mixing_vec_w, scalar_energies_vec_btf, '11x,xxx') scalar_energies_vec_bt = cgt.sum(scalar_energies_vec_btf, axis=2) # Softmax weights the blended features over their time dimesions. softmax_weights_bt = nn.softmax(scalar_energies_vec_bt, axis=1) # This weight multiplies all features. extended_softmax_bt1 = cgt.dimshuffle(softmax_weights_bt, [0, 1, 'x']) # Weight the features by it's temporally dependent softmax weight. pre_blended = cgt.broadcast('*', extended_softmax_bt1, self.features_post_mlp_btf, 'xx1,xxx') # Integrate out time. blended_features_bf = cgt.sum(pre_blended, axis=1) return blended_features_bf
def ntm_address(opt, wprev_bhn, M_bnm, k_bhm, beta_bh, g_bh, s_bh3, gamma_bh): # Content addressing # Cosine similarity # take inner product along memory axis k * M numer_bhn = cgt.einsum("bhm,bnm->bhn", k_bhm, M_bnm) # compute denominator |k| * |m| denom_bhn = cgt.broadcast( "*", cgt.norm(k_bhm, axis=2, keepdims=True), # -> shape bh1 cgt.norm(M_bnm, axis=2, keepdims=True).transpose([0, 2, 1]), # -> bn1 -> b1n "xx1,x1x") csim_bhn = numer_bhn / denom_bhn assert infer_shape(csim_bhn) == (opt.b, 2 * opt.h, opt.n) # scale by beta tmp_bhn = cgt.broadcast("*", beta_bh[:, :, None], csim_bhn, "xx1,xxx") wc_bhn = sum_normalize2(cgt.exp(tmp_bhn)) # Interpolation g_bh1 = g_bh[:, :, None] wg_bhn = cgt.broadcast("*", wprev_bhn, (1 - g_bh1), "xxx,xx1") \ + cgt.broadcast("*", wc_bhn, g_bh1, "xxx,xx1") # Shift wtil_bhn = circ_conv_1d(wg_bhn, s_bh3, axis=2) # Sharpening wfin_bhn = sum_normalize2( cgt.broadcast("**", wtil_bhn, gamma_bh.reshape([opt.b, 2 * opt.h, 1]), "xxx,xx1")) b, h, n = opt.b, 2 * opt.h, opt.n assert infer_shape(wtil_bhn) == (b, h, n) assert infer_shape(gamma_bh) == (b, h) assert infer_shape(gamma_bh[:, :, None]) == (b, h, 1) return wfin_bhn
def make_deep_rrnn(size_input, size_mem, n_layers, size_output, size_batch_in, k_in, k_h): inputs = [cgt.matrix() for i_layer in xrange(n_layers+1)] outputs = [] print 'input_size: ', size_input for i_layer in xrange(n_layers): prev_h = inputs[i_layer+1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer==0 else outputs[i_layer-1] size_x = size_input if i_layer==0 else size_mem size_batch = prev_h.shape[0] xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform") xform_h_non = xform_h_param.weight xform_h_non.props["is_rotation"] = True xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True) xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1") r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x) r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem)) r_norm = cgt.norm(r_non, axis=2, keepdims=True) r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1") prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1)) inters_in = [prev_h_3] colon = slice(None, None, None) for i in xrange(2 * k_in): inter_in = inters_in[-1] r_cur = cgt.subtensor(r, [colon, i, colon]) r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem)) r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1)) ref_cur = cgt.batched_matmul(r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in)) inter_out = inter_in - 2 * ref_cur inters_in.append(inter_out) h_in_rot = cgt.reshape(inters_in[-1], (size_batch, size_mem)) inters_h = [h_in_rot] for i in xrange(2 * k_h): inter_in = inters_h[-1] r_cur = cgt.subtensor(xform_h, [i, colon]) r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1)) r_cur_2 = cgt.reshape(r_cur, (1, size_mem)) ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2) inter_out = inter_in - 2 * ref_cur inters_h.append(inter_out) next_h = inters_h[-1] outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output,name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) #print 'len outputs:', len(outputs) #print 'len inputs:', len(inputs) return nn.Module(inputs, outputs)
def circ_conv_1d(wg_bhn, s_bh3, axis=2): "VERY inefficient way to implement circular convolution for the special case of filter size 3" assert axis == 2 n = cgt.size(wg_bhn,2) wback = cgt.concatenate([wg_bhn[:,:,n-1:n], wg_bhn[:,:,:n-1]], axis=2) w = wg_bhn wfwd = cgt.concatenate([wg_bhn[:,:,1:n], wg_bhn[:,:,0:1]], axis=2) return cgt.broadcast("*", s_bh3[:,:,0:1] , wback, "xx1,xxx")\ + cgt.broadcast("*", s_bh3[:,:,1:2] , w, "xx1,xxx")\ + cgt.broadcast("*", s_bh3[:,:,2:3] , wfwd, "xx1,xxx")
def make_ff_controller(opt): b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k H = 2*h in_size = k + h*m out_size = H*m + H + H + H*3 + H + h*m + h*m + p # Previous reads r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m)) # External inputs X_bk = cgt.matrix("x", fixed_shape = (b,k)) r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]]) # Input to controller inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1) hid_sizes = opt.ff_hid_sizes activation = cgt.tanh layer_out_sizes = [in_size] + hid_sizes + [out_size] last_out = inp_bq # feedforward part. we could simplify a bit by using nn.Affine for i in xrange(len(layer_out_sizes)-1): indim = layer_out_sizes[i] outdim = layer_out_sizes[i+1] W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all") bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all") last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x") # Don't apply nonlinearity at the last layer if i != len(layer_out_sizes)-2: last_out = activation(last_out) idx = 0 k_bHm = last_out[:,idx:idx+H*m]; idx += H*m; k_bHm = k_bHm.reshape([b,H,m]) beta_bH = last_out[:,idx:idx+H]; idx += H g_bH = last_out[:,idx:idx+H]; idx += H s_bH3 = last_out[:,idx:idx+3*H]; idx += 3*H; s_bH3 = s_bH3.reshape([b,H,3]) gamma_bH = last_out[:,idx:idx+H]; idx += H e_bhm = last_out[:,idx:idx+h*m]; idx += h*m; e_bhm = e_bhm.reshape([b,h,m]) a_bhm = last_out[:,idx:idx+h*m]; idx += h*m; a_bhm = a_bhm.reshape([b,h,m]) y_bp = last_out[:,idx:idx+p]; idx += p k_bHm = cgt.tanh(k_bHm) beta_bH = nn.softplus(beta_bH) g_bH = cgt.sigmoid(g_bH) s_bH3 = sum_normalize2(cgt.exp(s_bH3)) gamma_bH = cgt.sigmoid(gamma_bH)+1 e_bhm = cgt.sigmoid(e_bhm) a_bhm = cgt.tanh(a_bhm) # y_bp = y_bp assert infer_shape(k_bHm) == (b,H,m) assert infer_shape(beta_bH) == (b,H) assert infer_shape(g_bH) == (b,H) assert infer_shape(s_bH3) == (b,H,3) assert infer_shape(gamma_bH) == (b,H) assert infer_shape(e_bhm) == (b,h,m) assert infer_shape(a_bhm) == (b,h,m) assert infer_shape(y_bp) == (b,p) return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
def __call__(self, x): """ x is the input Returns the output to feed as the input into the next layer. """ return cgt.broadcast("+", x.dot(self.W), self.b, "xx,1x")
def get_context_backup(self, prev_state_bf): state_step_bf = cgt.sigmoid(self.states_mlp_bf(prev_state_bf)) product_list = [] for time_step in range(0, 3): inner_product = cgt.sum(state_step_bf*self.features_post_mlp_btf[:, time_step, :], axis=1) product_list.append(inner_product) st = cgt.stack(product_list) st = cgt.dimshuffle(st, [1, 0]) softmax_weights = softmax(st) sum = None for time_step in range(0, 3): softmax_t_step = cgt.dimshuffle(softmax_weights[:, time_step], [0, 'x']) if sum is None: sum = cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx') else: sum += cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx') return sum
def make_deep_rrnn_rot_relu(size_input, size_mem, n_layers, size_output, size_batch_in, k_in, k_h): inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)] outputs = [] print 'input_size: ', size_input for i_layer in xrange(n_layers): prev_h = inputs[ i_layer + 1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer == 0 else outputs[i_layer - 1] size_x = size_input if i_layer == 0 else size_mem size_batch = prev_h.shape[0] xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform") xform_h_non = xform_h_param.weight xform_h_non.props["is_rotation"] = True xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True) xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1") add_in_lin = nn.Affine(size_x, size_mem)(x) add_in_relu = nn.rectify(add_in_lin) prev_h_scaled = nn.scale_mag(prev_h) h_in_added = prev_h_scaled + add_in_relu inters_h = [h_in_added] colon = slice(None, None, None) for i in xrange(2 * k_h): inter_in = inters_h[-1] r_cur = xform_h[i, :] #r_cur = cgt.subtensor(xform_h, [i, colon]) r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1)) r_cur_2 = cgt.reshape(r_cur, (1, size_mem)) ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2) inter_out = inter_in - 2 * ref_cur inters_h.append(inter_out) next_h = inters_h[-1] outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output, name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) #print 'len outputs:', len(outputs) #print 'len inputs:', len(inputs) return nn.Module(inputs, outputs)
def ntm_write(M_bnm, w_bhn, e_bhm, a_bhm): if False: # Here's the version that's faithful to the paper # weighted erases bhn1 bh1m # ideally we wouldn't create this big 4-tensor but this operation # requires a more general kind of contraction than is provided by einsum we_bhmn = cgt.broadcast("*", w_bhn[:,:,:,None], e_bhm[:,:,None,:], "xxx1,xx1x") # take produce of erasing factors mult_bmn = (1 - we_bhmn).prod(axis=1) M_bnm = M_bnm * mult_bmn # Equation 3 http://arxiv.org/pdf/1410.5401v2.pdf else: # This version just does a regular contraction erase_bnm = cgt.einsum( "bhn,bhm->bnm", w_bhn, e_bhm) M_bnm = M_bnm*(1-erase_bnm) # Now do the same thing with adds # But now it's just a regular contraction since we are adding rather than taking product add_bnm = cgt.einsum( "bhn,bhm->bnm", w_bhn, a_bhm) M_bnm = M_bnm + add_bnm return M_bnm
def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = cgt.dot(hid_previous, W_hid_stacked) # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update return self.nonlinearity_hid(hid) # adding this non-linearity seems to help stability.
def __init__(self, input, n_in, n_out, W=None, b=None, activation=cgt.tanh, prefix=""): self.n_in = n_in self.n_out = n_out if W is None: # XXX replace with nn init W_values = np.asarray( rng.uniform( low=-np.sqrt(6. / (n_in + n_out)), high=np.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=cgt.floatX ) if activation == cgt.sigmoid: W_values *= 4 W = cgt.shared(W_values, name=prefix+"_W") if b is None: b_values = np.zeros((n_out,), dtype=cgt.floatX) b = cgt.shared(b_values, name=prefix+"_b") self.W = W self.b = b # XXX broadcast api may change lin_output = cgt.broadcast("+", cgt.dot(input, self.W), cgt.dimshuffle(self.b, ["x", 0]), "xx,1x") self.output = ( lin_output if activation is None else activation(lin_output) ) # parameters of the model self.params = [self.W, self.b]
def step(input_n, cell_previous, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): input_n = cgt.broadcast("+", cgt.dot(input_n, W_in_stacked), b_stacked, "xx,1x") # Calculate gates pre-activations and slice gates = input_n + cgt.dot(hid_previous, W_hid_stacked) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) outgate = self.nonlinearity_outgate(outgate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid]
def __call__(self, x): tmp = conv2d(x, self.weight, self.kernelshape, self.pad, self.stride) return cgt.broadcast("+", tmp, self.bias, "xxxx,1x11")
def __call__(self, x): return cgt.broadcast("+", x.dot(self.weight), self.bias, "xx,1x")
def softmax(x,axis=1): # x = cgt.broadcast("-", x, x.max(axis=1,keepdims=True),"xx,x1") out = cgt.exp(x) out = cgt.broadcast("/", out, out.sum(axis=axis,keepdims=True), "xx,x1") return out
from cgt import nn, utils import numpy as np, numpy.random as nr from numpy.linalg import norm from param_collection import ParamCollection k_in = 1 size_x = 3 size_mem = 4 size_batch = 4 x = cgt.matrix(fixed_shape=(size_batch, size_x)) prev_h = cgt.matrix(fixed_shape=(size_batch, size_mem)) r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x) r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem)) r_norm = cgt.norm(r_non, axis=2, keepdims=True) r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1") prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1)) inters = [prev_h_3] for i in xrange(k_in * 2): inter_in = inters[-1] r_cur = r[:, i, :] r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem)) r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1)) ref_cur = cgt.batched_matmul(r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in)) inter_out = inter_in - ref_cur inters.append(inter_out) h = inters[-1] r_nn = nn.Module([x], [h])
def broadcast(x, a, b, bcpat): return cgt.broadcast(x, a, b, bcpat)
X = inputs[0] param = layer.convolution_param kh,kw = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\ else (param.kernel_h, param.kernel_w) nchanin = infer_shape(X)[0] Wshape = (param.num_output, nchanin, kh, kw) Wname = layer.param[0].name or layer.name+":W" Wval = np.empty(Wshape, dtype=cgt.floatX) W = name2node[Wname] = cgt.shared(Wval, name=Wname, fixed_shape_mask="all") bshape = (1, param.num_output, 1, 1) bname = layer.param[1].name or layer.name+":b" bval = np.empty(bshape, dtype=cgt.floatX) b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all") sh,sw = (param.stride, param.stride) if param.HasField("stride")\ else (param.stride_h, param.stride_w) output = [cgt.broadcast("+",nn.conv2d(X, W, subsample=(sh,sw)), b, "xxxx,1x11")] elif layer.type == "Pooling": param = layer.pooling_param X = inputs[0] pool_type = {param.MAX : "max", param.AVE : "mean"}[param.pool] height_in,width_in = infer_shape(X)[2:4] kernel = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\ else (param.kernel_h, param.kernel_w) stride = (param.stride, param.stride) if param.HasField("stride")\ else (param.stride_h, param.stride_w) pad = (param.pad, param.pad) if param.HasField("pad")\ else (param.pad_h, param.pad_w) output = [nn.pool(pool_type, X, stride, kernel, pad)] elif layer.type == "InnerProduct": X = inputs[0] if X.ndim == 4:
def normalize(var): return cgt.broadcast("/", var, cgt.sum(var,axis=2,keepdims=True), "xxx,xx1")
from cgt import nn, utils import numpy as np, numpy.random as nr from numpy.linalg import norm from param_collection import ParamCollection k_in = 1 size_x = 3 size_mem = 4 size_batch = 4 x = cgt.matrix(fixed_shape=(size_batch, size_x)) prev_h = cgt.matrix(fixed_shape=(size_batch, size_mem)) r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x) r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem)) r_norm = cgt.norm(r_non, axis=2, keepdims=True) r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1") prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1)) inters = [prev_h_3] for i in xrange(k_in * 2): inter_in = inters[-1] r_cur = r[:, i, :] r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem)) r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1)) ref_cur = cgt.batched_matmul( r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in)) inter_out = inter_in - ref_cur inters.append(inter_out) h = inters[-1] r_nn = nn.Module([x], [h])
def sum_normalize2(x): return cgt.broadcast("/", x, x.sum(axis=2,keepdims=True), "xxx,xx1")
def broadcast(opname,x,y,bcpat): return cgt.broadcast(opname, x, y, bcpat) if isinstance(x, core.Node) else eval("x %s y"%opname)
def sum_normalize2(x): return cgt.broadcast("/", x, x.sum(axis=2, keepdims=True), "xxx,xx1")
def softmax(x, axis=1): # x = cgt.broadcast("-", x, x.max(axis=1,keepdims=True),"xx,x1") out = cgt.exp(x) out = cgt.broadcast("/", out, out.sum(axis=axis, keepdims=True), "xx,x1") return out
def broadcast(opname, x, y, bcpat): return cgt.broadcast(opname, x, y, bcpat) if isinstance( x, core.Node) else eval("x %s y" % opname)
Wshape = (param.num_output, nchanin, kh, kw) Wname = layer.param[0].name or layer.name + ":W" Wval = np.empty(Wshape, dtype=cgt.floatX) W = name2node[Wname] = cgt.shared(Wval, name=Wname, fixed_shape_mask="all") bshape = (1, param.num_output, 1, 1) bname = layer.param[1].name or layer.name + ":b" bval = np.empty(bshape, dtype=cgt.floatX) b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all") sh,sw = (param.stride, param.stride) if param.HasField("stride")\ else (param.stride_h, param.stride_w) output = [ cgt.broadcast("+", nn.conv2d(X, W, subsample=(sh, sw)), b, "xxxx,1x11") ] elif layer.type == "Pooling": param = layer.pooling_param X = inputs[0] pool_type = {param.MAX: "max", param.AVE: "mean"}[param.pool] height_in, width_in = infer_shape(X)[2:4] kernel = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\ else (param.kernel_h, param.kernel_w) stride = (param.stride, param.stride) if param.HasField("stride")\ else (param.stride_h, param.stride_w) pad = (param.pad, param.pad) if param.HasField("pad")\ else (param.pad_h, param.pad_w) output = [nn.pool(pool_type, X, stride, kernel, pad)] elif layer.type == "InnerProduct": X = inputs[0]