def divergence(self, x, y): tx = layers.pad(x[:, :, :, :], (0, 0, 0, 0, 0, 0, 1, 0)) ty = layers.pad(y[:, :, :, :], (0, 0, 0, 0, 1, 0, 0, 0)) grad_x = self.conv4px(tx) grad_y = self.conv4py(ty) return grad_x + grad_y
def forward_grad(self, x): grad_x = self.conv4u(layers.pad(x, (0, 0, 0, 0, 0, 0, 0, 1))) tmp = layers.unstack(grad_x, axis=2) tmp[-1] = tmp[-1] - tmp[-1] #tmp[-1]=0 grad_x = layers.stack(tmp, axis=2) grad_y = self.conv4v(layers.pad(x, (0, 0, 0, 0, 0, 1, 0, 0))) tmp = layers.unstack(grad_y, axis=2) tmp[-1] = tmp[-1] - tmp[-1] # tmp[-1]=0 grad_y = layers.stack(tmp, axis=2) return grad_x, grad_y
def epoch_predict(env, args, model, loader): """Predict in one epoch""" model.eval() arcs, rels, probs = [], [], [] for words, feats in loader(): # ignore the first token of each sentence tmp_words = layers.pad(words[:, 1:], paddings=[0, 0, 1, 0], pad_value=args.pad_index) mask = tmp_words != args.pad_index lens = nn.reduce_sum(mask, -1) s_arc, s_rel = model(words, feats) arc_preds, rel_preds = decode(args, s_arc, s_rel, mask) arcs.extend( layers.split(nn.masked_select(arc_preds, mask), lens.numpy().tolist())) rels.extend( layers.split(nn.masked_select(rel_preds, mask), lens.numpy().tolist())) if args.prob: arc_probs = nn.index_sample(layers.softmax(s_arc, -1), layers.unsqueeze(arc_preds, -1)) probs.extend( layers.split( nn.masked_select(layers.squeeze(arc_probs, axes=[-1]), mask), lens.numpy().tolist())) arcs = [seq.numpy().tolist() for seq in arcs] rels = [env.REL.vocab[seq.numpy().tolist()] for seq in rels] probs = [[round(p, 3) for p in seq.numpy().tolist()] for seq in probs] return arcs, rels, probs
def epoch_evaluate(args, model, loader, puncts): """Evaluate in one epoch""" model.eval() total_loss, metric = 0, Metric() for words, feats, arcs, rels in loader(): # ignore the first token of each sentence tmp_words = layers.pad(words[:, 1:], paddings=[0, 0, 1, 0], pad_value=args.pad_index) mask = tmp_words != args.pad_index s_arc, s_rel = model(words, feats) loss = loss_function(s_arc, s_rel, arcs, rels, mask) arc_preds, rel_preds = decode(args, s_arc, s_rel, mask) # ignore all punctuation if not specified if not args.punct: punct_mask = layers.reduce_all( layers.expand(layers.unsqueeze(words, -1), (1, 1, puncts.shape[0])) != layers.expand( layers.reshape(puncts, (1, 1, -1)), (*words.shape, 1)), dim=-1) mask = layers.logical_and(mask, punct_mask) metric(arc_preds, rel_preds, arcs, rels, mask) total_loss += loss.numpy().item() total_loss /= len(loader) return total_loss, metric
def epoch_train(args, model, optimizer, loader, epoch): """Train in one epoch""" model.train() total_loss = 0 for batch, (words, feats, arcs, rels) in enumerate(loader(), start=1): model.clear_gradients() # ignore the first token of each sentence tmp_words = layers.pad(words[:, 1:], paddings=[0, 0, 1, 0], pad_value=args.pad_index) mask = tmp_words != args.pad_index s_arc, s_rel = model(words, feats) loss = loss_function(s_arc, s_rel, arcs, rels, mask) if args.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) total_loss += loss.numpy().item() logging.info( f"epoch: {epoch}, batch: {batch}/{math.ceil(len(loader) / args.nranks)}, batch_size: {len(words)}, loss: {loss.numpy().item():.4f}" ) total_loss /= len(loader) return total_loss
def pad(self, input_ele): max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))]) out_list = [] for i in range(len(input_ele)): pad_len = max_len - input_ele[i].shape[0] one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0], pad_value=0.0) out_list.append(one_batch_padded) out_padded = layers.stack(out_list) return out_padded
def prepare_encoder_decoder(src_word, src_pos, src_vocab_size, src_emb_dim, src_max_len, dropout_rate=0., word_emb_param_name=None, training=True, pos_enc_param_name=None, is_src=True, params_type="normal"): """Add word embeddings and position encodings. The output tensor has a shape of: [batch_size, max_src_length_in_batch, d_model]. This module is used at the bottom of the encoder stacks. """ assert params_type == "fixed" or params_type == "normal" or params_type == "new" pre_name = "densedense" if params_type == "fixed": pre_name = "fixed_densefixed_dense" elif params_type == "new": pre_name = "new_densenew_dense" src_word_emb = layers.embedding( src_word, size=[src_vocab_size, src_emb_dim], padding_idx=DenseModelHyperParams.bos_idx, # set embedding of bos to 0 param_attr=fluid.ParamAttr(name=pre_name + word_emb_param_name, initializer=fluid.initializer.Normal( 0., src_emb_dim**-0.5))) #, is_sparse=True) if not is_src and training: src_word_emb = layers.pad(src_word_emb, [0, 0, 1, 0, 0, 0]) src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) src_pos_enc = layers.embedding(src_pos, size=[src_max_len, src_emb_dim], param_attr=fluid.ParamAttr( trainable=False, name=pre_name + pos_enc_param_name)) src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc return layers.dropout(enc_input, dropout_prob=dropout_rate, seed=DenseModelHyperParams.dropout_seed, is_test=False, dropout_implementation='upscale_in_train' ) if dropout_rate else enc_input
def forward(self, x): ''' bt,c,w,h=x.shape tmp=layers.reshape(x,shape=[48,-1,c,w,h]) res=layers.reshape(tmp[:,:-1],shape=[-1,c,w,h])''' x = self.bottleneck(x) inp = self.norm_img(x) bt, c, w, h = inp.shape inp = layers.reshape(inp, shape=[self.batch_size, -1, c, w, h]) x = inp[:, :-1] y = inp[:, 1:] x = layers.reshape(layers.transpose(x, perm=[0, 2, 1, 3, 4]), shape=[-1, c, h, w]) y = layers.reshape(layers.transpose(y, perm=[0, 2, 1, 3, 4]), shape=[-1, c, h, w]) u1 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') u2 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') l_t = self.lamda * self.theta taut = self.tau / (self.theta + 1e-12) grad2_x = self.conv4Ix(layers.pad(y, (0, 0, 0, 0, 0, 0, 1, 1))) tmp = layers.unstack(grad2_x, axis=3) tmp[-1] = 0.5 * (x[:, :, :, -1] - x[:, :, :, -2]) tmp[0] = 0.5 * (x[:, :, :, 1] - x[:, :, :, 0]) grad2_x = layers.stack(tmp, axis=3) grad2_y = self.conv4Iy(layers.pad(y, (0, 0, 0, 0, 1, 1, 0, 0))) tmp = layers.unstack(grad2_y, axis=2) tmp[-1] = 0.5 * (x[:, :, -1, :] - x[:, :, -2, :]) tmp[0] = 0.5 * (x[:, :, 1, :] - x[:, :, 0, :]) grad2_y = layers.stack(tmp, axis=2) p11 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') p12 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') p21 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') p22 = fluid.dygraph.to_variable(np.zeros(x.shape)).astype('float32') gsqx = grad2_x**2 gsqy = grad2_y**2 grad = gsqx + gsqy + 1e-12 rho_c = y - grad2_x * u1 - grad2_y * u2 - x for i in range(self.n_iter): rho = rho_c + grad2_x * u1 + grad2_y * u2 + 1e-12 mask1 = (rho < -l_t * grad).detach().astype('float32') mask1.stop_gradient = True tmp1 = l_t * grad2_x tmp2 = l_t * grad2_y v1 = tmp1 * mask1 v2 = tmp2 * mask1 mask2 = (rho > l_t * grad).detach().astype('float32') mask2.stop_gradient = True v1 = -tmp1 * mask2 + v1 v2 = -tmp2 * mask2 + v2 mask3 = fluid.layers.ones( x.shape, dtype='float32') - (mask1 + mask2 - mask1 * mask2) mask3.stop_gradient = True tmp1 = (-rho / grad) * grad2_x tmp2 = (-rho / grad) * grad2_y v1 = tmp1 * mask3 + v1 v2 = tmp2 * mask3 + v2 del rho del mask1 del mask2 del mask3 v1 += u1 v2 += u2 u1 = v1 + self.theta * self.divergence(p11, p12) u2 = v2 + self.theta * self.divergence(p21, p22) del v1 del v2 u1 = u1 u2 = u2 u1x, u1y = self.forward_grad(u1) u2x, u2y = self.forward_grad(u2) p11 = (p11 + taut * u1x) / ( 1. + taut * layers.sqrt(u1x**2 + u1y**2 + 1e-12)) p12 = (p12 + taut * u1y) / ( 1. + taut * layers.sqrt(u1x**2 + u1y**2 + 1e-12)) p21 = (p21 + taut * u2x) / ( 1. + taut * layers.sqrt(u2x**2 + u2y**2 + 1e-12)) p22 = (p22 + taut * u2y) / ( 1. + taut * layers.sqrt(u2x**2 + u2y**2 + 1e-12)) del u1x del u1y del u2x del u2y flow = layers.concat([u1, u2], axis=1) # flow = layers.transpose(layers.reshape(flow,shape=[b,t,c*2,h,w]),perm=[0,2,1,3,4]) flow = self.unbottleneck(flow) flow = self.bn(flow) if self.bn else flow return flow
def forward(self, x, cls=None): # x is BxTxCxHxW 注意与2p1d网络输入格式不同 # spatio-temporal video data b, t, c, h, w = x.shape # need to view it is B*TxCxHxW for 2D CNN # important to keep batch and time axis next to # eachother, so a simple view without tranposing is possible # 此处存疑,因为torch.dataloader作batch打包录入数据时,各类别是混起来的,而且同类视频间也不方便混起来的,因为要计算表示层光流 x = reshape(x, shape=[b * t, c, h, w]) x = self.conv1(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) # 插入FCF层 # res = x # F.avg_pool2d(x, (3, 1), 1, 0) # x[:,:,1:-1].contiguous() F表示torch.nn.functional res = x x = self.flow_cmp(x) x = self.flow_layer.norm_img(x) # compute flow for 0,1,...,T-1 # and 1,2,...,T b_t, c, h, w = x.shape x = reshape(x, shape=[b, -1, c, h, w]) #将x拆解为BTCHW,后续要对T维度操作 # 根据有无x=x+res,下面两句二选一 x = pad(x, paddings=[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]) # t -= 1 # Representation Flow操作后,t少一帧 u, v = self.flow_layer(reshape(x[:, :-1], shape=[-1, c, h, w]), reshape(x[:, 1:], shape=[-1, c, h, w])) x = concat([u, v], axis=1) x = self.flow_conv(x) # Flow-of-flow x = self.flow_cmp2(x) x = self.flow_layer.norm_img(x) # compute flow for 0,1,...,T-1 # and 1,2,...,T b_t, c, h, w = x.shape x = reshape(x, shape=[b, -1, c, h, w]) # 根据有无x=x+res,下面两句二选一 x = pad(x, paddings=[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]) # t -= 1 # Representation Flow操作后,t少一帧 u, v = self.flow_layer2(reshape(x[:, :-1], shape=[-1, c, h, w]), reshape(x[:, 1:], shape=[-1, c, h, w])) x = concat([u, v], axis=1) x = self.flow_conv2(x) x = self.bnf(x) x = x + res x = leaky_relu(x) # x = self.layer3(x) x = self.layer4(x) #print(x.size()) x = self.avgpool(x) x = reshape(x, shape=[x.shape[0], -1]) x = self.dropout(x) # currently making dense, per-frame predictions x = self.fc(x) # so view as BxTxClass x = reshape(x, shape=[b, t, -1]) # mean-pool over time x = reduce_mean(x, dim=1) # temporal维度合并 # return BxClass prediction if cls is not None: acc = float(accuracy(input=x, label=cls)) return x, acc else: return x
def get_grad_w(self, w, b, grad): conv_in = self.x conv_out = self.y N, C, H, W = conv_in.shape N, out_C, out_H, out_W = conv_out.shape # w [out_C, in_C, kH, kW] out_C, in_C, kH, kW = w.shape stride = self.stride padding = self.padding pad_H = H + padding * 2 pad_W = W + padding * 2 # loss对w的偏导数。 conv_in = paddle.to_tensor(conv_in) pad_x = L.pad( conv_in, paddings=[0, 0, 0, 0, padding, padding, padding, padding], pad_value=0.0) # [N, in_C, pad_H, pad_W] pad_x = L.transpose(pad_x, [2, 3, 0, 1]) # [pad_H, pad_W, N, in_C] if self.special_inds_dw is None: # 只会做一次,即初始化。 self.special_inds_dw = [] # 卷积核滑动,只会在H和W两个方向上滑动 for i in range(out_H): # i是纵坐标 for j in range(out_W): # j是横坐标 ori_x = j * stride # 卷积核在pad_x中的横坐标,等差数列,公差是stride ori_y = i * stride # 卷积核在pad_x中的纵坐标,等差数列,公差是stride for i2 in range(kH): # i2是纵坐标 for j2 in range(kW): # j2是横坐标 point_x = ori_x + j2 point_y = ori_y + i2 self.special_inds_dw.append([point_y, point_x]) # self.special_inds_dw.shape == [out_H*out_W*kH*kW, 2] special_inds_dw = paddle.to_tensor(self.special_inds_dw) special_inds_dw = L.cast(special_inds_dw, 'int32') special_inds_dw.stop_gradient = True x_in = L.gather_nd(pad_x, special_inds_dw) # [out_H*out_W*kH*kW, N, in_C] x_in = L.reshape(x_in, (out_H, out_W, kH, kW, N, in_C)) x_in = L.transpose( x_in, [4, 5, 0, 1, 2, 3]) # [N, in_C, out_H, out_W, kH, kW] x_in = L.reshape( x_in, (N, in_C, out_H * out_W, kH, kW)) # [N, in_C, out_H*out_W, kH, kW] x_in = L.unsqueeze(x_in, 1) # [N, 1, in_C, out_H*out_W, kH, kW] grad_r = L.reshape(grad, (N, out_C, 1, out_H * out_W, 1, 1)) # [N, out_C, 1, out_H*out_W, 1, 1] # 乘法 # dw = x_in * grad_r # [N, out_C, in_C, out_H*out_W, kH, kW] # dL_dWeight = L.reduce_sum(dw, dim=[0, 3]) # [out_C, in_C, kH, kW] # 根据https://github.com/miemie2013/Pure_Python_Deep_Learning 1x1conv.py里的口诀“13”,知道可以转换成1x1卷积。 # 把x_in看作是卷积输入图像,该图像的批大小为in_C, 该图像的通道数为N*out_H*out_W # 把grad_r看作是卷积核,该卷积核的个数为out_C, 该卷积核的in_C为N*out_H*out_W x_in = L.transpose( x_in, [2, 1, 0, 3, 4, 5]) # [in_C, 1, N, out_H*out_W, kH, kW] x_in = L.reshape( x_in, (in_C, N * out_H * out_W, kH, kW)) # [in_C, N*out_H*out_W, kH, kW] grad_r = L.transpose( grad_r, [1, 2, 0, 3, 4, 5]) # [out_C, 1, N, out_H*out_W, 1, 1] grad_r = L.reshape( grad_r, (out_C, N * out_H * out_W, 1, 1)) # [out_C, N*out_H*out_W, 1, 1] dw = F.conv2d(x_in, grad_r, None) # [in_C, out_C, kH, kW] dL_dWeight = L.transpose(dw, [1, 0, 2, 3]) # [out_C, in_C, kH, kW] return dL_dWeight
def build(self, boxNum=64, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, regularization=None, lazy_mode=False): dataInput = pfl.data(name='data_input', shape=[3, 416, 416], dtype='float32') gtbox = pfl.data(name='data_gtbox', shape=[boxNum, 4], dtype='float32') gtlabel = pfl.data(name='data_gtlabel', shape=[boxNum], dtype='int32') anchors = [10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319] layer0_output = _DBL(input=dataInput, num_filters=16, filter_size=3, name='layer0') layer1_output = pfl.pool2d(input=layer0_output, pool_size=2, pool_type='max', pool_stride=2, name='layer1_max') layer2_output = _DBL(input=layer1_output, num_filters=32, filter_size=3, name='layer2') layer3_output = pfl.pool2d(input=layer2_output, pool_size=2, pool_type='max', pool_stride=2, name='layer3_max') layer4_output = _DBL(input=layer3_output, num_filters=64, filter_size=3, name='layer4') layer5_output = pfl.pool2d(input=layer4_output, pool_size=2, pool_type='max', pool_stride=2, name='layer5_max') layer6_output = _DBL(input=layer5_output, num_filters=128, filter_size=3, name='layer6') layer7_output = pfl.pool2d(input=layer6_output, pool_size=2, pool_type='max', pool_stride=2, name='layer7_max') layer8_output = _DBL(input=layer7_output, num_filters=256, filter_size=3, name='layer8') layer9_output = pfl.pool2d(input=layer8_output, pool_size=2, pool_type='max', pool_stride=2, name='layer9_max') layer10_output = _DBL(input=layer9_output, num_filters=512, filter_size=3, name='layer10') layer11_output = pfl.pool2d(input=pfl.pad( layer10_output, paddings=[0, 0, 0, 0, 0, 1, 0, 1]), pool_size=2, pool_type='max', pool_stride=1, name='layer11_max') layer12_output = _DBL(input=layer11_output, num_filters=1024, filter_size=3, name='layer12') layer13_output = _DBL(input=layer12_output, num_filters=256, filter_size=1, padding=0, name='layer13') layer14_output = _DBL(input=layer13_output, num_filters=512, filter_size=3, name='layer14') layer15_output = pfl.conv2d(input=layer14_output, num_filters=18, filter_size=1, name='layer15_conv') # layer16_yolo -> -1 x 18 x 13 x 13 yolo1_loss = pfl.yolov3_loss(name='yolo1_loss', x=layer15_output, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, anchor_mask=[3, 4, 5], class_num=1, ignore_thresh=0.5, downsample_ratio=32) # layer17_route_13 layer18_output = _DBL(input=layer13_output, num_filters=128, filter_size=1, padding=0, name='layer18') layer19_output = pfl.expand(layer18_output, expand_times=[1, 1, 2, 2], name='layer19_upsample') # layer20_route_19_8 layer20_output = pfl.concat([layer19_output, layer8_output], axis=1, name='layer20_concat') layer21_output = _DBL(layer20_output, num_filters=256, filter_size=3, name='layer21') layer22_output = pfl.conv2d(input=layer21_output, num_filters=18, filter_size=1, name='layer22_conv') # layer23_yolo -> -1 x 18 x 26 x 26 yolo2_loss = pfl.yolov3_loss(name='yolo2_loss', x=layer22_output, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, anchor_mask=[0, 1, 2], class_num=1, ignore_thresh=0.5, downsample_ratio=16) loss = pfl.reduce_mean(pfl.elementwise_add(yolo1_loss, yolo2_loss), name="loss_output") optimizer = fluid.optimizer.AdamOptimizer( learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, regularization=regularization, lazy_mode=lazy_mode) optimizer.minimize(loss) self._netOutput1, self._netOutput2 = layer15_output, layer22_output self._loss = loss self._trainExe = fluid.Executor( fluid.CUDAPlace(0)) if self._USE_CUDA else fluid.Executor( fluid.CPUPlace())