def test_uniform_initializer(self, dtype="float32"): """ In dygraph mode, we can use initializer directly to initialize a tensor. """ paddle.disable_static() tensor = paddle.zeros([1024, 1024, 16]) tensor.stop_gradient = False self.assertTrue(np.allclose(np.zeros((1024, 1024, 16)), tensor.numpy())) uniform_ = paddle.nn.initializer.Uniform() uniform_(tensor) self.assertEqual(tensor.stop_gradient, False) # stop_gradient is not changed hist, prob = output_hist(tensor.numpy()) self.assertTrue( np.allclose( hist, prob, rtol=0, atol=1e-3), "hist: " + str(hist)) paddle.enable_static()
def inv_transform(self, prob_map): if self._object_roi is None: self._prev_probs = prob_map.numpy() return prob_map assert prob_map.shape[0] == 1 rmin, rmax, cmin, cmax = self._object_roi prob_map = paddle.nn.functional.interpolate(prob_map, size=(rmax - rmin + 1, cmax - cmin + 1), mode='bilinear', align_corners=True) if self._prev_probs is not None: new_prob_map = paddle.zeros(shape=self._prev_probs.shape, dtype=prob_map.dtype) new_prob_map[:, :, rmin:rmax + 1, cmin:cmax + 1] = prob_map else: new_prob_map = prob_map self._prev_probs = new_prob_map.numpy() return new_prob_map
def sample_bbox(matches, match_labels, gt_classes, batch_size_per_im, fg_fraction, num_classes, use_random=True, is_cascade=False): n_gt = gt_classes.shape[0] if n_gt == 0: # No truth, assign everything to background gt_classes = paddle.ones(matches.shape, dtype='int32') * num_classes #return matches, match_labels + num_classes else: gt_classes = paddle.gather(gt_classes, matches) gt_classes = paddle.where(match_labels == 0, paddle.ones_like(gt_classes) * num_classes, gt_classes) gt_classes = paddle.where(match_labels == -1, paddle.ones_like(gt_classes) * -1, gt_classes) if is_cascade: index = paddle.arange(matches.shape[0]) return index, gt_classes rois_per_image = int(batch_size_per_im) fg_inds, bg_inds = subsample_labels(gt_classes, rois_per_image, fg_fraction, num_classes, use_random) if fg_inds.shape[0] == 0 and bg_inds.shape[0] == 0: # fake output labeled with -1 when all boxes are neither # foreground nor background sampled_inds = paddle.zeros([1], dtype='int32') else: sampled_inds = paddle.concat([fg_inds, bg_inds]) sampled_gt_classes = paddle.gather(gt_classes, sampled_inds) return sampled_inds, sampled_gt_classes
def __call__(self, bbox_head_out, rois, im_shape, scale_factor): bbox_pred, cls_prob = bbox_head_out roi, rois_num = rois origin_shape = im_shape / scale_factor scale_list = [] origin_shape_list = [] for idx in range(self.batch_size): scale = scale_factor[idx, :][0] rois_num_per_im = rois_num[idx] expand_scale = paddle.expand(scale, [rois_num_per_im, 1]) scale_list.append(expand_scale) expand_im_shape = paddle.expand(origin_shape[idx, :], [rois_num_per_im, 2]) origin_shape_list.append(expand_im_shape) scale = paddle.concat(scale_list) origin_shape = paddle.concat(origin_shape_list) bbox = roi / scale bbox = ops.box_coder(prior_box=bbox, prior_box_var=self.prior_box_var, target_box=bbox_pred, code_type=self.code_type, box_normalized=self.box_normalized, axis=self.axis) # TODO: Updata box_clip origin_h = paddle.unsqueeze(origin_shape[:, 0] - 1, axis=1) origin_w = paddle.unsqueeze(origin_shape[:, 1] - 1, axis=1) zeros = paddle.zeros(origin_h.shape, 'float32') x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros) y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros) x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros) y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros) bbox = paddle.stack([x1, y1, x2, y2], axis=-1) bboxes = (bbox, rois_num) return bboxes, cls_prob
def forward(self, input_ids, token_type_ids=None, position_ids=None): if position_ids is None: ones = paddle.ones_like(input_ids, dtype="int64") seq_length = paddle.cumsum(ones, axis=-1) content_len = paddle.shape(input_ids)[1] - self.cls_num position_ids = paddle.concat([ paddle.zeros(shape=[self.cls_num], dtype="int64"), paddle.linspace(1, content_len, content_len, dtype="int64") ]) position_ids.stop_gradient = True if token_type_ids is None: token_type_ids = paddle.zeros_like(input_ids, dtype="int64") input_embedings = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = input_embedings + token_type_embeddings + position_embeddings embeddings = self.layer_norm(embeddings) embeddings = self.dropout(embeddings) return embeddings
def forward(self, x): out = self.bn1(x) out = self.conv1(out) out = self.bn2(out) out = self.relu(out) out = self.conv2(out) out = self.bn3(out) out = self.relu(out) out = self.conv3(out) out = self.bn4(out) if self.downsample is not None: shortcut = self.downsample(x) featuremap_size = shortcut.shape[2:4] else: shortcut = x featuremap_size = out.shape[2:4] batch_size = out.shape[0] residual_channel = out.shape[1] shortcut_channel = shortcut.shape[1] if residual_channel != shortcut_channel: padding = paddle.zeros([ batch_size, residual_channel - shortcut_channel, featuremap_size[0], featuremap_size[1] ]) out += paddle.concat([shortcut, padding], 1) else: out += shortcut return out
def forward(self, voxel_features, coords, batch_size): batch_canvas = [] for batch_itt in range(batch_size): canvas = paddle.zeros((self.nchannels, self.nx * self.ny), dtype=voxel_features.dtype) batch_mask = coords[:, 0] == batch_itt if batch_mask.any().numpy()[0] == True: this_coords = mask_select(coords, batch_mask) indices = this_coords[:, 2] * self.nx + this_coords[:, 3] indices = indices.astype("int64") voxels = mask_select(voxel_features, batch_mask) voxels = voxels.t() canvas = select_change(canvas, voxels, indices) else: pass batch_canvas.append(canvas) batch_canvas = paddle.stack(batch_canvas, 0) batch_canvas = batch_canvas.reshape( (batch_size, self.nchannels, self.ny, self.nx)) return batch_canvas
def make_grid(tensor, nrow=8, normalize=False, range=None, scale_each=False): """Make a grid of images. Args: tensor (Tensor or list): 4D mini-batch Tensor of shape (B x C x H x W) or a list of images all of the same size. nrow (int, optional): Number of images displayed in each row of the grid. The final grid size is ``(B / nrow, nrow)``. Default: ``8``. normalize (bool, optional): If True, shift the image to the range (0, 1), by the min and max values specified by :attr:`range`. Default: ``False``. range (tuple, optional): tuple (min, max) where min and max are numbers, then these numbers are used to normalize the image. By default, min and max are computed from the tensor. scale_each (bool, optional): If ``True``, scale each image in the batch of images separately rather than the (min, max) over all images. Default: ``False``. """ if not (isinstance(tensor, paddle.Tensor) or (isinstance(tensor, list) and all(isinstance(tensor, t) for t in tensor))): raise TypeError('tensor or list of tensors expected, got {}'.format( type(tensor))) # if list of tensors, convert to a 4D mini-batch Tensor if isinstance(tensor, list): tensor = paddle.stack(tensor, 0) if tensor.dim() == 2: # single image H x W tensor = tensor.unsqueeze(0) if tensor.dim() == 3: # single image if tensor.shape[0] == 1: # if single-channel, convert to 3-channel tensor = paddle.concat([tensor, tensor, tensor], 0) tensor = tensor.unsqueeze(0) if tensor.dim() == 4 and tensor.shape[1] == 1: # single-channel images tensor = paddle.concat([tensor, tensor, tensor], 1) if normalize is True: tensor = tensor.astype(tensor.dtype) # avoid modifying tensor in-place if range is not None: assert isinstance(range, tuple), \ "range has to be a tuple (min, max) if specified. min and max are numbers" def norm_ip(img, min, max): img[:] = img.clip(min=min, max=max) img[:] = (img - min) / (max - min + 1e-5) def norm_range(t, range): if range is not None: norm_ip(t, range[0], range[1]) else: norm_ip(t, float(t.min()), float(t.max())) if scale_each is True: for t in tensor: # loop over mini-batch dimension norm_range(t, range) else: norm_range(tensor, range) if tensor.shape[0] == 1: return tensor.squeeze(0) # make the mini-batch of images into a grid nmaps = tensor.shape[0] xmaps = min(nrow, nmaps) ymaps = int(math.ceil(float(nmaps) / xmaps)) height, width = int(tensor.shape[2]), int(tensor.shape[3]) num_channels = tensor.shape[1] canvas = paddle.zeros((num_channels, height * ymaps, width * xmaps), dtype=tensor.dtype) k = 0 for y in irange(ymaps): for x in irange(xmaps): if k >= nmaps: break canvas[:, y * height:(y + 1) * height, x * width:(x + 1) * width] = tensor[k] k = k + 1 return canvas
def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=4, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, use_checkpoint=False, **kwargs): super().__init__() self.num_classes = num_classes self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.num_features = int(embed_dim * 2**(self.num_layers - 1)) self.mlp_ratio = mlp_ratio # split image into non-overlapping patches self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None) num_patches = self.patch_embed.num_patches patches_resolution = self.patch_embed.patches_resolution self.patches_resolution = patches_resolution # absolute position embedding if self.ape: attr = ParamAttr(initializer=nn.initializer.Constant(0)) self.absolute_pos_embed = self.create_parameter(shape=(1, num_patches, embed_dim), attr=attr) paddle.assign(trunc_norm_(self.absolute_pos_embed.shape, std=0.02), self.absolute_pos_embed) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [x for x in np.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule # build layers self.layers = nn.LayerList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2**i_layer), input_resolution=(patches_resolution[0] // (2**i_layer), patches_resolution[1] // (2**i_layer)), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint) self.layers.append(layer) self.norm = norm_layer(self.num_features) self.avgpool = nn.AdaptiveAvgPool1D(1) self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else Identity() for m in self.sublayers(): if isinstance(m, nn.LayerNorm): paddle.assign(paddle.zeros(m.bias.shape), m.bias) paddle.assign(paddle.ones(m.weight.shape), m.weight) if isinstance(m, nn.Linear): try: paddle.assign(trunc_norm_(m.weight.shape, std=0.02), m.weight) except: print(m.weight.shape) if m.bias is not None: paddle.assign(paddle.zeros(m.bias.shape), m.bias)
def __call__(self, seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=None): # sort and keep top nms_pre sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n) seg_masks = paddle.gather(seg_masks, index=sort_inds) seg_preds = paddle.gather(seg_preds, index=sort_inds) sum_masks = paddle.gather(sum_masks, index=sort_inds) cate_scores = paddle.gather(cate_scores, index=sort_inds) cate_labels = paddle.gather(cate_labels, index=sort_inds) seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1) # inter. inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0])) n_samples = paddle.shape(cate_labels) # union. sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples]) # iou. iou_matrix = (inter_matrix / (sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix)) iou_matrix = paddle.triu(iou_matrix, diagonal=1) # label_specific matrix. cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples]) label_matrix = paddle.cast( (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])), 'float32') label_matrix = paddle.triu(label_matrix, diagonal=1) # IoU compensation compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0) compensate_iou = paddle.expand(compensate_iou, shape=[n_samples, n_samples]) compensate_iou = paddle.transpose(compensate_iou, [1, 0]) # IoU decay decay_iou = iou_matrix * label_matrix # matrix nms if self.kernel == 'gaussian': decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2)) compensate_matrix = paddle.exp(-1 * self.sigma * (compensate_iou**2)) decay_coefficient = paddle.min(decay_matrix / compensate_matrix, axis=0) elif self.kernel == 'linear': decay_matrix = (1 - decay_iou) / (1 - compensate_iou) decay_coefficient = paddle.min(decay_matrix, axis=0) else: raise NotImplementedError # update the score. cate_scores = cate_scores * decay_coefficient y = paddle.zeros(shape=paddle.shape(cate_scores), dtype='float32') keep = paddle.where(cate_scores >= self.update_threshold, cate_scores, y) keep = paddle.nonzero(keep) keep = paddle.squeeze(keep, axis=[1]) # Prevent empty and increase fake data keep = paddle.concat( [keep, paddle.cast(paddle.shape(cate_scores)[0] - 1, 'int64')]) seg_preds = paddle.gather(seg_preds, index=keep) cate_scores = paddle.gather(cate_scores, index=keep) cate_labels = paddle.gather(cate_labels, index=keep) # sort and keep top_k sort_inds = self._sort_score(cate_scores, self.post_nms_top_n) seg_preds = paddle.gather(seg_preds, index=sort_inds) cate_scores = paddle.gather(cate_scores, index=sort_inds) cate_labels = paddle.gather(cate_labels, index=sort_inds) return seg_preds, cate_scores, cate_labels
googLeNet_part3 = self.googLeNet_part3(googLeNet_part2) googLeNet_part3 = paddle.nn.functional.dropout(googLeNet_part3, p=0.6) out_final_2d = paddle.reshape(googLeNet_part3, [-1, googLeNet_part3.shape[1]]) out_final_2d = paddle.reshape(out_final_2d, [-1, self.seg_num, out_final_2d.shape[1]]) out_final_2d = paddle.mean(out_final_2d, axis=1) out_final = paddle.concat(x=[out_final_2d,out_final_3d], axis=1) out_final = self.out(out_final) out_final = paddle.nn.functional.softmax(out_final) if label is not None: acc = paddle.metric.accuracy(input=out_final, label=label) return out_final, acc else: return out_final if __name__ == '__main__': network = GoogLeNet() img = paddle.zeros([1, 12, 3, 224, 224]) outs = network(img) print(outs.shape)
def func_test_memory_reserved(self, device=None): if core.is_compiled_with_cuda(): tensor = paddle.zeros(shape=[256]) alloc_size = 4 * 256 # 256 float32 data, with 4 bytes for each one memory_reserved_size = memory_reserved(device) self.assertEqual(memory_reserved_size, alloc_size)
def get_tensor(self, device="cpu"): self.device = device.lower() place = None tensor = paddle.zeros([5, 5], dtype="float32") return tensor
def forward(self, mol_batch, x_tree_vecs): """Tree decoding in training Args: mol_batch(list): mol objects in a batch. x_tree_vecs(tensor): tree latent representation. Returns: pred_loss: label prediction loss. stop_loss: topological prediction loss. pred_acc: label prediction accuracy. stop_acc: topological prediction accuracy. """ pred_hiddens, pred_contexts, pred_targets = [], [], [] stop_hiddens, stop_contexts, stop_targets = [], [], [] traces = [] for mol_tree in mol_batch: s = [] dfs(s, mol_tree.nodes[0], -1) traces.append(s) for node in mol_tree.nodes: node.neighbors = [] batch_size = len(mol_batch) pred_hiddens.append(paddle.zeros([len(mol_batch), self.hidden_size])) pred_targets.extend([mol_tree.nodes[0].wid for mol_tree in mol_batch]) pred_contexts.append(paddle.to_tensor(list(range(batch_size)))) max_iter = max([len(tr) for tr in traces]) padding = paddle.zeros([self.hidden_size]) padding.stop_gradient = False h = {} for t in range(max_iter): prop_list = [] batch_list = [] for i, plist in enumerate(traces): if t < len(plist): prop_list.append(plist[t]) batch_list.append(i) cur_x = [] cur_h_nei, cur_o_nei = [], [] for node_x, real_y, _ in prop_list: cur_nei = [h[(node_y.idx, node_x.idx)] for node_y in node_x.neighbors if node_y.idx != real_y.idx] pad_len = MAX_NB - len(cur_nei) cur_h_nei.extend(cur_nei) cur_h_nei.extend([padding] * pad_len) cur_nei = [h[(node_y.idx, node_x.idx)] for node_y in node_x.neighbors] pad_len = MAX_NB - len(cur_nei) cur_o_nei.extend(cur_nei) cur_o_nei.extend([padding] * pad_len) cur_x.append(node_x.wid) cur_x = paddle.to_tensor(cur_x) cur_x = self.embedding(cur_x) cur_h_nei = paddle.reshape(paddle.stack(cur_h_nei, axis=0), shape=[-1, MAX_NB, self.hidden_size]) new_h = GRU(cur_x, cur_h_nei, self.W_z, self.W_r, self.U_r, self.W_h) cur_o_nei = paddle.reshape(paddle.stack(cur_o_nei, axis=0), shape=[-1, MAX_NB, self.hidden_size]) cur_o = paddle.sum(cur_o_nei, axis=1) pred_target, pred_list = [], [] stop_target = [] for i, m in enumerate(prop_list): node_x, node_y, direction = m x, y = node_x.idx, node_y.idx h[(x, y)] = new_h[i] node_y.neighbors.append(node_x) if direction == 1: pred_target.append(node_y.wid) pred_list.append(i) stop_target.append(direction) cur_batch = paddle.to_tensor((batch_list)) stop_hidden = paddle.concat([cur_x, cur_o], axis=1) stop_hiddens.append(stop_hidden) stop_contexts.append(cur_batch) stop_targets.extend(stop_target) if len(pred_list) > 0: batch_list = [batch_list[i] for i in pred_list] cur_batch = paddle.to_tensor(batch_list) pred_contexts.append(cur_batch) cur_pred = paddle.to_tensor(pred_list) pred_hiddens.append(paddle.index_select(axis=0, index=cur_pred, x=new_h)) pred_targets.extend(pred_target) cur_x, cur_o_nei = [], [] for mol_tree in mol_batch: node_x = mol_tree.nodes[0] cur_x.append(node_x.wid) cur_nei = [h[(node_y.idx, node_x.idx)] for node_y in node_x.neighbors] pad_len = MAX_NB - len(cur_nei) cur_o_nei.extend(cur_nei) cur_o_nei.extend([padding] * pad_len) cur_x = paddle.to_tensor(cur_x) cur_x = self.embedding(cur_x) cur_o_nei = paddle.reshape(paddle.stack(cur_o_nei, axis=0), shape=[-1, MAX_NB, self.hidden_size]) cur_o = paddle.sum(cur_o_nei, axis=1) stop_hidden = paddle.concat([cur_x, cur_o], axis=1) stop_hiddens.append(stop_hidden) stop_contexts.append(paddle.to_tensor(list(range(batch_size)))) stop_targets.extend([0] * len(mol_batch)) pred_contexts = paddle.concat(pred_contexts, axis=0) pred_hiddens = paddle.concat(pred_hiddens, axis=0) pred_scores = self.aggregate(pred_hiddens, pred_contexts, x_tree_vecs, 'word') pred_targets = paddle.to_tensor(pred_targets) pred_loss = self.pred_loss(pred_scores, pred_targets) / len(mol_batch) preds = paddle.argmax(pred_scores, axis=1) pred_acc = paddle.equal(preds, pred_targets).astype('float32') pred_acc = paddle.sum(pred_acc) / pred_targets.size stop_contexts = paddle.concat(stop_contexts, axis=0) stop_hiddens = paddle.concat(stop_hiddens, axis=0) stop_hiddens = F.relu(self.U_i(stop_hiddens)) stop_scores = self.aggregate(stop_hiddens, stop_contexts, x_tree_vecs, 'stop') stop_scores = stop_scores.squeeze(-1) stop_targets = paddle.to_tensor(stop_targets).astype('float32') stop_loss = self.stop_loss(stop_scores, stop_targets) / len(mol_batch) stops = paddle.greater_equal(stop_scores, paddle.ones(shape=[1])).astype('float32') stop_acc = paddle.equal(stops, stop_targets).astype('float32') stop_acc = paddle.sum(stop_acc) / stop_targets.size return {'pred_loss': pred_loss, 'stop_loss': stop_loss, 'pred_acc': float(pred_acc.numpy()), 'stop_acc': float(stop_acc.numpy())}
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_ds = load_dataset('glue', args.task_name, splits="train") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.task_name == "mnli": dev_ds_matched, dev_ds_mismatched = load_dataset( 'glue', args.task_name, splits=["dev_matched", "dev_mismatched"]) dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True) dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True) dev_batch_sampler_matched = paddle.io.BatchSampler( dev_ds_matched, batch_size=args.batch_size, shuffle=False) dev_data_loader_matched = DataLoader( dataset=dev_ds_matched, batch_sampler=dev_batch_sampler_matched, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_batch_sampler_mismatched = paddle.io.BatchSampler( dev_ds_mismatched, batch_size=args.batch_size, shuffle=False) dev_data_loader_mismatched = DataLoader( dataset=dev_ds_mismatched, batch_sampler=dev_batch_sampler_mismatched, collate_fn=batchify_fn, num_workers=0, return_list=True) else: dev_ds = load_dataset('glue', args.task_name, splits='dev') dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_labels = 1 if train_ds.label_list == None else len(train_ds.label_list) model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) # Step1: Initialize a dictionary to save the weights from the origin BERT model. origin_weights = model.state_dict() # Step2: Convert origin model to supernet. sp_config = supernet(expand_ratio=args.width_mult_list) model = Convert(sp_config).convert(model) # Use weights saved in the dictionary to initialize supernet. utils.set_state_dict(model, origin_weights) del origin_weights # Step3: Define teacher model. teacher_model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_labels) # Step4: Config about distillation. mapping_layers = ['bert.embeddings'] for idx in range(model.bert.config['num_hidden_layers']): mapping_layers.append('bert.encoder.layers.{}'.format(idx)) default_distill_config = { 'lambda_distill': 0.1, 'teacher_model': teacher_model, 'mapping_layers': mapping_layers, } distill_config = DistillConfig(**default_distill_config) # Step5: Config in supernet training. ofa_model = OFA(model, distill_config=distill_config, elastic_order=['width']) criterion = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() if args.task_name == "mnli": dev_data_loader = (dev_data_loader_matched, dev_data_loader_mismatched) # Step6: Calculate the importance of neurons and head, # and then reorder them according to the importance. head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance( args.task_name, ofa_model.model, dev_data_loader, loss_fct=criterion, num_layers=model.bert.config['num_hidden_layers'], num_heads=model.bert.config['num_attention_heads']) reorder_neuron_head(ofa_model.model, head_importance, neuron_importance) if paddle.distributed.get_world_size() > 1: ofa_model.model = paddle.DataParallel(ofa_model.model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=ofa_model.model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): # Step7: Set current epoch and task. ofa_model.set_epoch(epoch) ofa_model.set_task('width') for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch for width_mult in args.width_mult_list: # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) logits, teacher_logits = ofa_model(input_ids, segment_ids, attention_mask=[None, None]) rep_loss = ofa_model.calc_distill_loss() if args.task_name == 'sts-b': logit_loss = paddle.zeros(shape=[1], dtype='float32') else: logit_loss = soft_cross_entropy(logits, teacher_logits.detach()) loss = rep_loss + args.lambda_logit * logit_loss loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: if paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0: tic_eval = time.time() if args.task_name == "mnli": evaluate(teacher_model, criterion, metric, dev_data_loader_matched, width_mult=100) evaluate(teacher_model, criterion, metric, dev_data_loader_mismatched, width_mult=100) else: evaluate(teacher_model, criterion, metric, dev_data_loader, width_mult=100) print("eval done total : %s s" % (time.time() - tic_eval)) for idx, width_mult in enumerate(args.width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() if args.task_name == "mnli": acc = evaluate(ofa_model, criterion, metric, dev_data_loader_matched, width_mult) evaluate(ofa_model, criterion, metric, dev_data_loader_mismatched, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) else: acc = evaluate(ofa_model, criterion, metric, dev_data_loader, width_mult) print("eval done total : %s s" % (time.time() - tic_eval)) if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: return
def forward(self, inputs, lengths): """ Decode the highest scoring sequence of tags. Args: inputs: The unary emission tensor with shape `[batch_size, sequence_length, num_tags]`. length: The input length tensor with shape `[batch_size]`, storing real length of each sequence for correctness. Returns: scores: The scores tensor containing the score for the Viterbi sequence, with shape `[batch_size]`. paths: The paths tensor containing the highest scoring tag indices, with shape `[batch_size, sequence_length`]. """ batch_size, seq_len, n_labels = inputs.shape inputs_t = inputs.transpose([1, 0, 2]) trans_exp = self.transitions.unsqueeze(0).expand( [batch_size, n_labels, n_labels]) all_alpha = [] historys = [] if self.with_start_stop_tag: alpha = self._initialize_alpha(batch_size) else: alpha = paddle.zeros((batch_size, self.num_tags), dtype='float32') for i, logit in enumerate(inputs_t): # if not with_start_stop_tag, the first label has not antecedent tag. if i == 0 and not self.with_start_stop_tag: alpha = logit all_alpha.append(alpha) continue alpha_exp = alpha.unsqueeze(2) # alpha_trn_sum: batch_size, n_labels, n_labels alpha_trn_sum = alpha_exp + trans_exp # alpha_max: batch_size, n_labels # We don't include the emission scores here because the max does not depend on them (we add them in below) alpha_max = alpha_trn_sum.max(1) # If with_start_stop_tag, the first antecedent tag must be START, else the first label has not antecedent tag. # So we can record the path from i=1. if i >= 1: alpha_argmax = alpha_trn_sum.argmax(1) historys.append(alpha_argmax) # Now add the emission scores alpha = alpha_max + logit all_alpha.append(alpha) # Get the valid alpha all_alpha = paddle.stack(all_alpha).transpose([1, 0, 2]) batch_index = self._get_batch_index(batch_size) last_index = lengths - 1 idxs = paddle.stack([batch_index, last_index], axis=1) alpha = paddle.gather_nd(all_alpha, idxs) if self.with_start_stop_tag: # The last one step alpha += self.transitions[self.stop_idx].unsqueeze(0).expand_as( alpha) scores, last_ids = alpha.max(1), alpha.argmax(1).numpy().tolist() # Trace back the best path # historys: seq_len, batch_size, n_labels historys = paddle.stack(historys).numpy() lengths_np = lengths.numpy() batch_path = [] max_len = 0 for batch_id in range(batch_size): best_last_tag = last_ids[batch_id] path = [best_last_tag] for hist in reversed(historys[:lengths_np[batch_id]]): # hist: batch_size, n_labels best_last_tag = hist[batch_id][best_last_tag] path.append(best_last_tag) path.reverse() max_len = max(max_len, len(path)) # Pad to the max sequence length, so that the ChunkEvaluator can compute it batch_path.append(path) batch_path = [ path + [0] * (max_len - len(path)) for path in batch_path ] batch_path = paddle.to_tensor(batch_path) return scores, batch_path
def __call__(self, box_cls, box_pred, scale_factor_wh, img_whwh): """ Arguments: box_cls (Tensor): tensor of shape (batch_size, num_proposals, K). The tensor predicts the classification probability for each proposal. box_pred (Tensor): tensors of shape (batch_size, num_proposals, 4). The tensor predicts 4-vector (x,y,w,h) box regression values for every proposal scale_factor_wh (Tensor): tensors of shape [batch_size, 2] the scalor of per img img_whwh (Tensor): tensors of shape [batch_size, 4] Returns: bbox_pred (Tensor): tensors of shape [num_boxes, 6] Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] bbox_num (Tensor): tensors of shape [batch_size] the number of RoIs in each image. """ assert len(box_cls) == len(scale_factor_wh) == len(img_whwh) img_wh = img_whwh[:, :2] scores = F.sigmoid(box_cls) labels = paddle.arange(0, self.num_classes). \ unsqueeze(0).tile([self.num_proposals, 1]).flatten(start_axis=0, stop_axis=1) classes_all = [] scores_all = [] boxes_all = [] for i, (scores_per_image, box_pred_per_image) in enumerate(zip(scores, box_pred)): scores_per_image, topk_indices = scores_per_image.flatten( 0, 1).topk( self.num_proposals, sorted=False) labels_per_image = paddle.gather(labels, topk_indices, axis=0) box_pred_per_image = box_pred_per_image.reshape([-1, 1, 4]).tile( [1, self.num_classes, 1]).reshape([-1, 4]) box_pred_per_image = paddle.gather( box_pred_per_image, topk_indices, axis=0) classes_all.append(labels_per_image) scores_all.append(scores_per_image) boxes_all.append(box_pred_per_image) bbox_num = paddle.zeros([len(scale_factor_wh)], dtype="int32") boxes_final = [] for i in range(len(scale_factor_wh)): classes = classes_all[i] boxes = boxes_all[i] scores = scores_all[i] boxes[:, 0::2] = paddle.clip( boxes[:, 0::2], min=0, max=img_wh[i][0]) / scale_factor_wh[i][0] boxes[:, 1::2] = paddle.clip( boxes[:, 1::2], min=0, max=img_wh[i][1]) / scale_factor_wh[i][1] boxes_w, boxes_h = (boxes[:, 2] - boxes[:, 0]).numpy(), ( boxes[:, 3] - boxes[:, 1]).numpy() keep = (boxes_w > 1.) & (boxes_h > 1.) if (keep.sum() == 0): bboxes = paddle.zeros([1, 6]).astype("float32") else: boxes = paddle.to_tensor(boxes.numpy()[keep]).astype("float32") classes = paddle.to_tensor(classes.numpy()[keep]).astype( "float32").unsqueeze(-1) scores = paddle.to_tensor(scores.numpy()[keep]).astype( "float32").unsqueeze(-1) bboxes = paddle.concat([classes, scores, boxes], axis=-1) boxes_final.append(bboxes) bbox_num[i] = bboxes.shape[0] bbox_pred = paddle.concat(boxes_final) return bbox_pred, bbox_num
def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.input_resolution = input_resolution self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio if min(self.input_resolution) <= self.window_size: # if window size is larger than input resolution, we don't partition windows self.shift_size = 0 self.window_size = min(self.input_resolution) assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention(dim, window_size=(self.window_size, self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) if self.shift_size > 0: # calculate attention mask for SW-MSA H, W = self.input_resolution img_mask = paddle.zeros((1, H, W, 1)) # 1 H W 1 h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition( img_mask, self.window_size) # nW, window_size, window_size, 1 mask_windows = mask_windows.reshape( (-1, self.window_size * self.window_size)) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) _mask = (attn_mask != 0).astype('float32') attn_mask *= 0 attn_mask += _mask * float(-100) # attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) else: attn_mask = None self.register_buffer("attn_mask", attn_mask)
def test_assign_output(array): result1 = paddle.zeros(shape=[3, 2], dtype='float32') paddle.assign(array, result1) # result1 = [[1, 1], [3 4], [1, 3]] return result1
def __init__( self, img_size=224, tokens_type="performer", in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm, token_dim=64, class_dim=1000, ): super().__init__() self.class_dim = class_dim self.num_features = ( self.embed_dim ) = embed_dim # num_features for consistency with other models self.tokens_to_token = T2T_Layer( img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim, token_dim=token_dim, ) num_patches = self.tokens_to_token.num_patches self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim))) self.pos_embed = add_parameter( self, get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim)) self.pos_drop = nn.Dropout(p=drop_rate) dpr = np.linspace(0, drop_path_rate, depth) # stochastic depth decay rule self.blocks = nn.LayerList([ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, ) for i in range(depth) ]) self.norm = norm_layer(embed_dim) # Classifier head if class_dim > 0: self.head = nn.Linear(embed_dim, class_dim) trunc_normal_(self.cls_token) self.apply(self._init_weights)
log_writer.add_scalar('eval/acc', acc, step=step) log.debug('acc %.5f' % acc) if args.save_dir is not None: P.save(model.state_dict(), args.save_dir / 'ckpt.bin') if args.save_dir is not None: P.save(model.state_dict(), args.save_dir / 'ckpt.bin') if args.inference_model_dir is not None: class InferenceModel(ErnieModelForSequenceClassification): def forward(self, ids, sids): _, logits = super(InferenceModel, self).forward(ids, sids) return logits model.__class__ = InferenceModel log.debug('saving inference model') src_placeholder = P.zeros([2, 2], dtype='int64') sent_placehodler = P.zeros([2, 2], dtype='int64') _, static = P.jit.TracedLayer.trace( model, inputs=[src_placeholder, sent_placehodler]) static.save_inference_model(str(args.inference_model_dir)) #class InferenceModel(ErnieModelForSequenceClassification): # @P.jit.to_static # def forward(self, ids, sids): # _, logits = super(InferenceModel, self).forward(ids, sids, labels=None) # return logits #model.__class__ = InferenceModel #src_placeholder = P.zeros([2, 2], dtype='int64') #sent_placehodler = P.zeros([2, 2], dtype='int64') #P.jit.save(model, args.inference_model_dir, input_var=[src_placeholder, sent_placehodler]) log.debug('done')
def __init__(self, input_nc=3, output_nc=3, ngf=128, n_blocks=6, norm_layer=nn.InstanceNorm2D, load_checkpoint=None): super(MSGNet, self).__init__() self.gram = GramMatrix() block = Bottleneck upblock = UpBottleneck expansion = 4 model1 = [ ConvLayer(input_nc, 64, kernel_size=7, stride=1), norm_layer(64), nn.ReLU(), block(64, 32, 2, 1, norm_layer), block(32 * expansion, ngf, 2, 1, norm_layer) ] self.model1 = nn.Sequential(*tuple(model1)) model = [] model += model1 self.ins = Inspiration(ngf * expansion) model.append(self.ins) for i in range(n_blocks): model += [block(ngf * expansion, ngf, 1, None, norm_layer)] model += [ upblock(ngf * expansion, 32, 2, norm_layer), upblock(32 * expansion, 16, 2, norm_layer), norm_layer(16 * expansion), nn.ReLU(), ConvLayer(16 * expansion, output_nc, kernel_size=7, stride=1) ] model = tuple(model) self.model = nn.Sequential(*model) if load_checkpoint is not None: self.model_dict = paddle.load(load_checkpoint) self.set_dict(self.model_dict) print("load custom checkpoint success") else: checkpoint = os.path.join(self.directory, 'style_paddle.pdparams') model_dict = paddle.load(checkpoint) model_dict_clone = model_dict.copy() for key, value in model_dict_clone.items(): if key.endswith(("scale")): name = key.rsplit('.', 1)[0] + '.bias' model_dict[name] = paddle.zeros( shape=model_dict[name].shape, dtype='float32') model_dict[key] = paddle.ones(shape=model_dict[key].shape, dtype='float32') self.set_dict(model_dict) self.model_dict = model_dict print("load pretrained checkpoint success") self._vgg = None
def decode(self, x_tree_vecs, prob_decode): """ Decode tree structre from tree latent space. Args: x_tree_mess(tensor): tree latent represenation. prob_decode(bool): using bernoulli distribution in tree decode if prob_decode=true. Returns: root node and all nodes. """ assert x_tree_vecs.shape[0] == 1 stack = [] init_hiddens = paddle.zeros([1, self.hidden_size]) zero_pad = paddle.zeros([1, 1, self.hidden_size]) contexts = paddle.zeros([1]).astype('int64') root_score = self.aggregate(init_hiddens, contexts, x_tree_vecs, 'word') root_wid = paddle.argmax(root_score, axis=1) root_wid = int(root_wid.numpy()) root = MolTreeNode(self.vocab.get_smiles(root_wid)) root.wid = root_wid root.idx = 0 stack.append((root, self.vocab.get_slots(root.wid))) all_nodes = [root] h = {} for step in range(MAX_DECODE_LEN): node_x, fa_slot = stack[-1] cur_h_nei = [h[(node_y.idx, node_x.idx)] for node_y in node_x.neighbors] if len(cur_h_nei) > 0: cur_h_nei = paddle.reshape(paddle.stack(cur_h_nei, axis=0), shape=[1, -1, self.hidden_size]) else: cur_h_nei = zero_pad cur_x = paddle.to_tensor([node_x.wid]) cur_x = self.embedding(cur_x) cur_h = paddle.sum(cur_h_nei, axis=1) stop_hiddens = paddle.concat([cur_x, cur_h], axis=1) stop_hiddens = F.relu(self.U_i(stop_hiddens)) stop_score = self.aggregate(stop_hiddens, contexts, x_tree_vecs, 'stop') if prob_decode: backtrack = (paddle.bernoulli(F.sigmoid(stop_score)).item() == 0) else: backtrack = (float(stop_score.numpy()) < 0) if not backtrack: new_h = GRU(cur_x, cur_h_nei, self.W_z, self.W_r, self.U_r, self.W_h) pred_score = self.aggregate(new_h, contexts, x_tree_vecs, 'word') if prob_decode: sort_wid = paddle.multinomial(F.softmax(pred_score, axis=1).squeeze(), 5) else: sort_wid = paddle.argsort( pred_score, axis=1, descending=True) sort_wid = sort_wid.squeeze() next_wid = None for wid in sort_wid[:5]: slots = self.vocab.get_slots(wid) node_y = MolTreeNode(self.vocab.get_smiles(wid)) if have_slots(fa_slot, slots) and can_assemble(node_x, node_y): next_wid = wid next_slots = slots break if next_wid is None: backtrack = True else: node_y = MolTreeNode(self.vocab.get_smiles(next_wid)) node_y.wid = int(next_wid.numpy()) node_y.idx = len(all_nodes) node_y.neighbors.append(node_x) h[(node_x.idx, node_y.idx)] = new_h[0] stack.append((node_y, next_slots)) all_nodes.append(node_y) if backtrack: if len(stack) == 1: break node_fa, _ = stack[-2] cur_h_nei = [h[(node_y.idx, node_x.idx)] for node_y in node_x.neighbors if node_y.idx != node_fa.idx] if len(cur_h_nei) > 0: cur_h_nei = paddle.reshape(paddle.stack(cur_h_nei, axis=0), shape=[1, -1, self.hidden_size]) else: cur_h_nei = zero_pad new_h = GRU(cur_x, cur_h_nei, self.W_z, self.W_r, self.U_r, self.W_h) h[(node_x.idx, node_fa.idx)] = new_h[0] node_fa.neighbors.append(node_x) stack.pop() return root, all_nodes
def forward( self, input_values, attention_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): output_attentions = False #output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = False # ( # output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states #) return_dict = True # return_dict if return_dict is not None else self.config.use_return_dict hidden_states = self.feature_extractor(input_values) hidden_states = hidden_states.transpose((0, 2, 1)) if attention_mask is not None: # compute real output lengths according to convolution formula output_lengths = self._get_feat_extract_output_lengths( attention_mask.sum(-1)) attention_mask = paddle.zeros(hidden_states.shape[:2], dtype=hidden_states.dtype) # these two operations makes sure that all values # before the output lengths indices are attended to attention_mask[(paddle.arange(0, end=attention_mask.shape[0]), output_lengths - 1)] = 1 attention_mask = attention_mask.flip([-1]).cumsum(-1).flip( [-1]).bool() hidden_states = self.feature_projection(hidden_states) if self.config.apply_spec_augment and self.training: batch_size, sequence_length, hidden_size = hidden_states.shape assert False # apply SpecAugment along time axis if self.config.mask_time_prob > 0: mask_time_indices = _compute_mask_indices( (batch_size, sequence_length), self.config.mask_time_prob, self.config.mask_time_length, attention_mask=attention_mask, min_masks=2, ) hidden_states[paddle.to_tensor( mask_time_indices)] = self.masked_spec_embed.to( hidden_states.dtype) # apply SpecAugment along feature axis if self.config.mask_feature_prob > 0: mask_feature_indices = _compute_mask_indices( (batch_size, hidden_size), self.config.mask_feature_prob, self.config.mask_feature_length, ) mask_feature_indices = paddle.to_tensor(mask_feature_indices) hidden_states[mask_feature_indices[:, None].expand( -1, sequence_length, -1)] = 0 encoder_outputs = self.encoder( hidden_states, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = encoder_outputs[0] if not return_dict: return (hidden_states, ) + encoder_outputs[1:] return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, )
def forward( self, input_ids=None, token_type_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, input_mask=None, head_mask=None, inputs_embeds=None, use_mems_train=False, use_mems_eval=False, output_attentions=False, output_hidden_states=False, return_dict=False, ): if self.training: use_mems = use_mems_train else: use_mems = use_mems_eval # The original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # but we want a unified interface in the library with the batch size on the first dimension # so we move here the first dimension (batch) to the end if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_ids = paddle.transpose(input_ids, perm=[1, 0]) qlen, bsz = input_ids.shape[0], input_ids.shape[1] elif inputs_embeds is not None: inputs_embeds = paddle.transpose(inputs_embeds, perm=[1, 0]) qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") token_type_ids = token_type_ids.transpose( [1, 0]) if token_type_ids is not None else None input_mask = input_mask.transpose( [1, 0]) if input_mask is not None else None attention_mask = attention_mask.transpose( [1, 0]) if attention_mask is not None else None perm_mask = perm_mask.transpose([1, 2, 0 ]) if perm_mask is not None else None target_mapping = target_mapping.transpose( [1, 2, 0]) if target_mapping is not None else None mlen = mems[0].shape[ 0] if mems is not None and mems[0] is not None else 0 klen = mlen + qlen # Attention mask # Causal attention mask if self.attn_type == "uni": attn_mask = self.create_mask(qlen, mlen) attn_mask = paddle.unsqueeze(attn_mask, axis=[2, 3]) elif self.attn_type == "bi": attn_mask = None else: raise ValueError("Unsupported attention type: {}".format( self.attn_type)) # Data mask: input mask & perm mask assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " "or attention_mask (uses 0 for padding, added for compatibility with BERT). Please choose one." if input_mask is None and attention_mask is not None: input_mask = 1.0 - attention_mask if input_mask is not None and perm_mask is not None: data_mask = paddle.unsqueeze(input_mask, axis=0) + perm_mask elif input_mask is not None and perm_mask is None: data_mask = paddle.unsqueeze(input_mask, axis=0) elif input_mask is None and perm_mask is not None: data_mask = perm_mask else: data_mask = None if data_mask is not None: # All mems can be attended to if mlen > 0: mems_mask = paddle.cast(paddle.zeros( [data_mask.shape[0], mlen, bsz]), dtype=dtype_float) data_mask = paddle.concat([mems_mask, data_mask], axis=1) if attn_mask is None: attn_mask = paddle.unsqueeze(data_mask, axis=-1) else: attn_mask += paddle.unsqueeze(data_mask, axis=-1) if attn_mask is not None: attn_mask = paddle.cast((attn_mask > 0), dtype=dtype_float) if attn_mask is not None: non_tgt_mask = paddle.cast(-paddle.eye(qlen), dtype=dtype_float) if mlen > 0: non_tgt_mask = paddle.concat([ paddle.cast(paddle.zeros([qlen, mlen]), dtype=dtype_float), non_tgt_mask ], axis=-1) non_tgt_mask = paddle.cast(( (attn_mask + paddle.unsqueeze(non_tgt_mask, axis=[2, 3])) > 0), dtype=dtype_float) else: non_tgt_mask = None # Word embeddings and prepare h & g hidden states if inputs_embeds is not None: word_emb_k = inputs_embeds else: word_emb_k = self.word_embedding(input_ids) output_h = self.dropout(word_emb_k) if target_mapping is not None: word_emb_q = self.mask_emb.expand( [target_mapping.shape[0], bsz, -1]) output_g = self.dropout(word_emb_q) else: output_g = None # Segment embedding if token_type_ids is not None: # Convert `token_type_ids` to one-hot `seg_mat` if mlen > 0: mem_pad = paddle.zeros(shape=[mlen, bsz], dtype='int64') cat_ids = paddle.concat(x=[mem_pad, token_type_ids], axis=0) else: cat_ids = token_type_ids # `1` indicates not in the same segment [qlen x klen x bsz] seg_mat = paddle.cast(paddle.unsqueeze(token_type_ids, axis=1) != paddle.unsqueeze(cat_ids, axis=0), dtype='int64') seg_mat = paddle.cast(F.one_hot(seg_mat, num_classes=2), dtype=dtype_float) else: seg_mat = None # Positional encoding pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz) pos_emb = self.dropout(pos_emb) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # Attention_probs has shape bsz x n_heads x N x N # Input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # And head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( 0).unsqueeze(0) head_mask = head_mask.expand([self.n_layer, -1, -1, -1, -1]) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) else: head_mask = [None] * self.n_layer new_mems = () if mems is None: mems = [None] * len(self.layer) attentions = [] if output_attentions else None hidden_states = [] if output_hidden_states else None for i, layer_module in enumerate(self.layer): if use_mems: # Cache new mems new_mems = new_mems + (self.cache_mem(output_h, mems[i]), ) if output_hidden_states: hidden_states.append(( output_h, output_g) if output_g is not None else output_h) outputs = layer_module( output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask, r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping, head_mask=head_mask[i], output_attentions=output_attentions, ) output_h, output_g = outputs[:2] if output_attentions: attentions.append(outputs[2]) # Add last hidden state if output_hidden_states: hidden_states.append(( output_h, output_g) if output_g is not None else output_h) output = self.dropout(output_g if output_g is not None else output_h) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) output = paddle.transpose(output, perm=[1, 0, 2]) if not use_mems: new_mems = None if output_hidden_states: if output_g is not None: hidden_states = tuple( paddle.transpose(h, perm=[1, 0, 2]) for hs in hidden_states for h in hs) else: hidden_states = tuple( paddle.transpose(hs, perm=[1, 0, 2]) for hs in hidden_states) if output_attentions: if target_mapping is not None: # When target_mapping is provided, there are 2-tuple of attentions attentions = tuple( tuple( paddle.transpose(att_stream, perm=[2, 3, 0, 1]) for att_stream in t) for t in attentions) else: attentions = tuple( paddle.transpose(t, perm=[2, 3, 0, 1]) for t in attentions) if not return_dict: return tuple( v for v in [output, new_mems, hidden_states, attentions] if v is not None) return { "last_hidden_state": output, "mems": new_mems, "hidden_states": hidden_states, "attentions": attentions, }
def forward(self, input_ids=None, bbox=None, image=None, token_type_ids=None, position_ids=None, attention_mask=None, head_mask=None, output_hidden_states=None, output_attentions=None): input_shape = input_ids.shape visual_shape = list(input_shape) visual_shape[1] = self.config["image_feature_pool_shape"][ 0] * self.config["image_feature_pool_shape"][1] final_shape = list(input_shape) final_shape[1] += visual_shape[1] visual_bbox_x = (paddle.arange( 0, 1000 * (self.config["image_feature_pool_shape"][1] + 1), 1000, dtype=bbox.dtype, ) // self.config["image_feature_pool_shape"][1]) visual_bbox_y = (paddle.arange( 0, 1000 * (self.config["image_feature_pool_shape"][0] + 1), 1000, dtype=bbox.dtype, ) // self.config["image_feature_pool_shape"][0]) expand_shape = self.config["image_feature_pool_shape"][0:2] visual_bbox = paddle.stack( [ visual_bbox_x[:-1].expand(expand_shape), visual_bbox_y[:-1].expand(expand_shape[::-1]).transpose([1, 0]), visual_bbox_x[1:].expand(expand_shape), visual_bbox_y[1:].expand(expand_shape[::-1]).transpose([1, 0]), ], axis=-1, ).reshape([-1, bbox.shape[-1]]) visual_bbox = visual_bbox.expand([final_shape[0], -1, -1]) final_bbox = paddle.concat([bbox, visual_bbox], axis=1) if attention_mask is None: attention_mask = paddle.ones(input_shape) visual_attention_mask = paddle.ones(visual_shape) attention_mask = attention_mask.astype(visual_attention_mask.dtype) final_attention_mask = paddle.concat( [attention_mask, visual_attention_mask], axis=1) if token_type_ids is None: token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64) if position_ids is None: seq_length = input_shape[1] position_ids = self.embeddings.position_ids[:, :seq_length] position_ids = position_ids.expand_as(input_ids) visual_position_ids = paddle.arange(0, visual_shape[1]).expand( [input_shape[0], -1]) final_position_ids = paddle.concat( [position_ids, visual_position_ids], axis=1) if bbox is None: bbox = paddle.zeros(input_shape + [4]) text_layout_emb = self._calc_text_embeddings( input_ids=input_ids, bbox=bbox, token_type_ids=token_type_ids, position_ids=position_ids, ) visual_emb = self._calc_img_embeddings( image=image, bbox=visual_bbox, position_ids=visual_position_ids, ) final_emb = paddle.concat([text_layout_emb, visual_emb], axis=1) extended_attention_mask = final_attention_mask.unsqueeze(1).unsqueeze(2) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( -1).unsqueeze(-1) head_mask = head_mask.expand(self.config["num_hidden_layers"], -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.to(dtype=next(self.parameters()).dtype) else: head_mask = [None] * self.config["num_hidden_layers"] encoder_outputs = self.encoder( final_emb, extended_attention_mask, bbox=final_bbox, position_ids=final_position_ids, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) return sequence_output, pooled_output
def init_memory(batch_size, memory_length, d_model, n_layers): return [ paddle.zeros([batch_size, memory_length, d_model], dtype="float32") for _ in range(n_layers) ]
def compute_alpha(beta, t): beta = paddle.concat([paddle.zeros([1]), beta], 0) a = (1 - beta).cumprod(0).index_select(t + 1, 0).reshape([-1, 1, 1, 1]) return a
def __init__(self, num_classes=50, max_point=2048): super(PointNet_Seg, self).__init__() self.max_point = max_point self.input_transform_net = nn.Sequential( nn.Conv1D(3, 64, 1), nn.BatchNorm(64), nn.ReLU(), nn.Conv1D(64, 128, 1), nn.BatchNorm(128), nn.ReLU(), nn.Conv1D(128, 1024, 1), nn.BatchNorm(1024), nn.ReLU(), nn.MaxPool1D(max_point) ) self.input_fc = nn.Sequential( nn.Linear(1024, 512), nn.ReLU(), nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 9, weight_attr=paddle.framework.ParamAttr(initializer=paddle.nn.initializer.Assign(paddle.zeros((256, 9)))), bias_attr=paddle.framework.ParamAttr(initializer=paddle.nn.initializer.Assign(paddle.reshape(paddle.eye(3), [-1]))) ) ) self.mlp_1 = nn.Sequential( nn.Conv1D(3, 64, 1), nn.BatchNorm(64), nn.ReLU(), nn.Conv1D(64, 64, 1), nn.BatchNorm(64), nn.ReLU(), ) self.feature_transform_net = nn.Sequential( nn.Conv1D(64, 64, 1), nn.BatchNorm(64), nn.ReLU(), nn.Conv1D(64, 128, 1), nn.BatchNorm(128), nn.ReLU(), nn.Conv1D(128, 1024, 1), nn.BatchNorm(1024), nn.ReLU(), nn.MaxPool1D(max_point) ) self.feature_fc = nn.Sequential( nn.Linear(1024, 512), nn.ReLU(), nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 64*64) ) self.mlp_2 = nn.Sequential( nn.Conv1D(64, 64, 1), nn.BatchNorm(64), nn.ReLU(), nn.Conv1D(64, 128, 1), nn.BatchNorm(128), nn.ReLU(), nn.Conv1D(128, 1024, 1), nn.BatchNorm(1024), nn.ReLU(), ) self.seg_net = nn.Sequential( nn.Conv1D(1024+64, 512, 1), nn.BatchNorm(512), nn.ReLU(), nn.Conv1D(512, 256, 1), nn.BatchNorm(256), nn.ReLU(), nn.Conv1D(256, 128, 1), nn.BatchNorm(128), nn.ReLU(), nn.Conv1D(128, 128, 1), nn.BatchNorm(128), nn.ReLU(), nn.Conv1D(128, num_classes, 1) )
def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, fluid.framework.Block) block.program._use_lamb = True m = moment1 = self._get_accumulator(self._moment1_acc_str, param_and_grad[0]) v = self._get_accumulator(self._moment2_acc_str, param_and_grad[0]) beta_1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param_and_grad[0]) beta_2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, param_and_grad[0]) beta_1 = layers.fill_constant(dtype='float32', shape=[1], value=self._beta1, name='lamb_beta_1') beta_2 = layers.fill_constant(dtype='float32', shape=[1], value=self._beta2, name='lamb_beta_2') epsilon = layers.fill_constant(dtype='float32', shape=[1], value=self._epsilon, name='epsilon') one = paddle.ones(shape=[1]).astype('float32') zero = paddle.zeros(shape=[1]).astype('float32') next_m = paddle.multiply(m, beta_1) + paddle.multiply( param_and_grad[1], one - beta_1) next_v = paddle.multiply(v, beta_2) + paddle.multiply( paddle.pow(param_and_grad[1], 2), one - beta_2) beta1_correction = one - beta_1_pow_acc beta2_correction = one - beta_2_pow_acc next_m_unbiased = next_m / beta1_correction next_v_unbiased = next_v / beta2_correction update = next_m_unbiased / (paddle.sqrt(next_v_unbiased) + epsilon) if self._exclude_from_weight_decay_fn is not None and self._exclude_from_weight_decay_fn( param_and_grad[0]): self._lamb_weight_decay = 0.0 update += self._lamb_weight_decay * param_and_grad[0] w_norm = paddle.norm(param_and_grad[0], p=2) g_norm = paddle.norm(update, p=2) learning_rate = self._create_param_lr(param_and_grad) ratio = paddle.where( paddle.greater_than(w_norm, zero), paddle.where(paddle.greater_than(g_norm, zero), (w_norm / g_norm), one), one) update_with_lr = ratio * learning_rate * update next_param = param_and_grad[0] - update_with_lr beta_1_pow_acc *= beta_1 beta_2_pow_acc *= beta_2 paddle.assign(next_m, m) paddle.assign(next_v, v) paddle.assign(next_param, param_and_grad[0]) return None