def sqrt_newton_schulz_autograd(self, A, numIters): A_shape = A.shape batchSize = A_shape[0] dim = A_shape[1] normA = A * A normA = paddle.sum(normA, axis=1) normA = paddle.sum(normA, axis=1) normA = paddle.sqrt(normA) normA1 = normA.reshape([batchSize, 1, 1]) Y = paddle.divide(A, paddle.expand_as(normA1, A)) I = paddle.eye(dim, dim).reshape([1, dim, dim]) l0 = [] for i in range(batchSize): l0.append(I) I = paddle.concat(l0, axis=0) I.stop_gradient = False Z = paddle.eye(dim, dim).reshape([1, dim, dim]) l1 = [] for i in range(batchSize): l1.append(Z) Z = paddle.concat(l1, axis=0) Z.stop_gradient = False for i in range(numIters): T = 0.5 * (3.0 * I - Z.bmm(Y)) Y = Y.bmm(T) Z = T.bmm(Z) sA = Y * paddle.sqrt(normA1).reshape([batchSize, 1, 1]) sA = paddle.expand_as(sA, A) return sA
def forward(self, embedding, targets): if isinstance(embedding, dict): embedding = embedding['features'] # Normalize embedding features embedding = F.normalize(embedding, axis=1) dist_mat = paddle.matmul(embedding, embedding, transpose_y=True) N = dist_mat.shape[0] is_pos = targets.reshape([N, 1]).expand([N, N]).equal( paddle.t(targets.reshape([N, 1]).expand([N, N]))).astype('float') is_neg = targets.reshape([N, 1]).expand([N, N]).not_equal( paddle.t(targets.reshape([N, 1]).expand([N, N]))).astype('float') # Mask scores related to itself is_pos = is_pos - paddle.eye(N, N) s_p = dist_mat * is_pos s_n = dist_mat * is_neg logit_p = -self.gamma * s_p + (-99999999.) * (1 - is_pos) logit_n = self.gamma * (s_n + self.margin) + (-99999999.) * (1 - is_neg) loss = F.softplus( paddle.logsumexp(logit_p, axis=1) + paddle.logsumexp(logit_n, axis=1)).mean() return {"PairwiseCosface": loss}
def forward(self, node_feat, edge_feat): # get size num_tasks = node_feat.shape[0] num_data = node_feat.shape[1] # get eye matrix (batch_size x 2 x node_size x node_size) diag_mask = 1.0 - paddle.expand( paddle.eye(num_data), [num_tasks, self.edge_dim, num_data, num_data]) # set diagonal as zero and normalize edge_feat = F.normalize(edge_feat * diag_mask, p=1, axis=-1) # compute attention and aggregate aggr_feat = paddle.bmm( paddle.concat(paddle.split(edge_feat, 2, 1), self.edge_dim).squeeze(1), node_feat) node_feat = paddle.transpose( paddle.concat( [node_feat, paddle.concat(paddle.split(aggr_feat, 2, 1), -1)], -1), (0, 2, 1)) # non-linear transform node_feat = paddle.transpose(self.network(node_feat.unsqueeze(-1)), (0, 2, 1, 3)).squeeze(-1) return node_feat
def label2edge(self, label, mask_diag=True): # get size num_samples = label.shape[1] # reshape label_i = paddle.transpose( paddle.expand(label, [num_samples, label.shape[0], label.shape[1]]), [1, 2, 0]) label_j = label_i.transpose((0, 2, 1)) # compute edge edge = paddle.cast(paddle.equal(label_i, label_j), 'float32') # expand edge = edge.unsqueeze(1) if self.edge_type == 'dist': edge = 1 - edge if self.edge_dim == 2: edge = paddle.concat([edge, 1 - edge], 1) if mask_diag: diag_mask = 1.0 - paddle.expand( paddle.eye(edge.shape[2]), [edge.shape[0], self.edge_dim, edge.shape[2], edge.shape[2]]) edge = edge * diag_mask if self.edge_activation == 'softmax': edge = edge / edge.sum(-1).unsqueeze(-1) return edge
def build_inv_delta_C_paddle(self, C): """ Return inv_delta_C which is needed to calculate T """ F = self.F hat_eye = paddle.eye(F, dtype='float64') # F x F hat_C = paddle.norm(C.reshape([1, F, 2]) - C.reshape([F, 1, 2]), axis=2) + hat_eye hat_C = (hat_C**2) * paddle.log(hat_C) delta_C = paddle.concat( # F+3 x F+3 [ paddle.concat([paddle.ones((F, 1), dtype='float64'), C, hat_C], axis=1), # F x F+3 paddle.concat([ paddle.zeros((2, 3), dtype='float64'), paddle.transpose(C, perm=[1, 0]) ], axis=1), # 2 x F+3 paddle.concat([ paddle.zeros((1, 3), dtype='float64'), paddle.ones((1, F), dtype='float64') ], axis=1) # 1 x F+3 ], axis=0) inv_delta_C = paddle.inverse(delta_C) return inv_delta_C # F+3 x F+3
def forward(self, nodes, edges, nums): start, cat_nodes = 0, [] for num in nums: sample_nodes = nodes[start:start + num] cat_nodes.append( paddle.concat([ paddle.expand(sample_nodes.unsqueeze(1), [-1, num, -1]), paddle.expand(sample_nodes.unsqueeze(0), [num, -1, -1]) ], -1).reshape([num**2, -1])) start += num cat_nodes = paddle.concat([paddle.concat(cat_nodes), edges], -1) cat_nodes = self.relu(self.in_fc(cat_nodes)) coefs = self.coef_fc(cat_nodes) start, residuals = 0, [] for num in nums: residual = F.softmax( -paddle.eye(num).unsqueeze(-1) * 1e9 + coefs[start:start + num**2].reshape([num, num, -1]), 1) residuals.append((residual * cat_nodes[start:start + num**2] .reshape([num, num, -1])).sum(1)) start += num**2 nodes += self.relu(self.out_fc(paddle.concat(residuals))) return [nodes, cat_nodes]
def forward(self, features, im_info, boxes=None): # prediction pred_cls_score_list = [] pred_bbox_offsets_list = [] for x in features: t = F.relu(self.rpn_conv(x)) pred_cls_score_list.append(self.rpn_cls_score(t)) pred_bbox_offsets_list.append(self.rpn_bbox_offsets(t)) # get anchors all_anchors_list = [] # stride: 64,32,16,8,4 p6->p2 base_stride = 4 off_stride = 2**(len(features) - 1) # 16 for fm in features: layer_anchors = self.anchors_generator(fm, base_stride, off_stride) off_stride = off_stride // 2 all_anchors_list.append(layer_anchors) # sample from the predictions rpn_rois = find_top_rpn_proposals(self.training, pred_bbox_offsets_list, pred_cls_score_list, all_anchors_list, im_info) rpn_rois = rpn_rois.cast('float32') if self.training: rpn_labels, rpn_bbox_targets = fpn_anchor_target( boxes, im_info, all_anchors_list) #rpn_labels = rpn_labels.astype(np.int32) pred_cls_score, pred_bbox_offsets = fpn_rpn_reshape( pred_cls_score_list, pred_bbox_offsets_list) # rpn loss valid_masks = rpn_labels >= 0 # objectness_loss = softmax_loss( # torch.gather(pred_cls_score,torch.nonzero(valid_masks)), # torch.gather(rpn_labels,torch.nonzero(valid_masks))) objectness_loss = F.binary_cross_entropy( F.softmax( torch.gather(pred_cls_score, torch.nonzero(valid_masks))), torch.gather( torch.eye(2), torch.gather(rpn_labels, torch.nonzero(valid_masks)))) pos_masks = rpn_labels > 0 # localization_loss = smooth_l1_loss( # pred_bbox_offsets[pos_masks], # rpn_bbox_targets[pos_masks], # config.rpn_smooth_l1_beta) localization_loss = \ F.smooth_l1_loss(torch.gather(pred_bbox_offsets, torch.nonzero(pos_masks)), torch.gather(rpn_bbox_targets, torch.nonzero(pos_masks)),delta=config.rcnn_smooth_l1_beta) normalizer = 1 / valid_masks.cast('float32').sum() loss_rpn_cls = objectness_loss.sum() * normalizer loss_rpn_loc = localization_loss.sum() * normalizer loss_dict = {} loss_dict['loss_rpn_cls'] = loss_rpn_cls loss_dict['loss_rpn_loc'] = loss_rpn_loc return rpn_rois, loss_dict else: return rpn_rois
def forward(self): """ forward """ num_rows = self.config["num_rows"] num_columns = self.config["num_columns"] dtype = self.config["dtype"] x = paddle.eye(num_rows, num_columns=num_columns, dtype=dtype) return x
def __init__(self, rgb_range, rgb_mean, rgb_std, sign=-1): super(MeanShift, self).__init__(3, 3, kernel_size=1) std = paddle.to_tensor(rgb_std) self.weight.set_value(paddle.eye(3).reshape([3, 3, 1, 1])) self.weight.set_value(self.weight / (std.reshape([3, 1, 1, 1]))) mean = paddle.to_tensor(rgb_mean) self.bias.set_value(sign * rgb_range * mean / std) self.weight.trainable = False self.bias.trainable = False
def __init__(self, num_classes=16, max_point=2048): super(PointNet_Clas, self).__init__() self.input_transform_net = nn.Sequential(nn.Conv1D(3, 64, 1), nn.BatchNorm(64), nn.ReLU(), nn.Conv1D(64, 128, 1), nn.BatchNorm(128), nn.ReLU(), nn.Conv1D(128, 1024, 1), nn.BatchNorm(1024), nn.ReLU(), nn.MaxPool1D(max_point)) self.input_fc = nn.Sequential( nn.Linear(1024, 512), nn.ReLU(), nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 9, weight_attr=paddle.framework.ParamAttr( initializer=paddle.nn.initializer.Assign( paddle.zeros((256, 9)))), bias_attr=paddle.framework.ParamAttr( initializer=paddle.nn.initializer.Assign( paddle.reshape(paddle.eye(3), [-1]))))) self.mlp_1 = nn.Sequential( nn.Conv1D(3, 64, 1), nn.BatchNorm(64), nn.ReLU(), nn.Conv1D(64, 64, 1), nn.BatchNorm(64), nn.ReLU(), ) self.feature_transform_net = nn.Sequential(nn.Conv1D(64, 64, 1), nn.BatchNorm(64), nn.ReLU(), nn.Conv1D(64, 128, 1), nn.BatchNorm(128), nn.ReLU(), nn.Conv1D(128, 1024, 1), nn.BatchNorm(1024), nn.ReLU(), nn.MaxPool1D(max_point)) self.feature_fc = nn.Sequential(nn.Linear(1024, 512), nn.ReLU(), nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 64 * 64)) self.mlp_2 = nn.Sequential( nn.Conv1D(64, 64, 1), nn.BatchNorm(64), nn.ReLU(), nn.Conv1D(64, 128, 1), nn.BatchNorm(128), nn.ReLU(), nn.Conv1D(128, 1024, 1), nn.BatchNorm(1024), nn.ReLU(), ) self.fc = nn.Sequential(nn.Linear(1024, 512), nn.ReLU(), nn.Linear(512, 256), nn.ReLU(), nn.Dropout(p=0.7), nn.Linear(256, num_classes))
def resize_mat(self, x, t): n, c, s, s1 = x.shape assert s == s1 if t <= 1: return x x = paddle.reshape(x, (n * c, -1, 1, 1)) x = x * paddle.eye(t, t, dtype=x.dtype) x = paddle.reshape(x, (n * c, s, s, t, t)) x = paddle.concat(paddle.split(x, 1, axis=1), axis=3) x = paddle.concat(paddle.split(x, 1, axis=2), axis=4) x = paddle.reshape(x, (n, c, s * t, s * t)) return x
def __init__(self, mean_rgb, sub): super(MeanShift, self).__init__() sign = -1 if sub else 1 r = mean_rgb[0] * sign g = mean_rgb[1] * sign b = mean_rgb[2] * sign self.shifter = nn.Conv2D(3, 3, 1, 1, 0) self.shifter.weight.set_value(paddle.eye(3).reshape([3, 3, 1, 1])) self.shifter.bias.set_value(np.array([r, g, b]).astype('float32')) # Freeze the mean shift layer for params in self.shifter.parameters(): params.trainable = False
def test_out(self): with fluid.program_guard(fluid.Program()): data = paddle.eye(10) place = fluid.CPUPlace() exe = fluid.Executor(place) result, = exe.run(fetch_list=[data]) expected_result = np.eye(10, dtype="float32") self.assertEqual((result == expected_result).all(), True) with fluid.program_guard(fluid.Program()): data = paddle.eye(10, num_columns=7, dtype="float64") place = fluid.CPUPlace() exe = fluid.Executor(place) result, = exe.run(fetch_list=[data]) expected_result = np.eye(10, 7, dtype="float64") self.assertEqual((result == expected_result).all(), True) with fluid.program_guard(fluid.Program()): data = paddle.eye(10, dtype="int64") place = fluid.CPUPlace() exe = fluid.Executor(place) result, = exe.run(fetch_list=[data]) expected_result = np.eye(10, dtype="int64") self.assertEqual((result == expected_result).all(), True)
def sigmoid_focal_loss(self, x, label, fg_num, gamma=2.0, alpha=0.25): C = x.shape[1] eye = paddle.eye(C + 1, dtype='float32') one_hot = L.gather(eye, label) pos_mask = one_hot[:, 1:] # 正样本掩码 p = L.sigmoid(x) # [批大小*所有格子数, 80], 预测的类别概率 pos_loss = pos_mask * (0 - L.log(p + 1e-9)) * L.pow(1 - p, gamma) * alpha neg_loss = (1.0 - pos_mask) * (0 - L.log(1 - p + 1e-9)) * L.pow( p, gamma) * (1 - alpha) focal_loss = pos_loss + neg_loss if fg_num > 0.5: # 当没有gt时,即fg_num==0时,focal_loss什么都不除。 focal_loss = focal_loss / fg_num return focal_loss
def perm_to_Pmat(perm, dim): pshape = perm.shape bs = int(np.product(perm.shape[:-1]).item()) perm = perm.reshape((bs, pshape[-1])) oneslst = [] for i in range(bs): idlst = np.arange(dim) perm_item = perm[i, :] for idx, p in enumerate(perm_item - 1): temp = idlst[idx] idlst[idx] = idlst[p] idlst[p] = temp ones = paddle.eye(dim) nmat = paddle.scatter(ones, paddle.to_tensor(idlst), ones) oneslst.append(nmat) return np.array(oneslst).reshape(list(pshape[:-1]) + [dim, dim])
def eye_(tensor): r"""Fills the 2-dimensional input `Tensor` with the identity matrix. Preserves the identity of the inputs in `Linear` layers, where as many inputs are preserved as possible. Args: tensor: a 2-dimensional `torch.Tensor` Examples: >>> w = torch.empty(3, 5) >>> nn.init.eye_(w) """ if tensor.ndimension() != 2: raise ValueError("Only tensors with 2 dimensions are supported") with paddle.no_grad(): tensor.set_value(paddle.eye(*tensor.shape)) return tensor
def partial_trace_discontiguous(rho, preserve_qubits=None): r"""计算量子态的偏迹,可选取任意子系统。 Args: rho (Tensor): 输入的量子态 preserve_qubits (list): 要保留的量子比特,默认为 None,表示全保留 """ if preserve_qubits is None: return rho else: n = int(log2(rho.size) // 2) num_preserve = len(preserve_qubits) shape = paddle.ones((n + 1, )) shape = 2 * shape shape[n] = 2**n shape = paddle.cast(shape, "int32") identity = paddle.eye(2**n) identity = paddle.reshape(identity, shape=shape) discard = list() for idx in range(0, n): if idx not in preserve_qubits: discard.append(idx) addition = [n] preserve_qubits.sort() preserve_qubits = paddle.to_tensor(preserve_qubits) discard = paddle.to_tensor(discard) addition = paddle.to_tensor(addition) permute = paddle.concat([discard, preserve_qubits, addition]) identity = paddle.transpose(identity, perm=permute) identity = paddle.reshape(identity, (2**n, 2**n)) result = np.zeros((2**num_preserve, 2**num_preserve), dtype="complex64") result = paddle.to_tensor(result) for i in range(0, 2**num_preserve): bra = identity[i * 2**num_preserve:(i + 1) * 2**num_preserve, :] result = result + matmul(matmul(bra, rho), transpose(bra, perm=[1, 0])) return result
def _contrastive(self, feats_, labels_): """ Args: feats_ (Tensor): sampled pixel, shape = [total_classes, n_view, feat_dim], total_classes = batch_size * single image classes labels_ (Tensor): label, shape = [total_classes] """ anchor_num, n_view = feats_.shape[0], feats_.shape[1] labels_ = labels_.reshape((-1, 1)) mask = paddle.equal(labels_, paddle.transpose(labels_, [1, 0])).astype('float32') contrast_count = n_view contrast_feature = paddle.concat(paddle.unbind(feats_, axis=1), axis=0) anchor_feature = contrast_feature anchor_count = contrast_count anchor_dot_contrast = paddle.matmul( anchor_feature, paddle.transpose(contrast_feature, [1, 0])) / self.temperature logits_max = paddle.max(anchor_dot_contrast, axis=1, keepdim=True) logits = anchor_dot_contrast - logits_max mask = paddle.tile(mask, [anchor_count, contrast_count]) neg_mask = 1 - mask logits_mask = 1 - paddle.eye(mask.shape[0]).astype('float32') mask = mask * logits_mask neg_logits = paddle.exp(logits) * neg_mask neg_logits = neg_logits.sum(1, keepdim=True) exp_logits = paddle.exp(logits) log_prob = logits - paddle.log(exp_logits + neg_logits) mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1) loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos loss = loss.mean() return loss
def forward(self, node_feat, edge_feat=None): # x: bs*N*num_feat # compute abs(x_i, x_j) x_i = node_feat.unsqueeze(2) x_j = paddle.transpose(x_i, (0, 2, 1, 3)) x_ij = paddle.abs(x_i - x_j) # size: bs x fs X N x N (2,128,11,11) x_ij = paddle.transpose(x_ij, (0, 3, 2, 1)) if self.adj_type == 'sim': x_ij = paddle.exp(-x_ij) sim_val = self.sim_network(x_ij) diag_mask = 1.0 - paddle.expand( paddle.eye(node_feat.shape[1]), [node_feat.shape[0], 1, node_feat.shape[1], node_feat.shape[1]]) if self.activation == 'softmax': sim_val = self.softmax_with_mask(sim_val, diag_mask) elif self.activation == 'sigmoid': sim_val = F.sigmoid(sim_val) * diag_mask else: sim_val = sim_val * diag_mask if self.edge_dim == 2: if self.activation == 'softmax': dsim_val = self.softmax_with_mask(1 - sim_val, diag_mask) else: dsim_val = (1 - sim_val) * diag_mask adj_val = paddle.concat([sim_val, dsim_val], 1) else: adj_val = sim_val if self.top_k > 0: n_q, n_edge, n1, n2 = adj_val.shape k = min(self.top_k, n1) adj_temp = adj_val.reshape((n_q * n_edge * n1, n2)) topk, indices = paddle.topk(adj_temp, k) mask = F.one_hot(indices, adj_temp.shape[1]).sum(1) mask = mask.reshape((n_q, n_edge, n1, n2)) if self.activation == 'softmax': adj_val = self.softmax_with_mask(adj_val, mask) else: adj_val = adj_val * mask return adj_val, edge_feat
def forward(self, all_emb, q_emb=None, return_adj=False, return_emb=False): node_feat = all_emb if self.pre_dropout > 0: node_feat = self.predrop1(node_feat) edge_feat_list = [] if return_adj: x_i = node_feat.unsqueeze(2) x_j = paddle.transpose(x_i, (1, 2)) init_adj = paddle.abs(x_i - x_j) init_adj = paddle.transpose( init_adj, (1, 3)) # size: bs x fs X N x N (2,128,11,11) if self.adj_type == 'sim': init_adj = paddle.exp(-init_adj) diag_mask = 1.0 - paddle.expand(paddle.eye(node_feat.shape[1]), [node_feat.shape[0], 1, 1, 1]) init_adj = init_adj * diag_mask edge_feat_list.append(init_adj) for i in range(self.num_layers): adj, _ = self.layer_edge[i](node_feat) node_feat_new = self.layer_node[i](node_feat, adj) if self.node_concat: node_feat = paddle.concat([node_feat, node_feat_new], 2) else: node_feat = node_feat_new edge_feat_list.append(adj) if self.pre_dropout > 0: node_feat = self.predrop2(node_feat) node_feat = self.fc1(node_feat) node_feat = self.res_alpha * all_emb + node_feat s_feat = node_feat[:, :-1, :] q_feat = node_feat[:, -1, :] s_logits = self.fc2(s_feat) q_logits = self.fc2(q_feat) if return_emb: return s_logits, q_logits, edge_feat_list, s_feat, q_feat else: return s_logits, q_logits, edge_feat_list
def test_num_rows_type_check(): paddle.eye(-1, dtype="int64")
def to_one_hot(self, class_idx, num_classes=2): return paddle.eye(num_classes)[class_idx]
def forward( self, input_ids=None, token_type_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, input_mask=None, head_mask=None, inputs_embeds=None, use_mems_train=False, use_mems_eval=False, output_attentions=False, output_hidden_states=False, return_dict=False, ): if self.training: use_mems = use_mems_train else: use_mems = use_mems_eval # The original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # but we want a unified interface in the library with the batch size on the first dimension # so we move here the first dimension (batch) to the end if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_ids = paddle.transpose(input_ids, perm=[1, 0]) qlen, bsz = input_ids.shape[0], input_ids.shape[1] elif inputs_embeds is not None: inputs_embeds = paddle.transpose(inputs_embeds, perm=[1, 0]) qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") token_type_ids = token_type_ids.transpose( [1, 0]) if token_type_ids is not None else None input_mask = input_mask.transpose( [1, 0]) if input_mask is not None else None attention_mask = attention_mask.transpose( [1, 0]) if attention_mask is not None else None perm_mask = perm_mask.transpose([1, 2, 0 ]) if perm_mask is not None else None target_mapping = target_mapping.transpose( [1, 2, 0]) if target_mapping is not None else None mlen = mems[0].shape[ 0] if mems is not None and mems[0] is not None else 0 klen = mlen + qlen # Attention mask # Causal attention mask if self.attn_type == "uni": attn_mask = self.create_mask(qlen, mlen) attn_mask = paddle.unsqueeze(attn_mask, axis=[2, 3]) elif self.attn_type == "bi": attn_mask = None else: raise ValueError("Unsupported attention type: {}".format( self.attn_type)) # Data mask: input mask & perm mask assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " "or attention_mask (uses 0 for padding, added for compatibility with BERT). Please choose one." if input_mask is None and attention_mask is not None: input_mask = 1.0 - attention_mask if input_mask is not None and perm_mask is not None: data_mask = paddle.unsqueeze(input_mask, axis=0) + perm_mask elif input_mask is not None and perm_mask is None: data_mask = paddle.unsqueeze(input_mask, axis=0) elif input_mask is None and perm_mask is not None: data_mask = perm_mask else: data_mask = None if data_mask is not None: # All mems can be attended to if mlen > 0: mems_mask = paddle.cast(paddle.zeros( [data_mask.shape[0], mlen, bsz]), dtype=dtype_float) data_mask = paddle.concat([mems_mask, data_mask], axis=1) if attn_mask is None: attn_mask = paddle.unsqueeze(data_mask, axis=-1) else: attn_mask += paddle.unsqueeze(data_mask, axis=-1) if attn_mask is not None: attn_mask = paddle.cast((attn_mask > 0), dtype=dtype_float) if attn_mask is not None: non_tgt_mask = paddle.cast(-paddle.eye(qlen), dtype=dtype_float) if mlen > 0: non_tgt_mask = paddle.concat([ paddle.cast(paddle.zeros([qlen, mlen]), dtype=dtype_float), non_tgt_mask ], axis=-1) non_tgt_mask = paddle.cast(( (attn_mask + paddle.unsqueeze(non_tgt_mask, axis=[2, 3])) > 0), dtype=dtype_float) else: non_tgt_mask = None # Word embeddings and prepare h & g hidden states if inputs_embeds is not None: word_emb_k = inputs_embeds else: word_emb_k = self.word_embedding(input_ids) output_h = self.dropout(word_emb_k) if target_mapping is not None: word_emb_q = self.mask_emb.expand( [target_mapping.shape[0], bsz, -1]) output_g = self.dropout(word_emb_q) else: output_g = None # Segment embedding if token_type_ids is not None: # Convert `token_type_ids` to one-hot `seg_mat` if mlen > 0: mem_pad = paddle.zeros(shape=[mlen, bsz], dtype='int64') cat_ids = paddle.concat(x=[mem_pad, token_type_ids], axis=0) else: cat_ids = token_type_ids # `1` indicates not in the same segment [qlen x klen x bsz] seg_mat = paddle.cast(paddle.unsqueeze(token_type_ids, axis=1) != paddle.unsqueeze(cat_ids, axis=0), dtype='int64') seg_mat = paddle.cast(F.one_hot(seg_mat, num_classes=2), dtype=dtype_float) else: seg_mat = None # Positional encoding pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz) pos_emb = self.dropout(pos_emb) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # Attention_probs has shape bsz x n_heads x N x N # Input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # And head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( 0).unsqueeze(0) head_mask = head_mask.expand([self.n_layer, -1, -1, -1, -1]) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) else: head_mask = [None] * self.n_layer new_mems = () if mems is None: mems = [None] * len(self.layer) attentions = [] if output_attentions else None hidden_states = [] if output_hidden_states else None for i, layer_module in enumerate(self.layer): if use_mems: # Cache new mems new_mems = new_mems + (self.cache_mem(output_h, mems[i]), ) if output_hidden_states: hidden_states.append(( output_h, output_g) if output_g is not None else output_h) outputs = layer_module( output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask, r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping, head_mask=head_mask[i], output_attentions=output_attentions, ) output_h, output_g = outputs[:2] if output_attentions: attentions.append(outputs[2]) # Add last hidden state if output_hidden_states: hidden_states.append(( output_h, output_g) if output_g is not None else output_h) output = self.dropout(output_g if output_g is not None else output_h) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) output = paddle.transpose(output, perm=[1, 0, 2]) if not use_mems: new_mems = None if output_hidden_states: if output_g is not None: hidden_states = tuple( paddle.transpose(h, perm=[1, 0, 2]) for hs in hidden_states for h in hs) else: hidden_states = tuple( paddle.transpose(hs, perm=[1, 0, 2]) for hs in hidden_states) if output_attentions: if target_mapping is not None: # When target_mapping is provided, there are 2-tuple of attentions attentions = tuple( tuple( paddle.transpose(att_stream, perm=[2, 3, 0, 1]) for att_stream in t) for t in attentions) else: attentions = tuple( paddle.transpose(t, perm=[2, 3, 0, 1]) for t in attentions) if not return_dict: return tuple( v for v in [output, new_mems, hidden_states, attentions] if v is not None) return { "last_hidden_state": output, "mems": new_mems, "hidden_states": hidden_states, "attentions": attentions, }
def minimize_bfgs(objective_func, initial_position, max_iters=50, tolerance_grad=1e-7, tolerance_change=1e-9, initial_inverse_hessian_estimate=None, line_search_fn='strong_wolfe', max_line_search_iters=50, initial_step_length=1.0, dtype='float32', name=None): r""" Minimizes a differentiable function `func` using the BFGS method. The BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function. Closely related is the Newton method for minimization. Consider the iterate update formula: .. math:: x_{k+1} = x_{k} + H_k \nabla{f_k} If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method. If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then it's a quasi-Newton. In practice, the approximated Hessians are obtained by only using the gradients, over either whole or part of the search history, the former is BFGS, the latter is L-BFGS. Reference: Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp140: Algorithm 6.1 (BFGS Method). Args: objective_func: the objective function to minimize. ``objective_func`` accepts a 1D Tensor and returns a scalar. initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` . max_iters (int, optional): the maximum number of minimization iterations. Default value: 50. tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7. tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9. initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. If not given, will use an identity matrix of order N, which is size of ``initial_position`` . Default value: None. line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'. max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50. initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0. dtype ('float32' | 'float64', optional): data type used in the algorithm, the data type of the input parameter must be consistent with the dtype. Default value: 'float32'. name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None. Returns: output(tuple): - is_converge (bool): Indicates whether found the minimum within tolerance. - num_func_calls (int): number of objective function called. - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of the objective function regrading to the initial position. - objective_value (Tensor): objective function value at the `position`. - objective_gradient (Tensor): objective function gradient at the `position`. - inverse_hessian_estimate (Tensor): the estimate of inverse hessian at the `position`. Examples: .. code-block:: python import paddle def func(x): return paddle.dot(x, x) x0 = paddle.to_tensor([1.3, 2.7]) results = paddle.incubate.optimizer.functional.minimize_bfgs(func, x0) print("is_converge: ", results[0]) print("the minimum of func is: ", results[2]) # is_converge: is_converge: Tensor(shape=[1], dtype=bool, place=Place(gpu:0), stop_gradient=True, # [True]) # the minimum of func is: Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True, # [0., 0.]) """ if dtype not in ['float32', 'float64']: raise ValueError( "The dtype must be 'float32' or 'float64', but the specified is {}." .format(dtype)) op_name = 'minimize_bfgs' check_input_type(initial_position, 'initial_position', op_name) I = paddle.eye(initial_position.shape[0], dtype=dtype) if initial_inverse_hessian_estimate is None: initial_inverse_hessian_estimate = I else: check_input_type(initial_inverse_hessian_estimate, 'initial_inverse_hessian_estimate', op_name) check_initial_inverse_hessian_estimate( initial_inverse_hessian_estimate) Hk = paddle.assign(initial_inverse_hessian_estimate) # use detach and assign to create new tensor rather than =, or xk will share memory and grad with initial_position xk = paddle.assign(initial_position.detach()) value, g1 = _value_and_gradient(objective_func, xk) num_func_calls = paddle.full(shape=[1], fill_value=1, dtype='int64') # when the dim of x is 1000, it needs more than 30 iters to get all element converge to minimum. k = paddle.full(shape=[1], fill_value=0, dtype='int64') done = paddle.full(shape=[1], fill_value=False, dtype='bool') is_converge = paddle.full(shape=[1], fill_value=False, dtype='bool') def cond(k, done, is_converge, num_func_calls, xk, value, g1, Hk): return (k < max_iters) & ~done def body(k, done, is_converge, num_func_calls, xk, value, g1, Hk): ############# compute pk ############# pk = -paddle.matmul(Hk, g1) ############# compute alpha by line serach ############# if line_search_fn == 'strong_wolfe': alpha, value, g2, ls_func_calls = strong_wolfe( f=objective_func, xk=xk, pk=pk, initial_step_length=initial_step_length, dtype=dtype) else: raise NotImplementedError( "Currently only support line_search_fn = 'strong_wolfe', but the specified is '{}'" .format(line_search_fn)) num_func_calls += ls_func_calls ############# update Hk ############# sk = alpha * pk yk = g2 - g1 xk = xk + sk g1 = g2 sk = paddle.unsqueeze(sk, 0) yk = paddle.unsqueeze(yk, 0) rhok_inv = paddle.dot(yk, sk) rhok = paddle.static.nn.cond( rhok_inv == 0., lambda: paddle.full(shape=[1], fill_value=1000.0, dtype=dtype), lambda: 1. / rhok_inv) Vk_transpose = I - rhok * sk * yk.t() Vk = I - rhok * yk * sk.t() Hk = paddle.matmul(paddle.matmul(Vk_transpose, Hk), Vk) + rhok * sk * sk.t() k += 1 ############# check convergence ############# gnorm = paddle.linalg.norm(g1, p=np.inf) pk_norm = paddle.linalg.norm(pk, p=np.inf) paddle.assign( done | (gnorm < tolerance_grad) | (pk_norm < tolerance_change), done) paddle.assign(done, is_converge) # when alpha=0, there is no chance to get xk change. paddle.assign(done | (alpha == 0.), done) return [k, done, is_converge, num_func_calls, xk, value, g1, Hk] paddle.static.nn.while_loop( cond=cond, body=body, loop_vars=[k, done, is_converge, num_func_calls, xk, value, g1, Hk]) return is_converge, num_func_calls, xk, value, g1, Hk
def lddt(predicted_points, true_points, true_points_mask, cutoff=15., per_residue=False): """Measure (approximate) lDDT for a batch of coordinates. lDDT reference: Mariani, V., Biasini, M., Barbato, A. & Schwede, T. lDDT: A local superposition-free score for comparing protein structures and models using distance difference tests. Bioinformatics 29, 2722–2728 (2013). lDDT is a measure of the difference between the true distance matrix and the distance matrix of the predicted points. The difference is computed only on points closer than cutoff *in the true structure*. This function does not compute the exact lDDT value that the original paper describes because it does not include terms for physical feasibility (e.g. bond length violations). Therefore this is only an approximate lDDT score. Args: predicted_points: (batch, length, 3) array of predicted 3D points true_points: (batch, length, 3) array of true 3D points true_points_mask: (batch, length, 1) binary-valued float array. This mask should be 1 for points that exist in the true points. cutoff: Maximum distance for a pair of points to be included per_residue: If true, return score for each residue. Note that the overall lDDT is not exactly the mean of the per_residue lDDT's because some residues have more contacts than others. Returns: An (approximate, see above) lDDT score in the range 0-1. """ assert len(predicted_points.shape) == 3 assert predicted_points.shape[-1] == 3 assert true_points_mask.shape[-1] == 1 assert len(true_points_mask.shape) == 3 # Compute true and predicted distance matrices. dmat_true = paddle.sqrt(1e-10 + paddle.sum( (true_points[:, :, None] - true_points[:, None, :])**2, axis=-1)) dmat_predicted = paddle.sqrt(1e-10 + paddle.sum((predicted_points[:, :, None] - predicted_points[:, None, :])**2, axis=-1)) cutoff = paddle.to_tensor(cutoff) dists_to_score = ( paddle.cast((dmat_true < cutoff), 'float32') * true_points_mask * paddle.transpose(true_points_mask, [0, 2, 1]) * (1. - paddle.eye(dmat_true.shape[1])) # Exclude self-interaction. ) # Shift unscored distances to be far away. dist_l1 = paddle.abs(dmat_true - dmat_predicted) # True lDDT uses a number of fixed bins. # We ignore the physical plausibility correction to lDDT, though. score = 0.25 * (paddle.cast((dist_l1 < 0.5), 'float32') + paddle.cast( (dist_l1 < 1.0), 'float32') + paddle.cast( (dist_l1 < 2.0), 'float32') + paddle.cast( (dist_l1 < 4.0), 'float32')) # Normalize over the appropriate axes. reduce_axes = (-1, ) if per_residue else (-2, -1) norm = 1. / (1e-10 + paddle.sum(dists_to_score, axis=reduce_axes)) score = norm * (1e-10 + paddle.sum(dists_to_score * score, axis=reduce_axes)) return score
def rmi_lower_bound(self, labels_4D, probs_4D): """ calculate the lower bound of the region mutual information. Args: labels_4D : [N, C, H, W], dtype=float32 probs_4D : [N, C, H, W], dtype=float32 """ assert labels_4D.shape == probs_4D.shape, print( 'shapes', labels_4D.shape, probs_4D.shape) p, s = self.rmi_pool_size, self.rmi_pool_stride if self.rmi_pool_stride > 1: if self.rmi_pool_way == 0: labels_4D = F.max_pool2d(labels_4D, kernel_size=p, stride=s, padding=self.kernel_padding) probs_4D = F.max_pool2d(probs_4D, kernel_size=p, stride=s, padding=self.kernel_padding) elif self.rmi_pool_way == 1: labels_4D = F.avg_pool2d(labels_4D, kernel_size=p, stride=s, padding=self.kernel_padding) probs_4D = F.avg_pool2d(probs_4D, kernel_size=p, stride=s, padding=self.kernel_padding) elif self.rmi_pool_way == 2: shape = labels_4D.shape new_h, new_w = shape[2] // s, shape[3] // s labels_4D = F.interpolate(labels_4D, size=(new_h, new_w), mode='nearest') probs_4D = F.interpolate(probs_4D, size=(new_h, new_w), mode='bilinear', align_corners=True) else: raise NotImplementedError("Pool way of RMI is not defined!") label_shape = labels_4D.shape n, c = label_shape[0], label_shape[1] la_vectors, pr_vectors = self.map_get_pairs(labels_4D, probs_4D, radius=self.rmi_radius, is_combine=0) la_vectors = paddle.reshape(la_vectors, [n, c, self.half_d, -1]) la_vectors = paddle.cast(la_vectors, dtype='float64') la_vectors.stop_gradient = True pr_vectors = paddle.reshape(pr_vectors, [n, c, self.half_d, -1]) pr_vectors = paddle.cast(pr_vectors, dtype='float64') diag_matrix = paddle.unsqueeze(paddle.unsqueeze(paddle.eye( self.half_d), axis=0), axis=0) la_vectors = la_vectors - paddle.mean(la_vectors, axis=3, keepdim=True) la_cov = paddle.matmul(la_vectors, paddle.transpose(la_vectors, [0, 1, 3, 2])) pr_vectors = pr_vectors - paddle.mean(pr_vectors, axis=3, keepdim=True) pr_cov = paddle.matmul(pr_vectors, paddle.transpose(pr_vectors, [0, 1, 3, 2])) pr_cov_inv = self.inverse(pr_cov + paddle.cast(diag_matrix, dtype='float64') * _POS_ALPHA) la_pr_cov = paddle.matmul(la_vectors, paddle.transpose(pr_vectors, [0, 1, 3, 2])) appro_var = la_cov - paddle.matmul( paddle.matmul(la_pr_cov, pr_cov_inv), paddle.transpose(la_pr_cov, [0, 1, 3, 2])) rmi_now = 0.5 * self.log_det_by_cholesky( appro_var + paddle.cast(diag_matrix, dtype='float64') * _POS_ALPHA) rmi_per_class = paddle.cast(paddle.mean(paddle.reshape( rmi_now, [-1, self.num_classes]), axis=0), dtype='float32') rmi_per_class = paddle.divide(rmi_per_class, paddle.to_tensor(float(self.half_d))) rmi_loss = paddle.sum(rmi_per_class) if _IS_SUM else paddle.mean( rmi_per_class) return rmi_loss
confidence = initial_const #k值 k = 40 #像素值区间 boxmin = -3.0 boxmax = 3.0 #类别数 pytorch的实现里面是1000 num_labels = 1000 #攻击目标标签 必须使用one hot编码 #target_label = 288 target_label = 344 tlab = paddle.eye(num_labels)[target_label] print("type of tlab: ", type(tlab)) print() shape = (1, 3, 224, 224) #c的初始化边界 lower_bound = 0 c = initial_const upper_bound = 1e10 # the best l2, score, and image attack o_bestl2 = 1e10 o_bestscore = -1 o_bestattack = [np.zeros(shape)]
def test_num_columns_type_check(): paddle.eye(10, num_columns=5.2, dtype="int64")
def forward_single(self, emb, instance, kernel, training_mask, bboxes): training_mask = (training_mask > 0.5).long() kernel = (kernel > 0.5).long() instance = instance * training_mask instance_kernel = paddle.reshape((instance * kernel),(-1)) instance = paddle.reshape(instance,(-1)) emb = paddle.reshape(emb,(self.feature_dim, -1)) unique_labels, unique_ids = paddle.unique(instance_kernel, return_inverse=True) num_instance = unique_labels.size(0) if num_instance <= 1: return 0 emb_mean = paddle.zeros((self.feature_dim, num_instance), dtype='float32') for i, lb in enumerate(unique_labels): if lb == 0: continue ind_k = instance_kernel == lb emb_mean[:, i] = paddle.mean(emb[:, ind_k], axis=1) l_agg = paddle.zeros(num_instance, dtype='float32') for i, lb in enumerate(unique_labels): if lb == 0: continue ind = instance == lb emb_ = emb[:, ind] dist = (emb_ - emb_mean[:, i:i + 1]).norm(p=2, dim=0) dist = F.relu(dist - self.delta_v) ** 2 l_agg[i] = paddle.mean(paddle.log(dist + 1.0)) l_agg = paddle.mean(l_agg[1:]) if num_instance > 2: emb_trans = paddle.transpose(emb_mean, perm=[1, 0]) emb_interleave = paddle.tile(emb_trans, repeat_times=[num_instance, 1]) emb_trans = paddle.transpose(emb_mean, perm=[1, 0]) emb_tile = paddle.tile(emb_trans, repeat_times=[num_instance, 1]) emb_band = paddle.reshape(emb_tile,(-1, self.feature_dim)) # print(seg_band) mask = (1 - paddle.eye(num_instance, dtype=np.int8)) mask = paddle.reshape(mask,(-1,1)) mask = paddle.tile(mask, repeat_times=[1, self.feature_dim]) mask = paddle.reshape(mask,(num_instance, num_instance, -1)) mask[0, :, :] = 0 mask[:, 0, :] = 0 mask = paddle.reshape(mask, (num_instance * num_instance, -1)) # print(mask) dist = emb_interleave - emb_band # dist = dist[mask > 0].view(-1, self.feature_dim).norm(p=2, dim=1) dist = paddle.reshape(dist[mask > 0], (-1, self.feature_dim)).norm(p=2, axis=1) dist = F.relu(2 * self.delta_d - dist) ** 2 l_dis = paddle.mean(paddle.log(dist + 1.0)) else: l_dis = 0 l_agg = self.weights[0] * l_agg l_dis = self.weights[1] * l_dis l_reg = paddle.mean(paddle.log(paddle.norm(emb_mean, 2, 0) + 1.0)) * 0.001 loss = l_agg + l_dis + l_reg return loss
def eye(n, m): return Tensor(paddle.eye(n, m))