def forward(self, input):
        if _global_parallel_strategy == "dp":
            auto.shard_tensor(input,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(input,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1, -1]
                              })

        q = self.q_proj(input)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        k = self.k_proj(input)
        v = self.v_proj(input)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })

        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])

        # scale dot product attention
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=self.head_dim**-0.5)

        if self.attn_mask is not None:
            product = product + self.attn_mask

        weights = F.softmax(product)

        if self.dropout_ratio:
            weights = F.dropout(weights,
                                self.dropout_ratio,
                                training=self.training,
                                mode="upscale_in_train")

        out = tensor.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)
        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        return out
Ejemplo n.º 2
0
    def greedy_search(self,
                      src_word,
                      max_len=256,
                      waitk=-1,
                      caches=None,
                      bos_id=None):
        """
        greedy_search uses streaming reader. It doesn't need calling
        encoder many times, an a sub-sentence just needs calling encoder once.
        So, it needsprevious state(caches) and last one of generated
        tokens id last time.
        """
        src_max_len = paddle.shape(src_word)[-1]
        base_attn_bias = paddle.cast(
            src_word == self.bos_id,
            dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
        src_slf_attn_bias = base_attn_bias
        src_slf_attn_bias.stop_gradient = True
        trg_src_attn_bias = paddle.tile(base_attn_bias, [1, 1, 1, 1])
        src_pos = paddle.cast(src_word != self.bos_id,
                              dtype="int64") * paddle.arange(start=0,
                                                             end=src_max_len)
        src_emb = self.src_word_embedding(src_word)
        src_pos_emb = self.src_pos_embedding(src_pos)
        src_emb = src_emb + src_pos_emb
        enc_input = F.dropout(
            src_emb, p=self.dropout,
            training=self.training) if self.dropout else src_emb
        enc_outputs = [self.encoder(enc_input, src_mask=src_slf_attn_bias)]

        # constant number
        batch_size = enc_outputs[-1].shape[0]
        max_len = (enc_outputs[-1].shape[1] +
                   20) if max_len is None else max_len
        end_token_tensor = paddle.full(shape=[batch_size, 1],
                                       fill_value=self.eos_id,
                                       dtype="int64")

        predict_ids = []
        log_probs = paddle.full(shape=[batch_size, 1],
                                fill_value=0,
                                dtype="float32")
        if not bos_id:
            trg_word = paddle.full(shape=[batch_size, 1],
                                   fill_value=self.bos_id,
                                   dtype="int64")
        else:
            trg_word = paddle.full(shape=[batch_size, 1],
                                   fill_value=bos_id,
                                   dtype="int64")

        # init states (caches) for transformer
        if not caches:
            caches = self.decoder.gen_cache(enc_outputs[-1], do_zip=False)

        for i in range(max_len):
            trg_pos = paddle.full(shape=trg_word.shape,
                                  fill_value=i,
                                  dtype="int64")
            trg_emb = self.trg_word_embedding(trg_word)
            trg_pos_emb = self.trg_pos_embedding(trg_pos)
            trg_emb = trg_emb + trg_pos_emb
            dec_input = F.dropout(
                trg_emb, p=self.dropout,
                training=self.training) if self.dropout else trg_emb

            if waitk < 0 or i >= len(enc_outputs):
                # if the decoder step is full sent or longer than all source
                # step, then read the whole src
                _e = enc_outputs[-1]
                dec_output, caches = self.decoder(
                    dec_input, [_e], None,
                    trg_src_attn_bias[:, :, :, :_e.shape[1]], caches)
            else:
                _e = enc_outputs[i]
                dec_output, caches = self.decoder(
                    dec_input, [_e], None,
                    trg_src_attn_bias[:, :, :, :_e.shape[1]], caches)

            dec_output = paddle.reshape(dec_output,
                                        shape=[-1, dec_output.shape[-1]])

            logits = self.linear(dec_output)
            step_log_probs = paddle.log(F.softmax(logits, axis=-1))
            log_probs = paddle.add(x=step_log_probs, y=log_probs)
            scores = log_probs
            topk_scores, topk_indices = paddle.topk(x=scores, k=1)

            finished = paddle.equal(topk_indices, end_token_tensor)
            trg_word = topk_indices
            log_probs = topk_scores

            predict_ids.append(topk_indices)

            if paddle.all(finished).numpy():
                break

        predict_ids = paddle.stack(predict_ids, axis=0)
        finished_seq = paddle.transpose(predict_ids, [1, 2, 0])
        finished_scores = topk_scores

        return finished_seq, finished_scores, caches
Ejemplo n.º 3
0
 def get_prediction(self, score, delta):
     bbox_prob = F.softmax(score)
     return delta, bbox_prob
Ejemplo n.º 4
0
def main(args):
    """The main function 

    Args:
        args (args): configs

    Raises:
        ValueError: if the args is invalid.
    """
    model_config = json.load(open(args.model_config, 'r'))
    paddle.set_device("gpu" if args.use_cuda else "cpu")
    rank = 0
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
        # the current process id
        rank = paddle.distributed.get_rank()

    train_dataset = SecondaryStructureDataset(base_path=args.train_data,
                                              mode='train',
                                              num_classes=3)
    train_loader = create_dataloader(
        train_dataset,
        mode='train',
        batch_size=args.batch_size,
        pad_token_id=ProteinTokenizer.padding_token_id)
    test_dataset = SecondaryStructureDataset(base_path=args.train_data,
                                             mode='test',
                                             num_classes=3)
    test_loader = create_dataloader(
        test_dataset,
        mode='test',
        batch_size=args.batch_size,
        pad_token_id=ProteinTokenizer.padding_token_id)

    if model_config["model_type"] == "transformer":
        model = TransformerSeqClassificationModel(
            vocab_size=len(ProteinTokenizer.vocab),
            num_class=model_config['class_num'],
            emb_dim=model_config['hidden_size'])
    elif model_config["model_type"] == "lstm":
        model = LstmSeqClassificationModel(vocab_size=len(
            ProteinTokenizer.vocab),
            num_class=model_config['class_num'],
            emb_dim=model_config['hidden_size'])
    else:
        raise ValueError("Not avaliable {}".format(model_config["model_type"]))

    if os.path.exists(args.init_model):
        param_state_dict = paddle.load(args.init_model)
        model.set_dict(param_state_dict)
        print("Loaded model parameters from %s" % args.init_model)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    if args.warmup_steps > 0:
        lr_scheduler = paddle.optimizer.lr.NoamDecay(
            1 / (args.warmup_steps * (args.lr ** 2)), args.warmup_steps)
    else:
        lr_scheduler = args.lr

    max_grad_norm = 0.1
    grad_clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=max_grad_norm)
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=model_config['weight_decay'],
        grad_clip=grad_clip)

    criterion = paddle.nn.CrossEntropyLoss(ignore_index=-1)
    metric = ClassificationMetric()

    for epoch in range(args.epoch):
        loss_sum = 0
        model.train()
        start_time = time.time()
        for i, (texts, seq_lens, labels) in enumerate(train_loader, start=1):
            labels = labels.reshape([-1, 1])
            mask = labels != -1
            logits = model(texts, seq_lens)
            logits = logits.reshape([-1, logits.shape[-1]])
            # mask, remove the effect of 'PAD'
            mask = paddle.cast(mask, dtype='float32')
            inf_tensor = paddle.full(shape=mask.shape,
                                     dtype='float32',
                                     fill_value=-1. * 1e12)
            logits = paddle.multiply(logits, mask) + paddle.multiply(
                inf_tensor, (1 - mask))
            probs = F.softmax(logits, axis=1)
            loss = criterion(probs, labels)
            loss_sum += loss.numpy()
            loss.backward()
            optimizer.step()
            optimizer.clear_gradients()
            probs = probs.numpy()
            labels = labels.numpy()
            metric.update(probs, labels)
            if i % 10 == 0:
                print('epoch %d, step %d, avg loss %.5f' %
                      (epoch, i, loss_sum / 10))
                metric.show()
                loss_sum = 0
                metric.clear()

        if rank == 0:
            print('Test:')
            avg_loss = eval(model, test_loader, criterion, metric)
            print("Average loss: %.5f" % avg_loss)

            print("Save model epoch%d." % epoch)
            param_path = os.path.join(args.model, 'epoch%d' % epoch,
                                      'saved_params.pdparams')
            opt_path = os.path.join(args.model, 'epoch%d' % epoch,
                                    'saved_opt.pdopt')
            paddle.save(model.state_dict(), param_path)
            paddle.save(optimizer.state_dict(), opt_path)
Ejemplo n.º 5
0
def do_predict():
    paddle.set_device(args.device)

    no_entity_label = "O"
    ignore_label = -1

    tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0")
    label_map = load_dict(args.tag_path)
    id2label = {val: key for key, val in label_map.items()}
    model = ErnieForTokenClassification.from_pretrained(
        "ernie-1.0", num_classes=len(label_map))

    print("============start predict==========")
    if not args.init_ckpt or not os.path.isfile(args.init_ckpt):
        raise Exception("init checkpoints {} not exist".format(args.init_ckpt))
    else:
        state_dict = paddle.load(args.init_ckpt)
        model.set_dict(state_dict)
        print("Loaded parameters from %s" % args.init_ckpt)

    # load data from predict file
    sentences = read_by_lines(args.predict_data)  # origin data format
    sentences = [json.loads(sent) for sent in sentences]

    encoded_inputs_list = []
    for sent in sentences:
        sent = sent["text"].replace(" ", "\002")
        input_ids, token_type_ids, seq_len = convert_example_to_feature(
            [list(sent), []],
            tokenizer,
            max_seq_len=args.max_seq_len,
            is_test=True)
        encoded_inputs_list.append((input_ids, token_type_ids, seq_len))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),  # input_ids
        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]
            ),  # token_type_ids
        Stack()  # sequence lens
    ): fn(samples)
    # Seperates data into some batches.
    batch_encoded_inputs = [
        encoded_inputs_list[i:i + args.batch_size]
        for i in range(0, len(encoded_inputs_list), args.batch_size)
    ]
    results = []
    model.eval()
    for batch in batch_encoded_inputs:
        input_ids, token_type_ids, seq_lens = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        token_type_ids = paddle.to_tensor(token_type_ids)
        logits = model(input_ids, token_type_ids)
        probs = F.softmax(logits, axis=-1)
        probs_ids = paddle.argmax(probs, -1).numpy()
        probs = probs.numpy()
        for p_list, p_ids, seq_len in zip(probs.tolist(), probs_ids.tolist(),
                                          seq_lens.tolist()):
            prob_one = [
                p_list[index][pid]
                for index, pid in enumerate(p_ids[1:seq_len - 1])
            ]
            label_one = [id2label[pid] for pid in p_ids[1:seq_len - 1]]
            results.append({"probs": prob_one, "labels": label_one})
    assert len(results) == len(sentences)
    for sent, ret in zip(sentences, results):
        sent["pred"] = ret
    sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences]
    write_by_lines(args.predict_save_path, sentences)
    print("save data {} to {}".format(len(sentences), args.predict_save_path))
Ejemplo n.º 6
0
 def embedded_gaussian(self, theta_x, phi_x):
     pairwise_weight = paddle.matmul(theta_x, phi_x)
     if self.use_scale:
         pairwise_weight /= theta_x.shape[-1]**0.5
     pairwise_weight = F.softmax(pairwise_weight, -1)
     return pairwise_weight
    def forward(self,
                query,
                key,
                value,
                attn_mask=None,
                use_cache=False,
                cache=None):
        r"""
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.
        """
        key = query if key is None else key
        value = query if value is None else value
        # compute q ,k ,v
        if use_cache is False:
            if self.fuse:
                q, k, v = self._fuse_prepare_qkv(query)
            else:
                q, k, v = self._prepare_qkv(query, key, value, use_cache,
                                            cache)
        else:
            q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,
                                               cache)
        # scale dot product attention
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=self.head_dim**-0.5)

        if attn_mask is not None:
            product = product + attn_mask

        weights = F.softmax(product)
        if self.dropout:
            weights = F.dropout(weights,
                                self.dropout,
                                training=self.training,
                                mode="upscale_in_train")

        out = tensor.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        outs = [out]
        if self.need_weights:
            outs.append(weights)
        if use_cache:
            outs.append(cache)
        return out if len(outs) == 1 else tuple(outs)
Ejemplo n.º 8
0
    def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None):
        qlen, bsz = w.shape[1], w.shape[0]

        if mems is not None:
            cat = paddle.concat([mems, w], 1)
            if self.normalize_before:
                w_heads = self.qkv_proj(self.layer_norm(cat))
            else:
                w_heads = self.qkv_proj(cat)
            w_head_q, w_head_k, w_head_v = paddle.chunk(w_heads,
                                                        chunks=3,
                                                        axis=-1)

            w_head_q = w_head_q[-qlen:]
        else:
            if self.normalize_before:
                w_heads = self.qkv_proj(self.layer_norm(w))
            else:
                w_heads = self.qkv_proj(w)
            w_head_q, w_head_k, w_head_v = paddle.chunk(w_heads,
                                                        chunks=3,
                                                        axis=-1)

        klen = w_head_k.shape[1]

        w_head_q = paddle.reshape(w_head_q,
                                  shape=[
                                      w_head_q.shape[0], w_head_q.shape[1],
                                      self.n_head, self.d_head
                                  ])
        w_head_k = paddle.reshape(w_head_k,
                                  shape=[
                                      w_head_k.shape[0], w_head_k.shape[1],
                                      self.n_head, self.d_head
                                  ])
        w_head_v = paddle.reshape(w_head_v,
                                  shape=[
                                      w_head_v.shape[0], w_head_v.shape[1],
                                      self.n_head, self.d_head
                                  ])

        if klen > r_emb.shape[0]:
            r_emb_pad = r_emb[0:1].expand(klen - r_emb.shape[0], -1, -1)
            r_emb = paddle.concat([r_emb_pad, r_emb], 0)
            r_bias_pad = r_bias[0:1].expand(klen - r_bias.shape[0], -1)
            r_bias = paddle.concat([r_bias_pad, r_bias], 0)
        else:
            r_emb = r_emb[-klen:]
            r_bias = r_bias[-klen:]

        rw_head_q = w_head_q + r_w_bias.unsqueeze([0])

        AC = einsum('bind,bjnd->bnij', rw_head_q, w_head_k)
        r_emb = r_emb.unsqueeze([0]).expand([bsz, -1, -1, -1])
        B_ = einsum('bind,bjnd->bnij', w_head_q, r_emb)
        D_ = r_bias.unsqueeze([0, 2])
        BD = self._rel_shift(B_ + D_)

        attn_score = AC + BD
        attn_score = attn_score * self.scale

        if attn_mask is not None:
            attn_score = attn_score - float('inf') * attn_mask

        attn_prob = F.softmax(attn_score, dim=-1)
        attn_prob = self.attn_drop(attn_prob)

        attn_vec = einsum('bnij,bjnd->bind', attn_prob, w_head_v)

        attn_vec = paddle.reshape(attn_vec,
                                  shape=[
                                      attn_vec.shape[0], attn_vec.shape[1],
                                      self.n_head * self.d_head
                                  ])

        attn_out = self.o_net(attn_vec)
        attn_out = self.drop(attn_out)

        if self.normalize_before:
            output = w + attn_out
        else:
            output = self.layer_norm(w + attn_out)

        return output
Ejemplo n.º 9
0
    def forward(self, logit, label):
        """
        Forward computation.

        Args:
            logit (Tensor): Logit tensor, the data type is float32, float64. Shape is
                (N, C), where C is number of classes, and if shape is more than 2D, this
                is (N, C, D1, D2,..., Dk), k >= 1.
            label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
                (N, D1, D2,..., Dk), k >= 1.
        """
        if len(label.shape) != len(logit.shape):
            label = paddle.unsqueeze(label, 1)

        # get the label after ohem
        n, c, h, w = logit.shape
        label = label.reshape((-1, ))
        valid_mask = (label != self.ignore_index).astype('int64')
        num_valid = valid_mask.sum()
        label = label * valid_mask

        prob = F.softmax(logit, axis=1)
        prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1))

        if self.min_kept < num_valid and num_valid > 0:
            # let the value which ignored greater than 1
            prob = prob + (1 - valid_mask)

            # get the prob of relevant label
            label_onehot = F.one_hot(label, c)
            label_onehot = label_onehot.transpose((1, 0))
            prob = prob * label_onehot
            prob = paddle.sum(prob, axis=0)

            threshold = self.thresh
            if self.min_kept > 0:
                index = prob.argsort()
                threshold_index = index[min(len(index), self.min_kept) - 1]
                threshold_index = int(threshold_index.numpy()[0])
                if prob[threshold_index] > self.thresh:
                    threshold = prob[threshold_index]
                kept_mask = (prob < threshold).astype('int64')
                label = label * kept_mask
                valid_mask = valid_mask * kept_mask

        # make the invalid region as ignore
        label = label + (1 - valid_mask) * self.ignore_index

        label = label.reshape((n, 1, h, w))
        valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32')
        loss = F.softmax_with_cross_entropy(logit,
                                            label,
                                            ignore_index=self.ignore_index,
                                            axis=1)
        loss = loss * valid_mask
        avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS)

        label.stop_gradient = True
        valid_mask.stop_gradient = True
        return avg_loss
Ejemplo n.º 10
0
    def forward(self, hidden, target, keep_order=False):
        assert (hidden.shape[0] == target.shape[0])

        if self.num_clusters == 0:
            logit = self._compute_logits(hidden, self.out_layers_weight[0],
                                         self.out_layers_bias[0],
                                         self.out_projs[0])
            nll = -paddle.log(F.softmax(logit, axis=-1))
            idx = paddle.concat([
                paddle.arange(0, nll.shape[0]).unsqueeze([1]),
                target.unsqueeze(1)
            ],
                                axis=1)
            nll = paddle.gather_nd(nll, idx)
        else:
            weights, biases = [], []
            for i in range(len(self.cutoffs)):
                if self.div_val == 1:
                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                    weight_i = self.out_layers_weight[0][l_idx:r_idx]
                    bias_i = self.out_layers_bias[0][l_idx:r_idx]
                else:
                    weight_i = self.out_layers_weight[i]
                    bias_i = self.out_layers_bias[i]

                if i == 0:
                    weight_i = paddle.concat([weight_i, self.cluster_weight],
                                             axis=0)
                    bias_i = paddle.concat([bias_i, self.cluster_bias], axis=0)

                weights.append(weight_i)
                biases.append(bias_i)

            head_weight, head_bias, head_proj = weights[0], biases[
                0], self.out_projs[0]

            head_logit = self._compute_logits(hidden, head_weight, head_bias,
                                              head_proj)
            head_logprob = paddle.log(F.softmax(head_logit, axis=-1))

            nll = paddle.zeros_like(target, dtype=hidden.dtype)

            offset = 0
            cutoff_values = [0] + self.cutoffs
            for i in range(len(cutoff_values) - 1):
                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]

                mask_i = paddle.cast(
                    target >= l_idx,
                    dtype=paddle.get_default_dtype()) * paddle.cast(
                        target < r_idx, dtype="int64")
                indices_i = paddle.nonzero(mask_i).squeeze([1])

                if paddle.numel(indices_i) == 0:
                    continue
                target_i = paddle.gather(target, indices_i, axis=0) - l_idx
                head_logprob_i = paddle.gather(head_logprob, indices_i, axis=0)
                if i == 0:
                    target_i_idx = paddle.concat([
                        paddle.arange(0, head_logprob_i.shape[0]).unsqueeze(
                            [1]),
                        target_i.unsqueeze([1])
                    ],
                                                 axis=1)
                    logprob_i = head_logprob_i.gather_nd(target_i_idx)
                else:
                    weight_i, bias_i, proj_i = weights[i], biases[
                        i], self.out_projs[i].weight if self.out_projs[
                            i] is not None else None

                    hidden_i = paddle.gather(hidden, indices_i, axis=0)

                    tail_logit_i = self._compute_logits(
                        hidden_i, weight_i, bias_i, proj_i)
                    tail_logprob_i = paddle.log(
                        F.softmax(tail_logit_i, axis=-1))

                    target_i_idx = paddle.concat([
                        paddle.arange(0, tail_logprob_i.shape[0]).unsqueeze(
                            [1]),
                        target_i.unsqueeze([1])
                    ],
                                                 axis=1)
                    logprob_i = tail_logprob_i.gather_nd(target_i_idx)

                    logprob_i = head_logprob_i[:, -i] + logprob_i

                if self.keep_order or keep_order:
                    nll = paddle.scatter(nll, indices_i, -logprob_i)
                else:
                    index = paddle.arange(offset, offset + logprob_i.shape[0],
                                          1)
                    nll = paddle.scatter(nll, index, -logprob_i)

                offset += logprob_i.shape[0]

        return nll
Ejemplo n.º 11
0
    def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None):
        qlen, rlen, bsz = w.shape[1], r.shape[1], w.shape[0]

        if mems is not None:
            cat = paddle.concat([mems, w], axis=1)
            if self.normalize_before:
                w_heads = self.qkv_proj(self.layer_norm(cat))
            else:
                w_heads = self.qkv_proj(cat)
            r_head_k = self.r_proj(r)

            w_head_q, w_head_k, w_head_v = paddle.chunk(w_heads,
                                                        chunks=3,
                                                        axis=-1)

            w_head_q = w_head_q[:, -qlen:, :]
        else:
            if self.normalize_before:
                w_heads = self.qkv_proj(self.layer_norm(w))
            else:
                w_heads = self.qkv_proj(w)
            r_head_k = self.r_proj(r)

            w_head_q, w_head_k, w_head_v = paddle.chunk(w_heads,
                                                        chunks=3,
                                                        axis=-1)

        klen = w_head_k.shape[1]

        w_head_q = paddle.reshape(w_head_q,
                                  shape=[bsz, qlen, self.n_head, self.d_head])
        w_head_k = paddle.reshape(w_head_k,
                                  shape=[bsz, klen, self.n_head, self.d_head])
        w_head_v = paddle.reshape(w_head_v,
                                  shape=[bsz, klen, self.n_head, self.d_head])

        r_head_k = paddle.reshape(r_head_k,
                                  shape=[bsz, rlen, self.n_head, self.d_head])

        rw_head_q = w_head_q + r_w_bias

        AC = einsum('bind,bjnd->bnij', rw_head_q, w_head_k)
        rr_head_q = w_head_q + r_r_bias

        BD = einsum('bind,bjnd->bnij', rr_head_q, r_head_k)
        BD = self._rel_shift(BD)

        attn_score = AC + BD
        attn_score = attn_score * self.scale

        if attn_mask is not None:
            attn_score = attn_score - 1e30 * attn_mask

        attn_prob = F.softmax(attn_score, axis=-1)
        attn_prob = self.attn_drop(attn_prob)

        attn_vec = einsum('bnij,bjnd->bind', attn_prob, w_head_v)

        attn_vec = paddle.reshape(attn_vec,
                                  shape=[
                                      attn_vec.shape[0], attn_vec.shape[1],
                                      self.n_head * self.d_head
                                  ])

        attn_out = self.o_proj(attn_vec)
        attn_out = self.drop(attn_out)

        if self.normalize_before:
            output = w + attn_out
        else:
            output = self.layer_norm(w + attn_out)

        return output
Ejemplo n.º 12
0
def do_train():
    paddle.set_device(args.device)
    set_seed(args.seed)

    train_ds, dev_ds = load_dataset("chnsenticorp", splits=["train", "dev"])

    model = FasterErnieForSequenceClassification.from_pretrained(
        'ernie-1.0',
        num_classes=len(train_ds.label_list),
        max_seq_len=args.max_seq_length)

    train_data_loader = create_dataloader(
        train_ds, mode='train', batch_size=args.batch_size)
    dev_data_loader = create_dataloader(
        dev_ds, mode='dev', batch_size=args.batch_size)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)

    num_training_steps = len(train_data_loader) * args.epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.loss.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()
    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)

    global_step = 0
    tic_train = time.time()
    total_train_time = 0
    for epoch in range(1, args.epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            texts, labels = batch["text"], batch["label"]
            texts = to_tensor(texts)
            with paddle.amp.auto_cast(
                    args.use_amp,
                    custom_white_list=["fused_feedforward", "fused_attention"]):
                logits, predictions = model(texts)
                loss = criterion(logits, labels)
            probs = F.softmax(logits, axis=1)
            correct = metric.compute(logits, labels)
            metric.update(correct)
            acc = metric.accumulate()

            if args.use_amp:
                scaler.scale(loss).backward()
                scaler.minimize(optimizer, loss)
            else:
                loss.backward()
                optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            global_step += 1
            if global_step % args.logging_steps == 0:
                time_diff = time.time() - tic_train
                total_train_time += time_diff
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, accuracy: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, acc,
                       args.logging_steps / time_diff))
                tic_train = time.time()
            if global_step % args.save_steps == 0:
                save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                evaluate(model, criterion, metric, dev_data_loader)
                model.save_pretrained(save_dir)
                tic_train = time.time()

    print("Speed: %.2f steps/s" % (global_step / total_train_time))
Ejemplo n.º 13
0
    def run(self):
        args = self.args
        paddle.set_device(args.device)
        rank = paddle.distributed.get_rank()
        if paddle.distributed.get_world_size() > 1:
            paddle.distributed.init_parallel_env()

        set_seed(args.seed)

        train_ds = load_dataset(read, file_path=args.train_path, lazy=False)
        dev_ds = load_dataset(read, file_path=args.dev_path, lazy=False)

        model = paddlenlp.transformers.BertForSequenceClassification.from_pretrained(
            'bert-base-chinese', num_class=2)
        tokenizer = paddlenlp.transformers.BertTokenizer.from_pretrained(
            'bert-base-chinese')

        trans_func = partial(convert_example,
                             tokenizer=tokenizer,
                             max_seq_length=args.max_seq_length)
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
            Stack(dtype="int64")  # label
        ): [data for data in fn(samples)]
        train_data_loader = create_dataloader(train_ds,
                                              mode='train',
                                              batch_size=args.batch_size,
                                              batchify_fn=batchify_fn,
                                              trans_fn=trans_func)
        dev_data_loader = create_dataloader(dev_ds,
                                            mode='dev',
                                            batch_size=args.batch_size,
                                            batchify_fn=batchify_fn,
                                            trans_fn=trans_func)

        if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
            state_dict = paddle.load(args.init_from_ckpt)
            model.set_dict(state_dict)
        model = paddle.DataParallel(model)

        num_training_steps = len(train_data_loader) * args.epochs

        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps,
                                             args.warmup_proportion)

        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params)

        criterion = paddle.nn.loss.CrossEntropyLoss()
        metric = paddle.metric.Accuracy()

        global_step = 0
        tic_train = time.time()
        for epoch in range(1, args.epochs + 1):
            for step, batch in enumerate(train_data_loader, start=1):
                input_ids, token_type_ids, labels = batch
                logits = model(input_ids, token_type_ids)
                probs = functional.softmax(logits, axis=1)
                loss = criterion(probs, labels)

                correct = metric.compute(loss, labels)  # labels可以是索引,也可以是独热表示
                metric.update(correct)  # 更新正确预测的个数以及总个数
                acc = metric.accumulate()

                global_step += 1
                if global_step % 10 == 0 and rank == 0:
                    print(
                        "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                        % (global_step, epoch, step, loss, acc, 10 /
                           (time.time() - tic_train)))
                    tic_train = time.time()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()
                if global_step % 100 == 0 and rank == 0:  # 100步保存一次模型
                    save_dir = os.path.join(args.save_dir,
                                            "model_%d" % global_step)
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    self.evaluate(model, criterion, metric, dev_data_loader)
                    model._layers.save_pretrained(save_dir)
                    tokenizer.save_pretrained(save_dir)
    def forward(self, input_ids, position_ids):
        if _global_parallel_strategy == "dp":
            auto.shard_tensor(input_ids,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(input_ids,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })

        input_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.word_embeddings.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.word_embeddings.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        embeddings = input_embeddings + position_embeddings
        embeddings = self.dropout1(embeddings)

        # Pre-norm
        target = self.norm1(embeddings)

        # The following is the attention part
        q = self.q_proj(target)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        k = self.k_proj(target)
        v = self.v_proj(target)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })

        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])

        # scale dot product attention
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=self.head_dim**-0.5)

        if self.attn_mask is not None:
            product = product + self.attn_mask

        weights = F.softmax(product)

        if self.dropout_ratio:
            weights = F.dropout(weights,
                                self.dropout_ratio,
                                training=self.training,
                                mode="upscale_in_train")

        out = tensor.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        # Add residual
        residual = embeddings + self.dropout2(out)

        # Pre-norm
        out0 = self.norm2(residual)

        # The following is the MLP part
        out1 = self.linear0(out0)
        out2 = F.gelu(out1, approximate=True)
        out3 = self.linear1(out2)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        # Add residual
        final = residual + self.dropout3(out3)
        return final
Ejemplo n.º 15
0
def evaluate(model,
             eval_dataset,
             aug_eval=False,
             scales=1.0,
             flip_horizontal=False,
             flip_vertical=False,
             is_slide=False,
             stride=None,
             crop_size=None,
             num_workers=0,
             print_detail=True,
             auc_roc=False):
    """
    Launch evalution.

    Args:
        model(nn.Layer): A sementic segmentation model.
        eval_dataset (paddle.io.Dataset): Used to read and process validation datasets.
        aug_eval (bool, optional): Whether to use mulit-scales and flip augment for evaluation. Default: False.
        scales (list|float, optional): Scales for augment. It is valid when `aug_eval` is True. Default: 1.0.
        flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_eval` is True. Default: True.
        flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_eval` is True. Default: False.
        is_slide (bool, optional): Whether to evaluate by sliding window. Default: False.
        stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height.
            It should be provided when `is_slide` is True.
        crop_size (tuple|list, optional):  The crop size of sliding window, the first is width and the second is height.
            It should be provided when `is_slide` is True.
        num_workers (int, optional): Num workers for data loader. Default: 0.
        print_detail (bool, optional): Whether to print detailed information about the evaluation process. Default: True.
        auc_roc(bool, optional): whether add auc_roc metric

    Returns:
        float: The mIoU of validation datasets.
        float: The accuracy of validation datasets.
    """
    model.eval()
    nranks = paddle.distributed.ParallelEnv().nranks
    local_rank = paddle.distributed.ParallelEnv().local_rank
    if nranks > 1:
        # Initialize parallel environment if not done.
        if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(
        ):
            paddle.distributed.init_parallel_env()
    batch_sampler = paddle.io.DistributedBatchSampler(
        eval_dataset, batch_size=1, shuffle=False, drop_last=False)
    loader = paddle.io.DataLoader(
        eval_dataset,
        batch_sampler=batch_sampler,
        num_workers=num_workers,
        return_list=True,
    )

    total_iters = len(loader)
    intersect_area_all = 0
    pred_area_all = 0
    label_area_all = 0
    logits_all = None
    label_all = None

    if print_detail:
        logger.info(
            "Start evaluating (total_samples: {}, total_iters: {})...".format(
                len(eval_dataset), total_iters))
    #TODO(chenguowei): fix log print error with multi-gpus
    progbar_val = progbar.Progbar(
        target=total_iters, verbose=1 if nranks < 2 else 2)
    reader_cost_averager = TimeAverager()
    batch_cost_averager = TimeAverager()
    batch_start = time.time()
    with paddle.no_grad():
        for iter, (im, label) in enumerate(loader):
            reader_cost_averager.record(time.time() - batch_start)
            label = label.astype('int64')

            ori_shape = label.shape[-2:]
            if aug_eval:
                pred, logits = infer.aug_inference(
                    model,
                    im,
                    ori_shape=ori_shape,
                    transforms=eval_dataset.transforms.transforms,
                    scales=scales,
                    flip_horizontal=flip_horizontal,
                    flip_vertical=flip_vertical,
                    is_slide=is_slide,
                    stride=stride,
                    crop_size=crop_size)
            else:
                pred, logits = infer.inference(
                    model,
                    im,
                    ori_shape=ori_shape,
                    transforms=eval_dataset.transforms.transforms,
                    is_slide=is_slide,
                    stride=stride,
                    crop_size=crop_size)

            intersect_area, pred_area, label_area = metrics.calculate_area(
                pred,
                label,
                eval_dataset.num_classes,
                ignore_index=eval_dataset.ignore_index)

            # Gather from all ranks
            if nranks > 1:
                intersect_area_list = []
                pred_area_list = []
                label_area_list = []
                paddle.distributed.all_gather(intersect_area_list,
                                              intersect_area)
                paddle.distributed.all_gather(pred_area_list, pred_area)
                paddle.distributed.all_gather(label_area_list, label_area)

                # Some image has been evaluated and should be eliminated in last iter
                if (iter + 1) * nranks > len(eval_dataset):
                    valid = len(eval_dataset) - iter * nranks
                    intersect_area_list = intersect_area_list[:valid]
                    pred_area_list = pred_area_list[:valid]
                    label_area_list = label_area_list[:valid]

                for i in range(len(intersect_area_list)):
                    intersect_area_all = intersect_area_all + intersect_area_list[
                        i]
                    pred_area_all = pred_area_all + pred_area_list[i]
                    label_area_all = label_area_all + label_area_list[i]
            else:
                intersect_area_all = intersect_area_all + intersect_area
                pred_area_all = pred_area_all + pred_area
                label_area_all = label_area_all + label_area

                if auc_roc:
                    logits = F.softmax(logits, axis=1)
                    if logits_all is None:
                        logits_all = logits.numpy()
                        label_all = label.numpy()
                    else:
                        logits_all = np.concatenate(
                            [logits_all, logits.numpy()])  # (KN, C, H, W)
                        label_all = np.concatenate([label_all, label.numpy()])

            batch_cost_averager.record(
                time.time() - batch_start, num_samples=len(label))
            batch_cost = batch_cost_averager.get_average()
            reader_cost = reader_cost_averager.get_average()

            if local_rank == 0 and print_detail:
                progbar_val.update(iter + 1, [('batch_cost', batch_cost),
                                              ('reader cost', reader_cost)])
            reader_cost_averager.reset()
            batch_cost_averager.reset()
            batch_start = time.time()

    class_iou, miou = metrics.mean_iou(intersect_area_all, pred_area_all,
                                       label_area_all)
    class_acc, acc = metrics.accuracy(intersect_area_all, pred_area_all)
    kappa = metrics.kappa(intersect_area_all, pred_area_all, label_area_all)
    class_dice, mdice = metrics.dice(intersect_area_all, pred_area_all,
                                     label_area_all)

    if auc_roc:
        auc_roc = metrics.auc_roc(
            logits_all, label_all, num_classes=eval_dataset.num_classes)
        auc_infor = ' Auc_roc: {:.4f}'.format(auc_roc)

    if print_detail:
        infor = "[EVAL] #Images: {} mIoU: {:.4f} Acc: {:.4f} Kappa: {:.4f} Dice: {:.4f}".format(
            len(eval_dataset), miou, acc, kappa, mdice)
        infor = infor + auc_infor if auc_roc else infor
        logger.info(infor)
        logger.info("[EVAL] Class IoU: \n" + str(np.round(class_iou, 4)))
        logger.info("[EVAL] Class Acc: \n" + str(np.round(class_acc, 4)))
    return miou, acc, class_iou, class_acc, kappa
Ejemplo n.º 16
0
def do_train():
    set_seed(args.seed)
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    world_size = paddle.distributed.get_world_size()
    if world_size > 1:
        paddle.distributed.init_parallel_env()

    train_ds, dev_ds, test_ds = load_dataset("chnsenticorp",
                                             splits=["train", "dev", "test"])

    # If you wanna use bert/roberta/electra pretrained model,
    # model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class=2)
    # model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext', num_class=2)
    # model = ppnlp.transformers.ElectraForSequenceClassification.from_pretrained('chinese-electra-small', num_classes=2)
    model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained(
        'ernie-tiny', num_classes=len(train_ds.label_list))

    # If you wanna use bert/roberta/electra pretrained model,
    # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese')
    # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext')
    # tokenizer = ppnlp.transformers.ElectraTokenizer.from_pretrained('chinese-electra-small', num_classes=2)
    # ErnieTinyTokenizer is special for ernie-tiny pretained model.
    tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
        'ernie-tiny')

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]
    train_data_loader = create_dataloader(train_ds,
                                          mode='train',
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)
    dev_data_loader = create_dataloader(dev_ds,
                                        mode='dev',
                                        batch_size=args.batch_size,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)
    test_data_loader = create_dataloader(test_ds,
                                         mode='test',
                                         batch_size=args.batch_size,
                                         batchify_fn=batchify_fn,
                                         trans_fn=trans_func)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)
    model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    criterion = paddle.nn.loss.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, args.epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            input_ids, token_type_ids, labels = batch
            logits = model(input_ids, token_type_ids)
            loss = criterion(logits, labels)
            probs = F.softmax(logits, axis=1)
            correct = metric.compute(probs, labels)
            metric.update(correct)
            acc = metric.accumulate()

            global_step += 1
            if global_step % 10 == 0 and paddle.distributed.get_rank() == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, acc, 10 /
                       (time.time() - tic_train)))
                tic_train = time.time()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % 100 == 0 and paddle.distributed.get_rank() == 0:
                save_dir = os.path.join(args.save_dir,
                                        "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                evaluate(model, criterion, metric, dev_data_loader)
                model._layers.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)

    if paddle.distributed.get_rank() == 0:
        print('Evaluating on test data.')
        evaluate(model, criterion, metric, test_data_loader)
Ejemplo n.º 17
0
 def gaussian(self, theta_x, phi_x):
     pairwise_weight = paddle.matmul(theta_x, phi_x)
     pairwise_weight = F.softmax(pairwise_weight, axis=-1)
     return pairwise_weight
Ejemplo n.º 18
0
    def forward(self,
                queries,
                keys,
                values,
                relation_k,
                relation_v,
                attn_bias=None,
                past_cache=None):
        """relational attention forward.
        seq_len in `shape` means num queries/keys/values of attention

        Args:
            queries (TYPE): shape = [batch, seq_len, num_heads * hidden]
            keys (TYPE): shape = queries.shape
            values (TYPE): shape = queries.shape
            relation_k (TYPE): shape = [batch, seq_len, seq_len, hidden]
            relation_v (TYPE): shape = relation_k.shape
            attn_bias (TYPE): used as sequence mask. Default is None
            past_cache (TYPE): Default is None

        Returns: TODO

        Raises: NULL
        """
        assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3
        #bsz, q_len, q_dim = queries.shape
        #bsz, k_len, k_dim = keys.shape
        #bsz, v_len, v_dim = values.shape
        #assert k_len == v_len

        q = self.q(queries)
        k = self.k(keys)
        v = self.v(values)

        cache = (k, v)
        if past_cache is not None:
            cached_k, cached_v = past_cache
            k = paddle.concat([cached_k, k], 1)
            v = paddle.concat([cached_v, v], 1)

        def _transpose(inputs):
            """reshape and transpose
            Args: inputs: shape = [batch, seq_len, heads * hidden]
            Returns: shape = [batch, heads, seq_len, hidden]
            """
            hidden_size = inputs.shape[-1] // self.n_head
            outputs = inputs.reshape([0, 0, self.n_head, hidden_size])
            return outputs.transpose([0, 2, 1, 3])

        q, k, v = [_transpose(x) for x in (q, k, v)]

        q = q.scale(self.d_key**-0.5)
        scores = relative_attention_logits(q, k, relation_k)
        if attn_bias is not None:
            scores += attn_bias
        scores = F.softmax(scores)
        scores = self.dropout(scores)

        out = relative_attention_values(scores, v, relation_v)
        # input: [batch, heads, seq_len, hidden]
        # output: [batch, seq_len, heads * hidden]
        out = out.transpose([0, 2, 1, 3])
        out = out.reshape([0, 0, out.shape[2] * out.shape[3]])
        out = self.o(out)
        return out, cache
Ejemplo n.º 19
0
        def deep_match(item_his_eb, context_his_eb, mask, match_mask,
                       mid_his_batch, EMBEDDING_DIM, item_vectors, item_biases,
                       n_mid):
            query = context_his_eb
            query = self.query_layer(query)  # [1, 50, 64]
            # print(f'query_prelu: {query.shape}')
            query = self.query_prelu(query)

            inputs = paddle.concat(
                [
                    query, item_his_eb, query - item_his_eb,
                    query * item_his_eb
                ],
                axis=-1)  # B,T,E
            att_layer1 = self.att_layer1_layer(inputs)
            att_layer1 = F.sigmoid(att_layer1)
            att_layer2 = self.att_layer2_layer(att_layer1)
            att_layer2 = F.sigmoid(att_layer2)
            att_layer3 = self.att_layer3_layer(att_layer2)  # B,T,1
            scores = paddle.transpose(att_layer3, [0, 2, 1])  # B,1,T

            # mask
            bool_mask = paddle.equal(mask, paddle.ones_like(mask))  # B,T
            key_masks = paddle.unsqueeze(bool_mask, axis=1)  # B,1,T
            paddings = paddle.ones_like(scores) * (-2**32 + 1)
            scores = paddle.where(key_masks, scores, paddings)

            # tril
            scores_tile = paddle.tile(
                paddle.sum(scores, axis=1),
                [1, paddle.shape(scores)[-1]])  # B, T*T
            scores_tile = paddle.reshape(scores_tile, [
                -1, paddle.shape(scores)[-1], paddle.shape(scores)[-1]
            ])  # B, T, T
            diag_vals = paddle.ones_like(scores_tile)  # B, T, T
            tril = paddle.tril(diag_vals)
            paddings = paddle.ones_like(tril) * (-2**32 + 1)
            scores_tile = paddle.where(
                paddle.equal(tril, paddle.to_tensor(0.0)), paddings,
                scores_tile)  # B, T, T
            scores_tile = F.softmax(scores_tile)  # B, T, T
            att_dm_item_his_eb = paddle.matmul(scores_tile,
                                               item_his_eb)  # B, T, E

            dnn_layer1 = self.dnn_layer1_layer(att_dm_item_his_eb)
            # print(f'dnn_layer1_prelu: {dnn_layer1.shape}')
            dnn_layer1 = self.dnn_layer1_prelu(dnn_layer1)

            # target mask
            user_vector = dnn_layer1[:, -1, :]  # B, E
            user_vector2 = dnn_layer1[:, -2, :] * paddle.reshape(
                match_mask,
                [-1, paddle.shape(match_mask)[1], 1])[:, -2, :]  # B, E
            num_sampled = 2000
            labels = paddle.reshape(mid_his_batch[:, -1], [-1, 1])  # B, 1

            # not sample
            # [B, E] * [E_size, cate_size]
            logits = paddle.matmul(
                user_vector2, item_vectors, transpose_y=True)
            logits = paddle.add(logits, item_biases)
            loss = F.cross_entropy(input=logits, label=labels)

            return loss, user_vector, scores
Ejemplo n.º 20
0
 def forward(self, fpn_fms, rcnn_rois, labels=None, bbox_targets=None):
     # stride: 64,32,16,8,4 -> 4, 8, 16, 32
     fpn_fms = fpn_fms[1:][::-1]
     stride = [4, 8, 16, 32]
     pool_features = roi_pooler(fpn_fms, rcnn_rois, stride, (7, 7),
                                "ROIAlignV2")
     flatten_feature = torch.flatten(pool_features, start_axis=1)
     flatten_feature = F.relu_(self.fc1(flatten_feature))
     flatten_feature = F.relu_(self.fc2(flatten_feature))
     pred_emd_cls_0 = self.emd_pred_cls_0(flatten_feature)
     pred_emd_delta_0 = self.emd_pred_delta_0(flatten_feature)
     pred_emd_cls_1 = self.emd_pred_cls_1(flatten_feature)
     pred_emd_delta_1 = self.emd_pred_delta_1(flatten_feature)
     pred_emd_scores_0 = F.softmax(pred_emd_cls_0, axis=-1)
     pred_emd_scores_1 = F.softmax(pred_emd_cls_1, axis=-1)
     # cons refine feature
     boxes_feature_0 = cat(
         (pred_emd_delta_0[:, 4:], pred_emd_scores_0[:, 1][:, None]),
         axis=1).repeat(1, 4)
     boxes_feature_1 = cat(
         (pred_emd_delta_1[:, 4:], pred_emd_scores_1[:, 1][:, None]),
         axis=1).repeat(1, 4)
     boxes_feature_0 = cat((flatten_feature, boxes_feature_0), axis=1)
     boxes_feature_1 = cat((flatten_feature, boxes_feature_1), axis=1)
     refine_feature_0 = F.relu_(self.fc3(boxes_feature_0))
     refine_feature_1 = F.relu_(self.fc3(boxes_feature_1))
     # refine
     pred_ref_cls_0 = self.ref_pred_cls_0(refine_feature_0)
     pred_ref_delta_0 = self.ref_pred_delta_0(refine_feature_0)
     pred_ref_cls_1 = self.ref_pred_cls_1(refine_feature_1)
     pred_ref_delta_1 = self.ref_pred_delta_1(refine_feature_1)
     if self.training:
         loss0 = emd_loss_softmax(pred_emd_delta_0, pred_emd_cls_0,
                                  pred_emd_delta_1, pred_emd_cls_1,
                                  bbox_targets, labels)
         loss1 = emd_loss_softmax(pred_emd_delta_1, pred_emd_cls_1,
                                  pred_emd_delta_0, pred_emd_cls_0,
                                  bbox_targets, labels)
         loss2 = emd_loss_softmax(pred_ref_delta_0, pred_ref_cls_0,
                                  pred_ref_delta_1, pred_ref_cls_1,
                                  bbox_targets, labels)
         loss3 = emd_loss_softmax(pred_ref_delta_1, pred_ref_cls_1,
                                  pred_ref_delta_0, pred_ref_cls_0,
                                  bbox_targets, labels)
         loss_rcnn = cat([loss0, loss1], axis=1)
         loss_ref = cat([loss2, loss3], axis=1)
         # requires_grad = False
         _, min_indices_rcnn = loss_rcnn.min(dim=1)
         _, min_indices_ref = loss_ref.min(dim=1)
         loss_rcnn = loss_rcnn[torch.arange(loss_rcnn.shape[0]),
                               min_indices_rcnn]
         loss_rcnn = loss_rcnn.mean()
         loss_ref = loss_ref[torch.arange(loss_ref.shape[0]),
                             min_indices_ref]
         loss_ref = loss_ref.mean()
         loss_dict = {}
         loss_dict['loss_rcnn_emd'] = loss_rcnn
         loss_dict['loss_ref_emd'] = loss_ref
         return loss_dict
     else:
         class_num = pred_ref_cls_0.shape[-1] - 1
         tag = torch.arange(class_num).type_as(pred_ref_cls_0) + 1
         tag = tag.repeat(pred_ref_cls_0.shape[0], 1).reshape(-1, 1)
         pred_scores_0 = F.softmax(pred_ref_cls_0,
                                   axis=-1)[:, 1:].reshape(-1, 1)
         pred_scores_1 = F.softmax(pred_ref_cls_1,
                                   axis=-1)[:, 1:].reshape(-1, 1)
         pred_delta_0 = pred_ref_delta_0[:, 4:].reshape(-1, 4)
         pred_delta_1 = pred_ref_delta_1[:, 4:].reshape(-1, 4)
         base_rois = rcnn_rois[:, 1:5].repeat(1, class_num).reshape(-1, 4)
         pred_bbox_0 = restore_bbox(base_rois, pred_delta_0, True)
         pred_bbox_1 = restore_bbox(base_rois, pred_delta_1, True)
         pred_bbox_0 = cat([pred_bbox_0, pred_scores_0, tag], axis=1)
         pred_bbox_1 = cat([pred_bbox_1, pred_scores_1, tag], axis=1)
         pred_bbox = cat((pred_bbox_0, pred_bbox_1), axis=1)
         return pred_bbox
Ejemplo n.º 21
0
# cross-entropy cost function
criterion = paddle.nn.loss.CrossEntropyLoss()
# accuracy metric
metric = paddle.metric.Accuracy()
print(args)
# train
global_step = 0
tic_train = time.time()
for epoch in range(1, args.epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, pinyin_ids, labels = batch
        batch_size, length = input_ids.shape
        pinyin_ids = paddle.reshape(pinyin_ids, [batch_size, length, 8])
        logits = model(input_ids, pinyin_ids)
        loss = criterion(logits, labels)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        global_step += 1
        if global_step % 10 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, acc, 10 /
                   (time.time() - tic_train)))
            tic_train = time.time()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
Ejemplo n.º 22
0
    def sample(self,
               input_ids,
               logits_processors,
               max_length,
               pad_token_id,
               eos_token_id,
               top_k=None,
               top_p=None,
               temperature=None,
               min_tokens_to_keep=1,
               **model_kwargs):
        def TopKProcess(probs, top_k, min_tokens_to_keep):
            top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1])
            # Remove all tokens with a probability less than the last token of the top-k
            topk_probs, _ = paddle.topk(probs, k=top_k)
            probs = paddle.where(probs >= topk_probs[:, -1:], probs,
                                 paddle.full_like(probs, 0.0))
            return probs

        def TopPProcess(probs, top_p, min_tokens_to_keep):
            sorted_probs = paddle.sort(probs, descending=True)
            sorted_indices = paddle.argsort(probs, descending=True)
            cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)

            # Remove tokens with cumulative probs above the top_p, But keep at
            # least min_tokens_to_keep tokens
            sorted_indices_to_remove = cumulative_probs > top_p
            if min_tokens_to_keep > 1:
                # Set 'min_tokens_to_keep - 1' because the first token is kept
                sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0
            # Keep the first token
            sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove,
                                                   dtype='int64')
            sorted_indices_to_remove[:, 1:] = (
                sorted_indices_to_remove[:, :-1].clone())
            sorted_indices_to_remove[:, 0] = 0

            # Scatter sorted tensors to original indexing
            sorted_indices = sorted_indices + paddle.arange(
                probs.shape[0]).unsqueeze(-1) * probs.shape[-1]
            condition = paddle.scatter(sorted_indices_to_remove.flatten(),
                                       sorted_indices.flatten(),
                                       sorted_indices_to_remove.flatten())
            condition = paddle.cast(condition, 'bool').reshape(probs.shape)
            probs = paddle.where(condition, paddle.full_like(probs, 0.0),
                                 probs)
            return probs

        batch_size, cur_len = input_ids.shape
        origin_len = cur_len
        unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool')
        scores = paddle.full([batch_size, 1],
                             0.0,
                             dtype=paddle.get_default_dtype())

        while cur_len < max_length:
            # prepare model inputs & get model output
            model_inputs = self.prepare_inputs_for_generation(
                input_ids, **model_kwargs)
            outputs = self(**model_inputs)
            logits = outputs[0] if isinstance(outputs, tuple) else outputs
            # [batch_size, vocab_size]
            logits = logits[:, -1, :]

            # pre-process distribution
            logits = self.adjust_logits_during_generation(logits)
            logits = logits_processors(input_ids, logits)

            # sample
            origin_probs = F.softmax(logits)
            origin_probs = paddle.log(origin_probs)
            if temperature is not None and temperature != 1.0:
                logits = logits / temperature
            probs = F.softmax(logits)
            if top_k is not None and top_k != 0:
                probs = TopKProcess(probs, top_k, min_tokens_to_keep)
            if top_p is not None and top_p < 1.0:
                probs = TopPProcess(probs, top_p, min_tokens_to_keep)
            next_tokens = paddle.multinomial(probs)
            next_scores = paddle.index_sample(origin_probs, next_tokens)

            if eos_token_id is not None:
                next_tokens = paddle.where(
                    unfinished_flag, next_tokens,
                    paddle.full_like(next_tokens, pad_token_id))

            scores = self.update_scores_for_generation(scores, next_scores,
                                                       cur_len - origin_len,
                                                       unfinished_flag)

            cur_len += 1
            input_ids = paddle.concat([input_ids, next_tokens], axis=1)

            if eos_token_id is not None:
                unfinished_flag = paddle.logical_and(
                    unfinished_flag, next_tokens != eos_token_id)

            # Stop when there is a </s> in all sentences
            if not paddle.any(unfinished_flag):
                break
            model_kwargs = self.update_model_kwargs_for_generation(
                outputs,
                model_kwargs,
                is_encoder_decoder=self.is_encoder_decoder)
        return input_ids[:, origin_len:], scores
Ejemplo n.º 23
0
parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
args = parser.parse_args()
# yapf: enable

if __name__ == '__main__':
    paddle.set_device(args.device)

    feature_extractor = LogMelSpectrogram(sr=16000,
                                          n_fft=512,
                                          hop_length=320,
                                          n_mels=64,
                                          f_min=50)
    model = SoundClassifier(backbone=cnn14(pretrained=False,
                                           extract_embedding=True),
                            num_class=len(ESC50.label_list))
    model.set_state_dict(paddle.load(args.checkpoint))
    model.eval()

    waveform, sr = load_audio(args.wav)
    feats = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
    logits = model(feats)
    probs = F.softmax(logits, axis=1).numpy()

    sorted_indices = (-probs[0]).argsort()

    msg = f'[{args.wav}]\n'
    for idx in sorted_indices[:args.top_k]:
        msg += f'{ESC50.label_list[idx]}: {probs[0][idx]}\n'
    print(msg)
Ejemplo n.º 24
0
    def beam_search(self, input_ids, beam_scorer, logits_processors,
                    max_length, pad_token_id, eos_token_id, **model_kwargs):
        batch_size = len(beam_scorer._beam_hyps)
        num_beams = beam_scorer.num_beams

        batch_beam_size, cur_len = input_ids.shape
        origin_len = cur_len

        assert (
            num_beams * batch_size == batch_beam_size
        ), "Batch dimension of `input_ids` should be {}, but received {}.".format(
            num_beams * batch_size, batch_beam_size)

        beam_scores = paddle.zeros((batch_size, num_beams),
                                   dtype=paddle.get_default_dtype())
        beam_scores[:, 1:] = -1e9
        beam_scores = paddle.reshape(beam_scores, [-1])

        while cur_len < max_length:
            # prepare model inputs & get model output
            model_inputs = self.prepare_inputs_for_generation(
                input_ids, **model_kwargs)
            outputs = self(**model_inputs)
            logits = outputs[0] if isinstance(outputs, tuple) else outputs
            # [batch_size, vocab_size]
            logits = logits[:, -1, :]

            # pre-process distribution
            logits = self.adjust_logits_during_generation(logits)
            logits = logits_processors(input_ids, logits)

            # beam search
            # [batch_size * num_beams, vocab_size]
            next_scores = F.softmax(logits)
            next_scores = paddle.log(next_scores)

            next_scores = next_scores + beam_scores.unsqueeze(-1)
            # reshape for beam search
            vocab_size = next_scores.shape[-1]
            next_scores = next_scores.reshape(
                [batch_size, num_beams * vocab_size])

            next_scores, next_tokens = paddle.topk(next_scores,
                                                   2 * num_beams,
                                                   axis=1)

            next_indices = next_tokens // vocab_size
            next_tokens = next_tokens % vocab_size

            # stateless
            beam_outputs = beam_scorer.process(
                input_ids,
                next_scores,
                next_tokens,
                next_indices,
                origin_len=origin_len,
                pad_token_id=pad_token_id,
                eos_token_id=eos_token_id,
            )
            beam_scores = beam_outputs["next_beam_scores"]
            beam_next_tokens = beam_outputs["next_beam_tokens"]
            beam_idx = beam_outputs["next_beam_indices"]

            cur_len += 1
            input_ids = paddle.concat([
                paddle.index_select(input_ids, beam_idx),
                beam_next_tokens.unsqueeze(-1)
            ],
                                      axis=-1)

            if beam_scorer.is_done:
                break
            model_kwargs = self.update_model_kwargs_for_generation(
                outputs,
                model_kwargs,
                is_encoder_decoder=self.is_encoder_decoder)
            if model_kwargs["cache"] is not None:
                # reorder the cache
                model_kwargs["cache"] = map_structure(
                    lambda x: paddle.index_select(x, beam_idx),
                    model_kwargs["cache"])

        pred_ids, scores = beam_scorer.finalize(input_ids,
                                                beam_scores,
                                                next_tokens,
                                                next_indices,
                                                pad_token_id=pad_token_id,
                                                eos_token_id=eos_token_id)
        return pred_ids[:, origin_len:], scores
Ejemplo n.º 25
0
    def forward(self,
                input_ids=None,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None,
                query_input_ids=None,
                query_token_type_ids=None,
                query_position_ids=None,
                query_attention_mask=None,
                title_input_ids=None,
                title_token_type_ids=None,
                title_position_ids=None,
                title_attention_mask=None,
                seq_lengths=None,
                labels=None):

        if self.task != 'text-matching':
            result = self.model(input_ids, token_type_ids, position_ids,
                                attention_mask)
        else:
            query_result = self.model(query_input_ids, query_token_type_ids,
                                      query_position_ids, query_attention_mask)
            title_result = self.model(title_input_ids, title_token_type_ids,
                                      title_position_ids, title_attention_mask)

        if self.task == 'seq-cls':
            logits = result
            probs = F.softmax(logits, axis=1)
            if labels is not None:
                loss = self.criterion(logits, labels)
                correct = self.metric.compute(probs, labels)
                acc = self.metric.update(correct)
                return probs, loss, {'acc': acc}
            return probs
        elif self.task == 'token-cls':
            logits = result
            token_level_probs = F.softmax(logits, axis=-1)
            preds = token_level_probs.argmax(axis=-1)
            if labels is not None:
                loss = self.criterion(logits, labels.unsqueeze(-1))
                num_infer_chunks, num_label_chunks, num_correct_chunks = \
                    self.metric.compute(None, seq_lengths, preds, labels)
                self.metric.update(num_infer_chunks.numpy(),
                                   num_label_chunks.numpy(),
                                   num_correct_chunks.numpy())
                _, _, f1_score = map(float, self.metric.accumulate())
                return token_level_probs, loss, {'f1_score': f1_score}
            return token_level_probs
        elif self.task == 'text-matching':
            query_token_embedding, _ = query_result
            query_token_embedding = self.dropout(query_token_embedding)
            query_attention_mask = paddle.unsqueeze(
                (query_input_ids != self.model.pad_token_id).astype(
                    self.model.pooler.dense.weight.dtype),
                axis=2)
            query_token_embedding = query_token_embedding * query_attention_mask
            query_sum_embedding = paddle.sum(query_token_embedding, axis=1)
            query_sum_mask = paddle.sum(query_attention_mask, axis=1)
            query_mean = query_sum_embedding / query_sum_mask

            title_token_embedding, _ = title_result
            title_token_embedding = self.dropout(title_token_embedding)
            title_attention_mask = paddle.unsqueeze(
                (title_input_ids != self.model.pad_token_id).astype(
                    self.model.pooler.dense.weight.dtype),
                axis=2)
            title_token_embedding = title_token_embedding * title_attention_mask
            title_sum_embedding = paddle.sum(title_token_embedding, axis=1)
            title_sum_mask = paddle.sum(title_attention_mask, axis=1)
            title_mean = title_sum_embedding / title_sum_mask

            sub = paddle.abs(paddle.subtract(query_mean, title_mean))
            projection = paddle.concat([query_mean, title_mean, sub], axis=-1)
            logits = self.classifier(projection)
            probs = F.softmax(logits)
            if labels is not None:
                loss = self.criterion(logits, labels)
                correct = self.metric.compute(probs, labels)
                acc = self.metric.update(correct)
                return probs, loss, {'acc': acc}
            return probs
        else:
            sequence_output, pooled_output = result
            return sequence_output, pooled_output
Ejemplo n.º 26
0
    def GetBaselineOut(self):
        paddle.disable_static(place=paddle.CUDAPlace(0))
        tensor_query = paddle.to_tensor(self.query, stop_gradient=False)

        cache_kv = None
        if self.has_cache_kv:
            cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False)

        if self.has_attn_mask:
            attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
        else:
            attn_mask = None
        residual = tensor_query

        ln1_out = tensor_query
        if self.pre_layer_norm:
            ln1_out = self.norm1(tensor_query)

        q = self.q_proj(ln1_out)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q_out = tensor.transpose(x=q, perm=[0, 2, 1, 3])
        k = self.k_proj(ln1_out)
        v = self.v_proj(ln1_out)
        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k_out = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3])

        if self.has_cache_kv:
            # [1, B, n_head, cache_seq_len, head_dim]
            cache_k, cache_v = paddle.split(cache_kv, 2)
            cache_k = paddle.squeeze(cache_k, axis=0)
            cache_v = paddle.squeeze(cache_v, axis=0)
            # [B, n_head, cache_seq_len + seq_len, head_dim]
            # out_seq_len = cache_seq_len + seq_len
            k_out = paddle.concat([cache_k, k_out], axis=-2)
            v_out = paddle.concat([cache_v, v_out], axis=-2)

        # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
        # --> [B, n_head, seq_len, out_seq_len]
        qk_out = layers.matmul(x=q_out,
                               y=k_out,
                               transpose_y=True,
                               alpha=self.head_dim**-0.5)

        if attn_mask is not None:
            attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype)
            attn_mask_out = qk_out + attn_mask
            softmax_out = F.softmax(attn_mask_out)
        else:
            softmax_out = F.softmax(qk_out)

        if self.dropout_prob:
            dropout_out = F.dropout(softmax_out,
                                    self.dropout_prob,
                                    training=self.training,
                                    mode="upscale_in_train")
            # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim]
            # --> [B, n_head, seq_len, head_dim]
            qktv_out = tensor.matmul(dropout_out, v_out)
        else:
            qktv_out = tensor.matmul(softmax_out, v_out)

        fmha_out = tensor.transpose(qktv_out, perm=[0, 2, 1, 3])
        out_linear_in = tensor.reshape(
            x=fmha_out, shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]])
        out = self.out_proj(out_linear_in)

        residual_out = residual + self.dropout(out)
        if not self.pre_layer_norm:
            final_out = self.norm1(residual_out)
        else:
            final_out = residual_out

        if self.has_cache_kv:
            return final_out

        paddle.autograd.backward([final_out], [paddle.to_tensor(self.dout)],
                                 retain_graph=True)
        return final_out, tensor_query.grad
def search_mobilenetv2_block(config, args, image_size):
    image_shape = [3, image_size, image_size]
    transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
    if args.data == 'cifar10':
        train_dataset = paddle.vision.datasets.Cifar10(mode='train',
                                                       transform=transform,
                                                       backend='cv2')
        val_dataset = paddle.vision.datasets.Cifar10(mode='test',
                                                     transform=transform,
                                                     backend='cv2')

    elif args.data == 'imagenet':
        train_dataset = imagenet_reader.ImageNetDataset(mode='train')
        val_dataset = imagenet_reader.ImageNetDataset(mode='val')

    places = static.cuda_places() if args.use_gpu else static.cpu_places()
    place = places[0]
    if args.is_server:
        sa_nas = SANAS(config,
                       server_addr=(args.server_address, args.port),
                       search_steps=args.search_steps,
                       is_server=True)
    else:
        sa_nas = SANAS(config,
                       server_addr=(args.server_address, args.port),
                       search_steps=args.search_steps,
                       is_server=False)

    for step in range(args.search_steps):
        archs = sa_nas.next_archs()[0]

        train_program = static.Program()
        test_program = static.Program()
        startup_program = static.Program()
        with static.program_guard(train_program, startup_program):
            data_shape = [None] + image_shape
            data = static.data(name='data', shape=data_shape, dtype='float32')
            label = static.data(name='label', shape=[None, 1], dtype='int64')
            if args.data == 'cifar10':
                paddle.assign(paddle.reshape(label, [-1, 1]), label)
            train_loader = paddle.io.DataLoader(train_dataset,
                                                places=places,
                                                feed_list=[data, label],
                                                drop_last=True,
                                                batch_size=args.batch_size,
                                                return_list=False,
                                                shuffle=True,
                                                use_shared_memory=True,
                                                num_workers=4)
            val_loader = paddle.io.DataLoader(val_dataset,
                                              places=place,
                                              feed_list=[data, label],
                                              drop_last=False,
                                              batch_size=args.batch_size,
                                              return_list=False,
                                              shuffle=False)
            data = conv_bn_layer(input=data,
                                 num_filters=32,
                                 filter_size=3,
                                 stride=2,
                                 padding='SAME',
                                 act='relu6',
                                 name='mobilenetv2_conv1')
            data = archs(data)[0]
            data = conv_bn_layer(input=data,
                                 num_filters=1280,
                                 filter_size=1,
                                 stride=1,
                                 padding='SAME',
                                 act='relu6',
                                 name='mobilenetv2_last_conv')
            data = F.adaptive_avg_pool2d(data,
                                         output_size=[1, 1],
                                         name='mobilenetv2_last_pool')
            output = static.nn.fc(
                x=data,
                size=args.class_dim,
                weight_attr=ParamAttr(name='mobilenetv2_fc_weights'),
                bias_attr=ParamAttr(name='mobilenetv2_fc_offset'))

            softmax_out = F.softmax(output)
            cost = F.cross_entropy(softmax_out, label=label)
            avg_cost = paddle.mean(cost)
            acc_top1 = paddle.metric.accuracy(input=softmax_out,
                                              label=label,
                                              k=1)
            acc_top5 = paddle.metric.accuracy(input=softmax_out,
                                              label=label,
                                              k=5)
            test_program = train_program.clone(for_test=True)

            optimizer = paddle.optimizer.Momentum(
                learning_rate=0.1,
                momentum=0.9,
                weight_decay=paddle.regularizer.L2Decay(1e-4))
            optimizer.minimize(avg_cost)

        current_flops = flops(train_program)
        print('step: {}, current_flops: {}'.format(step, current_flops))
        if current_flops > int(321208544):
            continue

        exe = static.Executor(place)
        exe.run(startup_program)

        build_strategy = static.BuildStrategy()
        train_compiled_program = static.CompiledProgram(
            train_program).with_data_parallel(loss_name=avg_cost.name,
                                              build_strategy=build_strategy)
        for epoch_id in range(args.retain_epoch):
            for batch_id, data in enumerate(train_loader()):
                fetches = [avg_cost.name]
                s_time = time.time()
                outs = exe.run(train_compiled_program,
                               feed=data,
                               fetch_list=fetches)[0]
                batch_time = time.time() - s_time
                if batch_id % 10 == 0:
                    _logger.info(
                        'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms'
                        .format(step, epoch_id, batch_id, outs[0], batch_time))

        reward = []
        for batch_id, data in enumerate(val_loader()):
            test_fetches = [avg_cost.name, acc_top1.name, acc_top5.name]
            batch_reward = exe.run(test_program,
                                   feed=data,
                                   fetch_list=test_fetches)
            reward_avg = np.mean(np.array(batch_reward), axis=1)
            reward.append(reward_avg)

            _logger.info(
                'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'
                .format(step, batch_id, batch_reward[0], batch_reward[1],
                        batch_reward[2]))

        finally_reward = np.mean(np.array(reward), axis=0)
        _logger.info(
            'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format(
                finally_reward[0], finally_reward[1], finally_reward[2]))

        sa_nas.reward(float(finally_reward[1]))
    def forward(self, logits, label):
        """
        Forward computation.

        Args:
            logits (tuple|list): (seg_logit, edge_logit) Tensor, the data type is float32, float64. Shape is
                (N, C), where C is number of classes, and if shape is more than 2D, this
                is (N, C, D1, D2,..., Dk), k >= 1. C =1 of edge_logit .
            label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each
                value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
                (N, C, D1, D2,..., Dk), k >= 1.
        """
        seg_logit, edge_logit = logits[0], logits[1]
        if len(label.shape) != len(seg_logit.shape):
            label = paddle.unsqueeze(label, 1)
        if edge_logit.shape != label.shape:
            raise ValueError(
                'The shape of edge_logit should equal to the label, but they are {} != {}'
                .format(edge_logit.shape, label.shape))

        # Filter out edge
        filler = paddle.ones_like(label) * self.ignore_index
        label = paddle.where(edge_logit > self.edge_threshold, label, filler)

        # ohem
        n, c, h, w = seg_logit.shape
        label = label.reshape((-1, ))
        valid_mask = (label != self.ignore_index).astype('int64')
        num_valid = valid_mask.sum()
        label = label * valid_mask

        prob = F.softmax(seg_logit, axis=1)
        prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1))

        if self.min_kept < num_valid and num_valid > 0:
            # let the value which ignored greater than 1
            prob = prob + (1 - valid_mask)

            # get the prob of relevant label
            label_onehot = F.one_hot(label, c)
            label_onehot = label_onehot.transpose((1, 0))
            prob = prob * label_onehot
            prob = paddle.sum(prob, axis=0)

            threshold = self.thresh
            if self.min_kept > 0:
                index = prob.argsort()
                threshold_index = index[min(len(index), self.min_kept) - 1]
                threshold_index = int(threshold_index.numpy()[0])
                if prob[threshold_index] > self.thresh:
                    threshold = prob[threshold_index]
                kept_mask = (prob < threshold).astype('int64')
                label = label * kept_mask
                valid_mask = valid_mask * kept_mask
        # make the invalid region as ignore
        label = label + (1 - valid_mask) * self.ignore_index
        label = label.reshape((n, 1, h, w))
        valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32')

        loss = F.softmax_with_cross_entropy(
            seg_logit, label, ignore_index=self.ignore_index, axis=1)
        loss = loss * valid_mask
        avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS)

        label.stop_gradient = True
        valid_mask.stop_gradient = True
        return avg_loss
Ejemplo n.º 29
0
def soft_cross_entropy(inp, target):
    inp_likelihood = F.log_softmax(inp, axis=-1)
    target_prob = F.softmax(target, axis=-1)
    return -1. * paddle.mean(paddle.sum(inp_likelihood * target_prob, axis=-1))
Ejemplo n.º 30
0
def do_train(args):
    set_seed(args)
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    world_size = paddle.distributed.get_world_size()
    if world_size > 1:
        paddle.distributed.init_parallel_env()

    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    train_dataset, dev_dataset, test_dataset = ppnlp.datasets.ChnSentiCorp.get_datasets(
        ['train', 'dev', 'test'])
    if args.model_name == 'ernie-tiny':
        # ErnieTinyTokenizer is special for ernie-tiny pretained model.
        tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained(
            args.model_name)
    else:
        tokenizer = tokenizer_class.from_pretrained(args.model_name)

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         label_list=train_dataset.get_labels(),
                         max_seq_length=args.max_seq_length)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        Stack(dtype="int64")  # label
    ): [data for data in fn(samples)]
    train_data_loader = create_dataloader(train_dataset,
                                          mode='train',
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)
    dev_data_loader = create_dataloader(dev_dataset,
                                        mode='dev',
                                        batch_size=args.batch_size,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)
    test_data_loader = create_dataloader(test_dataset,
                                         mode='test',
                                         batch_size=args.batch_size,
                                         batchify_fn=batchify_fn,
                                         trans_fn=trans_func)

    model = model_class.from_pretrained(args.model_name,
                                        num_classes=len(
                                            train_dataset.get_labels()))

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)
    model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.epochs
    num_warmup_steps = int(args.warmup_proption * num_training_steps)

    def get_lr_factor(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        else:
            return max(
                0.0,
                float(num_training_steps - current_step) /
                float(max(1, num_training_steps - num_warmup_steps)))

    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lr_lambda=lambda current_step: get_lr_factor(current_step))
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    criterion = paddle.nn.loss.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()

    global_step = 0
    tic_train = time.time()
    for epoch in range(1, args.epochs + 1):
        for step, batch in enumerate(train_data_loader, start=1):
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = criterion(logits, labels)
            probs = F.softmax(logits, axis=1)
            correct = metric.compute(probs, labels)
            metric.update(correct)
            acc = metric.accumulate()

            global_step += 1
            if global_step % 10 == 0 and paddle.distributed.get_rank() == 0:
                print(
                    "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, step, loss, acc, 10 /
                       (time.time() - tic_train)))
                tic_train = time.time()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            if global_step % 100 == 0 and paddle.distributed.get_rank() == 0:
                save_dir = os.path.join(args.save_dir,
                                        "model_%d" % global_step)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                evaluate(model, criterion, metric, dev_data_loader)
                model._layers.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)

    if paddle.distributed.get_rank() == 0:
        print('Evaluating on test data.')
        evaluate(model, criterion, metric, test_data_loader)