Example #1
0
 def __init__(self, ernie, crf_lr=100):
     super().__init__()
     self.num_classes = ernie.num_classes
     self.ernie = ernie  # allow ernie to be config
     self.crf = LinearChainCrf(
         self.num_classes, crf_lr=crf_lr, with_start_stop_tag=False)
     self.crf_loss = LinearChainCrfLoss(self.crf)
     self.viterbi_decoder = ViterbiDecoder(
         self.crf.transitions, with_start_stop_tag=False)
Example #2
0
    def __init__(self,
                 word_emb_dim,
                 hidden_size,
                 vocab_size,
                 num_labels,
                 emb_lr=2.0,
                 crf_lr=0.2,
                 with_start_stop_tag=True):
        super(BiGruCrf, self).__init__()
        self.word_emb_dim = word_emb_dim
        self.vocab_size = vocab_size
        self.num_labels = num_labels
        self.hidden_size = hidden_size
        self.emb_lr = emb_lr
        self.crf_lr = crf_lr
        self.init_bound = 0.1

        self.word_embedding = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.word_emb_dim,
            weight_attr=paddle.ParamAttr(learning_rate=self.emb_lr,
                                         initializer=nn.initializer.Uniform(
                                             low=-self.init_bound,
                                             high=self.init_bound)))

        self.gru = nn.GRU(
            input_size=self.word_emb_dim,
            hidden_size=self.hidden_size,
            num_layers=2,
            direction='bidirectional',
            weight_ih_attr=paddle.ParamAttr(
                initializer=nn.initializer.Uniform(low=-self.init_bound,
                                                   high=self.init_bound),
                regularizer=paddle.regularizer.L2Decay(coeff=1e-4)),
            weight_hh_attr=paddle.ParamAttr(
                initializer=nn.initializer.Uniform(low=-self.init_bound,
                                                   high=self.init_bound),
                regularizer=paddle.regularizer.L2Decay(coeff=1e-4)))

        self.fc = nn.Linear(
            in_features=self.hidden_size * 2,
            out_features=self.num_labels + 2 \
                if with_start_stop_tag else self.num_labels,
            weight_attr=paddle.ParamAttr(
                initializer=nn.initializer.Uniform(
                    low=-self.init_bound, high=self.init_bound),
                regularizer=paddle.regularizer.L2Decay(coeff=1e-4)))

        self.crf = LinearChainCrf(self.num_labels, self.crf_lr,
                                  with_start_stop_tag)
        self.crf_loss = LinearChainCrfLoss(self.crf)
        self.viterbi_decoder = ViterbiDecoder(self.crf.transitions,
                                              with_start_stop_tag)
Example #3
0
    def __init__(self, ernie_ctm, num_tag, crf_lr=100):
        super(ErnieCtmWordtagModel, self).__init__()
        self.num_tag = num_tag
        self.ernie_ctm = ernie_ctm
        self.tag_classifier = nn.Linear(self.ernie_ctm.config["hidden_size"],
                                        self.num_tag)
        self.crf = LinearChainCrf(self.num_tag,
                                  crf_lr,
                                  with_start_stop_tag=False)
        self.crf_loss = LinearChainCrfLoss(self.crf)
        self.viterbi_decoder = ViterbiDecoder(self.crf.transitions, False)

        self.apply(self.init_weights)
Example #4
0
 def __init__(self,
              emb_size,
              hidden_size,
              word_num,
              label_num,
              use_w2v_emb=False):
     super(BiGRUWithCRF, self).__init__()
     if use_w2v_emb:
         self.word_emb = TokenEmbedding(
             extended_vocab_path='./data/word.dic', unknown_token='OOV')
     else:
         self.word_emb = nn.Embedding(word_num, emb_size)
     self.gru = nn.GRU(emb_size,
                       hidden_size,
                       num_layers=2,
                       direction='bidirectional')
     self.fc = nn.Linear(hidden_size * 2, label_num + 2)  # BOS EOS
     self.crf = LinearChainCrf(label_num)
     self.crf_loss = LinearChainCrfLoss(self.crf)
     self.viterbi_decoder = ViterbiDecoder(self.crf.transitions)
Example #5
0
    def __init__(self, skep, num_classes):
        super().__init__()
        self.num_classes = num_classes
        self.skep = skep  # allow skep to be config
        gru_hidden_size = 128

        self.gru = nn.GRU(self.skep.config["hidden_size"],
                          gru_hidden_size,
                          num_layers=2,
                          direction='bidirect')
        self.fc = nn.Linear(
            gru_hidden_size * 2,
            self.num_classes,
            weight_attr=paddle.ParamAttr(
                initializer=nn.initializer.Uniform(low=-0.1, high=0.1),
                regularizer=paddle.regularizer.L2Decay(coeff=1e-4)))
        self.crf = LinearChainCrf(self.num_classes,
                                  crf_lr=0.2,
                                  with_start_stop_tag=False)
        self.crf_loss = LinearChainCrfLoss(self.crf)
        self.viterbi_decoder = ViterbiDecoder(self.crf.transitions, False)
Example #6
0
def train(args):
    paddle.set_device("gpu" if args.n_gpu else "cpu")

    # create dataset.
    train_dataset = LacDataset(args.data_dir, mode='train')
    test_dataset = LacDataset(args.data_dir, mode='test')

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    train_sampler = paddle.io.DistributedBatchSampler(
        dataset=train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)
    train_loader = paddle.io.DataLoader(dataset=train_dataset,
                                        batch_sampler=train_sampler,
                                        return_list=True,
                                        collate_fn=batchify_fn)

    test_sampler = paddle.io.BatchSampler(dataset=test_dataset,
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          drop_last=False)
    test_loader = paddle.io.DataLoader(dataset=test_dataset,
                                       batch_sampler=test_sampler,
                                       return_list=True,
                                       collate_fn=batchify_fn)

    # Define the model netword and its loss
    network = BiGruCrf(args.emb_dim, args.hidden_size,
                       train_dataset.vocab_size, train_dataset.num_labels)
    model = paddle.Model(network)

    # Prepare optimizer, loss and metric evaluator
    optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr,
                                      parameters=model.parameters())
    crf_loss = LinearChainCrfLoss(network.crf)
    chunk_evaluator = ChunkEvaluator(
        label_list=train_dataset.label_vocab.keys(), suffix=True)
    model.prepare(optimizer, crf_loss, chunk_evaluator)
    if args.init_checkpoint:
        model.load(args.init_checkpoint)

    # Start training
    callbacks = paddle.callbacks.ProgBarLogger(
        log_freq=10, verbose=3) if args.verbose else None
    model.fit(train_data=train_loader,
              eval_data=test_loader,
              batch_size=args.batch_size,
              epochs=args.epochs,
              eval_freq=1,
              log_freq=10,
              save_dir=args.model_save_dir,
              save_freq=1,
              shuffle=True,
              callbacks=callbacks)
Example #7
0
def do_train(args):
    paddle.set_device(args.device)
    rank = paddle.distributed.get_rank()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)

    train_ds = load_dataset(read_custom_data,
                            filename=os.path.join(args.data_dir, "train.txt"),
                            is_test=False,
                            lazy=False)
    dev_ds = load_dataset(read_custom_data,
                          filename=os.path.join(args.data_dir, "dev.txt"),
                          is_test=False,
                          lazy=False)
    tags_to_idx = load_dict(os.path.join(args.data_dir, "tags.txt"))

    tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag")
    model = ErnieCtmWordtagModel.from_pretrained("wordtag",
                                                 num_tag=len(tags_to_idx))
    model.crf_loss = LinearChainCrfLoss(
        LinearChainCrf(len(tags_to_idx), 0.1, with_start_stop_tag=False))

    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_len=args.max_seq_len,
                         tags_to_idx=tags_to_idx)

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
            ),  # token_type_ids
        Stack(dtype='int64'),  # seq_len
        Pad(axis=0, pad_val=tags_to_idx["O"], dtype='int64'),  # tags
    ): fn(samples)

    train_data_loader = create_dataloader(train_ds,
                                          mode="train",
                                          batch_size=args.batch_size,
                                          batchify_fn=batchify_fn,
                                          trans_fn=trans_func)

    dev_data_loader = create_dataloader(dev_ds,
                                        mode="dev",
                                        batch_size=args.batch_size,
                                        batchify_fn=batchify_fn,
                                        trans_fn=trans_func)

    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
        state_dict = paddle.load(args.init_from_ckpt)
        model.set_dict(state_dict)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    num_training_steps = len(train_data_loader) * args.num_train_epochs
    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    num_train_optimization_steps = len(
        train_ds) / args.batch_size * args.num_train_epochs

    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params)

    logger.info("Total steps: %s" % num_training_steps)
    logger.info("WarmUp steps: %s" % warmup)

    metric = SequenceAccuracy()

    total_loss = 0
    global_step = 0

    for epoch in range(1, args.num_train_epochs + 1):
        logger.info(f"Epoch {epoch} beginnig")
        start_time = time.time()

        for total_step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, token_type_ids, seq_len, tags = batch

            loss, _ = model(input_ids,
                            token_type_ids,
                            lengths=seq_len,
                            tag_labels=tags)
            loss = loss.mean()
            total_loss += loss
            loss.backward()

            optimizer.step()
            optimizer.clear_grad()
            lr_scheduler.step()

            if global_step % args.logging_steps == 0 and rank == 0:
                end_time = time.time()
                speed = float(args.logging_steps) / (end_time - start_time)
                logger.info(
                    "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s"
                    % (global_step, epoch, total_loss / args.logging_steps,
                       speed))
                start_time = time.time()
                total_loss = 0

            if (global_step % args.save_steps == 0
                    or global_step == num_training_steps) and rank == 0:
                output_dir = os.path.join(args.output_dir,
                                          "model_%d" % (global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)

        evaluate(model, metric, dev_data_loader, tags, tags_to_idx)
Example #8
0
def train(args):
    if args.use_gpu:
        place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
        paddle.set_device("gpu")
    else:
        place = paddle.CPUPlace()
        paddle.set_device("cpu")

    # create dataset.
    train_dataset = LacDataset(args.data_dir, mode='train')
    test_dataset = LacDataset(args.data_dir, mode='test')

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)

    # Create sampler for dataloader
    train_sampler = paddle.io.DistributedBatchSampler(
        dataset=train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)
    train_loader = paddle.io.DataLoader(
        dataset=train_dataset,
        batch_sampler=train_sampler,
        places=place,
        return_list=True,
        collate_fn=batchify_fn)

    test_sampler = paddle.io.BatchSampler(
        dataset=test_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=True)
    test_loader = paddle.io.DataLoader(
        dataset=test_dataset,
        batch_sampler=test_sampler,
        places=place,
        return_list=True,
        collate_fn=batchify_fn)

    # Define the model netword and its loss
    network = BiGruCrf(args.emb_dim, args.hidden_size, train_dataset.vocab_size,
                       train_dataset.num_labels)
    model = paddle.Model(network)

    # Prepare optimizer, loss and metric evaluator
    optimizer = paddle.optimizer.Adam(
        learning_rate=args.base_lr, parameters=model.parameters())
    crf_loss = LinearChainCrfLoss(network.crf.transitions)
    chunk_evaluator = ChunkEvaluator(
        int(math.ceil((train_dataset.num_labels + 1) / 2.0)),
        "IOB")  # + 1 for START and STOP
    model.prepare(optimizer, crf_loss, chunk_evaluator)
    if args.init_checkpoint:
        model.load(args.init_checkpoint)

    # Start training
    callback = paddle.callbacks.ProgBarLogger(log_freq=10, verbose=3)
    model.fit(train_data=train_loader,
              eval_data=test_loader,
              batch_size=args.batch_size,
              epochs=args.epochs,
              eval_freq=1,
              log_freq=10,
              save_dir=args.model_save_dir,
              save_freq=1,
              drop_last=True,
              shuffle=True,
              callbacks=callback)