def __init__(self, ernie, crf_lr=100): super().__init__() self.num_classes = ernie.num_classes self.ernie = ernie # allow ernie to be config self.crf = LinearChainCrf( self.num_classes, crf_lr=crf_lr, with_start_stop_tag=False) self.crf_loss = LinearChainCrfLoss(self.crf) self.viterbi_decoder = ViterbiDecoder( self.crf.transitions, with_start_stop_tag=False)
def __init__(self, word_emb_dim, hidden_size, vocab_size, num_labels, emb_lr=2.0, crf_lr=0.2, with_start_stop_tag=True): super(BiGruCrf, self).__init__() self.word_emb_dim = word_emb_dim self.vocab_size = vocab_size self.num_labels = num_labels self.hidden_size = hidden_size self.emb_lr = emb_lr self.crf_lr = crf_lr self.init_bound = 0.1 self.word_embedding = nn.Embedding( num_embeddings=self.vocab_size, embedding_dim=self.word_emb_dim, weight_attr=paddle.ParamAttr(learning_rate=self.emb_lr, initializer=nn.initializer.Uniform( low=-self.init_bound, high=self.init_bound))) self.gru = nn.GRU( input_size=self.word_emb_dim, hidden_size=self.hidden_size, num_layers=2, direction='bidirectional', weight_ih_attr=paddle.ParamAttr( initializer=nn.initializer.Uniform(low=-self.init_bound, high=self.init_bound), regularizer=paddle.regularizer.L2Decay(coeff=1e-4)), weight_hh_attr=paddle.ParamAttr( initializer=nn.initializer.Uniform(low=-self.init_bound, high=self.init_bound), regularizer=paddle.regularizer.L2Decay(coeff=1e-4))) self.fc = nn.Linear( in_features=self.hidden_size * 2, out_features=self.num_labels + 2 \ if with_start_stop_tag else self.num_labels, weight_attr=paddle.ParamAttr( initializer=nn.initializer.Uniform( low=-self.init_bound, high=self.init_bound), regularizer=paddle.regularizer.L2Decay(coeff=1e-4))) self.crf = LinearChainCrf(self.num_labels, self.crf_lr, with_start_stop_tag) self.crf_loss = LinearChainCrfLoss(self.crf) self.viterbi_decoder = ViterbiDecoder(self.crf.transitions, with_start_stop_tag)
def __init__(self, ernie_ctm, num_tag, crf_lr=100): super(ErnieCtmWordtagModel, self).__init__() self.num_tag = num_tag self.ernie_ctm = ernie_ctm self.tag_classifier = nn.Linear(self.ernie_ctm.config["hidden_size"], self.num_tag) self.crf = LinearChainCrf(self.num_tag, crf_lr, with_start_stop_tag=False) self.crf_loss = LinearChainCrfLoss(self.crf) self.viterbi_decoder = ViterbiDecoder(self.crf.transitions, False) self.apply(self.init_weights)
def __init__(self, emb_size, hidden_size, word_num, label_num, use_w2v_emb=False): super(BiGRUWithCRF, self).__init__() if use_w2v_emb: self.word_emb = TokenEmbedding( extended_vocab_path='./data/word.dic', unknown_token='OOV') else: self.word_emb = nn.Embedding(word_num, emb_size) self.gru = nn.GRU(emb_size, hidden_size, num_layers=2, direction='bidirectional') self.fc = nn.Linear(hidden_size * 2, label_num + 2) # BOS EOS self.crf = LinearChainCrf(label_num) self.crf_loss = LinearChainCrfLoss(self.crf) self.viterbi_decoder = ViterbiDecoder(self.crf.transitions)
def __init__(self, skep, num_classes): super().__init__() self.num_classes = num_classes self.skep = skep # allow skep to be config gru_hidden_size = 128 self.gru = nn.GRU(self.skep.config["hidden_size"], gru_hidden_size, num_layers=2, direction='bidirect') self.fc = nn.Linear( gru_hidden_size * 2, self.num_classes, weight_attr=paddle.ParamAttr( initializer=nn.initializer.Uniform(low=-0.1, high=0.1), regularizer=paddle.regularizer.L2Decay(coeff=1e-4))) self.crf = LinearChainCrf(self.num_classes, crf_lr=0.2, with_start_stop_tag=False) self.crf_loss = LinearChainCrfLoss(self.crf) self.viterbi_decoder = ViterbiDecoder(self.crf.transitions, False)
def train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") # create dataset. train_dataset = LacDataset(args.data_dir, mode='train') test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(dataset=train_dataset, batch_sampler=train_sampler, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss network = BiGruCrf(args.emb_dim, args.hidden_size, train_dataset.vocab_size, train_dataset.num_labels) model = paddle.Model(network) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr, parameters=model.parameters()) crf_loss = LinearChainCrfLoss(network.crf) chunk_evaluator = ChunkEvaluator( label_list=train_dataset.label_vocab.keys(), suffix=True) model.prepare(optimizer, crf_loss, chunk_evaluator) if args.init_checkpoint: model.load(args.init_checkpoint) # Start training callbacks = paddle.callbacks.ProgBarLogger( log_freq=10, verbose=3) if args.verbose else None model.fit(train_data=train_loader, eval_data=test_loader, batch_size=args.batch_size, epochs=args.epochs, eval_freq=1, log_freq=10, save_dir=args.model_save_dir, save_freq=1, shuffle=True, callbacks=callbacks)
def do_train(args): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "train.txt"), is_test=False, lazy=False) dev_ds = load_dataset(read_custom_data, filename=os.path.join(args.data_dir, "dev.txt"), is_test=False, lazy=False) tags_to_idx = load_dict(os.path.join(args.data_dir, "tags.txt")) tokenizer = ErnieCtmTokenizer.from_pretrained("wordtag") model = ErnieCtmWordtagModel.from_pretrained("wordtag", num_tag=len(tags_to_idx)) model.crf_loss = LinearChainCrfLoss( LinearChainCrf(len(tags_to_idx), 0.1, with_start_stop_tag=False)) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len, tags_to_idx=tags_to_idx) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # token_type_ids Stack(dtype='int64'), # seq_len Pad(axis=0, pad_val=tags_to_idx["O"], dtype='int64'), # tags ): fn(samples) train_data_loader = create_dataloader(train_ds, mode="train", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode="dev", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) num_train_optimization_steps = len( train_ds) / args.batch_size * args.num_train_epochs decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) logger.info("Total steps: %s" % num_training_steps) logger.info("WarmUp steps: %s" % warmup) metric = SequenceAccuracy() total_loss = 0 global_step = 0 for epoch in range(1, args.num_train_epochs + 1): logger.info(f"Epoch {epoch} beginnig") start_time = time.time() for total_step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, seq_len, tags = batch loss, _ = model(input_ids, token_type_ids, lengths=seq_len, tag_labels=tags) loss = loss.mean() total_loss += loss loss.backward() optimizer.step() optimizer.clear_grad() lr_scheduler.step() if global_step % args.logging_steps == 0 and rank == 0: end_time = time.time() speed = float(args.logging_steps) / (end_time - start_time) logger.info( "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, total_loss / args.logging_steps, speed)) start_time = time.time() total_loss = 0 if (global_step % args.save_steps == 0 or global_step == num_training_steps) and rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) evaluate(model, metric, dev_data_loader, tags, tags_to_idx)
def train(args): if args.use_gpu: place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) paddle.set_device("gpu") else: place = paddle.CPUPlace() paddle.set_device("cpu") # create dataset. train_dataset = LacDataset(args.data_dir, mode='train') test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader( dataset=train_dataset, batch_sampler=train_sampler, places=place, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler( dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_loader = paddle.io.DataLoader( dataset=test_dataset, batch_sampler=test_sampler, places=place, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss network = BiGruCrf(args.emb_dim, args.hidden_size, train_dataset.vocab_size, train_dataset.num_labels) model = paddle.Model(network) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam( learning_rate=args.base_lr, parameters=model.parameters()) crf_loss = LinearChainCrfLoss(network.crf.transitions) chunk_evaluator = ChunkEvaluator( int(math.ceil((train_dataset.num_labels + 1) / 2.0)), "IOB") # + 1 for START and STOP model.prepare(optimizer, crf_loss, chunk_evaluator) if args.init_checkpoint: model.load(args.init_checkpoint) # Start training callback = paddle.callbacks.ProgBarLogger(log_freq=10, verbose=3) model.fit(train_data=train_loader, eval_data=test_loader, batch_size=args.batch_size, epochs=args.epochs, eval_freq=1, log_freq=10, save_dir=args.model_save_dir, save_freq=1, drop_last=True, shuffle=True, callbacks=callback)