Beispiel #1
0
    def _get_trainer(self, models_folder):
        optimizer = optim.SGD(self.parameters(),
                              lr=self.config['lr'],
                              momentum=0.9)

        callbacks = []
        clip_callback = GradientClipCallback(clip_type='value', clip_value=5)
        evaluate_callback = EvaluateCallback(
            self.data_bundle.get_dataset('test'))

        if self.config['warmup_steps'] > 0:
            warmup_callback = WarmupCallback(self.config['warmup_steps'],
                                             schedule='linear')
            callbacks.append(warmup_callback)
        callbacks.extend([clip_callback, evaluate_callback])

        return Trainer(self.data_bundle.get_dataset('train'),
                       self,
                       optimizer,
                       batch_size=self.config['batch_size'],
                       sampler=BucketSampler(),
                       num_workers=2,
                       n_epochs=100,
                       dev_data=self.data_bundle.get_dataset('dev'),
                       metrics=SpanFPreRecMetric(
                           tag_vocab=self.data_bundle.get_vocab('target'),
                           encoding_type=self.config['encoding_type']),
                       dev_batch_size=self.config['batch_size'] * 5,
                       callbacks=callbacks,
                       device=self.config['device'],
                       test_use_tqdm=False,
                       use_tqdm=True,
                       print_every=300,
                       save_path=models_folder)
Beispiel #2
0
              after_norm=after_norm,
              attn_type=attn_type,
              bi_embed=bi_embed,
              bert_embed=bert_embed,
              fc_dropout=fc_dropout,
              pos_embed=pos_embed,
              scale=attn_type == 'transformer')

optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

callbacks = []
clip_callback = GradientClipCallback(clip_type='value', clip_value=5)
evaluate_callback = EvaluateCallback(data_bundle.get_dataset('test'))

if warmup_steps > 0:
    warmup_callback = WarmupCallback(warmup_steps, schedule='linear')
    callbacks.append(warmup_callback)
callbacks.extend([clip_callback, evaluate_callback])

trainer = Trainer(data_bundle.get_dataset('train'),
                  model,
                  optimizer,
                  batch_size=batch_size,
                  sampler=BucketSampler(),
                  num_workers=2,
                  n_epochs=n_epochs,
                  dev_data=data_bundle.get_dataset('dev'),
                  metrics=SpanFPreRecMetric(
                      tag_vocab=data_bundle.get_vocab('target'),
                      encoding_type=encoding_type),
                  dev_batch_size=batch_size,
Beispiel #3
0
print(f"In total {len(word2bpes)} target words")
pad_id = data_bundle.pad_id

model = ENBertReverseDict(pre_name, word2bpes, pad_id=pad_id,
                          number_word_in_train=data_bundle.number_word_in_train)

if torch.cuda.is_available():
    model.cuda()

optimizer = optim.AdamW(model.parameters(), lr=lr)

data = {}
for name in ['seen', 'unseen', 'desc']:
    data[name] = data_bundle.get_dataset(name)

callbacks = [GradientClipCallback(clip_type='value', clip_value=5), WarmupCallback(warmup=0.01, schedule='linear')]
callbacks.append(FitlogCallback(data=data, verbose=1))
train_data = data_bundle.get_dataset('train')
train_data.add_seq_len('input')

# from collections import Counter
# print(Counter(train_data.get_field('seq_len').content))
# exit(0)

sampler = BucketSampler()
clip_max_length(train_data, data_bundle)

trainer = Trainer(train_data=train_data, model=model,
                  optimizer=optimizer, loss=CrossEntropyLoss(),
                 batch_size=batch_size, sampler=sampler, drop_last=False, update_every=1,
                 num_workers=1, n_epochs=n_epochs, print_every=5,
Beispiel #4
0
    word2bpes,
    pad_id=pad_id,
    number_word_in_train=data_bundle.number_word_in_train)

if torch.cuda.is_available():
    model.cuda()

optimizer = optim.AdamW(model.parameters(), lr=lr)

data = {}
for name in ['desc', 'question', 'seen_test', 'unseen_test']:
    data[name] = data_bundle.get_dataset(name)

callbacks = [
    GradientClipCallback(clip_type='value'),
    WarmupCallback(warmup=0.1, schedule='linear')
]
callbacks.append(FitlogCallback(data=data, verbose=1))
train_data = data_bundle.get_dataset('train')
train_data.add_seq_len('input')
sampler = BucketSampler()

clip_max_length(train_data, data_bundle)

trainer = Trainer(train_data=train_data,
                  model=model,
                  optimizer=optimizer,
                  loss=CrossEntropyLoss(),
                  batch_size=batch_size,
                  sampler=sampler,
                  drop_last=False,
Beispiel #5
0
def main():
    args = parse_args()

    if args.debug:
        fitlog.debug()

    fitlog.set_log_dir(args.log_dir)
    fitlog.commit(__file__)
    fitlog.add_hyper_in_file(__file__)
    fitlog.add_hyper(args)
    if args.gpu != 'all':
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    train_set, dev_set, test_set, temp_ent_vocab = load_fewrel_graph_data(
        data_dir=args.data_dir)

    print('data directory: {}'.format(args.data_dir))
    print('# of train samples: {}'.format(len(train_set)))
    print('# of dev samples: {}'.format(len(dev_set)))
    print('# of test samples: {}'.format(len(test_set)))

    ent_vocab, rel_vocab = load_ent_rel_vocabs(path='../')

    # load entity embeddings
    ent_index = []
    for k, v in temp_ent_vocab.items():
        ent_index.append(ent_vocab[k])
    ent_index = torch.tensor(ent_index)
    ent_emb = np.load(os.path.join(args.model_path, 'entities.npy'))
    ent_embedding = nn.Embedding.from_pretrained(torch.from_numpy(ent_emb))
    ent_emb = ent_embedding(ent_index.view(1, -1)).squeeze().detach()

    # load CoLAKE parameters
    config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3)
    model = CoLAKEForRE(config,
                        num_types=len(train_set.label_vocab),
                        ent_emb=ent_emb)
    states_dict = torch.load(os.path.join(args.model_path, 'model.bin'))
    model.load_state_dict(states_dict, strict=False)
    print('parameters below are randomly initializecd:')
    for name, param in model.named_parameters():
        if name not in states_dict:
            print(name)

    # tie relation classification head
    rel_index = []
    for k, v in train_set.label_vocab.items():
        rel_index.append(rel_vocab[k])
    rel_index = torch.LongTensor(rel_index)
    rel_embeddings = nn.Embedding.from_pretrained(
        states_dict['rel_embeddings.weight'])
    rel_index = rel_index.cuda()
    rel_cls_weight = rel_embeddings(rel_index.view(1, -1)).squeeze()
    model.tie_rel_weights(rel_cls_weight)

    model.rel_head.dense.weight.data = states_dict['rel_lm_head.dense.weight']
    model.rel_head.dense.bias.data = states_dict['rel_lm_head.dense.bias']
    model.rel_head.layer_norm.weight.data = states_dict[
        'rel_lm_head.layer_norm.weight']
    model.rel_head.layer_norm.bias.data = states_dict[
        'rel_lm_head.layer_norm.bias']

    model.resize_token_embeddings(
        len(RobertaTokenizer.from_pretrained('roberta-base')) + 4)
    print('parameters of CoLAKE has been loaded.')

    # fine-tune
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'embedding']
    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=args.lr,
                            betas=(0.9, args.beta),
                            eps=1e-6)

    metrics = [MacroMetric(pred='pred', target='target')]

    test_data_iter = TorchLoaderIter(dataset=test_set,
                                     batch_size=args.batch_size,
                                     sampler=RandomSampler(),
                                     num_workers=4,
                                     collate_fn=test_set.collate_fn)
    devices = list(range(torch.cuda.device_count()))
    tester = Tester(data=test_data_iter,
                    model=model,
                    metrics=metrics,
                    device=devices)
    # tester.test()

    fitlog_callback = FitlogCallback(tester=tester,
                                     log_loss_every=100,
                                     verbose=1)
    gradient_clip_callback = GradientClipCallback(clip_value=1,
                                                  clip_type='norm')
    warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear')

    bsz = args.batch_size // args.grad_accumulation

    train_data_iter = TorchLoaderIter(dataset=train_set,
                                      batch_size=bsz,
                                      sampler=RandomSampler(),
                                      num_workers=4,
                                      collate_fn=train_set.collate_fn)
    dev_data_iter = TorchLoaderIter(dataset=dev_set,
                                    batch_size=bsz,
                                    sampler=RandomSampler(),
                                    num_workers=4,
                                    collate_fn=dev_set.collate_fn)

    trainer = Trainer(
        train_data=train_data_iter,
        dev_data=dev_data_iter,
        model=model,
        optimizer=optimizer,
        loss=LossInForward(),
        batch_size=bsz,
        update_every=args.grad_accumulation,
        n_epochs=args.epoch,
        metrics=metrics,
        callbacks=[fitlog_callback, gradient_clip_callback, warmup_callback],
        device=devices,
        use_tqdm=True)

    trainer.train(load_best_model=False)
Beispiel #6
0
data, bert_embed = get_data()

print(data)
model = BertParser(embed=bert_embed, num_label=len(data.get_vocab('char_labels')), arc_mlp_size=arc_mlp_size,
                   label_mlp_size=label_mlp_size, dropout=dropout,
                   use_greedy_infer=False,
                   app_index=0)

metric1 = SegAppCharParseF1Metric(data.get_vocab('char_labels')['APP'])
metric2 = CWSMetric(data.get_vocab('char_labels')['APP'])
metrics = [metric1, metric2]

optimizer = optim.AdamW([param for param in model.parameters() if param.requires_grad], lr=lr,
                       weight_decay=1e-2)

sampler = BucketSampler(seq_len_field_name='seq_lens')
callbacks = []

warmup_callback = WarmupCallback(schedule='linear')

callbacks.append(warmup_callback)
callbacks.append(GradientClipCallback(clip_type='value', clip_value=5))
callbacks.append(EvaluateCallback(data.get_dataset('test')))

trainer = Trainer(data.datasets['train'], model, loss=None, metrics=metrics, n_epochs=n_epochs, batch_size=batch_size,
                  print_every=3,
                 validate_every=-1, dev_data=data.datasets['dev'], save_path=None, optimizer=optimizer,
                 check_code_level=0, metric_key='u_f1', sampler=sampler, num_workers=2, use_tqdm=True,
                 device=device, callbacks=callbacks, update_every=update_every, dev_batch_size=6)
trainer.train(load_best_model=False)
Beispiel #7
0
        "weight_decay":
        1e-2,
    },
    {
        "params": [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        "weight_decay":
        0.0,
    },
]
optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr)

callbacks = []
callbacks.append(WarmupCallback(0.01, "linear"))
callbacks.append(FitlogCallback(
    # data_bundle.get_dataset('train')
))

import torch
import torch.nn.functional as F
from fastNLP import LossBase


class SmoothLoss(LossBase):
    def __init__(self, smooth_eps=0):
        super().__init__()
        self.smooth_eps = smooth_eps

    def get_loss(self, pred, target):
Beispiel #8
0
def main():
    if args.do_eval:
        torch.multiprocessing.set_start_method('spawn', force=True)

    if args.model == 'bert':

        model = BertCRF(embed, [data_bundle.get_vocab('target')],
                        encoding_type='bioes')

    else:
        model = StackedTransformersCRF(
            tag_vocabs=[data_bundle.get_vocab('target')],
            embed=embed,
            num_layers=num_layers,
            d_model=d_model,
            n_head=n_heads,
            feedforward_dim=dim_feedforward,
            dropout=trans_dropout,
            after_norm=after_norm,
            attn_type=attn_type,
            bi_embed=None,
            fc_dropout=fc_dropout,
            pos_embed=pos_embed,
            scale=attn_type == 'transformer')
        model = torch.nn.DataParallel(model)

    if args.do_eval:
        if os.path.exists(os.path.expanduser(args.saved_model)):
            print("Load checkpoint from {}".format(
                os.path.expanduser(args.saved_model)))
            model = torch.load(args.saved_model)
            model.to('cuda')
            print('model to CUDA')

    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)

    callbacks = []
    clip_callback = GradientClipCallback(clip_type='value', clip_value=5)
    evaluate_callback = EvaluateCallback(data_bundle.get_dataset('test'))
    checkpoint_callback = CheckPointCallback(os.path.join(
        directory, 'model.pth'),
                                             delete_when_train_finish=False,
                                             recovery_fitlog=True)

    if warmup_steps > 0:
        warmup_callback = WarmupCallback(warmup_steps, schedule='linear')
        callbacks.append(warmup_callback)
    callbacks.extend([clip_callback, checkpoint_callback, evaluate_callback])

    if not args.do_eval:
        trainer = Trainer(data_bundle.get_dataset('train'),
                          model,
                          optimizer,
                          batch_size=batch_size,
                          sampler=BucketSampler(),
                          num_workers=no_cpu,
                          n_epochs=args.n_epochs,
                          dev_data=data_bundle.get_dataset('dev'),
                          metrics=SpanFPreRecMetric(
                              tag_vocab=data_bundle.get_vocab('target'),
                              encoding_type=encoding_type),
                          dev_batch_size=batch_size,
                          callbacks=callbacks,
                          device=args.device,
                          test_use_tqdm=True,
                          use_tqdm=True,
                          print_every=300,
                          save_path=os.path.join(directory, 'best'))

        trainer.train(load_best_model=True)

        predictor = Predictor(model)
        predict(os.path.join(directory, 'predictions_dev.tsv'), data_bundle,
                predictor, 'dev')
        predict(os.path.join(directory, 'predictions_test.tsv'), data_bundle,
                predictor, 'test')

    else:
        print('Predicting')
        # predictions of multiple files
        torch.multiprocessing.freeze_support()
        model.share_memory()
        predictor = Predictor(model)

        if len(files) > multiprocessing.cpu_count():
            with torch.multiprocessing.Pool(processes=no_cpu) as p:
                with tqdm(total=len(files)) as pbar:
                    for i, _ in enumerate(
                            p.imap_unordered(
                                partial(predict,
                                        data_bundle=data_bundle,
                                        predictor=predictor,
                                        predict_on='train',
                                        do_eval=args.do_eval), files)):
                        pbar.update()
        else:
            for file in tqdm(files):
                predict(file, data_bundle, predictor, 'train', args.do_eval)