Example #1
0
    def _get_trainer(self, models_folder):
        optimizer = optim.SGD(self.parameters(),
                              lr=self.config['lr'],
                              momentum=0.9)

        callbacks = []
        clip_callback = GradientClipCallback(clip_type='value', clip_value=5)
        evaluate_callback = EvaluateCallback(
            self.data_bundle.get_dataset('test'))

        if self.config['warmup_steps'] > 0:
            warmup_callback = WarmupCallback(self.config['warmup_steps'],
                                             schedule='linear')
            callbacks.append(warmup_callback)
        callbacks.extend([clip_callback, evaluate_callback])

        return Trainer(self.data_bundle.get_dataset('train'),
                       self,
                       optimizer,
                       batch_size=self.config['batch_size'],
                       sampler=BucketSampler(),
                       num_workers=2,
                       n_epochs=100,
                       dev_data=self.data_bundle.get_dataset('dev'),
                       metrics=SpanFPreRecMetric(
                           tag_vocab=self.data_bundle.get_vocab('target'),
                           encoding_type=self.config['encoding_type']),
                       dev_batch_size=self.config['batch_size'] * 5,
                       callbacks=callbacks,
                       device=self.config['device'],
                       test_use_tqdm=False,
                       use_tqdm=True,
                       print_every=300,
                       save_path=models_folder)
Example #2
0
              d_model=d_model,
              n_head=n_heads,
              feedforward_dim=dim_feedforward,
              dropout=dropout,
              after_norm=after_norm,
              attn_type=attn_type,
              bi_embed=bi_embed,
              bert_embed=bert_embed,
              fc_dropout=fc_dropout,
              pos_embed=pos_embed,
              scale=attn_type == 'transformer')

optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

callbacks = []
clip_callback = GradientClipCallback(clip_type='value', clip_value=5)
evaluate_callback = EvaluateCallback(data_bundle.get_dataset('test'))

if warmup_steps > 0:
    warmup_callback = WarmupCallback(warmup_steps, schedule='linear')
    callbacks.append(warmup_callback)
callbacks.extend([clip_callback, evaluate_callback])

trainer = Trainer(data_bundle.get_dataset('train'),
                  model,
                  optimizer,
                  batch_size=batch_size,
                  sampler=BucketSampler(),
                  num_workers=2,
                  n_epochs=n_epochs,
                  dev_data=data_bundle.get_dataset('dev'),
Example #3
0

data, char_embed, word_embed = cache()

print(data)

embed = StackEmbedding([word_embed, char_embed])
model = CNNBiLSTMCRF(embed,
                     hidden_size=1200,
                     num_layers=1,
                     tag_vocab=data.vocabs[Const.TARGET],
                     encoding_type=encoding_type,
                     dropout=dropout)

callbacks = [
    GradientClipCallback(clip_value=5, clip_type='value'),
    EvaluateCallback(data.datasets['test'])
]

optimizer = SGD(model.parameters(), lr=lr, momentum=0.9)
scheduler = LRScheduler(
    LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
callbacks.append(scheduler)

trainer = Trainer(train_data=data.get_dataset('train'),
                  model=model,
                  optimizer=optimizer,
                  sampler=BucketSampler(num_buckets=100),
                  device=0,
                  dev_data=data.get_dataset('dev'),
                  batch_size=batch_size,
Example #4
0
print(f"In total {len(word2bpes)} target words")
pad_id = data_bundle.pad_id

model = ENBertReverseDict(pre_name, word2bpes, pad_id=pad_id,
                          number_word_in_train=data_bundle.number_word_in_train)

if torch.cuda.is_available():
    model.cuda()

optimizer = optim.AdamW(model.parameters(), lr=lr)

data = {}
for name in ['seen', 'unseen', 'desc']:
    data[name] = data_bundle.get_dataset(name)

callbacks = [GradientClipCallback(clip_type='value', clip_value=5), WarmupCallback(warmup=0.01, schedule='linear')]
callbacks.append(FitlogCallback(data=data, verbose=1))
train_data = data_bundle.get_dataset('train')
train_data.add_seq_len('input')

# from collections import Counter
# print(Counter(train_data.get_field('seq_len').content))
# exit(0)

sampler = BucketSampler()
clip_max_length(train_data, data_bundle)

trainer = Trainer(train_data=train_data, model=model,
                  optimizer=optimizer, loss=CrossEntropyLoss(),
                 batch_size=batch_size, sampler=sampler, drop_last=False, update_every=1,
                 num_workers=1, n_epochs=n_epochs, print_every=5,
Example #5
0
optimizer = optim.Adam(
    [param for param in model.parameters() if param.requires_grad],
    lr=lr,
    weight_decay=weight_decay,
    betas=[0.9, 0.9])

sampler = BucketSampler(seq_len_field_name='seq_lens')
callbacks = []
# scheduler = LambdaLR(optimizer, lr_lambda=lambda step:(0.75)**(step//5000))
scheduler = StepLR(optimizer, step_size=18, gamma=0.75)
# optim_callback = OptimizerCallback(optimizer, scheduler, update_every)
# callbacks.append(optim_callback)
scheduler_callback = LRScheduler(scheduler)
callbacks.append(scheduler_callback)
callbacks.append(GradientClipCallback(clip_type='value', clip_value=5))

tester = Tester(data=data.datasets['test'],
                model=model,
                metrics=metrics,
                batch_size=64,
                device=device,
                verbose=0)
dev_callback = DevCallback(tester)
callbacks.append(dev_callback)

trainer = Trainer(data.datasets['train'],
                  model,
                  loss=None,
                  metrics=metrics,
                  n_epochs=n_epochs,
Example #6
0
#########hyper
device = 0

cache_fp = 'caches/{}.pkl'.format(data_name)
@cache_results(_cache_fp=cache_fp, _refresh=True)   # 将结果缓存到cache_fp中,这样下次运行就直接读取,而不需要再次运行
def prepare_data():
    data_bundle = CWSShiftRelayPipe(dataset_name=data_name, L=L).process_from_file()
    # 预训练的character embedding和bigram embedding
    char_embed = StaticEmbedding(data_bundle.get_vocab('chars'), dropout=0.5, word_dropout=0.01,
                                 model_dir_or_name='~/exps/CWS/pretrain/vectors/1grams_t3_m50_corpus.txt')
    bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), dropout=0.5, min_freq=3, word_dropout=0.01,
                                 model_dir_or_name='~/exps/CWS/pretrain/vectors/2grams_t3_m50_corpus.txt')

    return data_bundle, char_embed, bigram_embed

data, char_embed, bigram_embed = prepare_data()

model = ShiftRelayCWSModel(char_embed=char_embed, bigram_embed=bigram_embed,
                           hidden_size=hidden_size, num_layers=num_layers, drop_p=drop_p, L=L)

sampler = BucketSampler()
optimizer = Adam(model.parameters(), lr=lr)
clipper = GradientClipCallback(clip_value=5, clip_type='value')  # 截断太大的梯度
evaluator = EvaluateCallback(data.get_dataset('test'))  # 额外测试在test集上的效果
callbacks = [clipper, evaluator]

trainer = Trainer(data.get_dataset('train'), model, optimizer=optimizer, loss=None, batch_size=128, sampler=sampler,
                  update_every=1, n_epochs=10, print_every=5, dev_data=data.get_dataset('dev'), metrics=RelayMetric(),
                  metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks,
                  check_code_level=0, num_workers=1)
trainer.train()
Example #7
0
def train_mlt_single(args):
    global logger
    logger.info(args)
    task_lst, vocabs = utils.get_data(args.data_path)
    task_db = task_lst[args.task_id]
    train_data = task_db.train_set
    dev_data = task_db.dev_set
    test_data = task_db.test_set
    task_name = task_db.task_name

    if args.debug:
        train_data = train_data[:200]
        dev_data = dev_data[:200]
        test_data = test_data[:200]
        args.epochs = 3
        args.pruning_iter = 3

    summary_writer = SummaryWriter(
        log_dir=os.path.join(args.tb_path, "global/%s" % task_name)
    )

    logger.info("task name: {}, task id: {}".format(task_db.task_name, task_db.task_id))
    logger.info(
        "train len {}, dev len {}, test len {}".format(
            len(train_data), len(dev_data), len(test_data)
        )
    )

    # init model
    model = get_model(args, task_lst, vocabs)

    logger.info("model: \n{}".format(model))
    if args.init_weights is not None:
        utils.load_model(model, args.init_weights)

    if utils.need_acc(task_name):
        metrics = [AccuracyMetric(target="y"), MetricInForward(val_name="loss")]
        metric_key = "acc"

    else:
        metrics = [
            YangJieSpanMetric(
                tag_vocab=vocabs[task_name],
                pred="pred",
                target="y",
                seq_len="seq_len",
                encoding_type="bioes" if task_name == "ner" else "bio",
            ),
            MetricInForward(val_name="loss"),
        ]
        metric_key = "f"
    logger.info(metrics)

    need_cut_names = list(set([s.strip() for s in args.need_cut.split(",")]))
    prune_names = []
    for name, p in model.named_parameters():
        if not p.requires_grad or "bias" in name:
            continue
        for n in need_cut_names:
            if n in name:
                prune_names.append(name)
                break

    # get Pruning class
    pruner = Pruning(
        model, prune_names, final_rate=args.final_rate, pruning_iter=args.pruning_iter
    )
    if args.init_masks is not None:
        pruner.load(args.init_masks)
        pruner.apply_mask(pruner.remain_mask, pruner._model)
    # save checkpoint
    os.makedirs(args.save_path, exist_ok=True)

    logger.info('Saving init-weights to {}'.format(args.save_path))
    torch.save(
        model.cpu().state_dict(), os.path.join(args.save_path, "init_weights.th")
    )
    torch.save(args, os.path.join(args.save_path, "args.th"))
    # start training and pruning
    summary_writer.add_scalar("remain_rate", 100.0, 0)
    summary_writer.add_scalar("cutoff", 0.0, 0)

    if args.init_weights is not None:
        init_tester = Tester(
            test_data,
            model,
            metrics=metrics,
            batch_size=args.batch_size,
            num_workers=4,
            device="cuda",
            use_tqdm=False,
        )
        res = init_tester.test()
        logger.info("No init testing, Result: {}".format(res))
        del res, init_tester

    for prune_step in range(pruner.pruning_iter + 1):
        # reset optimizer every time
        optim_params = [p for p in model.parameters() if p.requires_grad]
        # utils.get_logger(__name__).debug(optim_params)
        utils.get_logger(__name__).debug(len(optim_params))
        optimizer = get_optim(args.optim, optim_params)
        # optimizer = TriOptim(optimizer, args.n_filters, args.warmup, args.decay)
        factor = pruner.cur_rate / 100.0
        factor = 1.0
        # print(factor, pruner.cur_rate)
        for pg in optimizer.param_groups:
            pg["lr"] = factor * pg["lr"]
        utils.get_logger(__name__).info(optimizer)

        trainer = Trainer(
            train_data,
            model,
            loss=LossInForward(),
            optimizer=optimizer,
            metric_key=metric_key,
            metrics=metrics,
            print_every=200,
            batch_size=args.batch_size,
            num_workers=4,
            n_epochs=args.epochs,
            dev_data=dev_data,
            save_path=None,
            sampler=fastNLP.BucketSampler(batch_size=args.batch_size),
            callbacks=[
                pruner,
                # LRStep(lstm.WarmupLinearSchedule(optimizer, args.warmup, int(len(train_data)/args.batch_size*args.epochs)))
                GradientClipCallback(clip_type="norm", clip_value=5),
                LRScheduler(
                    lr_scheduler=LambdaLR(optimizer, lambda ep: 1 / (1 + 0.05 * ep))
                ),
                LogCallback(path=os.path.join(args.tb_path, "No", str(prune_step))),
            ],
            use_tqdm=False,
            device="cuda",
            check_code_level=-1,
        )
        res = trainer.train()
        logger.info("No #{} training, Result: {}".format(pruner.prune_times, res))
        name, val = get_metric(res)
        summary_writer.add_scalar("prunning_dev_acc", val, prune_step)
        tester = Tester(
            test_data,
            model,
            metrics=metrics,
            batch_size=args.batch_size,
            num_workers=4,
            device="cuda",
            use_tqdm=False,
        )
        res = tester.test()
        logger.info("No #{} testing, Result: {}".format(pruner.prune_times, res))
        name, val = get_metric(res)
        summary_writer.add_scalar("pruning_test_acc", val, prune_step)

        # prune and save
        torch.save(
            model.state_dict(),
            os.path.join(
                args.save_path,
                "best_{}_{}.th".format(pruner.prune_times, pruner.cur_rate),
            ),
        )
        pruner.pruning_model()
        summary_writer.add_scalar("remain_rate", pruner.cur_rate, prune_step + 1)
        summary_writer.add_scalar("cutoff", pruner.last_cutoff, prune_step + 1)

        pruner.save(
            os.path.join(
                args.save_path, "{}_{}.th".format(pruner.prune_times, pruner.cur_rate)
            )
        )
Example #8
0
def train():
    args = parse_args()
    if args.debug:
        fitlog.debug()
        args.save_model = False
    # ================= define =================
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    word_mask_index = tokenizer.mask_token_id
    word_vocab_size = len(tokenizer)

    if get_local_rank() == 0:
        fitlog.set_log_dir(args.log_dir)
        fitlog.commit(__file__, fit_msg=args.name)
        fitlog.add_hyper_in_file(__file__)
        fitlog.add_hyper(args)

    # ================= load data =================
    dist.init_process_group('nccl')
    init_logger_dist()

    n_proc = dist.get_world_size()
    bsz = args.batch_size // args.grad_accumulation // n_proc
    args.local_rank = get_local_rank()
    args.save_dir = os.path.join(args.save_dir,
                                 args.name) if args.save_model else None
    if args.save_dir is not None and os.path.exists(args.save_dir):
        raise RuntimeError('save_dir has already existed.')
    logger.info('save directory: {}'.format(
        'None' if args.save_dir is None else args.save_dir))
    devices = list(range(torch.cuda.device_count()))
    NUM_WORKERS = 4

    ent_vocab, rel_vocab = load_ent_rel_vocabs()
    logger.info('# entities: {}'.format(len(ent_vocab)))
    logger.info('# relations: {}'.format(len(rel_vocab)))
    ent_freq = get_ent_freq()
    assert len(ent_vocab) == len(ent_freq), '{} {}'.format(
        len(ent_vocab), len(ent_freq))

    #####
    root = args.data_dir
    dirs = os.listdir(root)
    drop_files = []
    for dir in dirs:
        path = os.path.join(root, dir)
        max_idx = 0
        for file_name in os.listdir(path):
            if 'large' in file_name:
                continue
            max_idx = int(file_name) if int(file_name) > max_idx else max_idx
        drop_files.append(os.path.join(path, str(max_idx)))
    #####

    file_list = []
    for path, _, filenames in os.walk(args.data_dir):
        for filename in filenames:
            file = os.path.join(path, filename)
            if 'large' in file or file in drop_files:
                continue
            file_list.append(file)
    logger.info('used {} files in {}.'.format(len(file_list), args.data_dir))
    if args.data_prop > 1:
        used_files = file_list[:int(args.data_prop)]
    else:
        used_files = file_list[:round(args.data_prop * len(file_list))]

    data = GraphOTFDataSet(used_files, n_proc, args.local_rank,
                           word_mask_index, word_vocab_size, args.n_negs,
                           ent_vocab, rel_vocab, ent_freq)
    dev_data = GraphDataSet(used_files[0], word_mask_index, word_vocab_size,
                            args.n_negs, ent_vocab, rel_vocab, ent_freq)

    sampler = OTFDistributedSampler(used_files, n_proc, get_local_rank())
    train_data_iter = TorchLoaderIter(dataset=data,
                                      batch_size=bsz,
                                      sampler=sampler,
                                      num_workers=NUM_WORKERS,
                                      collate_fn=data.collate_fn)
    dev_data_iter = TorchLoaderIter(dataset=dev_data,
                                    batch_size=bsz,
                                    sampler=RandomSampler(),
                                    num_workers=NUM_WORKERS,
                                    collate_fn=dev_data.collate_fn)
    if args.test_data is not None:
        test_data = FewRelDevDataSet(path=args.test_data,
                                     label_vocab=rel_vocab,
                                     ent_vocab=ent_vocab)
        test_data_iter = TorchLoaderIter(dataset=test_data,
                                         batch_size=32,
                                         sampler=RandomSampler(),
                                         num_workers=NUM_WORKERS,
                                         collate_fn=test_data.collate_fn)

    if args.local_rank == 0:
        print('full wiki files: {}'.format(len(file_list)))
        print('used wiki files: {}'.format(len(used_files)))
        print('# of trained samples: {}'.format(len(data) * n_proc))
        print('# of trained entities: {}'.format(len(ent_vocab)))
        print('# of trained relations: {}'.format(len(rel_vocab)))

    # ================= prepare model =================
    logger.info('model init')
    if args.rel_emb is not None:  # load pretrained relation embeddings
        rel_emb = np.load(args.rel_emb)
        # add_embs = np.random.randn(3, rel_emb.shape[1])  # add <pad>, <mask>, <unk>
        # rel_emb = np.r_[add_embs, rel_emb]
        rel_emb = torch.from_numpy(rel_emb).float()
        assert rel_emb.shape[0] == len(rel_vocab), '{} {}'.format(
            rel_emb.shape[0], len(rel_vocab))
        # assert rel_emb.shape[1] == args.rel_dim
        logger.info('loaded pretrained relation embeddings. dim: {}'.format(
            rel_emb.shape[1]))
    else:
        rel_emb = None
    if args.model_name is not None:
        logger.info('further pre-train.')
        config = RobertaConfig.from_pretrained('roberta-base',
                                               type_vocab_size=3)
        model = CoLAKE(config=config,
                       num_ent=len(ent_vocab),
                       num_rel=len(rel_vocab),
                       ent_dim=args.ent_dim,
                       rel_dim=args.rel_dim,
                       ent_lr=args.ent_lr,
                       ip_config=args.ip_config,
                       rel_emb=None,
                       emb_name=args.emb_name)
        states_dict = torch.load(args.model_name)
        model.load_state_dict(states_dict, strict=True)
    else:
        model = CoLAKE.from_pretrained(
            'roberta-base',
            num_ent=len(ent_vocab),
            num_rel=len(rel_vocab),
            ent_lr=args.ent_lr,
            ip_config=args.ip_config,
            rel_emb=rel_emb,
            emb_name=args.emb_name,
            cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
            'dist_{}'.format(args.local_rank))
        model.extend_type_embedding(token_type=3)
    # if args.local_rank == 0:
    #     for name, param in model.named_parameters():
    #         if param.requires_grad is True:
    #             print('{}: {}'.format(name, param.shape))

    # ================= train model =================
    # lr=1e-4 for peak value, lr=5e-5 for initial value
    logger.info('trainer init')
    no_decay = [
        'bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm.bias',
        'layer_norm.weight'
    ]
    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    word_acc = WordMLMAccuracy(pred='word_pred',
                               target='masked_lm_labels',
                               seq_len='word_seq_len')
    ent_acc = EntityMLMAccuracy(pred='entity_pred',
                                target='ent_masked_lm_labels',
                                seq_len='ent_seq_len')
    rel_acc = RelationMLMAccuracy(pred='relation_pred',
                                  target='rel_masked_lm_labels',
                                  seq_len='rel_seq_len')
    metrics = [word_acc, ent_acc, rel_acc]

    if args.test_data is not None:
        test_metric = [rel_acc]
        tester = Tester(data=test_data_iter,
                        model=model,
                        metrics=test_metric,
                        device=list(range(torch.cuda.device_count())))
        # tester.test()
    else:
        tester = None

    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=args.lr,
                            betas=(0.9, args.beta),
                            eps=1e-6)
    # warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear')
    fitlog_callback = MyFitlogCallback(tester=tester,
                                       log_loss_every=100,
                                       verbose=1)
    gradient_clip_callback = GradientClipCallback(clip_value=1,
                                                  clip_type='norm')
    emb_callback = EmbUpdateCallback(model.ent_embeddings)
    all_callbacks = [gradient_clip_callback, emb_callback]
    if args.save_dir is None:
        master_callbacks = [fitlog_callback]
    else:
        save_callback = SaveModelCallback(args.save_dir,
                                          model.ent_embeddings,
                                          only_params=True)
        master_callbacks = [fitlog_callback, save_callback]

    if args.do_test:
        states_dict = torch.load(os.path.join(args.save_dir,
                                              args.model_name)).state_dict()
        model.load_state_dict(states_dict)
        data_iter = TorchLoaderIter(dataset=data,
                                    batch_size=args.batch_size,
                                    sampler=RandomSampler(),
                                    num_workers=NUM_WORKERS,
                                    collate_fn=data.collate_fn)
        tester = Tester(data=data_iter,
                        model=model,
                        metrics=metrics,
                        device=devices)
        tester.test()
    else:
        trainer = DistTrainer(train_data=train_data_iter,
                              dev_data=dev_data_iter,
                              model=model,
                              optimizer=optimizer,
                              loss=LossInForward(),
                              batch_size_per_gpu=bsz,
                              update_every=args.grad_accumulation,
                              n_epochs=args.epoch,
                              metrics=metrics,
                              callbacks_master=master_callbacks,
                              callbacks_all=all_callbacks,
                              validate_every=5000,
                              use_tqdm=True,
                              fp16='O1' if args.fp16 else '')
        trainer.train(load_best_model=False)
Example #9
0
              num_cls=len(data.vocabs[Const.TARGET]),
              repeats=ops.repeats,
              num_layers=ops.num_layers,
              num_filters=ops.num_filters,
              kernel_size=3,
              use_crf=ops.use_crf,
              use_projection=True,
              block_loss=True,
              input_dropout=0.5,
              hidden_dropout=0.2,
              inner_dropout=0.2)

print(model)

callbacks = [
    GradientClipCallback(clip_value=ops.gradient_clip, clip_type='value'),
]
metrics = []
metrics.append(
    SpanFPreRecMetric(
        tag_vocab=data.vocabs[Const.TARGET],
        encoding_type=encoding_type,
        pred=Const.OUTPUT,
        target=Const.TARGET,
        seq_len=Const.INPUT_LEN,
    ))


class LossMetric(MetricBase):
    def __init__(self, loss=None):
        super(LossMetric, self).__init__()
Example #10
0

data = load_data()
print(data)

embed = BertEmbedding(data.get_vocab(Const.INPUT),
                      model_dir_or_name='en-base-cased',
                      pool_method='max',
                      requires_grad=True,
                      layers='11',
                      include_cls_sep=False,
                      dropout=0.5,
                      word_dropout=0.01)

callbacks = [
    GradientClipCallback(clip_type='norm', clip_value=1),
    WarmupCallback(warmup=0.1, schedule='linear'),
    EvaluateCallback(data.get_dataset('test'))
]

model = BertCRF(embed,
                tag_vocab=data.get_vocab('target'),
                encoding_type=encoding_type)
optimizer = AdamW(model.parameters(), lr=2e-5)

trainer = Trainer(train_data=data.datasets['train'],
                  model=model,
                  optimizer=optimizer,
                  sampler=BucketSampler(),
                  device=0,
                  dev_data=data.datasets['dev'],
Example #11
0
def main():
    args = parse_args()

    if args.debug:
        fitlog.debug()

    fitlog.set_log_dir(args.log_dir)
    fitlog.commit(__file__)
    fitlog.add_hyper_in_file(__file__)
    fitlog.add_hyper(args)
    if args.gpu != 'all':
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    train_set, dev_set, test_set, temp_ent_vocab = load_fewrel_graph_data(
        data_dir=args.data_dir)

    print('data directory: {}'.format(args.data_dir))
    print('# of train samples: {}'.format(len(train_set)))
    print('# of dev samples: {}'.format(len(dev_set)))
    print('# of test samples: {}'.format(len(test_set)))

    ent_vocab, rel_vocab = load_ent_rel_vocabs(path='../')

    # load entity embeddings
    ent_index = []
    for k, v in temp_ent_vocab.items():
        ent_index.append(ent_vocab[k])
    ent_index = torch.tensor(ent_index)
    ent_emb = np.load(os.path.join(args.model_path, 'entities.npy'))
    ent_embedding = nn.Embedding.from_pretrained(torch.from_numpy(ent_emb))
    ent_emb = ent_embedding(ent_index.view(1, -1)).squeeze().detach()

    # load CoLAKE parameters
    config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3)
    model = CoLAKEForRE(config,
                        num_types=len(train_set.label_vocab),
                        ent_emb=ent_emb)
    states_dict = torch.load(os.path.join(args.model_path, 'model.bin'))
    model.load_state_dict(states_dict, strict=False)
    print('parameters below are randomly initializecd:')
    for name, param in model.named_parameters():
        if name not in states_dict:
            print(name)

    # tie relation classification head
    rel_index = []
    for k, v in train_set.label_vocab.items():
        rel_index.append(rel_vocab[k])
    rel_index = torch.LongTensor(rel_index)
    rel_embeddings = nn.Embedding.from_pretrained(
        states_dict['rel_embeddings.weight'])
    rel_index = rel_index.cuda()
    rel_cls_weight = rel_embeddings(rel_index.view(1, -1)).squeeze()
    model.tie_rel_weights(rel_cls_weight)

    model.rel_head.dense.weight.data = states_dict['rel_lm_head.dense.weight']
    model.rel_head.dense.bias.data = states_dict['rel_lm_head.dense.bias']
    model.rel_head.layer_norm.weight.data = states_dict[
        'rel_lm_head.layer_norm.weight']
    model.rel_head.layer_norm.bias.data = states_dict[
        'rel_lm_head.layer_norm.bias']

    model.resize_token_embeddings(
        len(RobertaTokenizer.from_pretrained('roberta-base')) + 4)
    print('parameters of CoLAKE has been loaded.')

    # fine-tune
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'embedding']
    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=args.lr,
                            betas=(0.9, args.beta),
                            eps=1e-6)

    metrics = [MacroMetric(pred='pred', target='target')]

    test_data_iter = TorchLoaderIter(dataset=test_set,
                                     batch_size=args.batch_size,
                                     sampler=RandomSampler(),
                                     num_workers=4,
                                     collate_fn=test_set.collate_fn)
    devices = list(range(torch.cuda.device_count()))
    tester = Tester(data=test_data_iter,
                    model=model,
                    metrics=metrics,
                    device=devices)
    # tester.test()

    fitlog_callback = FitlogCallback(tester=tester,
                                     log_loss_every=100,
                                     verbose=1)
    gradient_clip_callback = GradientClipCallback(clip_value=1,
                                                  clip_type='norm')
    warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear')

    bsz = args.batch_size // args.grad_accumulation

    train_data_iter = TorchLoaderIter(dataset=train_set,
                                      batch_size=bsz,
                                      sampler=RandomSampler(),
                                      num_workers=4,
                                      collate_fn=train_set.collate_fn)
    dev_data_iter = TorchLoaderIter(dataset=dev_set,
                                    batch_size=bsz,
                                    sampler=RandomSampler(),
                                    num_workers=4,
                                    collate_fn=dev_set.collate_fn)

    trainer = Trainer(
        train_data=train_data_iter,
        dev_data=dev_data_iter,
        model=model,
        optimizer=optimizer,
        loss=LossInForward(),
        batch_size=bsz,
        update_every=args.grad_accumulation,
        n_epochs=args.epoch,
        metrics=metrics,
        callbacks=[fitlog_callback, gradient_clip_callback, warmup_callback],
        device=devices,
        use_tqdm=True)

    trainer.train(load_best_model=False)
Example #12
0
from fastNLP.core.losses import CMRC2018Loss
from fastNLP.core.metrics import CMRC2018Metric
from fastNLP.io.pipe.qa import CMRC2018BertPipe
from fastNLP import Trainer, BucketSampler
from fastNLP import WarmupCallback, GradientClipCallback
from fastNLP.core.optimizer import AdamW


data_bundle = CMRC2018BertPipe().process_from_file()
data_bundle.rename_field('chars', 'words')

print(data_bundle)

embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn', requires_grad=True, include_cls_sep=False, auto_truncate=True,
                      dropout=0.5, word_dropout=0.01)
model = BertForQuestionAnswering(embed)
loss = CMRC2018Loss()
metric = CMRC2018Metric()

wm_callback = WarmupCallback(schedule='linear')
gc_callback = GradientClipCallback(clip_value=1, clip_type='norm')
callbacks = [wm_callback, gc_callback]

optimizer = AdamW(model.parameters(), lr=5e-5)

trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,
                  sampler=BucketSampler(seq_len_field_name='context_len'),
                  dev_data=data_bundle.get_dataset('dev'), metrics=metric,
                  callbacks=callbacks, device=0, batch_size=6, num_workers=2, n_epochs=2, print_every=1,
                  test_use_tqdm=False, update_every=10)
trainer.train(load_best_model=False)
Example #13
0
def main():
    if args.do_eval:
        torch.multiprocessing.set_start_method('spawn', force=True)

    if args.model == 'bert':

        model = BertCRF(embed, [data_bundle.get_vocab('target')],
                        encoding_type='bioes')

    else:
        model = StackedTransformersCRF(
            tag_vocabs=[data_bundle.get_vocab('target')],
            embed=embed,
            num_layers=num_layers,
            d_model=d_model,
            n_head=n_heads,
            feedforward_dim=dim_feedforward,
            dropout=trans_dropout,
            after_norm=after_norm,
            attn_type=attn_type,
            bi_embed=None,
            fc_dropout=fc_dropout,
            pos_embed=pos_embed,
            scale=attn_type == 'transformer')
        model = torch.nn.DataParallel(model)

    if args.do_eval:
        if os.path.exists(os.path.expanduser(args.saved_model)):
            print("Load checkpoint from {}".format(
                os.path.expanduser(args.saved_model)))
            model = torch.load(args.saved_model)
            model.to('cuda')
            print('model to CUDA')

    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)

    callbacks = []
    clip_callback = GradientClipCallback(clip_type='value', clip_value=5)
    evaluate_callback = EvaluateCallback(data_bundle.get_dataset('test'))
    checkpoint_callback = CheckPointCallback(os.path.join(
        directory, 'model.pth'),
                                             delete_when_train_finish=False,
                                             recovery_fitlog=True)

    if warmup_steps > 0:
        warmup_callback = WarmupCallback(warmup_steps, schedule='linear')
        callbacks.append(warmup_callback)
    callbacks.extend([clip_callback, checkpoint_callback, evaluate_callback])

    if not args.do_eval:
        trainer = Trainer(data_bundle.get_dataset('train'),
                          model,
                          optimizer,
                          batch_size=batch_size,
                          sampler=BucketSampler(),
                          num_workers=no_cpu,
                          n_epochs=args.n_epochs,
                          dev_data=data_bundle.get_dataset('dev'),
                          metrics=SpanFPreRecMetric(
                              tag_vocab=data_bundle.get_vocab('target'),
                              encoding_type=encoding_type),
                          dev_batch_size=batch_size,
                          callbacks=callbacks,
                          device=args.device,
                          test_use_tqdm=True,
                          use_tqdm=True,
                          print_every=300,
                          save_path=os.path.join(directory, 'best'))

        trainer.train(load_best_model=True)

        predictor = Predictor(model)
        predict(os.path.join(directory, 'predictions_dev.tsv'), data_bundle,
                predictor, 'dev')
        predict(os.path.join(directory, 'predictions_test.tsv'), data_bundle,
                predictor, 'test')

    else:
        print('Predicting')
        # predictions of multiple files
        torch.multiprocessing.freeze_support()
        model.share_memory()
        predictor = Predictor(model)

        if len(files) > multiprocessing.cpu_count():
            with torch.multiprocessing.Pool(processes=no_cpu) as p:
                with tqdm(total=len(files)) as pbar:
                    for i, _ in enumerate(
                            p.imap_unordered(
                                partial(predict,
                                        data_bundle=data_bundle,
                                        predictor=predictor,
                                        predict_on='train',
                                        do_eval=args.do_eval), files)):
                        pbar.update()
        else:
            for file in tqdm(files):
                predict(file, data_bundle, predictor, 'train', args.do_eval)