def create_cb(): lrschedule_callback = LRScheduler( lr_scheduler=LambdaLR(optimizer, lambda ep: 1 / (1 + 0.05 * ep))) clip_callback = GradientClipCallback(clip_type='value', clip_value=2) save_dir = os.path.join(root_path, f'model/{args.data_type}', f'fold{args.fold}') save_callback = SaveModelCallback(top=1, save_dir=save_dir) if args.cv: callbacks = [ lrschedule_callback, clip_callback, save_callback, ] else: callbacks = [ lrschedule_callback, clip_callback, save_callback, ] # callbacks.append(Unfreeze_Callback(embedding_param ,args.fix_embed_epoch)) if args.use_bert: if args.fix_bert_epoch != 0: callbacks.append( Unfreeze_Callback(model.lattice_embed, args.fix_bert_epoch)) else: bert_embedding.requires_grad = True callbacks.append(EarlyStopCallback(args.early_stop)) if args.warmup > 0 and args.model == 'transformer': callbacks.append(WarmupCallback(warmup=args.warmup, )) return callbacks
def test_gradient_clip(self): data_set, model = prepare_env() trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), batch_size=32, n_epochs=20, print_every=50, dev_data=data_set, metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, callbacks=[GradientClipCallback(model.parameters(), clip_value=2)], check_code_level=2) trainer.train()
def run3(self): # test callbacks, especially clip-norm set_rng_seed(100) data_set, model = prepare_env() trainer = DistTrainer( data_set, model, optimizer=None, loss=BCELoss(pred="predict", target="y"), n_epochs=3, print_every=50, callbacks_all=[GradientClipCallback()], callbacks_master=[EchoCallback('callbacks_master')]) trainer.train()
# weight_decay=args.weight_decay) optimizer = optim.SGD(param_, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if 'msra' in args.dataset: datasets['dev'] = datasets['test'] fitlog_evaluate_dataset = {'test': datasets['test']} if args.test_train: fitlog_evaluate_dataset['train'] = datasets['train'] evaluate_callback = FitlogCallback(fitlog_evaluate_dataset, verbose=1) lrschedule_callback = LRScheduler( lr_scheduler=LambdaLR(optimizer, lambda ep: 1 / (1 + 0.05 * ep))) clip_callback = GradientClipCallback(clip_type='value', clip_value=5) # model.state_dict() class CheckWeightCallback(Callback): def __init__(self, model): super().__init__() self.model_ = model def on_step_end(self): print('parameter weight:', flush=True) print(self.model_.state_dict()['encoder.layer_0.attn.w_q.weight'], flush=True) callbacks = [
model = Model(data_bundle.get_vocab(Const.INPUTS(0)), config) print(model) loss = SoftmaxLoss() metric = CRMetric() optim = Adam(model.parameters(), lr=config.lr) lr_decay_callback = LRCallback(optim.param_groups, config.lr_decay) trainer = Trainer( model=model, train_data=data_bundle.datasets["train"], dev_data=data_bundle.datasets["dev"], loss=loss, metrics=metric, check_code_level=-1, sampler=None, batch_size=1, device=torch.device("cuda:" + config.cuda) if torch.cuda.is_available() else None, metric_key='f', n_epochs=config.epoch, optimizer=optim, save_path=None, callbacks=[lr_decay_callback, GradientClipCallback(clip_value=5)]) print() trainer.train()
print(config) def cache(): bundle = CoReferencePipe(config).process_from_file({'train': config.train_path, 'dev': config.dev_path, 'test': config.test_path}) return bundle data_bundle = cache() print(data_bundle) model = Model(data_bundle.get_vocab(Const.INPUTS(0)), config) print(model) loss = SoftmaxLoss() metric = CRMetric() optim = Adam(model.parameters(), lr=config.lr) lr_decay_callback = LRCallback(optim.param_groups, config.lr_decay) trainer = Trainer(model=model, train_data=data_bundle.datasets["train"], dev_data=data_bundle.datasets["dev"], loss=loss, metrics=metric, check_code_level=-1, sampler=None, batch_size=1, device=torch.device("cuda:" + config.cuda) if torch.cuda.is_available() else None, metric_key='f', n_epochs=config.epoch, optimizer=optim, save_path=None, callbacks=[lr_decay_callback, GradientClipCallback(clip_value=5)]) print() trainer.train()
loss = LossInForward() optimizer = AdamW( [param for param in model.parameters() if param.requires_grad], lr=2e-5) # metric = AccuracyMetric() metric = SpanFPreRecMetric( tag_vocab=data_bundle.get_vocab(Const.TARGET), only_gross=False) # 若only_gross=False, 即还会返回各个label的metric统计值 device = 'cuda' if torch.cuda.is_available( ) else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快 logger.info('device:{}'.format(device)) batch_size = 32 n_epochs = 10 early_stopping = 10 callbacks = [ GradientClipCallback(clip_type='norm', clip_value=1), WarmupCallback(warmup=0.1, schedule='linear'), EarlyStopCallback(early_stopping) ] trainer = Trainer(save_path=model_path, train_data=data_bundle.get_dataset('train'), model=model, loss=loss, optimizer=optimizer, batch_size=batch_size, n_epochs=n_epochs, dev_data=data_bundle.get_dataset('dev'), metrics=metric, metric_key='f', device=device, callbacks=callbacks)
embedding = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-glove-840b-300d', requires_grad=True, normalize=False) else: raise RuntimeError(f'NOT support {arg.embedding} embedding yet!') # define model model = ESIM(embedding, num_labels=len(data_bundle.vocabs[Const.TARGET])) # define optimizer and callback optimizer = Adamax(lr=arg.lr, params=model.parameters()) scheduler = StepLR(optimizer, step_size=10, gamma=0.5) # 每10个epoch学习率变为原来的0.5倍 callbacks = [ GradientClipCallback( clip_value=10), # 等价于torch.nn.utils.clip_grad_norm_(10) LRScheduler(scheduler), ] if arg.task in ['snli']: callbacks.append( EvaluateCallback(data=data_bundle.datasets[arg.test_dataset_name])) # evaluate test set in every epoch if task is snli. # define trainer trainer = Trainer(train_data=data_bundle.datasets[arg.train_dataset_name], model=model, optimizer=optimizer, loss=CrossEntropyLoss(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,