print('{} set size: {}'.format(data_key, len(dataloader.data[data_key]))) if not os.path.exists(output_dir): os.makedirs(output_dir) if not os.path.exists(log_dir): os.makedirs(log_dir) writer = SummaryWriter(log_dir) model = JointBERT(config['model'], DEVICE, dataloader.tag_dim, dataloader.intent_dim, dataloader.intent_weight) model.to(DEVICE) if config['model']['finetune']: no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': config['model']['weight_decay']}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=config['model']['learning_rate'], eps=config['model']['adam_epsilon']) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config['model']['warmup_steps'], num_training_steps=config['model']['max_step']) else: for n, p in model.named_parameters(): if 'bert' in n: p.requires_grad = False optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config['model']['learning_rate'])
os.makedirs(log_dir) writer = SummaryWriter(log_dir) bert_config = BertConfig.from_pretrained( config['model']['pretrained_weights']) model = JointBERT(bert_config, config['model'], DEVICE, dataloader.tag_dim, dataloader.intent_dim, dataloader.intent_weight) model.to(DEVICE) if config['model']['finetune']: no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': config['model']['weight_decay'] }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=config['model']['learning_rate'], eps=config['model']['adam_epsilon'])