def _get_trainer(self, models_folder): optimizer = optim.SGD(self.parameters(), lr=self.config['lr'], momentum=0.9) callbacks = [] clip_callback = GradientClipCallback(clip_type='value', clip_value=5) evaluate_callback = EvaluateCallback( self.data_bundle.get_dataset('test')) if self.config['warmup_steps'] > 0: warmup_callback = WarmupCallback(self.config['warmup_steps'], schedule='linear') callbacks.append(warmup_callback) callbacks.extend([clip_callback, evaluate_callback]) return Trainer(self.data_bundle.get_dataset('train'), self, optimizer, batch_size=self.config['batch_size'], sampler=BucketSampler(), num_workers=2, n_epochs=100, dev_data=self.data_bundle.get_dataset('dev'), metrics=SpanFPreRecMetric( tag_vocab=self.data_bundle.get_vocab('target'), encoding_type=self.config['encoding_type']), dev_batch_size=self.config['batch_size'] * 5, callbacks=callbacks, device=self.config['device'], test_use_tqdm=False, use_tqdm=True, print_every=300, save_path=models_folder)
after_norm=after_norm, attn_type=attn_type, bi_embed=bi_embed, bert_embed=bert_embed, fc_dropout=fc_dropout, pos_embed=pos_embed, scale=attn_type == 'transformer') optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) callbacks = [] clip_callback = GradientClipCallback(clip_type='value', clip_value=5) evaluate_callback = EvaluateCallback(data_bundle.get_dataset('test')) if warmup_steps > 0: warmup_callback = WarmupCallback(warmup_steps, schedule='linear') callbacks.append(warmup_callback) callbacks.extend([clip_callback, evaluate_callback]) trainer = Trainer(data_bundle.get_dataset('train'), model, optimizer, batch_size=batch_size, sampler=BucketSampler(), num_workers=2, n_epochs=n_epochs, dev_data=data_bundle.get_dataset('dev'), metrics=SpanFPreRecMetric( tag_vocab=data_bundle.get_vocab('target'), encoding_type=encoding_type), dev_batch_size=batch_size,
print(f"In total {len(word2bpes)} target words") pad_id = data_bundle.pad_id model = ENBertReverseDict(pre_name, word2bpes, pad_id=pad_id, number_word_in_train=data_bundle.number_word_in_train) if torch.cuda.is_available(): model.cuda() optimizer = optim.AdamW(model.parameters(), lr=lr) data = {} for name in ['seen', 'unseen', 'desc']: data[name] = data_bundle.get_dataset(name) callbacks = [GradientClipCallback(clip_type='value', clip_value=5), WarmupCallback(warmup=0.01, schedule='linear')] callbacks.append(FitlogCallback(data=data, verbose=1)) train_data = data_bundle.get_dataset('train') train_data.add_seq_len('input') # from collections import Counter # print(Counter(train_data.get_field('seq_len').content)) # exit(0) sampler = BucketSampler() clip_max_length(train_data, data_bundle) trainer = Trainer(train_data=train_data, model=model, optimizer=optimizer, loss=CrossEntropyLoss(), batch_size=batch_size, sampler=sampler, drop_last=False, update_every=1, num_workers=1, n_epochs=n_epochs, print_every=5,
word2bpes, pad_id=pad_id, number_word_in_train=data_bundle.number_word_in_train) if torch.cuda.is_available(): model.cuda() optimizer = optim.AdamW(model.parameters(), lr=lr) data = {} for name in ['desc', 'question', 'seen_test', 'unseen_test']: data[name] = data_bundle.get_dataset(name) callbacks = [ GradientClipCallback(clip_type='value'), WarmupCallback(warmup=0.1, schedule='linear') ] callbacks.append(FitlogCallback(data=data, verbose=1)) train_data = data_bundle.get_dataset('train') train_data.add_seq_len('input') sampler = BucketSampler() clip_max_length(train_data, data_bundle) trainer = Trainer(train_data=train_data, model=model, optimizer=optimizer, loss=CrossEntropyLoss(), batch_size=batch_size, sampler=sampler, drop_last=False,
def main(): args = parse_args() if args.debug: fitlog.debug() fitlog.set_log_dir(args.log_dir) fitlog.commit(__file__) fitlog.add_hyper_in_file(__file__) fitlog.add_hyper(args) if args.gpu != 'all': os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_set, dev_set, test_set, temp_ent_vocab = load_fewrel_graph_data( data_dir=args.data_dir) print('data directory: {}'.format(args.data_dir)) print('# of train samples: {}'.format(len(train_set))) print('# of dev samples: {}'.format(len(dev_set))) print('# of test samples: {}'.format(len(test_set))) ent_vocab, rel_vocab = load_ent_rel_vocabs(path='../') # load entity embeddings ent_index = [] for k, v in temp_ent_vocab.items(): ent_index.append(ent_vocab[k]) ent_index = torch.tensor(ent_index) ent_emb = np.load(os.path.join(args.model_path, 'entities.npy')) ent_embedding = nn.Embedding.from_pretrained(torch.from_numpy(ent_emb)) ent_emb = ent_embedding(ent_index.view(1, -1)).squeeze().detach() # load CoLAKE parameters config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3) model = CoLAKEForRE(config, num_types=len(train_set.label_vocab), ent_emb=ent_emb) states_dict = torch.load(os.path.join(args.model_path, 'model.bin')) model.load_state_dict(states_dict, strict=False) print('parameters below are randomly initializecd:') for name, param in model.named_parameters(): if name not in states_dict: print(name) # tie relation classification head rel_index = [] for k, v in train_set.label_vocab.items(): rel_index.append(rel_vocab[k]) rel_index = torch.LongTensor(rel_index) rel_embeddings = nn.Embedding.from_pretrained( states_dict['rel_embeddings.weight']) rel_index = rel_index.cuda() rel_cls_weight = rel_embeddings(rel_index.view(1, -1)).squeeze() model.tie_rel_weights(rel_cls_weight) model.rel_head.dense.weight.data = states_dict['rel_lm_head.dense.weight'] model.rel_head.dense.bias.data = states_dict['rel_lm_head.dense.bias'] model.rel_head.layer_norm.weight.data = states_dict[ 'rel_lm_head.layer_norm.weight'] model.rel_head.layer_norm.bias.data = states_dict[ 'rel_lm_head.layer_norm.bias'] model.resize_token_embeddings( len(RobertaTokenizer.from_pretrained('roberta-base')) + 4) print('parameters of CoLAKE has been loaded.') # fine-tune no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'embedding'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr, betas=(0.9, args.beta), eps=1e-6) metrics = [MacroMetric(pred='pred', target='target')] test_data_iter = TorchLoaderIter(dataset=test_set, batch_size=args.batch_size, sampler=RandomSampler(), num_workers=4, collate_fn=test_set.collate_fn) devices = list(range(torch.cuda.device_count())) tester = Tester(data=test_data_iter, model=model, metrics=metrics, device=devices) # tester.test() fitlog_callback = FitlogCallback(tester=tester, log_loss_every=100, verbose=1) gradient_clip_callback = GradientClipCallback(clip_value=1, clip_type='norm') warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear') bsz = args.batch_size // args.grad_accumulation train_data_iter = TorchLoaderIter(dataset=train_set, batch_size=bsz, sampler=RandomSampler(), num_workers=4, collate_fn=train_set.collate_fn) dev_data_iter = TorchLoaderIter(dataset=dev_set, batch_size=bsz, sampler=RandomSampler(), num_workers=4, collate_fn=dev_set.collate_fn) trainer = Trainer( train_data=train_data_iter, dev_data=dev_data_iter, model=model, optimizer=optimizer, loss=LossInForward(), batch_size=bsz, update_every=args.grad_accumulation, n_epochs=args.epoch, metrics=metrics, callbacks=[fitlog_callback, gradient_clip_callback, warmup_callback], device=devices, use_tqdm=True) trainer.train(load_best_model=False)
data, bert_embed = get_data() print(data) model = BertParser(embed=bert_embed, num_label=len(data.get_vocab('char_labels')), arc_mlp_size=arc_mlp_size, label_mlp_size=label_mlp_size, dropout=dropout, use_greedy_infer=False, app_index=0) metric1 = SegAppCharParseF1Metric(data.get_vocab('char_labels')['APP']) metric2 = CWSMetric(data.get_vocab('char_labels')['APP']) metrics = [metric1, metric2] optimizer = optim.AdamW([param for param in model.parameters() if param.requires_grad], lr=lr, weight_decay=1e-2) sampler = BucketSampler(seq_len_field_name='seq_lens') callbacks = [] warmup_callback = WarmupCallback(schedule='linear') callbacks.append(warmup_callback) callbacks.append(GradientClipCallback(clip_type='value', clip_value=5)) callbacks.append(EvaluateCallback(data.get_dataset('test'))) trainer = Trainer(data.datasets['train'], model, loss=None, metrics=metrics, n_epochs=n_epochs, batch_size=batch_size, print_every=3, validate_every=-1, dev_data=data.datasets['dev'], save_path=None, optimizer=optimizer, check_code_level=0, metric_key='u_f1', sampler=sampler, num_workers=2, use_tqdm=True, device=device, callbacks=callbacks, update_every=update_every, dev_batch_size=6) trainer.train(load_best_model=False)
"weight_decay": 1e-2, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.lr) callbacks = [] callbacks.append(WarmupCallback(0.01, "linear")) callbacks.append(FitlogCallback( # data_bundle.get_dataset('train') )) import torch import torch.nn.functional as F from fastNLP import LossBase class SmoothLoss(LossBase): def __init__(self, smooth_eps=0): super().__init__() self.smooth_eps = smooth_eps def get_loss(self, pred, target):
def main(): if args.do_eval: torch.multiprocessing.set_start_method('spawn', force=True) if args.model == 'bert': model = BertCRF(embed, [data_bundle.get_vocab('target')], encoding_type='bioes') else: model = StackedTransformersCRF( tag_vocabs=[data_bundle.get_vocab('target')], embed=embed, num_layers=num_layers, d_model=d_model, n_head=n_heads, feedforward_dim=dim_feedforward, dropout=trans_dropout, after_norm=after_norm, attn_type=attn_type, bi_embed=None, fc_dropout=fc_dropout, pos_embed=pos_embed, scale=attn_type == 'transformer') model = torch.nn.DataParallel(model) if args.do_eval: if os.path.exists(os.path.expanduser(args.saved_model)): print("Load checkpoint from {}".format( os.path.expanduser(args.saved_model))) model = torch.load(args.saved_model) model.to('cuda') print('model to CUDA') optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8) callbacks = [] clip_callback = GradientClipCallback(clip_type='value', clip_value=5) evaluate_callback = EvaluateCallback(data_bundle.get_dataset('test')) checkpoint_callback = CheckPointCallback(os.path.join( directory, 'model.pth'), delete_when_train_finish=False, recovery_fitlog=True) if warmup_steps > 0: warmup_callback = WarmupCallback(warmup_steps, schedule='linear') callbacks.append(warmup_callback) callbacks.extend([clip_callback, checkpoint_callback, evaluate_callback]) if not args.do_eval: trainer = Trainer(data_bundle.get_dataset('train'), model, optimizer, batch_size=batch_size, sampler=BucketSampler(), num_workers=no_cpu, n_epochs=args.n_epochs, dev_data=data_bundle.get_dataset('dev'), metrics=SpanFPreRecMetric( tag_vocab=data_bundle.get_vocab('target'), encoding_type=encoding_type), dev_batch_size=batch_size, callbacks=callbacks, device=args.device, test_use_tqdm=True, use_tqdm=True, print_every=300, save_path=os.path.join(directory, 'best')) trainer.train(load_best_model=True) predictor = Predictor(model) predict(os.path.join(directory, 'predictions_dev.tsv'), data_bundle, predictor, 'dev') predict(os.path.join(directory, 'predictions_test.tsv'), data_bundle, predictor, 'test') else: print('Predicting') # predictions of multiple files torch.multiprocessing.freeze_support() model.share_memory() predictor = Predictor(model) if len(files) > multiprocessing.cpu_count(): with torch.multiprocessing.Pool(processes=no_cpu) as p: with tqdm(total=len(files)) as pbar: for i, _ in enumerate( p.imap_unordered( partial(predict, data_bundle=data_bundle, predictor=predictor, predict_on='train', do_eval=args.do_eval), files)): pbar.update() else: for file in tqdm(files): predict(file, data_bundle, predictor, 'train', args.do_eval)