loss.backward() optimizer.step() score = validate_random(model, valid_dl, n_tta=10) if score > best_score: best_score = score model_state = { "model_name": 'freesound', "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), "state_dict": model.state_dict() } torch.save(model_state, model_state_path) scheduler.step(score) print(epoch, score) earlystopper.step(score) if earlystopper.stop: break print(best_score) model_state = torch.load(model_state_path) model.load_state_dict(model_state["state_dict"]) score, predictions = validate_search(model, train_meta.iloc[train_splits[fold][1]], train_cache) print('score of loaded model: {}'.format(score)) sub.iloc[:, 1:] = predictions sub.to_csv('train_focus_fold_{}_gpu_{}.csv'.format(fold, args.gpu), index=False) if fold == 0: full_predictions = 1 / predictions else: full_predictions += 1 / predictions
with torch.no_grad(): for batch_idx, (img, mask, _) in enumerate(valid_loader): img = img.cuda() mask = mask.cuda() pred = model(img) loss = criterion(pred, mask) valid_loss.append(loss.item()) print('[EPOCH {}/{}] Train Loss: {:.4f}; Valid Loss: {:.4f}'.format( epoch, args.epochs, np.mean(train_loss), np.mean(valid_loss) )) flag, best, bad_epochs = es.step(torch.Tensor([np.mean(valid_loss)])) if flag: print('Early stopping criterion met') break else: if bad_epochs == 0: save_nets(nets, 'model') print('Saving current best model') print('Current Valid loss: {:.4f}; Current best: {:.4f}; Bad epochs: {}'.format( np.mean(valid_loss), best.item(), bad_epochs )) print('Training done... start evaluation') with torch.no_grad():
class Trainer(): def __init__(self, conf): self.conf = conf self.device = torch.device(f"cuda:{conf.gpu_id}") self.log = get_logger() torch.set_printoptions(precision=8) if conf.runid: conf.rundir = mkdir(conf.outdir / conf.runid) if not conf.rundir: conf.rundir = next_rundir(conf.outdir, log=self.log) self.rundir = conf.rundir dump_args(conf, conf.rundir / "conf.json") set_random_seed(conf.random_seed) if self.conf.use_bert: assert self.conf.lang in Bert.supported_langs, self.conf.lang self.bert = Bert(self.conf.bert_model_name, device=self.device) else: self.bert = None self.data = load_dataset(conf, conf.lang, bert=self.bert) _data = [self.data] for d in _data: self.log.info( f"{len(d.train_loader)} batches | bs {conf.batch_size}") self.model = self.get_model() self.optimizer = get_optim(conf, self.model) optimum = "min" if conf.lr_scheduler == "plateau": self.lr_scheduler = ReduceLROnPlateau(self.optimizer, factor=0.1, patience=2, mode=optimum, verbose=True) elif conf.lr_scheduler: raise ValueError("Unknown lr_scheduler: " + conf.lr_scheduler) self.losses = LossTrackers.from_names("loss", log=self.log) if (self.main_lang_data.tag == "ner" or self.conf.dataset.startswith("sr3de")): if self.data.is_multilingual: self.sentence_texts = { split_name: self.main_lang_data.token_texts(split_name) for split_name in ["dev", "test"] } self.conll_score = { lang: ConllScore(tag_enc=self.main_lang_data.tag_enc) for lang in self.data.dev } self.score = { lang: Score("f1", save_model=False, log=self.log, score_func=self.conll_score[lang], add_mode="append") for lang in self.data.dev } self.avg_score = Score("avg_f1", log=self.log, score_func="dummy", add_mode="append") else: self.sentence_texts = { split_name: self.main_lang_data.token_texts(split_name) [:conf.max_eval_inst] for split_name in ["dev", "test"] } self.conll_score = ConllScore( tag_enc=self.main_lang_data.tag_enc) self.score = Score("f1", log=self.log, score_func=self.conll_score, add_mode="append") else: if self.data.is_multilingual: self.score = { lang: Score("acc", log=self.log) for lang in self.data.dev } self.avg_score = Score("avg_acc", log=self.log, score_func="dummy", add_mode="append") else: self.score = Score("acc", log=self.log) if conf.early_stop > 0: score_optimum = ("max" if (self.conf.dataset.startswith("wikiannmulti") or self.data.is_multilingual) else self.score.optimum) self.early_stop = EarlyStopping( score_optimum, min_delta=conf.early_stop_min_delta, patience=conf.early_stop) else: self.early_stop = None self.epoch = 0 def get_model(self): ntags = self.data.tag_enc.nlabels nshapes = self.data.shape_enc.nlabels nchars = self.data.char_enc.nlabels bpe_emb = emb_layer(self.data.bpemb.vectors, trainable=not self.conf.emb_fixed, use_weights=not self.conf.emb_random_init) if self.conf.use_fasttext: fasttext_file = self.conf.fasttext_emb_file.format( dataset=self.conf.dataset, lang=self.data.lang) fasttext_emb = emb_layer(load_word2vec_file(fasttext_file, add_unk=True), trainable=not self.conf.emb_fixed, use_weights=not self.conf.emb_random_init) else: fasttext_emb = None model = SequenceTagger( bpe_emb, ntags, self.conf, nchars=nchars, nshapes=nshapes, fasttext_emb=fasttext_emb, bert=self.bert, tag_enc=self.main_lang_data.tag_enc, ).to(self.device) self.log.info(f'model repr dim: {model.repr_dim}') if self.conf.model_file: self.log.info(f"loading model {self.conf.model_file}") model.load_state_dict(torch.load(self.conf.model_file)) self.log.info(f"loaded model {self.conf.model_file}") return model def train(self, train_epoch, do_eval, do_test=None, eval_ds_name=None): try: for epoch in range(1, self.conf.max_epochs + 1): self.epoch = epoch self.model.train() train_epoch(epoch=epoch) self.losses.interval_end_log(epoch, ds_name="train") burnin_done = epoch >= self.conf.first_eval_epoch if burnin_done and not epoch % self.conf.eval_every: score = self.do_eval(do_eval, epoch=epoch, eval_ds_name=eval_ds_name) if do_test: self.do_eval(do_test, epoch=epoch, eval_ds_name="test") if score is not None and self.early_stop: if self.early_stop.step(score): if epoch >= self.conf.min_epochs: patience = self.early_stop.patience self.log.info( f"Early stop after {patience} steps") break except KeyboardInterrupt: self.log.info("Stopping training due to keyboard interrupt") def do_eval(self, eval_func, epoch=None, eval_ds_name=None): self.model.eval() eval_func(epoch=epoch) self.log_eval(ds_name=eval_ds_name, epoch=epoch) if self.data.is_multilingual: return self.avg_score.current return self.score.current def log_eval(self, ds_name=None, epoch=None): self.losses.interval_end(ds_name=ds_name) if self.data.is_multilingual: for lang in getattr(self.data, ds_name): if hasattr(self, "conll_score"): self.conll_score[lang].sentences = \ self.sentence_texts[ds_name][lang] fname = f"{epoch}.{ds_name}.{lang}.conll" self.conll_score[lang].outfile = self.rundir / fname self.score[lang].update() avg_score = np.average( [score.current for score in self.score.values()]) self.avg_score.update_log(model=self.model, rundir=self.rundir, epoch=epoch, score=avg_score) else: if hasattr(self, "conll_score"): self.conll_score.sentences = self.sentence_texts[ds_name] fname = f"{epoch}.{ds_name}.conll" self.conll_score.outfile = self.rundir / fname self.score.update_log(self.model, self.rundir, epoch) def save_model(self): model_file = self.rundir / f"model.e{self.epoch}.pt" save_model(self.model, model_file, self.log) @property def main_lang_data(self): return self.data[0] if isinstance(self.data, list) else self.data @property def batch_iter_train_multilang(self): main_lang_len = len(self.data[0].train_loader) max_sim_lang_len = int(self.conf.sim_lang_ratio * main_lang_len) def get_sim_lang_len(i): sim_lang_len = len(self.data[i].train_loader) return min(sim_lang_len, max_sim_lang_len) lang_idxs = [ i for i, data in enumerate(self.data) for _ in range(main_lang_len if i == 0 else get_sim_lang_len(i)) ] random.shuffle(lang_idxs) iters = [data.batch_iter_train for data in self.data] return ((i, next(iters[i])) for i in lang_idxs)
if epoch % 50 == 0: valid_loss, valid_incorrect = forward_pass(valid_loader, elisa_net, None, None, 'VALIDATE', epoch, args) valid_loss = np.mean(valid_loss) valid_accuracy = 1. - (sum(valid_incorrect) / len(valid_set)) print('[EPOCH {}/{}][VALIDATE AVG] Loss: {:.4f}; ACCURACY: {:.4f}'. format(epoch, args.epochs, valid_loss, valid_accuracy)) writer.add_scalars('Valid Loss', {'ELISANET': valid_loss}, epoch) writer.add_scalars('Valid ACCURACY', {'ELISANET': valid_accuracy}, epoch) flag, best, bad_epochs = es.step(torch.Tensor([valid_loss])) if flag: print('Early stopping criterion met') break else: if bad_epochs == 0: save_nets([elisa_net], solver, epoch, args) best_epoch = epoch print('Saving best net weights') elif bad_epochs % (args.patience // 2) == 0: scheduler.step() print( '[EPOCH {}] Current Valid Loss: {:.6f}; Best: {:.6f}; Bad epochs: {}; Best epoch: {}' .format(epoch, valid_loss, best.item(), bad_epochs, best_epoch))