def run_train(self, model, dataloaders, num_epochs=10, callbacks=None): if callbacks is None: callbacks = [] model.to(self.device) earlystop = EarlyStopping(patience=20, verbose=True, checkpoint='checkpoint_effnet_large') tracker = MetricsTracker() batch_size = self.opt.batchSize for epoch in tqdm(range(num_epochs)): print('Epoch:', epoch) for phase in ['train', 'valid']: if phase == ' train': model.train() else: model.eval() running_loss = 0.0 running_corrects = 0 dataloader = iter(dataloaders[phase]) for batch_idx, batch_data in tqdm( enumerate(dataloader), leave=False, total=len(dataloader)): data, target = batch_data['data'], batch_data['class'] running_corrects_, running_loss_ = self.run_one_step(model, batch_data, phase) accuracy = float(running_corrects) / ((batch_idx + 1) * batch_size) metrics = dict(accuracy=accuracy, loss=running_loss_ / data.size(0)) if batch_idx % self.opt.display_count == 0: for cb in callbacks: cb.update_metrics(metrics, phase) running_corrects += running_corrects_ running_loss += running_loss_ epoch_acc = running_corrects.double() / (len(dataloaders[phase]) * batch_size) epoch_loss = running_loss / ( len(dataloaders[phase]) * batch_size) if phase == 'valid': tracker.end_valid_iteration(epoch_loss, epoch_acc.item()) earlystop(epoch_loss, model) if phase == 'train': tracker.end_train_iteration(epoch_loss, epoch_acc.item()) if (earlystop.early_stop): print("Early stopping") break print('{} Accuracy: '.format(phase), epoch_acc.item()) tracker.end_epoch()
def train(self, training_data: TrainingData) -> None: x_train, y_train, x_val, y_val, vocab, class_to_i, i_to_class = preprocess_dataset(training_data, full_question=args.full_question, create_runs=args.create_runs) self.class_to_i = class_to_i self.i_to_class = i_to_class print('Batchifying data') train_batches = batchify(x_train, y_train, shuffle=True) val_batches = batchify(x_val, y_val, shuffle=False) self.model = ElmoModel(len(i_to_class), dropout=self.dropout) self.model = self.model.to(self.device) print(f'Model:\n{self.model}') parameters = list(self.model.classifier.parameters()) for mix in self.model.elmo._scalar_mixes: parameters.extend(list(mix.parameters())) self.optimizer = Adam(parameters) self.criterion = nn.CrossEntropyLoss() self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max') temp_prefix = get_tmp_filename() self.model_file = f'{temp_prefix}.pt' print(f'Saving model to: {self.model_file}') log = get(__name__) manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor='test_acc') ]) log.info('Starting training') epoch = 0 while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch(train_batches) random.shuffle(train_batches) self.model.eval() test_acc, test_loss, test_time = self.run_epoch(val_batches, train=False) stop_training, reasons = manager.instruct( train_time, train_loss, train_acc, test_time, test_loss, test_acc ) if stop_training: log.info(' '.join(reasons)) break else: self.scheduler.step(test_acc) epoch += 1
'lr': args.lr, 'dropout': args.dropout }) save_dir = 'save_model' result_dir = save_dir + '/result/' ckpt_dir = save_dir + '/checkpoints/' makedirs(result_dir) makedirs(ckpt_dir) result_fn = result_dir + args.save + '.pkl' ckpt_fn = ckpt_dir + args.save + '.pt' # instantiate the early stopping object patience = 7 early_stopping = EarlyStopping(ckpt_dir, args.save + '.pt', patience=patience, verbose=True) # Set the random seed manually for reproducibility. if args.cuda: if not torch.cuda.is_available(): print("WARNING: No CUDA device detected, switch to cpu device!") device = torch.device('cpu') torch.manual_seed(args.seed) else: device = torch.device('cuda') torch.cuda.manual_seed(args.seed) else: if torch.cuda.is_available(): print("WARNING: CUDA device detected, continue to use cpu device!") device = torch.device('cpu')
class NNModel(object): def __init__(self, sess, args): self.x = tf.placeholder(tf.float32, [None, args.feature_dim], name="input") self.y = tf.placeholder(tf.float32, [None, args.output_dim], name="output") self.y_output = build_model(self.x, args.output_dim) self.y_prob = tf.nn.softmax(self.y_output, name='prob') self.y_pred = tf.arg_max(self.y_prob, 1, name='pred') self.correct_predictions = tf.equal(self.y_pred, tf.math.argmax(self.y, 1)) self.accuracy = tf.reduce_mean(tf.cast(self.correct_predictions, "float"), name="accuracy") self.loss_f = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.y_output, labels=self.y)) self.x_1 = tf.placeholder(tf.float32, [None, args.feature_dim], name="input") self.y_1_output = build_model(self.x_1, args.output_dim) self.loss_augment = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.y_1_output, labels=self.y)) self.final_reg = tf.add(args.reg_param_1 * self.loss_y_y1, args.reg_param_2 * self.loss_augment, name="loss_final") self.loss_final = tf.add(self.final_reg, self.loss_f, name="loss_final") self.optimizer_final = tf.train.AdamOptimizer(args.lr, name = "opt2").minimize(loss_final) def restore(self, sess, output_dir, model_name): model_filename = "{}/models/{}.cpkt".format(output_dir,model_name) if os.path.isfile(model_filename): print('restoring model from %s...' % model_filename) self.saver.restore(sess, model_filename) def save(self, sess, output_dir,model_name,overwrite=True): model_filename = "{}/models/{}.cpkt".format(output_dir,model_name) if not os.path.isfile(model_filename) or overwrite: print('saving model to %s...' % model_filename) self.saver.save(sess, model_filename) # train for the base model def train_basemodel(self, sess, batch_x, batch_y, update_num): feed_dict={self.x: batch_x, self.y: batch_y} _ = sess.run(self.optimizer_f, feed_dict=feed_dict) def standard_augmentation(self, sess, batch_x, batch_x1, batch_y): batch_x1= am_util.augment_data_y1_modified(batch_x, S_30) feed_dict={self.x: batch_x, self.y: batch_y, self.x_1: batch_x1} _ = sess.run(self.optimizer_final, feed_dict=feed_dict) # evalute on the validation set, keep the summaries def val(self, sess, val_x, val_y): #loss_vector = [] feed_dict = {self.x: val_x, self.y_: val_y} # get all metrics here loss = sess.run(self.loss_f, feed_dict=feed_dict) #loss_vector.append(loss) print("validation loss = ", "{:.4f} | ".format(loss)) return loss # test and save the best result def test(self, sess, test_x, test_y, output_dir): feed_dict = {self.x: test_x, self.y_: test_y} loss, accuracy, pred_test_y = sess.run([self.loss_f, self.accuracy, self.pred], feed_dict=feed_dict) print("test loss is =", "{:.4f} | ".format(loss), "test accuracy is =", "{:.4f} | ".format(accuracy)) np.save(output_dir + '/pred_test_y.npy', pred_test_y) np.save(output_dir + '/test_y', test_y) with open(output_dir + '/Final_result.csv', 'a') as newFile: headers = ['epsilon', 'reg_param1', 'reg_param2', 'test_loss', 'test_accuracy'] newFileWriter = csv.DictWriter(newFile, fieldnames=headers) newFileWriter.writeheader() newFileWriter.writerow({'epsilon': args.epsilon, 'reg_param1': args.reg_param1, 'reg_param2': args.reg_param2, 'test_loss': loss, 'test_accuracy': accuracy}) ## 7. define the session from util import EarlyStopping config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) sess = tf.Session(config = config) # get the model model = NNModel(sess, args) # get the early stop early_stop = EarlyStopping(model) init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) update_num = 0 is_early_stop = False ## base model training for epoch in range(args.epochs): permutation = np.random.choice(train_num,train_num, replace=False) if is_early_stop == True: break for j in range(batch_num): update_num = update_num + 1 batch_index = permutation[(j * args.batch_size): (j + 1) * args.batch_size] batch_x = train_x[batch_index, :] batch_y = train_y[batch_index, :] model.train_basemodel(sess, batch_x, batch_y, update_num, model.train_summary_writer, args.display_step) val_loss = model.val(sess, val_x, val_y, update_num, model.val_summary_writer, args.display_step) ##Early stopping starts after updates if update_num > args.start_early_stop and update_num % args.display_step == 0: is_early_stop = early_stop(val_loss, args.stopping_criteria, args.model_name, update_num) print(is_early_stop) if is_early_stop: early_stop.restore(sess, args.model_name, update_num) #model.test(sess, test_x, test_y, output_dir) break
shuffle=True, drop_last=True, num_workers=4) train_dl = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=4) model = densenet161((1, 128, 128)) model.cuda() criterion = BCEWithLogitsLoss().cuda() optimizer = Adam(model.parameters(), lr=1e-3) scheduler = MultiStepLR(optimizer, milestones=[30, 45, 60], gamma=0.1) earlystopper = EarlyStopping(mode='max', min_delta=0.0001, patience=15, percentage=False) best_score = 0 model_state_path = os.path.join( Path.MODEL_DIR, 'densenet161_noisy_gpu_{}.tar'.format(args.gpu)) for epoch in range(MAX_ITERATIONS): model.train() for idx, ((x1, y1), (x2, y2)) in enumerate(zip(focus_dl, train_dl)): optimizer.zero_grad() x1, y1 = x1.float().cuda(), y1.float().cuda() x2, y2 = x2.float().cuda(), y2.float().cuda() x2, y2 = mixup_data(x2, y2, x2, y2, alpha=alpha) out2 = model(x2) out1 = model(x1) loss2 = criterion(out2, y2)
f_1 = f_2= np.arange(28) f_1, f_2 = np.meshgrid(f_1, f_2) f_1 = f_1.reshape(784,1) f_2 = f_2.reshape(784,1) fsd = np.concatenate((f_2, f_1), axis = 1) S = euclidean_distances(fsd) S[S>8]=0 #1.load base model config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) sess = tf.Session(config = config) # get the model model = NNModel(sess,args) # get the early stop early_stop = EarlyStopping(model) init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) update_num = 0 i1s_early_stop = False update_num = 0 args.batch_size = train_num batch_num = int(train_num / args.batch_size) is_early_stop = False model.restore(sess, args.base_model_path , 'base_model') ##train the final model model_name = 'final_model' for epoch in range(args.epochs): permutation = np.random.choice(train_num,train_num, replace=False)
def __init__(self, conf): self.conf = conf self.device = torch.device(f"cuda:{conf.gpu_id}") self.log = get_logger() torch.set_printoptions(precision=8) if conf.runid: conf.rundir = mkdir(conf.outdir / conf.runid) if not conf.rundir: conf.rundir = next_rundir(conf.outdir, log=self.log) self.rundir = conf.rundir dump_args(conf, conf.rundir / "conf.json") set_random_seed(conf.random_seed) if self.conf.use_bert: assert self.conf.lang in Bert.supported_langs, self.conf.lang self.bert = Bert(self.conf.bert_model_name, device=self.device) else: self.bert = None self.data = load_dataset(conf, conf.lang, bert=self.bert) _data = [self.data] for d in _data: self.log.info( f"{len(d.train_loader)} batches | bs {conf.batch_size}") self.model = self.get_model() self.optimizer = get_optim(conf, self.model) optimum = "min" if conf.lr_scheduler == "plateau": self.lr_scheduler = ReduceLROnPlateau(self.optimizer, factor=0.1, patience=2, mode=optimum, verbose=True) elif conf.lr_scheduler: raise ValueError("Unknown lr_scheduler: " + conf.lr_scheduler) self.losses = LossTrackers.from_names("loss", log=self.log) if (self.main_lang_data.tag == "ner" or self.conf.dataset.startswith("sr3de")): if self.data.is_multilingual: self.sentence_texts = { split_name: self.main_lang_data.token_texts(split_name) for split_name in ["dev", "test"] } self.conll_score = { lang: ConllScore(tag_enc=self.main_lang_data.tag_enc) for lang in self.data.dev } self.score = { lang: Score("f1", save_model=False, log=self.log, score_func=self.conll_score[lang], add_mode="append") for lang in self.data.dev } self.avg_score = Score("avg_f1", log=self.log, score_func="dummy", add_mode="append") else: self.sentence_texts = { split_name: self.main_lang_data.token_texts(split_name) [:conf.max_eval_inst] for split_name in ["dev", "test"] } self.conll_score = ConllScore( tag_enc=self.main_lang_data.tag_enc) self.score = Score("f1", log=self.log, score_func=self.conll_score, add_mode="append") else: if self.data.is_multilingual: self.score = { lang: Score("acc", log=self.log) for lang in self.data.dev } self.avg_score = Score("avg_acc", log=self.log, score_func="dummy", add_mode="append") else: self.score = Score("acc", log=self.log) if conf.early_stop > 0: score_optimum = ("max" if (self.conf.dataset.startswith("wikiannmulti") or self.data.is_multilingual) else self.score.optimum) self.early_stop = EarlyStopping( score_optimum, min_delta=conf.early_stop_min_delta, patience=conf.early_stop) else: self.early_stop = None self.epoch = 0
class Trainer(): def __init__(self, conf): self.conf = conf self.device = torch.device(f"cuda:{conf.gpu_id}") self.log = get_logger() torch.set_printoptions(precision=8) if conf.runid: conf.rundir = mkdir(conf.outdir / conf.runid) if not conf.rundir: conf.rundir = next_rundir(conf.outdir, log=self.log) self.rundir = conf.rundir dump_args(conf, conf.rundir / "conf.json") set_random_seed(conf.random_seed) if self.conf.use_bert: assert self.conf.lang in Bert.supported_langs, self.conf.lang self.bert = Bert(self.conf.bert_model_name, device=self.device) else: self.bert = None self.data = load_dataset(conf, conf.lang, bert=self.bert) _data = [self.data] for d in _data: self.log.info( f"{len(d.train_loader)} batches | bs {conf.batch_size}") self.model = self.get_model() self.optimizer = get_optim(conf, self.model) optimum = "min" if conf.lr_scheduler == "plateau": self.lr_scheduler = ReduceLROnPlateau(self.optimizer, factor=0.1, patience=2, mode=optimum, verbose=True) elif conf.lr_scheduler: raise ValueError("Unknown lr_scheduler: " + conf.lr_scheduler) self.losses = LossTrackers.from_names("loss", log=self.log) if (self.main_lang_data.tag == "ner" or self.conf.dataset.startswith("sr3de")): if self.data.is_multilingual: self.sentence_texts = { split_name: self.main_lang_data.token_texts(split_name) for split_name in ["dev", "test"] } self.conll_score = { lang: ConllScore(tag_enc=self.main_lang_data.tag_enc) for lang in self.data.dev } self.score = { lang: Score("f1", save_model=False, log=self.log, score_func=self.conll_score[lang], add_mode="append") for lang in self.data.dev } self.avg_score = Score("avg_f1", log=self.log, score_func="dummy", add_mode="append") else: self.sentence_texts = { split_name: self.main_lang_data.token_texts(split_name) [:conf.max_eval_inst] for split_name in ["dev", "test"] } self.conll_score = ConllScore( tag_enc=self.main_lang_data.tag_enc) self.score = Score("f1", log=self.log, score_func=self.conll_score, add_mode="append") else: if self.data.is_multilingual: self.score = { lang: Score("acc", log=self.log) for lang in self.data.dev } self.avg_score = Score("avg_acc", log=self.log, score_func="dummy", add_mode="append") else: self.score = Score("acc", log=self.log) if conf.early_stop > 0: score_optimum = ("max" if (self.conf.dataset.startswith("wikiannmulti") or self.data.is_multilingual) else self.score.optimum) self.early_stop = EarlyStopping( score_optimum, min_delta=conf.early_stop_min_delta, patience=conf.early_stop) else: self.early_stop = None self.epoch = 0 def get_model(self): ntags = self.data.tag_enc.nlabels nshapes = self.data.shape_enc.nlabels nchars = self.data.char_enc.nlabels bpe_emb = emb_layer(self.data.bpemb.vectors, trainable=not self.conf.emb_fixed, use_weights=not self.conf.emb_random_init) if self.conf.use_fasttext: fasttext_file = self.conf.fasttext_emb_file.format( dataset=self.conf.dataset, lang=self.data.lang) fasttext_emb = emb_layer(load_word2vec_file(fasttext_file, add_unk=True), trainable=not self.conf.emb_fixed, use_weights=not self.conf.emb_random_init) else: fasttext_emb = None model = SequenceTagger( bpe_emb, ntags, self.conf, nchars=nchars, nshapes=nshapes, fasttext_emb=fasttext_emb, bert=self.bert, tag_enc=self.main_lang_data.tag_enc, ).to(self.device) self.log.info(f'model repr dim: {model.repr_dim}') if self.conf.model_file: self.log.info(f"loading model {self.conf.model_file}") model.load_state_dict(torch.load(self.conf.model_file)) self.log.info(f"loaded model {self.conf.model_file}") return model def train(self, train_epoch, do_eval, do_test=None, eval_ds_name=None): try: for epoch in range(1, self.conf.max_epochs + 1): self.epoch = epoch self.model.train() train_epoch(epoch=epoch) self.losses.interval_end_log(epoch, ds_name="train") burnin_done = epoch >= self.conf.first_eval_epoch if burnin_done and not epoch % self.conf.eval_every: score = self.do_eval(do_eval, epoch=epoch, eval_ds_name=eval_ds_name) if do_test: self.do_eval(do_test, epoch=epoch, eval_ds_name="test") if score is not None and self.early_stop: if self.early_stop.step(score): if epoch >= self.conf.min_epochs: patience = self.early_stop.patience self.log.info( f"Early stop after {patience} steps") break except KeyboardInterrupt: self.log.info("Stopping training due to keyboard interrupt") def do_eval(self, eval_func, epoch=None, eval_ds_name=None): self.model.eval() eval_func(epoch=epoch) self.log_eval(ds_name=eval_ds_name, epoch=epoch) if self.data.is_multilingual: return self.avg_score.current return self.score.current def log_eval(self, ds_name=None, epoch=None): self.losses.interval_end(ds_name=ds_name) if self.data.is_multilingual: for lang in getattr(self.data, ds_name): if hasattr(self, "conll_score"): self.conll_score[lang].sentences = \ self.sentence_texts[ds_name][lang] fname = f"{epoch}.{ds_name}.{lang}.conll" self.conll_score[lang].outfile = self.rundir / fname self.score[lang].update() avg_score = np.average( [score.current for score in self.score.values()]) self.avg_score.update_log(model=self.model, rundir=self.rundir, epoch=epoch, score=avg_score) else: if hasattr(self, "conll_score"): self.conll_score.sentences = self.sentence_texts[ds_name] fname = f"{epoch}.{ds_name}.conll" self.conll_score.outfile = self.rundir / fname self.score.update_log(self.model, self.rundir, epoch) def save_model(self): model_file = self.rundir / f"model.e{self.epoch}.pt" save_model(self.model, model_file, self.log) @property def main_lang_data(self): return self.data[0] if isinstance(self.data, list) else self.data @property def batch_iter_train_multilang(self): main_lang_len = len(self.data[0].train_loader) max_sim_lang_len = int(self.conf.sim_lang_ratio * main_lang_len) def get_sim_lang_len(i): sim_lang_len = len(self.data[i].train_loader) return min(sim_lang_len, max_sim_lang_len) lang_idxs = [ i for i, data in enumerate(self.data) for _ in range(main_lang_len if i == 0 else get_sim_lang_len(i)) ] random.shuffle(lang_idxs) iters = [data.batch_iter_train for data in self.data] return ((i, next(iters[i])) for i in lang_idxs)
def hp_search_optuna(trial: optuna.Trial): global gargs args = gargs # set config config = load_config(args) config['args'] = args logger.info("%s", config) # set path set_path(config) # create accelerator accelerator = Accelerator() config['accelerator'] = accelerator args.device = accelerator.device # set search spaces lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True) bsz = trial.suggest_categorical('batch_size', [8, 16, 32, 64]) seed = trial.suggest_int('seed', 17, 42) epochs = trial.suggest_int('epochs', 1, args.epoch) # prepare train, valid dataset train_loader, valid_loader = prepare_datasets(config, hp_search_bsz=bsz) with temp_seed(seed): # prepare model model = prepare_model(config) # create optimizer, scheduler, summary writer model, optimizer, scheduler, writer = prepare_others(config, model, train_loader, lr=lr) # create secondary optimizer, scheduler _, optimizer_2nd, scheduler_2nd, _ = prepare_others( config, model, train_loader, lr=args.bert_lr_during_freezing) train_loader = accelerator.prepare(train_loader) valid_loader = accelerator.prepare(valid_loader) config['optimizer'] = optimizer config['scheduler'] = scheduler config['optimizer_2nd'] = optimizer_2nd config['scheduler_2nd'] = scheduler_2nd config['writer'] = writer total_batch_size = args.batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_loader)}") logger.info(f" Num Epochs = {args.epoch}") logger.info( f" Instantaneous batch size per device = {args.batch_size}") logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}" ) logger.info(f" Total optimization steps = {args.max_train_steps}") early_stopping = EarlyStopping(logger, patience=args.patience, measure='f1', verbose=1) best_eval_f1 = -float('inf') for epoch in range(epochs): eval_loss, eval_f1, best_eval_f1 = train_epoch( model, config, train_loader, valid_loader, epoch, best_eval_f1) # early stopping if early_stopping.validate(eval_f1, measure='f1'): break if eval_f1 == best_eval_f1: early_stopping.reset(best_eval_f1) early_stopping.status() trial.report(eval_f1, epoch) if trial.should_prune(): raise optuna.TrialPruned() return eval_f1
def train(args): # set etc torch.autograd.set_detect_anomaly(False) # set config config = load_config(args) config['args'] = args logger.info("%s", config) # set path set_path(config) # create accelerator accelerator = Accelerator() config['accelerator'] = accelerator args.device = accelerator.device # prepare train, valid dataset train_loader, valid_loader = prepare_datasets(config) with temp_seed(args.seed): # prepare model model = prepare_model(config) # create optimizer, scheduler, summary writer model, optimizer, scheduler, writer = prepare_others( config, model, train_loader) # create secondary optimizer, scheduler _, optimizer_2nd, scheduler_2nd, _ = prepare_others( config, model, train_loader, lr=args.bert_lr_during_freezing) train_loader = accelerator.prepare(train_loader) valid_loader = accelerator.prepare(valid_loader) config['optimizer'] = optimizer config['scheduler'] = scheduler config['optimizer_2nd'] = optimizer_2nd config['scheduler_2nd'] = scheduler_2nd config['writer'] = writer total_batch_size = args.batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_loader)}") logger.info(f" Num Epochs = {args.epoch}") logger.info( f" Instantaneous batch size per device = {args.batch_size}") logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}" ) logger.info(f" Total optimization steps = {args.max_train_steps}") # training early_stopping = EarlyStopping(logger, patience=args.patience, measure='f1', verbose=1) local_worse_epoch = 0 best_eval_f1 = -float('inf') for epoch_i in range(args.epoch): epoch_st_time = time.time() eval_loss, eval_f1, best_eval_f1 = train_epoch( model, config, train_loader, valid_loader, epoch_i, best_eval_f1) # early stopping if early_stopping.validate(eval_f1, measure='f1'): break if eval_f1 == best_eval_f1: early_stopping.reset(best_eval_f1) early_stopping.status()
) eval_loader = data.DataLoader( dataset=DataFolder('dataset/eval_images_256/', 'dataset/eval_masks_256/', 'evaluate'), batch_size=args.eval_batch_size, shuffle=False, num_workers=2 ) model = UNet(1, shrink=1).cuda() nets = [model] params = [{'params': net.parameters()} for net in nets] solver = optim.Adam(params, lr=args.lr) criterion = nn.CrossEntropyLoss() es = EarlyStopping(min_delta=args.min_delta, patience=args.patience) for epoch in range(1, args.epochs+1): train_loss = [] valid_loss = [] for batch_idx, (img, mask, _) in enumerate(train_loader): solver.zero_grad() img = img.cuda() mask = mask.cuda() pred = model(img) loss = criterion(pred, mask)
# 查看可训练参数 print("Trainable Parameter lists:") for name, param in model.named_parameters(): if param.requires_grad: print(name) loss_func=nn.BCELoss() learning_rate = 2e-3 # learning_rate = 1e-5 optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate) # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.5) # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[20, 50], gamma=0.1) early_stopping = EarlyStopping(patience=10, verbose=False, path=model_save_path) num_epoches = 100 print_every=40 k_value=5 print("Starting Training...") for epoch in range(num_epoches): # scheduler.step() total = 0 count = 0 avg_loss = 0 train_count = 0 train_total = 0 step = 0
patience=5, verbose=1, epsilon=1e-8, cooldown=0, min_lr=0, eps=1e-8) train_monitor = TrainingMonitor(file_dir='pics', arch=param.model_name) if param.task == 'cue': model_checkpoint = ModelCheckpoint( checkpoint_dir=f'D:/Dev/Bert/{param.model_name}', monitor='val_cue_f1', mode='max', arch=param.model_name) early_stopping = EarlyStopping(patience=param.early_stop_thres, monitor='val_cue_f1', mode='max') trainer = CueTrainer(n_gpu=1, model=model, logger=global_logger, optimizer=optimizer, lr_scheduler=lr_scheduler, label2id=cue_label2id, criterion=criterion, training_monitor=train_monitor, model_checkpoint=model_checkpoint, resume_path=None, grad_clip=5.0, gradient_accumulation_steps=1, early_stopping=early_stopping) elif param.task == 'scope':
eval_set = ElisaDataset('elisadata/standard', 'EVALUATE') eval_loader = data.DataLoader(dataset=eval_set, batch_size=args.eval_batch_size, shuffle=False, num_workers=0) elisa_net = network.ElisaNet(args.c_feat).cuda() params = [{'params': elisa_net.parameters()}] solver = optim.Adam(params, lr=args.lr) lmda = lambda x: 0.5 # TODO: can change this based on bad_epochs scheduler = LS.MultiplicativeLR(solver, lr_lambda=lmda) es = EarlyStopping(mode=args.es_mode, min_delta=args.loss_delta, patience=args.patience) epoch = 0 if args.resume_epoch != 0: load_weights([elisa_net], solver, args.resume_epoch, args) epoch = args.resume_epoch solver = lr_resume(solver, args.lr_resume) print('Loaded weights from epoch {}'.format(args.resume_epoch)) while epoch < args.epochs and not args.eval: epoch += 1 train_loss, _ = forward_pass(train_loader, elisa_net, solver, scheduler, 'TRAIN', epoch, args)
def train(self, training_data: TrainingData) -> None: x_train, y_train, x_val, y_val, vocab, class_to_i, i_to_class = preprocess_dataset(training_data, full_question=args.full_question,\ create_runs=args.create_runs, map_pattern=args.map_pattern, wiki_links=args.wiki_links, use_es_highlight=args.use_es_highlight) self.class_to_i = class_to_i self.i_to_class = i_to_class self.map_pattern = args.map_pattern self.wiki_links = args.wiki_links self.use_es_highlight = args.use_es_highlight self.full_question = args.full_question self.use_wiki = args.use_wiki log = get(__name__, "dan.log") log.info('Batchifying data') vocab = ['<unk>', '<eos>'] + sorted(vocab) word_to_i = {x: i for i, x in enumerate(vocab)} self.word_to_i = word_to_i log.info('Vocab len: ' + str(len(self.word_to_i))) train_sampler = RandomSampler(list(zip(x_train, y_train))) dev_sampler = RandomSampler(list(zip(x_val, y_val))) dev_loader = DataLoader(list(zip(x_val, y_val)), batch_size=args.batch_size, sampler=dev_sampler, num_workers=0, collate_fn=self.batchify) train_loader = DataLoader(list(zip(x_train, y_train)), batch_size=args.batch_size, sampler=train_sampler, num_workers=0, collate_fn=self.batchify) self.model = DanModel(len(i_to_class), len(vocab)) self.model = self.model.to(self.device) log.info(f'Loading GloVe') glove_word2idx, glove_vectors = load_glove("glove/glove.6B.300d.txt") for word, emb_index in word_to_i.items(): if word.lower() in glove_word2idx: glove_index = glove_word2idx[word.lower()] glove_vec = torch.FloatTensor(glove_vectors[glove_index]) glove_vec = glove_vec.cuda() self.model.text_embeddings.weight.data[emb_index, :].set_( glove_vec) log.info(f'Model:\n{self.model}') self.optimizer = Adam(self.model.parameters()) self.criterion = nn.CrossEntropyLoss() self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max') temp_prefix = get_tmp_filename() self.model_file = f'{temp_prefix}.pt' print(f'Saving model to: {self.model_file}') log = get(__name__) manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor='test_acc') ]) log.info('Starting training') epoch = 0 while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch(train_loader) self.model.eval() test_acc, test_loss, test_time = self.run_epoch(dev_loader, train=False) stop_training, reasons = manager.instruct(train_time, train_loss, train_acc, test_time, test_loss, test_acc) if stop_training: log.info(' '.join(reasons)) break else: self.scheduler.step(test_acc) epoch += 1