def fit(x,y,z,dev_x,dev_y,dev_z,lr,decay_weight,n_epochs=n_epochs): train_K = np.load(ROOT_PATH+'/mendelian_precomp/{}_train_K.npy'.format(sname)) dev_K = np.load(ROOT_PATH+'/mendelian_precomp/{}_dev_K.npy'.format(sname)) train_K = torch.from_numpy(train_K).float() dev_K = torch.from_numpy(dev_K).float() n_data = x.shape[0] net = Net(x.shape[1]) es = EarlyStopping(patience=5) optimizer = optim.Adam(list(net.parameters()), lr=lr, weight_decay=decay_weight) for epoch in range(n_epochs): permutation = torch.randperm(n_data) for i in range(0, n_data, batch_size): indices = permutation[i:i+batch_size] batch_x, batch_y = x[indices], y[indices] # training loop def closure(): optimizer.zero_grad() pred_y = net(batch_x) loss = my_loss(pred_y, batch_y, indices, train_K) loss.backward() return loss optimizer.step(closure) # Does the update if epoch % 5 == 0 and epoch >= 5 and dev_x is not None: # 5, 10 for small # 5,50 for large g_pred = net(test.x.float()) test_err = ((g_pred-test.g.float())**2).mean() dev_err = my_loss(net(dev_x), dev_y, None, dev_K) print('test',test_err,'dev',dev_err) if es.step(dev_err): break return es.best, epoch, net
def fit(x,y,z,dev_x,dev_y,dev_z,a,lr,decay_weight, ax, y_axz, w_samples, n_epochs=n_epochs): if 'mnist' in sname: train_K = torch.eye(x.shape[0]) else: train_K = (kernel(z, None, a, 1)+kernel(z, None, a/10, 1)+kernel(z, None, a*10, 1))/3 if dev_z is not None: if 'mnist' in sname: dev_K = torch.eye(x.shape[0]) else: dev_K = (kernel(dev_z, None, a, 1)+kernel(dev_z, None, a/10, 1)+kernel(dev_z, None, a*10, 1))/3 n_data = x.shape[0] net = FCNN(x.shape[1]) if sname not in ['mnist_x', 'mnist_xz'] else CNN() es = EarlyStopping(patience=10) # 10 for small optimizer = optim.Adam(list(net.parameters()), lr=lr, weight_decay=decay_weight) test_errs, dev_errs, exp_errs, mse_s = [], [], [], [] for epoch in range(n_epochs): permutation = torch.randperm(n_data) for i in range(0, n_data, batch_size): indices = permutation[i:i+batch_size] batch_x, batch_y = x[indices], y[indices] # training loop def closure(): optimizer.zero_grad() pred_y = net(batch_x) loss = my_loss(pred_y, batch_y, indices, train_K) loss.backward() return loss optimizer.step(closure) # Does the update if epoch % 5 == 0 and epoch >= 50 and dev_x is not None: # 5, 10 for small # 5,50 for large g_pred = net(test_X) # TODO: is it supposed to be test_X here? A: yes I think so. test_err = ((g_pred-test_Y)**2).mean() # TODO: why isn't this loss reweighted? A: because it is supposed to measure the agreement between prediction and labels. if epoch == 50 and 'mnist' in sname: if z.shape[1] > 100: train_K = np.load(ROOT_PATH+'/mnist_precomp/{}_train_K0.npy'.format(sname)) train_K = (torch.exp(-train_K/a**2/2)+torch.exp(-train_K/a**2*50)+torch.exp(-train_K/a**2/200))/3 dev_K = np.load(ROOT_PATH+'/mnist_precomp/{}_dev_K0.npy'.format(sname)) dev_K = (torch.exp(-dev_K/a**2/2)+torch.exp(-dev_K/a**2*50)+torch.exp(-dev_K/a**2/200))/3 else: train_K = (kernel(z, None, a, 1)+kernel(z, None, a/10, 1)+kernel(z, None, a*10, 1))/3 dev_K = (kernel(dev_z, None, a, 1)+kernel(dev_z, None, a/10, 1)+kernel(dev_z, None, a*10, 1))/3 dev_err = my_loss(net(dev_x), dev_y, None, dev_K) err_in_expectation, mse = conditional_expected_loss(net=net, ax=ax, w_samples=w_samples, y_samples=y_samples, y_axz=y_axz, x_on=False) print('test', test_err, 'dev', dev_err, 'err_in_expectation', err_in_expectation, 'mse: ', mse) test_errs.append(test_err) dev_errs.append(dev_err) exp_errs.append(err_in_expectation) mse_s.append(mse) if es.step(dev_err): break losses = {'test': test_errs, 'dev': dev_errs, 'exp': exp_errs, 'mse_': mse_s} return es.best, epoch, net, losses
def make_lda_model(self, sentences: list, threshold_remove_doc_freq_rate_over_this: float, rate_of_valid: float, num_topics=20, passes=200, patience=5, must_move_this_rate=0.03, round_check_convergence=3) -> bool: """ LDAモデルを作成するラッパー LDA関連一式はjoblibを使って自動セーブ passesは学習のiteration回数で、少ないと精度がとても悪い 精度の指標値を数値で表示するので、収束してなさそうならpassesを大きくして再実行 reference: [数式多めでさらっと](http://acro-engineer.hatenablog.com/entry/2017/12/11/120000) [ガチ勢のため](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf) :param sentences: list of str, 文書群, 文のlist :param threshold_remove_doc_freq_rate_over_this: float(0-1), この割合以上の出現率の語をstop word入り :param rate_of_valid: float, greater than 0 and less than 1, 収束判定でデータをvalidationに回す割合 :param num_topics: int, 想定する話題の数, 意味はreference参照 :param passes: int, LDAの学習回数, 多くすると基本的には良いことが多い :param patience: int, 収束判定でpatience回動かなかったら打ち切る :param must_move_this_rate: float, greater than 0 and less than 1, 収束判定で動いていないと見なす割合 :param round_check_convergence: int, 収束判定のskip回数。重い処理なのでround_check_convergence回に1回判定 :return: """ # documents -> list of tokens. token must be not high freq self.make_tokens_list(sentences=sentences, threshold_remove_doc_freq_rate_over_this=threshold_remove_doc_freq_rate_over_this) # gensim dictionary, 全異なり語数を端折らずに辞書化する指定。 * 指定しないと10,000語くらいで端折るはず self.dictionary = Dictionary(self.tokens_list, prune_at=self.num_tokens_variation) # tokens -> corpus corpus = [self.dictionary.doc2bow(tokens) for tokens in self.tokens_list] # prepare LDA # make LDA # ## for early stopping train, valid = split_train_valid(corpus=corpus, rate_of_valid=rate_of_valid) early_stopping = EarlyStopping(patience=patience, must_move_this_rate=must_move_this_rate) # ## make model_lda = LdaMulticore(corpus=train, num_topics=num_topics, id2word=self.dictionary, workers=self.num_process, passes=1, eval_every=round_check_convergence) _ = early_stopping.is_converged(model=model_lda, valid_corpus=valid) # ## train model for i_loop in tqdm(range(1, passes), desc="lda learning @ lda"): model_lda.update(train) if i_loop % round_check_convergence == 0: if early_stopping.is_converged(model=model_lda, valid_corpus=valid): break # show convergence self.log = early_stopping.log # save joblib.dump(self.dict_is_high_freq_token, f"{self.path_to_save}dict_is_stops.joblib") joblib.dump(model_lda, f"{self.path_to_save}model_LDA.joblib") joblib.dump(self.dictionary, f"{self.path_to_save}dictionary_LDA.joblib") return True
def run(self, num_epochs, patience): early_stopping = (patience >= 1) if early_stopping: from early_stopping import EarlyStopping self.stopper = EarlyStopping(patience=patience) self.initClassifier() self.dataset.train() self.train_loader = DataLoader(self.dataset, self.batch_size, shuffle=True, num_workers=0) self.optimizer = optim.Adam(self.classifier.parameters(), lr=1e-3, weight_decay=1e-1, amsgrad=False) self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min', patience=2, cooldown=3, factor=0.5) self.log_loss = torch.nn.BCEWithLogitsLoss() self.pbar = progressbar(range(num_epochs)) for ep in self.pbar: if early_stopping: with torch.no_grad(): shouldStop = self.test() if shouldStop: self.pbar.close() break self.train() return self.classifier
def do_train(model, config, train_data, dev_data): early_stopping = EarlyStopping(patience=10, measure='f1', verbose=1) maximum = 0 session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) feed_dict = {model.wrd_embeddings_init: config.embvec.wrd_embeddings} sess.run(tf.global_variables_initializer(), feed_dict=feed_dict) # feed large embedding data saver = tf.train.Saver() if config.restore is not None: saver.restore(sess, config.restore) print('model restored') # summary setting loss_summary = tf.summary.scalar('loss', model.loss) acc_summary = tf.summary.scalar('accuracy', model.accuracy) train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(config.summary_dir, 'summaries', 'train') train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) dev_summary_dir = os.path.join(config.summary_dir, 'summaries', 'dev') dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) for e in range(config.epoch): train_step(sess, model, config, train_data, train_summary_op, train_summary_writer) m = dev_step(sess, model, config, dev_data, dev_summary_writer, e) # early stopping if early_stopping.validate(m, measure='f1'): break if m > maximum: print('new best f1 score! : %s' % m) maximum = m # save best model save_path = saver.save(sess, config.checkpoint_dir + '/' + 'ner_model') print('max model saved in file: %s' % save_path) tf.train.write_graph(sess.graph, '.', config.checkpoint_dir + '/' + 'graph.pb', as_text=False) tf.train.write_graph(sess.graph, '.', config.checkpoint_dir + '/' + 'graph.pb_txt', as_text=True) sess.close()
def fit(model, train_data, dev_data): """Do actual training. """ def get_summary_setting(model): config = model.config sess = model.sess loss_summary = tf.summary.scalar('loss', model.loss) acc_summary = tf.summary.scalar('accuracy', model.accuracy) f1_summary = tf.summary.scalar('f1', model.f1) lr_summary = tf.summary.scalar('learning_rate', model.learning_rate) train_summary_op = tf.summary.merge( [loss_summary, acc_summary, f1_summary, lr_summary]) train_summary_dir = os.path.join(config.summary_dir, 'summaries', 'train') train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) dev_summary_dir = os.path.join(config.summary_dir, 'summaries', 'dev') dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) return train_summary_op, train_summary_writer, dev_summary_writer config = model.config sess = model.sess # restore previous model if provided saver = tf.train.Saver() if config.restore is not None: saver.restore(sess, config.restore) tf.logging.debug('model restored') # summary setting train_summary_op, train_summary_writer, dev_summary_writer = get_summary_setting( model) # train and evaluate early_stopping = EarlyStopping(patience=10, measure='f1', verbose=1) max_seqeval_f1 = 0 for e in range(config.epoch): train_step(model, train_data, train_summary_op, train_summary_writer) seqeval_f1, avg_f1 = dev_step(model, dev_data, dev_summary_writer, e) # early stopping if early_stopping.validate(seqeval_f1, measure='f1'): break if seqeval_f1 > max_seqeval_f1: tf.logging.debug('new best f1 score! : %s' % seqeval_f1) max_seqeval_f1 = seqeval_f1 # save best model save_path = saver.save(sess, config.checkpoint_dir + '/' + 'ner_model') tf.logging.debug('max model saved in file: %s' % save_path) tf.train.write_graph(sess.graph, '.', config.checkpoint_dir + '/' + 'graph.pb', as_text=False) tf.train.write_graph(sess.graph, '.', config.checkpoint_dir + '/' + 'graph.pb_txt', as_text=True) early_stopping.reset(max_seqeval_f1) early_stopping.status() sess.close()
def train(model, train_iterator, valid_iterator, test_iterator, optimizer, criterion, clip=1, short_train=True, n_epochs=10, teacher_force=0.5, eval_words=None, patience=3): early_stopping = EarlyStopping(patience=patience, verbose=False, filename='cache/checkpoint.pt') for epoch in range(n_epochs): start_time = time.time() train_loss = train_epoch(model, train_iterator, optimizer, criterion, clip, short_train, teacher_force=teacher_force) valid_loss, valid_accuracy = evaluate(model, valid_iterator, criterion, eval_words=eval_words) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s') print( f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3E}' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3E}' ) print(f'\t Val. Accuracy: {valid_accuracy:.3f}') early_stopping(valid_loss, model) if early_stopping.early_stop: print("Early stopping, reloading checkpoint model") model.load_state_dict(torch.load('cache/checkpoint.pt')) break test_loss, test_accuracy = evaluate(model, test_iterator, criterion, eval_words=eval_words) print( f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3E} |' ) print(f'| Test Accuracy: {test_accuracy:.3f}')
def hp_search(trial: optuna.Trial): if torch.cuda.is_available(): logger.info("%s", torch.cuda.get_device_name(0)) global gopt opt = gopt # set config config = load_config(opt) config['opt'] = opt logger.info("%s", config) # set path set_path(config) # set search spaces lr = trial.suggest_loguniform('lr', 1e-6, 1e-3) # .suggest_float('lr', 1e-6, 1e-3, log=True) bsz = trial.suggest_categorical('batch_size', [32, 64, 128]) seed = trial.suggest_int('seed', 17, 42) epochs = trial.suggest_int('epochs', 1, opt.epoch) # prepare train, valid dataset train_loader, valid_loader = prepare_datasets(config, hp_search_bsz=bsz) with temp_seed(seed): # prepare model model = prepare_model(config) # create optimizer, scheduler, summary writer, scaler optimizer, scheduler, writer, scaler = prepare_osws(config, model, train_loader, hp_search_lr=lr) config['optimizer'] = optimizer config['scheduler'] = scheduler config['writer'] = writer config['scaler'] = scaler early_stopping = EarlyStopping(logger, patience=opt.patience, measure=opt.measure, verbose=1) best_eval_measure = float('inf') if opt.measure == 'loss' else -float('inf') for epoch in range(epochs): eval_loss, eval_acc = train_epoch(model, config, train_loader, valid_loader, epoch) if opt.measure == 'loss': eval_measure = eval_loss else: eval_measure = eval_acc # early stopping if early_stopping.validate(eval_measure, measure=opt.measure): break if opt.measure == 'loss': is_best = eval_measure < best_eval_measure else: is_best = eval_measure > best_eval_measure if is_best: best_eval_measure = eval_measure early_stopping.reset(best_eval_measure) early_stopping.status() trial.report(eval_acc, epoch) if trial.should_prune(): raise optuna.TrialPruned() return eval_acc
def main(): # is_training = True model = MODEL_DISPATCHER[BASE_MODEL](training=True) print(model) model = model.to(DEVICE) EarlyStoppingObject = EarlyStopping() Training_Dataset = BengaliAiDataset( folds = TRAINING_FOLDS, \ img_height= IMG_HEIGHT, \ img_width= IMG_WIDTH, \ mean = MODEL_MEAN,\ std = MODEL_STD) Train_DataLoader = torch.utils.data.DataLoader(dataset=Training_Dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=4) Validation_Dataset = BengaliAiDataset( folds = VALIDATION_FOLDS, \ img_height= IMG_HEIGHT, \ img_width= IMG_WIDTH, \ mean = MODEL_MEAN,\ std = MODEL_STD) Validation_DataLoader = torch.utils.data.DataLoader( dataset=Validation_Dataset, batch_size=TEST_BATCH_SIZE, shuffle=False, num_workers=4) optimiser = torch.optim.Adam(model.parameters(), lr=1e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, mode = "min", \ patience = 5, factor = 0.3, \ verbose = True) for epoch in range(EPOCHS): train(Training_Dataset, Train_DataLoader, model, optimiser) validationScore = evaluate(Validation_Dataset, Validation_DataLoader, model, optimiser) scheduler.step(validationScore) print(f"EPOCH : {epoch} VALIDATION SCORE : {validationScore}") # torch.save(model.state_dict(), f"../input/output_models/{BASE_MODEL}_fold{VALIDATION_FOLDS[0]}.bin") EarlyStoppingObject( validationScore, model, f"../input/output_models/{BASE_MODEL}_fold{VALIDATION_FOLDS[0]}.bin" )
def main(): # Load experiment configuration config = load_config() manual_seed = config.get('manual_seed', None) if manual_seed is not None: torch.manual_seed(manual_seed) # see https://pytorch.org/docs/stable/notes/randomness.html torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Create the model device = config['training']['device'] model = get_model(config) learning_rate = config['training']['learning_rate'] # momentum = config['training']['momentum'] wd = config['training']['wd'] optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=wd) # betas=(0.9, 0.999), eps=1e-08, amsgrad=False step_size = config['training']['step_size'] gamma = config['training']['gamma'] scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma, last_epoch=-1) patience = config['training']['patience'] delta = config['training']['delta'] early_stopping = EarlyStopping(patience=patience, verbose=True, delta=delta, checkpoint_path=None) # Create loss criterion loss_type = config['loss']['loss_type'] w0, w1 = config['loss']['w0'], config['loss']['w1'] ce_weights = [w0, w1] dce_w = config['loss']['dce_w'] nll = config['loss']['nll'] criterion = DiceCrossEntropyLoss(loss=loss_type, logging_name=None, ce_weights = ce_weights, \ dce_weight=dce_w, nll=nll) # Start training train(model, config, optimizer, scheduler, criterion, early_stopping=None)
def train(opt): if torch.cuda.is_available(): logger.info("%s", torch.cuda.get_device_name(0)) # set etc torch.autograd.set_detect_anomaly(True) # set config config = load_config(opt) config['opt'] = opt logger.info("%s", config) # set path set_path(config) # prepare train, valid dataset train_loader, valid_loader = prepare_datasets(config) with temp_seed(opt.seed): # prepare model model = prepare_model(config) # create optimizer, scheduler, summary writer, scaler optimizer, scheduler, writer, scaler = prepare_osws( config, model, train_loader) config['optimizer'] = optimizer config['scheduler'] = scheduler config['writer'] = writer config['scaler'] = scaler # training early_stopping = EarlyStopping(logger, patience=opt.patience, measure='f1', verbose=1) local_worse_epoch = 0 best_eval_f1 = -float('inf') for epoch_i in range(opt.epoch): epoch_st_time = time.time() eval_loss, eval_f1, best_eval_f1 = train_epoch( model, config, train_loader, valid_loader, epoch_i, best_eval_f1) # early stopping if early_stopping.validate(eval_f1, measure='f1'): break if eval_f1 == best_eval_f1: early_stopping.reset(best_eval_f1) early_stopping.status()
def main(): parser = define_args() args = parser.parse_args() num_classes = 17 cudnn.benchmark = True print('Training model(s) for folds: {}'.format(args.folds)) for fold in args.folds: model, bootstrap_params, full_params = create_model( num_classes, args.lr, ) criterion = torch.nn.MultiLabelSoftMarginLoss() if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() bootstrap_optimizer = torch.optim.Adam(bootstrap_params, args.lr) optimizer = torch.optim.Adam(full_params, args.lr) train_loader, val_loader = create_data_pipeline(fold, args) tuner = Tuner(model, criterion, bootstrap_optimizer, optimizer, tag='fold_{}'.format(fold), early_stopping=EarlyStopping( mode='max', threshold_mode='abs', patience=7, )) if args.resume: if os.path.isfile(args.resume): tuner.restore_checkpoint(args.resume) tuner.run(train_loader, val_loader)
def train(model, train_iterator, valid_iterator, test_iterator, optimizer, criterion, model_checkpoint, device, clip=1, short_train=True, n_epochs=50, patience=3): early_stopping = EarlyStopping(patience=patience, verbose=False, filename=model_checkpoint) for epoch in range(n_epochs): start_time = time.time() train_loss = train_epoch(model, train_iterator, optimizer, criterion, clip, device, short_train) valid_loss = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3E}') print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3E}') early_stopping(valid_loss, model) if early_stopping.early_stop: print("Early stopping, reloading checkpoint model") model.load_state_dict(torch.load(model_checkpoint)) break test_loss = evaluate(model, test_iterator, criterion) print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3E} |')
def train(model, train_iter, val_iter, test_iter, optimizer, criterion, n_epochs, short_train, checkpoint_name, patience): early_stopping = EarlyStopping(filename=checkpoint_name, patience=patience) for epoch in range(n_epochs): start_time = time.time() train_loss = train_epoch(model, train_iter, optimizer, criterion, short_train) val_loss, val_acc = evaluate(model, val_iter, criterion) end_time = time.time() epoch_min, epoch_sec = epoch_time(start_time, end_time) print(f'Epoch: {epoch + 1:02} | Time: {epoch_min}m {epoch_sec}s') print(f'\tTrain Loss: {train_loss:.3f}') print(f'\t Val. Loss: {val_loss:.3f} | Val. Accuracy {val_acc:.3f}') early_stopping(val_loss, model) if early_stopping.early_stop: print("Early stopping, reloading checkpoint model") model.load_state_dict(torch.load(checkpoint_name)) break test_loss, test_acc = evaluate(model, test_iter, criterion) print(f'Test Loss: {test_loss:.3f} | Test Accuracy: {test_acc:.3f}')
def __init__(self, trainLoader, testLoader, model, epoch=100, eps=1e-3, savePath="./"): self.trainLoader = trainLoader self.testLoader = testLoader self.model = model # self.optimizer=torch.optim.SGD(self.model.parameters(),lr=0.01, momentum=0.9) self.optimizer = torch.optim.Adam(self.model.parameters()) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, mode="min", patience=3, threshold=0.0001) self.start_index = 1 self.epoch = epoch self.eps = eps self.vis = visdom.Visdom(env="imageNet") self.interval = 1 self.checker = EarlyStopping(delta=self.eps) self.device = device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.currentId = self.start_index self.savePath = savePath
def train_model(train_iterator, val_iterator, test_iterator): hidden_size = 8 vocab_size = len(train_iterator.word2index) n_extra_feat = 10 output_size = 2 n_layers = 1 dropout = 0.5 learning_rate = 0.001 epochs = 40 spatial_dropout = True bidirectional = True # Load the weights matrix weights = np.load('glove/weights-biGRU-glove.npy') # Check whether system supports CUDA CUDA = torch.cuda.is_available() model = BiGRU(hidden_size, vocab_size, n_extra_feat, weights, output_size, n_layers, dropout, spatial_dropout, bidirectional) # Move the model to GPU if possible if CUDA: model.cuda() model.add_loss_fn(nn.NLLLoss()) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) model.add_optimizer(optimizer) device = torch.device('cuda' if CUDA else 'cpu') model.add_device(device) # Instantiate the EarlyStopping early_stop = EarlyStopping(wait_epochs=2) train_losses_list, train_avg_loss_list, train_accuracy_list = [], [], [] eval_avg_loss_list, eval_accuracy_list, conf_matrix_list = [], [], [] for epoch in range(epochs): print('\nStart epoch [{}/{}]'.format(epoch + 1, epochs)) train_losses, train_avg_loss, train_accuracy = model.train_model( train_iterator) train_losses_list.append(train_losses) train_avg_loss_list.append(train_avg_loss) train_accuracy_list.append(train_accuracy) _, eval_avg_loss, eval_accuracy, conf_matrix = model.evaluate_model( val_iterator) eval_avg_loss_list.append(eval_avg_loss) eval_accuracy_list.append(eval_accuracy) conf_matrix_list.append(conf_matrix) print( '\nEpoch [{}/{}]: Train accuracy: {:.3f}. Train loss: {:.4f}. Evaluation accuracy: {:.3f}. Evaluation loss: {:.4f}' \ .format(epoch + 1, epochs, train_accuracy, train_avg_loss, eval_accuracy, eval_avg_loss)) if early_stop.stop(eval_avg_loss, model, delta=0.003): break _, test_avg_loss, test_accuracy, test_conf_matrix = model.evaluate_model( test_iterator) print('Test accuracy: {:.3f}. Test error: {:.3f}'.format( test_accuracy, test_avg_loss))
# lr=params.lr, # warmup=warmup_proportion, # t_total=num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr = params.lr, correct_bias=True ) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = int(warmup_proportion*num_train_optimization_steps), num_training_steps = num_train_optimization_steps ) criterion = nn.CrossEntropyLoss(ignore_index=0) binary_criterion = nn.BCEWithLogitsLoss( pos_weight=torch.Tensor([3932/14263]).to(dev)) avg_train_losses = [] avg_valid_losses = [] # initialize the early_stopping object early_stopping = EarlyStopping(patience=params.patience, verbose=True) for epoch in range(1, params.n_epochs+1): # print("For epoch {} cached is {}\n allocated is {}".format(epoch, # torch.cuda.memory_cached(0), torch.cuda.memory_allocated(0))) print("=========eval at epoch={epoch}=========") if not os.path.exists('checkpoints'): os.makedirs('checkpoints') if not os.path.exists('results'): os.makedirs('results') fname = os.path.join('checkpoints','epoch_{}_'.format(epoch)+params.run) spath = os.path.join('checkpoints','epoch_{}_'.format(epoch)+params.run+".pt") # print("For epoch {} cached is {}\n allocated is {}".format(epoch, torch.cuda.memory_cached(0), torch.cuda.memory_allocated(0)))
device = torch.device("cuda:0") unet = UnetResnet34().to(device) criterion = BinaryFocalLoss2d() optimizer = torch.optim.SGD(unet.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = ReduceLROnPlateau(optimizer, 'max', factor=args.rop_reduce_factor, patience=args.rop_patience, verbose=True) early_stopping = EarlyStopping(args.early_stopping_patience, mode='max') for epoch in range(args.epochs): unet.train() train_loss = [] for images, masks in tqdm(train_loader): optimizer.zero_grad() images, masks = images.to(device), masks.to(device) prediction = unet(images) predicted_mask = prediction.squeeze(1) masks = masks.squeeze(1) loss = criterion(predicted_mask, masks) train_loss.append(loss.item())
def run(fold, args): if args.sz: print(f"Images will be resized to {args.sz}") args.sz = int(args.sz) # get training and valid data df = pd.read_csv(args.training_folds_csv) if args.loss == 'crossentropy' and not args.isic2019: diag_to_ix = { v: i for i, v in enumerate(sorted(list(set(df.diagnosis)))) } ix_to_diag = {v: i for i, v in diag_to_ix.items()} if args.external_csv_path: df_external = pd.read_csv(args.external_csv_path) df_train = df.query(f"kfold != {fold}").reset_index(drop=True) df_valid = df.query(f"kfold == {fold}").reset_index(drop=True) print( f"Running for K-Fold {fold}; train_df: {df_train.shape}, valid_df: {df_valid.shape}" ) # calculate weights for NN loss weights = len(df) / df.target.value_counts().values class_weights = torch.FloatTensor(weights) if args.loss == 'weighted_bce': print(f"assigning weights {weights} to loss fn.") if args.loss == 'focal_loss': print("Focal loss will be used for training.") if args.loss == 'weighted_cross_entropy': print(f"assigning weights {weights} to loss fn.") # create model if 'efficient_net' in args.model_name: model = MODEL_DISPATCHER[args.model_name]( pretrained=args.pretrained, arch_name=args.arch_name, ce=(args.loss == 'crossentropy' or args.loss == 'weighted_cross_entropy' or args.load_pretrained_2019)) else: model = MODEL_DISPATCHER[args.model_name](pretrained=args.pretrained) if args.model_path is not None: print( f"Loading pretrained model and updating final layer from {args.model_path}" ) model.load_state_dict(torch.load(args.model_path)) nftrs = model.base_model._fc.in_features model.base_model._fc = nn.Linear(nftrs, 1) meta_array = None if args.use_metadata: # create meta array sex_dummy_train = pd.get_dummies(df_train['sex'])[['male', 'female']] site_dummy_train = pd.get_dummies( df_train['anatom_site_general_challenge'])[[ 'head/neck', 'lower extremity', 'oral/genital', 'palms/soles', 'torso', 'upper extremity' ]] assert max(df_train.age_approx) < 100 age_train = df_train.age_approx.fillna(-5) / 100 meta_array = pd.concat([sex_dummy_train, site_dummy_train, age_train], axis=1).values # modify model forward if args.freeze_cnn: model.load_state_dict(torch.load(args.model_path)) # update the forward pass model = modify_model(model, args) # freeze cnn if args.freeze_cnn: print("\nFreezing CNN layers!\n") for param in model.base_model.parameters(): param.requires_grad = False # add external meta to meta array if args.external_csv_path: sex_dummy_ext = pd.get_dummies( df_external['sex'])[['male', 'female']] df_external[ 'anatom_site_general'] = df_external.anatom_site_general.replace( { 'anterior torso': 'torso', 'lateral torso': 'torso', 'posterior torso': 'torso' }) site_dummy_ext = pd.get_dummies( df_external['anatom_site_general'])[[ 'head/neck', 'lower extremity', 'oral/genital', 'palms/soles', 'torso', 'upper extremity' ]] assert max(df_external.age_approx) < 100 age_ext = df_external.age_approx.fillna(-5) / 100 meta_array = np.concatenate([ meta_array, pd.concat([sex_dummy_ext, site_dummy_ext, age_ext], axis=1).values ]) assert meta_array.shape[1] == 9 model = model.to(args.device) train_aug = albumentations.Compose([ albumentations.RandomScale(0.07), albumentations.Rotate(50), albumentations.RandomBrightnessContrast(0.15, 0.1), albumentations.Flip(p=0.5), albumentations.IAAAffine(shear=0.1), albumentations.RandomCrop(args.sz, args.sz) if args.sz else albumentations.NoOp(), albumentations.OneOf([ albumentations.Cutout(random.randint(1, 8), 16, 16), albumentations.CoarseDropout(random.randint(1, 8), 16, 16) ]), albumentations.Normalize(always_apply=True) ]) valid_aug = albumentations.Compose([ albumentations.CenterCrop(args.sz, args.sz) if args.sz else albumentations.NoOp(), albumentations.Normalize(always_apply=True), ]) print(f"\nUsing train augmentations: {train_aug}\n") # get train and valid images & targets and add external data if required (external data only contains melonama data) train_images = df_train.image_name.tolist() if args.external_csv_path: external_images = df_external.image.tolist() if args.exclude_outliers_2019: # from EDA notebook external_images = np.load( f'/home/ubuntu/repos/kaggle/melonama/data/external/clean_external_2019_{args.sz}.npy' ).tolist() print( f"\n\n{len(external_images)} external images will be added to each training fold." ) train_images = train_images + external_images if args.use_pseudo_labels: test_df = pd.read_csv( '/home/ubuntu/repos/kaggle/melonama/data/test.csv') test_images = test_df.image_name.tolist() if args.pseudo_images_path: test_images = list( np.load(args.pseudo_images_path, allow_pickle=True)) print( f"\n\n{len(test_images)} test images will be added to each training fold." ) train_images = train_images + test_images train_image_paths = [ os.path.join(args.train_data_dir, image_name + '.jpg') for image_name in train_images ] train_targets = df_train.target if not args.external_csv_path else np.concatenate( [df_train.target.values, np.ones(len(external_images))]) if args.use_pseudo_labels: train_targets = np.concatenate([ train_targets, np.load(args.pseudo_labels_path, allow_pickle=True) ]) if args.loss == 'crossentropy': df_train['diagnosis'] = df_train.diagnosis.map(diag_to_ix) train_targets = df_train.diagnosis.values assert len(train_image_paths) == len( train_targets ), "Length of train images {} doesnt match length of targets {}".format( len(train_images), len(train_targets)) # same for valid dataframe valid_images = df_valid.image_name.tolist() valid_image_paths = [ os.path.join(args.train_data_dir, image_name + '.jpg') for image_name in valid_images ] valid_targets = df_valid.target if args.loss == 'crossentropy': df_valid['diagnosis'] = df_valid.diagnosis.map(diag_to_ix) valid_targets = df_valid.diagnosis.values print( f"\n\n Total Train images: {len(train_image_paths)}, Total val: {len(valid_image_paths)}\n\n" ) # create train and valid dataset, dont use color constancy as already preprocessed in directory train_dataset = MelonamaDataset(train_image_paths, train_targets, train_aug, cc=args.cc, meta_array=meta_array) valid_dataset = MelonamaDataset(valid_image_paths, valid_targets, valid_aug, cc=args.cc, meta_array=meta_array) # create dataloaders train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=4) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=args.valid_batch_size, shuffle=False, num_workers=4) # create optimizer and scheduler for training optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[3, 5, 6, 7, 8, 9, 10, 11, 13, 15], gamma=0.5) es = EarlyStopping(patience=3, mode='min' if args.metric == 'valid_loss' else 'max') for epoch in range(args.epochs): train_loss = train_one_epoch( args, train_loader, model, optimizer, weights=None if not args.loss.startswith('weighted') else class_weights) preds, valid_loss = evaluate(args, valid_loader, model) predictions = np.vstack(preds).ravel() if args.loss == 'crossentropy' or args.loss == 'weighted_cross_entropy': accuracy = metrics.accuracy_score(valid_targets, predictions) else: auc = metrics.roc_auc_score(valid_targets, predictions) preds_df = pd.DataFrame({ 'predictions': predictions, 'targets': valid_targets, 'valid_image_paths': valid_image_paths }) print( f"Epoch: {epoch}, Train loss: {train_loss}, Valid loss: {valid_loss}, Valid Score: {locals()[f'{args.metric}']}" ) scheduler.step() for param_group in optimizer.param_groups: print(f"Current Learning Rate: {param_group['lr']}") es(locals()[f"{args.metric}"], model, model_path= f"/home/ubuntu/repos/kaggle/melonama/models/{syd_now.strftime(r'%d%m%y')}/{args.arch_name}_fold_{fold}_{args.sz}_{locals()[f'{args.metric}']}.bin", preds_df=preds_df, df_path= f"/home/ubuntu/repos/kaggle/melonama/valid_preds/{syd_now.strftime(r'%d%m%y')}/{args.arch_name}_fold_{fold}_{args.sz}_{locals()[f'{args.metric}']}.bin", args=args) if es.early_stop: return preds_df
def train(fold): training_data_path = "/home/dragoshh1984/repos/kaggle/datasets/melanomia_classification/512x512-dataset-melanoma/512x512-dataset-melanoma" model_path = "/home/dragoshh1984/repos/kaggle/melanomia-classification" df = pd.read_csv( "/home/dragoshh1984/repos/kaggle/datasets/melanomia_classification/new_train.csv" ) # defines device = "cuda" epochs = 20 train_bs = 16 valid_bs = 16 # for this model mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) # data for training df_train = df[df.fold != fold].reset_index(drop=True) df_valid = df[df.fold == fold].reset_index(drop=True) # augmentations train_aug = albumentations.Compose([ albumentations.RandomResizedCrop(224, 224, (0.7, 1.0)), albumentations.HorizontalFlip(), albumentations.VerticalFlip(), albumentations.Cutout(), albumentations.RandomBrightness(), albumentations.RandomContrast(), albumentations.Rotate(), albumentations.RandomScale(), albumentations.PadIfNeeded(300, 300), albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True), ]) valid_aug = albumentations.Compose([ albumentations.RandomResizedCrop(224, 224, (0.7, 1.0)), albumentations.HorizontalFlip(), albumentations.VerticalFlip(), albumentations.Cutout(), albumentations.RandomBrightness(), albumentations.RandomContrast(), albumentations.Rotate(), albumentations.RandomScale(), albumentations.PadIfNeeded(300, 300), albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True), ]) train_images = df_train.image_id.values.tolist() train_images = [ os.path.join(training_data_path, i + ".jpg") for i in train_images ] train_metada = df_train.drop([ "fold", "target", "image_id", "patient_id", "source", "stratify_group" ], axis=1).values.tolist() train_targets = df_train.target.values valid_images = df_valid.image_id.values.tolist() valid_images = [ os.path.join(training_data_path, i + ".jpg") for i in valid_images ] valid_metadata = df_valid.drop([ "fold", "target", "image_id", "patient_id", "source", "stratify_group" ], axis=1).values.tolist() valid_targets = df_valid.target.values # datasets training_dataset = ClassificationLoader(image_paths=train_images, metadata=train_metada, targets=train_targets, resize=None, augmentations=train_aug) # loaders train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=train_bs, shuffle=True, num_workers=4) valid_dataset = ClassificationLoader(image_paths=valid_images, metadata=valid_metadata, targets=valid_targets, resize=None, augmentations=valid_aug) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_bs, shuffle=False, num_workers=4) model = EfficientNet_tabular(pretrained="imagenet") model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) # max for auc metric scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, mode="max") # early stopping es = EarlyStopping(patience=3, mode="max") # import pdb; pdb.set_trace() for epoch in range(epochs): training_loss = Engine.train(train_loader, model, optimizer, device) predictions, valid_loss = Engine.evaluate(valid_loader, model, device) # import pdb; pdb.set_trace() predictions = np.vstack((predictions)).ravel() auc = metrics.roc_auc_score(valid_targets, predictions) scheduler.step(auc) print(f"epoch={epoch}, auc={auc}") es(auc, model, os.path.join(model_path, f"model{fold}.bin")) if es.early_stop: print("early stopping") break
def train(opt): if torch.cuda.is_available(): logger.info("%s", torch.cuda.get_device_name(0)) # set etc torch.autograd.set_detect_anomaly(True) # set config config = load_config(opt) config['opt'] = opt logger.info("%s", config) # set path set_path(config) # prepare train, valid dataset train_loader, valid_loader = prepare_datasets(config) with temp_seed(opt.seed): # prepare model model = prepare_model(config) # create optimizer, scheduler, summary writer, scaler optimizer, scheduler, writer, scaler = prepare_osws(config, model, train_loader) config['optimizer'] = optimizer config['scheduler'] = scheduler config['writer'] = writer config['scaler'] = scaler # training early_stopping = EarlyStopping(logger, patience=opt.patience, measure='f1', verbose=1) local_worse_steps = 0 prev_eval_f1 = -float('inf') best_eval_f1 = -float('inf') for epoch_i in range(opt.epoch): epoch_st_time = time.time() eval_loss, eval_f1 = train_epoch(model, config, train_loader, valid_loader, epoch_i) # early stopping if early_stopping.validate(eval_f1, measure='f1'): break if eval_f1 > best_eval_f1: best_eval_f1 = eval_f1 if opt.save_path: logger.info("[Best model saved] : {:10.6f}".format(best_eval_f1)) save_model(config, model) # save finetuned bert model/config/tokenizer if config['emb_class'] in ['bert', 'distilbert', 'albert', 'roberta', 'bart', 'electra']: if not os.path.exists(opt.bert_output_dir): os.makedirs(opt.bert_output_dir) model.bert_tokenizer.save_pretrained(opt.bert_output_dir) model.bert_model.save_pretrained(opt.bert_output_dir) early_stopping.reset(best_eval_f1) early_stopping.status() # begin: scheduling, apply rate decay at the measure(ex, loss) getting worse for the number of deacy epoch steps. if prev_eval_f1 >= eval_f1: local_worse_steps += 1 else: local_worse_steps = 0 logger.info('Scheduler: local_worse_steps / opt.lr_decay_steps = %d / %d' % (local_worse_steps, opt.lr_decay_steps)) if not opt.use_transformers_optimizer and \ epoch_i > opt.warmup_epoch and \ (local_worse_steps >= opt.lr_decay_steps or early_stopping.step() > opt.lr_decay_steps): scheduler.step() local_worse_steps = 0 prev_eval_f1 = eval_f1
def train(self): train_sampler = RandomSampler(self.train_dataset) train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size) writer = SummaryWriter(log_dir = self.args.model_dir) if self.args.max_steps > 0: t_total = self.args.max_steps self.args.num_train_epochs = self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': self.args.weight_decay}, {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(self.train_dataset)) logger.info(" Num Epochs = %d", self.args.num_train_epochs) logger.info(" Total train batch size = %d", self.args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) logger.info(" Logging steps = %d", self.args.logging_steps) logger.info(" Save steps = %d", self.args.save_steps) global_step = 0 tr_loss = 0.0 self.model.zero_grad() train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch") early_stopping = EarlyStopping(patience = self.args.early_stopping, verbose = True) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", position=0, leave=True) print("\nEpoch", _) for step, batch in enumerate(epoch_iterator): self.model.train() batch = tuple(t.to(self.device) for t in batch) # GPU or CPU inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'intent_label_ids': batch[3], 'slot_labels_ids': batch[4]} if self.args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] outputs = self.model(**inputs) loss = outputs[0] if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % self.args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule self.model.zero_grad() global_step += 1 if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0: print('\nTuning metrics:', self.args.tuning_metric) results = self.evaluate("dev") writer.add_scalar("Loss/validation", results['loss'], _) writer.add_scalar("Intent Accuracy/validation", results['intent_acc'], _) writer.add_scalar("Slot F1/validation", results['slot_f1'], _) writer.add_scalar("Mean Intent Slot", results['mean_intent_slot'], _) writer.add_scalar("Sentence Accuracy/validation", results['semantic_frame_acc'], _) early_stopping(results[self.args.tuning_metric], self.model, self.args) if early_stopping.early_stop: print("Early stopping") break # if self.args.save_steps > 0 and global_step % self.args.save_steps == 0: # self.save_model() if 0 < self.args.max_steps < global_step: epoch_iterator.close() break if 0 < self.args.max_steps < global_step or early_stopping.early_stop: train_iterator.close() break writer.add_scalar("Loss/train", tr_loss / global_step, _) return global_step, tr_loss / global_step
def train(): lr = 0.002 lambd = 0.001 MAX_EPOCHS = 35 bs = 32 fp = open('../../../../4.training_data_all_4_1/data.pkl', 'rb') data = pickle.load(fp) X_train_pos = data['X_train_pos'] X_train_neg_all = data['X_train_neg'] X_valid_pos = data['X_valid_pos'] X_valid_neg_all = data['X_valid_neg'] x_pos_train = X_train_pos train_pos_num = len(x_pos_train) train_neg_num = multi * train_pos_num train_num = train_pos_num + train_neg_num Y_pos_train = np.ones((train_pos_num, 1), dtype=np.float32) Y_neg_train = np.zeros((train_neg_num, 1), dtype=np.float32) Y_train = np.concatenate((Y_pos_train, Y_neg_train)) x_pos_valid = X_valid_pos valid_pos_num = len(x_pos_valid) valid_neg_num = multi * valid_pos_num valid_num = valid_pos_num + valid_neg_num Y_pos_valid = np.ones((valid_pos_num, 1), dtype=np.float32) Y_neg_valid = np.zeros((valid_neg_num, 1), dtype=np.float32) Y_valid = np.concatenate((Y_pos_valid, Y_neg_valid)) print('train_sequence:', train_pos_num + train_neg_num) print('valid_sequence:', valid_neg_num + valid_pos_num) for m in range(10): print('model {}'.format(m)) savedir = './model_file/model_' + str(m) if not os.path.exists(savedir): os.makedirs(savedir) np.random.shuffle(X_train_neg_all) np.random.shuffle(X_valid_neg_all) x_neg_valid = X_valid_neg_all[:valid_neg_num] x_valid = np.vstack((x_pos_valid, x_neg_valid)) net = Net() label_loss = nn.BCELoss(reduction='none') if USE_CUDA: net = net.cuda() optimizer = torch.optim.Adam(net.parameters(), lr=lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8) earlystop = EarlyStopping(patience=10) valid_dataset = TensorDataset( torch.from_numpy(x_valid).cuda(), torch.from_numpy(Y_valid).cuda()) valid_loader = DataLoader(dataset=valid_dataset, batch_size=bs, shuffle=True) for epochs in range(MAX_EPOCHS): np.random.shuffle(X_train_neg_all) x_neg_train = X_train_neg_all[:train_neg_num] x_train = np.vstack((x_pos_train, x_neg_train)) train_dataset = TensorDataset( torch.from_numpy(x_train).cuda(), torch.from_numpy(Y_train).cuda()) train_loader = DataLoader(dataset=train_dataset, batch_size=bs, shuffle=True) Loss = 0 net.train() for i, (x, y) in enumerate(train_loader): x = Variable(x) y = Variable(y) bss = x.size(0) output = net(x) #loss=torch.mean(label_loss(output,y)+lambd*pterm) loss = torch.mean(label_loss(output, y)) if USE_CUDA: Loss += loss.cpu().data.numpy() * bss else: Loss += loss.data.numpy() optimizer.zero_grad() loss.backward() optimizer.step() if i % 20 == 0 and i != 0: pass #print('\rEpoch {}, process {}, loss {}, loss1{}, loss2{}'.format(epochs,i*bs/x_train.shape[0],Loss/i,Loss1/i,Loss2/i),end='') Loss /= train_num prob = [] Y = [] validloss = 0 net.eval() for i, (x, y) in enumerate(valid_loader): Y.append(y.cpu().data.numpy()) x = Variable(x) y = Variable(y) bss = x.size(0) with torch.no_grad(): output = net(x) prob.append(output.cpu().data.numpy()) #loss=torch.mean(label_loss(output,y)+lambd*pterm) loss = torch.mean(label_loss(output, y)) validloss += loss.cpu().data.numpy() * bss validloss /= valid_num prob = np.concatenate(prob) Y = np.concatenate(Y) vfpr, vtpr, vthresholds = metrics.roc_curve(Y, prob, pos_label=1) vauc = metrics.auc(vfpr, vtpr) print('Epoch {}, trainloss {}, validloss {}, vauc: {}'.format( epochs, Loss, validloss, vauc)) #print(' vauc: {}'.format(vauc)) earlystop(validloss, net, savedir) if earlystop.early_stop: print('early_stopping at {}'.format(epochs)) break scheduler.step()
conv3_filter_size=(2, 2), pool3_pool_size=(2, 2), dropout3_p=0.3, hidden4_num_units=1000, dropout4_p=0.5, hidden5_num_units=1000, output_num_units=30, output_nonlinearity=None, update_learning_rate=theano.shared(float32(0.03)), update_momentum=theano.shared(float32(0.9)), regression=True, batch_iterator_train=FlipBatchIterator(batch_size=128), on_epoch_finished=[ AdjustVariable('update_learning_rate', start=0.03, stop=0.0001), AdjustVariable('update_momentum', start=0.9, stop=0.999), EarlyStopping(patience=200) ], max_epochs=10000, verbose=1) X, y = load2d() # load 2D data net8.fit(X, y) # Training for 10000 epochs will take a while. We'll pickle the # trained model so that we can load it back later: from pickle import dump with open("net8_{0}.pickle".format(sys.argv[1]), 'wb') as f: dump(net8, f, -1)
def train(self, train_param, verbose=False): # unpack model = train_param['model'] data = train_param['data'] loss_param = train_param['loss'] loss_param['train_loss'] = train_param['train_loss'] loss_param['test_loss'] = train_param['test_loss'] # write loss file loss_file = open(self.path_log + 'loss_' + self.current_hash + '.txt', "a") loss_file.write( '\n==========================================================================\n' ) # get training data loader train_batch_size = math.ceil( data.y_train_tensor.size(0) / train_param['num_batch']) data_train_loader = DataLoader(list( zip(data.X_train_tensor, data.y_train_tensor, data.X_train_origin)), batch_size=train_batch_size, shuffle=True) # get test data loader val_batch_size = math.ceil( data.y_val_tensor.size(0) / train_param['num_batch']) data_val_loader = DataLoader(list( zip(data.X_val_tensor, data.y_val_tensor, data.X_val_origin)), batch_size=val_batch_size, shuffle=True) # get test data loader test_batch_size = math.ceil( data.y_test_tensor.size(0) / train_param['num_batch']) data_test_loader = DataLoader(list( zip(data.X_test_tensor, data.y_test_tensor, data.X_test_origin)), batch_size=test_batch_size, shuffle=True) # Optimizer optimizer = train_param['optimizer']( model.parameters(), weight_decay=train_param['L2_reg']) # cyclical scheduler if train_param['cyclical']: scheduler = torch.optim.lr_scheduler.CyclicLR( optimizer, **train_param['cyclical']) # MSE Loss criterion = torch.nn.MSELoss() # path to save model path_to_model = self.path_model + '%s.pt' % self.current_hash # for early stopping if train_param['early_stopping']: train_param['early_stopping']['saved_model'] = path_to_model early_stopping = EarlyStopping(**train_param['early_stopping']) # For Recording Losses NUMEPOCHS = train_param['epochs'] train_losses = np.zeros(NUMEPOCHS) val_losses = np.zeros(NUMEPOCHS) test_losses = np.zeros(NUMEPOCHS) train_phy_losses = np.zeros(NUMEPOCHS) val_phy_losses = np.zeros(NUMEPOCHS) test_phy_losses = np.zeros(NUMEPOCHS) train_norm_phy_losses = np.zeros(NUMEPOCHS) val_norm_phy_losses = np.zeros(NUMEPOCHS) test_norm_phy_losses = np.zeros(NUMEPOCHS) train_e_losses = np.zeros(NUMEPOCHS) val_e_losses = np.zeros(NUMEPOCHS) test_e_losses = np.zeros(NUMEPOCHS) train_all_losses = np.zeros(NUMEPOCHS) val_all_losses = np.zeros(NUMEPOCHS) test_all_losses = np.zeros(NUMEPOCHS) lambda_s_train = np.zeros(NUMEPOCHS) lambda_e_train = np.zeros(NUMEPOCHS) lambda_s_test = np.zeros(NUMEPOCHS) lambda_e_test = np.zeros(NUMEPOCHS) # write log file task_timestamp = self.str_now() statistics = \ """ ========================================================================== Action: training model. Time: %s Task Id: %s Number of Epochs: %d Train Batch Size: %d Test Batch Size: %d Optimizer: %s Training Loss: %s Test Loss: %s -------------------------------------------------------------------------- """ % \ ( task_timestamp, self.current_hash, NUMEPOCHS, train_batch_size, test_batch_size, optimizer, train_param['train_loss'], train_param['test_loss'], ) if verbose: print(statistics) # write log file log_file = open(self.path_log + 'log_' + self.current_hash + '.txt', "a") log_file.write(statistics) log_file.close() loss_file.write('Time: %s\n' % task_timestamp) loss_file.write('Task Id: %s\n' % self.current_hash) # training loss_file.write( 'Epoch \t Training \t Test \t\t Loss-Phy \t Loss-E \t Anealing Factor\n' ) e_coff = loss_param['lambda_e0'] s_coff = loss_param['lambda_s'] if loss_param['noise']: noise_param = loss_param['noise'] mode = noise_param['mode'] mean = noise_param['mean'] var = noise_param['var'] noise_decay = noise_param['decay'] if loss_param['cyclical']: cyclic_param = loss_param['cyclical'] cyclic_mode = cyclic_param['mode'] cyclic_mean = cyclic_param['mean'] amp = cyclic_param['amp'] period = cyclic_param['period'] cyclic_decay = cyclic_param['decay'] # the progress bar if self.master_bar is not None: child_bar = progress_bar(range(NUMEPOCHS), parent=self.master_bar) self.master_bar.names = ['train', 'val', 'test'] else: child_bar = range(NUMEPOCHS) # record when training started train_time = 0 if 'break_loop_early' in train_param: if train_param['break_loop_early'] == False: stopped_epoch = NUMEPOCHS - 1 # save initial state of the model torch.save(model.state_dict(), self.path_state + 'state_0.pt') for epoch in child_bar: model.train() start_time = time.time() # start recording time if train_param['train_loss'] != []: for batchX, batchY, batchH in data_train_loader: # Forward pass outputs = model(batchX) # add noise if loss_param['noise']: s_coff = loss_param['lambda_s'] noise = self.generate_noise(mode=mode, mean=mean, var=var) s_coff += noise s_coff = max(0, s_coff) # add noise if loss_param['cyclical']: s_coff = loss_param['lambda_s'] cyclic = self.cyclical(mode=cyclic_mode, epoch=epoch, mean=cyclic_mean, amp=amp, period=period) s_coff += cyclic s_coff = max(0, s_coff) lambda_s_train[epoch] = s_coff lambda_e_train[epoch] = e_coff # calculate gradients and save (loss_all, loss_mse, loss_phy, loss_phy_norm, loss_e, loss_se) = self.loss_for_grad(data, train_param['train_loss'], outputs=outputs, e_coff=e_coff, s_coff=s_coff, batchX=batchX, batchY=batchY, batchH=batchH, norm=loss_param['norm_wf']) grad_all = self.calc_gradient( loss=loss_all, model=model, save_name='train_all_%d.pkl' % (epoch + 1)) grad_mse = self.calc_gradient( loss=loss_mse, model=model, save_name='train_mse_%d.pkl' % (epoch + 1)) grad_phy = self.calc_gradient(loss=loss_phy, model=model, save_name='train_s_%d.pkl' % (epoch + 1)) grad_phy_norm = self.calc_gradient( loss=loss_phy_norm, model=model, save_name='train_train_s_norm_%d.pkl' % (epoch + 1)) grad_e = self.calc_gradient(loss=loss_e, model=model, save_name='train_e_%d.pkl' % (epoch + 1)) grad_se = self.calc_gradient(loss=loss_se, model=model, save_name='train_se_%d.pkl' % (epoch + 1)) # Backward and optimize optimizer.zero_grad() loss = self.loss_func(data, train_param['train_loss'], outputs=outputs, e_coff=e_coff, s_coff=s_coff, batchX=batchX, batchY=batchY, batchH=batchH, norm=loss_param['norm_wf'])[0] loss.backward() if train_param['cyclical']: scheduler.step() else: optimizer.step() if train_param['test_loss'] != []: for batchX, batchY, batchH in data_test_loader: # Forward pass outputs = model(batchX) # add noise if loss_param['noise']: s_coff = loss_param['lambda_s'] noise = self.generate_noise(mode=mode, mean=mean, var=var) s_coff += noise s_coff = max(0, s_coff) # add noise if loss_param['cyclical']: s_coff = loss_param['lambda_s'] cyclic = self.cyclical(mode=cyclic_mode, epoch=epoch, mean=cyclic_mean, amp=amp, period=period) s_coff += cyclic s_coff = max(0, s_coff) lambda_s_test[epoch] = s_coff lambda_e_test[epoch] = e_coff # calculate gradients and save (loss_all, loss_mse, loss_phy, loss_phy_norm, loss_e, loss_se) = self.loss_for_grad( data, train_param['test_loss'], outputs=outputs, e_coff=e_coff, s_coff=s_coff, batchX=batchX, batchY=None, batchH=batchH, norm=loss_param['norm_wf'], ) grad_all = self.calc_gradient(loss=loss_all, model=model, save_name='test_all_%d.pkl' % (epoch + 1)) grad_mse = self.calc_gradient(loss=loss_mse, model=model, save_name='test_mse_%d.pkl' % (epoch + 1)) grad_phy = self.calc_gradient(loss=loss_phy, model=model, save_name='test_s_%d.pkl' % (epoch + 1)) grad_phy_norm = self.calc_gradient( loss=loss_phy_norm, model=model, save_name='test_s_norm_%d.pkl' % (epoch + 1)) grad_e = self.calc_gradient(loss=loss_e, model=model, save_name='test_e_%d.pkl' % (epoch + 1)) grad_se = self.calc_gradient(loss=loss_se, model=model, save_name='test_se_%d.pkl' % (epoch + 1)) loss = self.loss_func( data, train_param['test_loss'], outputs=outputs, e_coff=e_coff, s_coff=s_coff, batchX=batchX, batchY=None, batchH=batchH, norm=loss_param['norm_wf'], )[0] # Backward and optimize optimizer.zero_grad() loss.backward() if train_param['cyclical']: scheduler.step() else: optimizer.step() end_time = time.time() # end recording time train_time += end_time - start_time # accumulate training time # record the loss history model.eval() # save initial state of the model torch.save(model.state_dict(), self.path_state + 'state_%d.pt' % (epoch + 1)) train_losses[epoch] = criterion(model(data.X_train_tensor), data.y_train_tensor).item() val_losses[epoch] = criterion(model(data.X_val_tensor), data.y_val_tensor).item() test_losses[epoch] = criterion(model(data.X_test_tensor), data.y_test_tensor).item() s_coff = loss_param['lambda_s'] # train losses (loss, train_phy_losses[epoch], train_norm_phy_losses[epoch], train_e_losses[epoch]) = self.loss_func( data, train_param['train_loss'], outputs=model(data.X_train_tensor), e_coff=e_coff, s_coff=s_coff, batchX=data.X_train_tensor, batchY=data.y_train_tensor, batchH=data.X_train_origin, norm=loss_param['norm_wf']) train_all_losses[epoch] = float(loss) # val losses (loss, val_phy_losses[epoch], val_norm_phy_losses[epoch], val_e_losses[epoch]) = self.loss_func(data, train_param['test_loss'], outputs=model( data.X_val_tensor), e_coff=e_coff, s_coff=s_coff, batchX=data.X_val_tensor, batchY=data.y_val_tensor, batchH=data.X_val_origin, norm=loss_param['norm_wf']) val_all_losses[epoch] = float(loss) # test losses (loss, test_phy_losses[epoch], test_norm_phy_losses[epoch], test_e_losses[epoch]) = self.loss_func(data, train_param['test_loss'], outputs=model( data.X_test_tensor), e_coff=e_coff, s_coff=s_coff, batchX=data.X_test_tensor, batchY=data.y_test_tensor, batchH=data.X_test_origin, norm=loss_param['norm_wf']) test_all_losses = float(loss) if epoch % loss_param['anneal_interval'] == 0: e_coff *= loss_param['anneal_factor'] if loss_param['noise']: var *= noise_decay if loss_param['cyclical']: amp *= cyclic_decay if epoch % train_param['print_interval'] == 0: loss_file.write( '%d \t %.8f \t %.8f \t %.8f \t %.8f \t %.8f\n' % (epoch, train_losses[epoch], test_losses[epoch], val_phy_losses[epoch], val_e_losses[epoch], e_coff)) # plot loss curve if epoch % 1 == 0 and self.master_bar is not None and self.plot_flag: y_upper_bound = max(train_losses.max(), val_losses[0].max(), test_losses[0].max()) x_axis = np.arange(epoch + 1) + 1 graphs = [[x_axis, train_losses[:epoch + 1]], [x_axis, val_losses[:epoch + 1]], [x_axis, test_losses[:epoch + 1]]] x_bounds = [0, NUMEPOCHS] y_bounds = [0.0, y_upper_bound] self.master_bar.update_graph(graphs, x_bounds, y_bounds) # early stopping if train_param['early_stopping']: early_stopping(val_losses[epoch], model) if early_stopping.early_stop: if 'break_loop_early' in train_param: if train_param['break_loop_early'] == True: break else: stopped_epoch = min(epoch, stopped_epoch) else: break # record when training stopped and calculate time time_per_epoch = train_time / epoch if 'break_loop_early' in train_param: if train_param['break_loop_early'] == False: epoch = stopped_epoch # print loss in log files if verbose and self.master_bar is not None: self.master_bar.write('Training stopped at %d/%d.' % (epoch, NUMEPOCHS)) loss_file.write('Training stopped at %d/%d.' % (epoch, NUMEPOCHS)) loss_file.write('Training time: %f seconds.' % train_time) loss_file.write('\nTraining Complete') loss_file.write( '\n--------------------------------------------------------------------------\n' ) loss_file.close() # data frame for losses df_loss = pd.DataFrame({ 'train_mse': train_losses, 'val_mse': train_losses, 'test_mse': test_losses, 'train_phy': train_phy_losses, 'val_phy': val_phy_losses, 'test_phy': test_phy_losses, 'train_norm_phy': train_norm_phy_losses, 'train_e': train_e_losses, 'val_e': val_e_losses, 'test_e': test_e_losses, 'val_norm_phy': val_norm_phy_losses, 'test_norm_phy': test_norm_phy_losses, 'train_all': train_all_losses, 'val_all': val_all_losses, 'test_all': test_all_losses, 'lambda_s_train': lambda_s_train, 'lambda_s_test': lambda_s_test, 'lambda_e_train': lambda_e_train, 'lambda_e_test': lambda_e_test }) df_loss.to_csv(self.path_out + "losses_" + self.current_hash + ".csv", index=False) # training statistics to return train_stats = { 'epoch': epoch, 'train_time': train_time, 'time_per_epoch': time_per_epoch } # save or load model if train_param['early_stopping']: model.load_state_dict(torch.load(path_to_model)) else: torch.save(model.state_dict(), path_to_model) return model, train_stats
}] optimizer = BertAdam(optimizer_grouped_parameters, lr=hp.lr, warmup=warmup_proportion, t_total=num_train_optimization_steps) criterion = nn.CrossEntropyLoss(ignore_index=0) binary_criterion = nn.BCEWithLogitsLoss( pos_weight=torch.Tensor([3932 / 14263])) avg_train_losses = [] avg_valid_losses = [] # initialize the early_stopping object early_stopping = EarlyStopping(patience=hp.patience, verbose=True) for epoch in range(1, hp.n_epochs + 1): print("=========eval at epoch={epoch}=========") if not os.path.exists('checkpoints'): os.makedirs('checkpoints') if not os.path.exists('results'): os.makedirs('results') fname = os.path.join('checkpoints', timestr) spath = os.path.join('checkpoints', timestr + ".pt") train_loss = train(model, train_iter, optimizer, criterion, binary_criterion) avg_train_losses.append(train_loss.item()) precision, recall, f1, valid_loss = eval(model, eval_iter, fname, criterion, binary_criterion) avg_valid_losses.append(valid_loss.item())
def fit(path, epochs=30): """ Args: epochs: Number of training epochs. The BERT authors recommend between 2 and 4. We chose to run for 4, but we'll see later that this may be over-fitting the training data. """ train_dataloader, validation_dataloader = dataloader(path) model, optimizer = get_model() early_stopping = EarlyStopping() # Tell pytorch to run this model on the GPU. # model.cuda() model.cpu() # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") device = torch.device("cpu") seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # Measure the total training time for the whole run. total_t0 = time.time() # For each epoch... for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_train_loss = 0 # Put the model into training mode. Don't be mislead--the call to # `train` just changes the *mode*, it doesn't *perform* the training. # `dropout` and `batchnorm` layers behave differently during training # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch) model.train() # For each batch of training data... for step, batch in enumerate(tqdm(train_dataloader)): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed)) # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using the # `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: labels b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) # Always clear any previously calculated gradients before performing a # backward pass. PyTorch doesn't do this automatically because # accumulating the gradients is "convenient while training RNNs". # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # It returns different numbers of parameters depending on what arguments # arge given and what flags are set. For our useage here, it returns # the loss (because we provided labels) and the "logits"--the model # outputs prior to activation. loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_train_loss += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epcoh took: {:}".format(training_time)) # ======================================== # Validation # ======================================== # After the completion of each training epoch, measure our performance on # our validation set. print("") print("Running Validation...") t0 = time.time() # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() # Tracking variables total_eval_accuracy = 0 total_eval_loss = 0 # Evaluate data for one epoch for batch in validation_dataloader: # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using # the `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: labels b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) # Tell pytorch not to bother with constructing the compute graph during # the forward pass, since this is only needed for backprop (training). with torch.no_grad(): # Forward pass, calculate logit predictions. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. (loss, logits) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # Accumulate the validation loss. total_eval_loss += loss.item() # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the jaccard for this batch of test sentences, and # accumulate it over all batches. total_eval_accuracy += jaccard(logits, label_ids) # Report the final accuracy for this validation run. avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) print(" Accuracy: {0:.2f}".format(avg_val_accuracy)) # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(validation_dataloader) # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) # Add early stopping early_stopping(avg_val_accuracy, model) if early_stopping.early_stop: print("Early stopping") break print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format( format_time(time.time() - total_t0)))
# gpu_used = int(get_free_gpu()) model = torchfcn.models.AutoEncoderConv3().cuda() model.apply(weight_init) summary(model, input_size=(1, 256, 256)) model = nn.DataParallel(model) if args.loss == "MSE": criterion = nn.MSELoss() if args.optimiser == "Adam": optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) else: optimizer = torch.optim.RMSprop(model.parameters(), lr=args.lr) early_stopping = EarlyStopping(patience=4) def train(epoch): model.train() train_loss = 0 count = 0 for i, (_, img) in tqdm.tqdm(enumerate(train_loader), total=len(train_loader), desc='Train epoch=%d' % epoch, ncols=80, leave=False): img = img.float() img = img[:, np.newaxis, :, :] img = Variable(img.cuda())
print("Using", torch.cuda.device_count(), "NVIDIA 1080TI GPUs!") if GPU_SELECT == 1: device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") print("Using one (the second) NVIDIA 1080TI GPU!") if GPU_SELECT == 0: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using one (the first) NVIDIA 1080TI GPU!") # In[2]: from early_stopping import EarlyStopping from dataset3 import dataset early_stopping = EarlyStopping( patience=patience, verbose=True) # initialize the early_stopping object # Counter for the execution time start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() if OPTIMIZATION_PLUGIN == 'Bayesian': from bayes_opt import BayesianOptimization #def black_box_function(x, y): def objective(SCI_RELU, SCI_BIAS, SCI_loss_type, SCI_optimizer, SCI_LR, SCI_MM, SCI_REGULARIZATION, SCI_EPOCHS, SCI_BATCH_SIZE, SCI_DROPOUT, SCI_L_SECOND, SCI_BN_MOMENTUM, SCI_SGD_MOMENTUM, SCI_BN_EPS, SCI_BN_STATS, SCI_LAST_LAYER, SCI_ACT_LAYER):
def train_pytorch(**kwargs): CHECKPOINT_PATH.mkdir(parents=True, exist_ok=True) # 调用logging.basicConfig会给进程添加一个root logger,这样其他模块中logger的日志才会显示到console当中 # (子logger传到root logger,root logger通过他自带的StreamHandler输出)。 # 如果不调用logging.basicConfig,必须得每个子logger配置一个StreamHandler,很麻烦 logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) formater = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') # Print logs to the terminal. # stream_handler = logging.StreamHandler() # stream_handler.setFormatter(formater) # # Save logs to file. log_path = CHECKPOINT_PATH / 'train.log' file_handler = logging.FileHandler(filename=log_path, mode='w', encoding='utf-8') file_handler.setFormatter(formater) # logger.addHandler(stream_handler) logger.addHandler(file_handler) inputs = kwargs['inputs'] outputs = kwargs['outputs'] # test_inputs = kwargs['test_inputs'] gkf = GroupKFold(n_splits=kwargs['n_splits']).split(X=df_train.q2, groups=df_train.id) # sss = StratifiedShuffleSplit(n_splits=kwargs['n_splits'], test_size=0.2, random_state=RANDOM_SEED).split(X=df_train.q2, # y=df_train.label) # skf = StratifiedKFold(n_splits=kwargs['n_splits'], shuffle=True, random_state=RANDOM_SEED).split(X=df_train.q2, y=outputs) # oof = np.zeros((len(df_train),1)) # all_pred = np.zeros(shape=(len(df_train), 2)) # 分类任务 all_pred = np.zeros(shape=(len(df_train))) # 回归任务 all_true = np.zeros(shape=(len(df_train))) for fold, (train_idx, valid_idx) in enumerate(gkf): # for fold, (train_idx, valid_idx) in enumerate(skf): logger.info(f'Fold No. {fold}') train_inputs = [inputs[i][train_idx] for i in range(len(inputs))] train_outputs = outputs[train_idx] train_qa_id = df_train[['id', 'id_sub', 'label']].iloc[train_idx] # =============================================================== # 通过反向翻译进行样本增强(只增强正样本) # 获得训练集样本的(id, id_sub) # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()]) # # 从增强样本中找出训练集中出现的样本 # mask = df_train_ex[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1) # df_train_fold = df_train_ex[mask] # 获得训练集样本的(id, id_sub) # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()]) # # 从增强样本中找出训练集中出现的样本 # mask = df_train_aug[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1) # df_train_fold = df_train_aug[mask] # train_inputs, train_inputs_overlap = compute_input_arrays(df_train_fold, input_categories, tokenizer, MAX_SEQUENCE_LENGTH) # train_outputs = compute_output_arrays(df_train_fold, output_categories) # df_train_fold = df_train.iloc[train_idx] # train_q_aug = [] # for x in tqdm(df_train_fold['q1']): # train_q_aug.append(eda_one(x)) # train_a_aug = [] # for x in tqdm(df_train_fold['q2']): # train_a_aug.append(eda_one(x)) # df_train_fold = pd.DataFrame(data={'q1': train_q_aug, 'q2': train_a_aug}) # train_inputs, train_inputs_overlap = compute_input_arrays(df_train_fold, input_categories, tokenizer, MAX_SEQUENCE_LENGTH) # train_outputs = compute_output_arrays(df_train_fold, output_categories) # 添加安居客数据到训练集 # train_inputs = [np.concatenate([train_inputs[i], anjuke_inputs[i]], axis=0) for i in range(len(inputs))] # train_outputs = np.concatenate([train_outputs, anjuke_outputs], axis=0) # ================================================================ valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))] valid_outputs = outputs[valid_idx] valid_qa_id = df_train[['id', 'id_sub', 'label']].iloc[valid_idx] train_set = HouseDataset(train_inputs, train_outputs, train_qa_id) valid_set = HouseDataset(valid_inputs, valid_outputs, valid_qa_id) # test_set = HouseDataset(test_inputs, np.zeros_like(test_inputs[0])) # 测试集没有标签 logger.info('Train set size: {}, valid set size {}'.format( len(train_set), len(valid_set))) train_loader = DataLoader( train_set, batch_size=kwargs['batch_size'], # shuffle=True # 如果使用分类训练,设为True ) valid_loader = DataLoader(valid_set, batch_size=kwargs['valid_batch_size']) # test_loader = DataLoader(test_set, # batch_size=512) device = torch.device(f"cuda:{kwargs['device']}") # model = BertForHouseQA().cuda(device) model = torch.nn.DataParallel(BertForHouseQA(), device_ids=[1, 2, 3]).cuda(device) # 找到分数最高的checkpoint文件并加载 # best_score_ = max([float(x.name[len(MODEL_NAME)+1:-3]) for x in CHECKPOINT_PATH.iterdir() if x.is_file()]) # best_ckpt_path = CHECKPOINT_PATH/f'{MODEL_NAME}_{best_score_}.pt' # ckpt = torch.load(best_ckpt_path) # model.load_state_dict(ckpt['model_state_dict']) # 加载point-wise模型,使用pair-wise继续训练 # 或者加载安居客模型 # ===================================================== # org_model = BertForHouseQA().cuda(device) # time_str = '2020-11-18-12:49:44' # org_ckpt_path = DATA_PATH / f"model_record/{MODEL_NAME}/{time_str}" # org_ckpt_path = DATA_PATH / f'anjuke/model_record/{MODEL_NAME}/{time_str}' # org_ckpt_paths = [x for x in org_ckpt_path.iterdir() if x.is_file() and x.suffix == '.pt'] # prefix = f'{MODEL_NAME}_' # best_ckpt_path = [x for x in org_ckpt_paths if str(x.name).startswith(prefix)][0] # ckpt = torch.load(best_ckpt_path) # org_model.load_state_dict(ckpt['model_state_dict']) # model = BertClsToReg(org_model).cuda(device) # model = BertClsToCls(org_model).cuda(device) # ===================================================== # List all modules inside the model. logger.info('Model modules:') for i, m in enumerate(model.named_children()): logger.info('{} -> {}'.format(i, m)) # # Get the number of total parameters. # total_params = sum(p.numel() for p in model.parameters()) # trainable_params = sum(p.numel() # for p in model.parameters() if p.requires_grad) # logger.info("Total params: {:,}".format(total_params)) # logger.info("Trainable params: {:,}".format(trainable_params)) # 使用HingeLoss criterion = torch.nn.MarginRankingLoss(margin=1.0) # criterion = torch.nn.MSELoss() # criterion = torch.nn.CrossEntropyLoss() # criterion_scl = SupConLoss(temperature=0.1, device=device) # optimizer = torch.optim.Adam( # model.parameters(), lr=kwargs['lr'], weight_decay=kwargs['weight_decay']) optimizer = transformers.AdamW(model.parameters(), lr=kwargs['lr'], weight_decay=kwargs['weight_decay']) logger.info('Optimizer:') logger.info(optimizer) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, # mode='min', # patience=int(kwargs['patience']/2), # verbose=True # ) scheduler = transformers.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=4, num_training_steps=kwargs['epoch']) # best_score = 0.0 stopper = EarlyStopping(patience=kwargs['patience'], mode='max') ckpt_path = None for epoch in range(kwargs['epoch']): pass # =======================Training=========================== # Set model to train mode. model.train() steps = int(np.ceil(len(train_set) / kwargs['batch_size'])) pbar = tqdm(desc='Epoch {}, loss {}'.format(epoch, 'NAN'), total=steps) for i, sample in enumerate(train_loader): x, y = sample[0].cuda(device).long(), sample[1].cuda( device).long() optimizer.zero_grad() feat, model_outputs = model(x) # [batch_size, 2] # CrossEntropy # loss = criterion(model_outputs, y) # MSE # loss = criterion(model_outputs, y.float().unsqueeze(-1)) # 使用 HingeLoss train_qa_id_sub = sample[2].cpu().detach().numpy() loss = get_hinge_loss(model_outputs, train_qa_id_sub, criterion) # 使用SCL # feat = F.normalize(feat, dim=-1).unsqueeze(1) # scl = criterion_scl(feat, y) # scl_weight = 0.3 # loss = (1-scl_weight)*loss + scl_weight*scl # loss += scl loss.backward() optimizer.step() pbar.set_description('Epoch {}, train loss {:.4f}'.format( epoch, loss.item())) pbar.update() pbar.close() # ========================================================= # =======================Validation======================== # Set model to evaluation mode. model.eval() with torch.no_grad(): # Validation step valid_loss = [] valid_pred = [] valid_true = [] steps = int( np.ceil(len(valid_set) / kwargs['valid_batch_size'])) pbar = tqdm(desc='Validating', total=steps) for i, sample in enumerate(valid_loader): y_true_local = sample[1].numpy() x, y_true = sample[0].cuda(device).long(), sample[1].cuda( device).long() feat, model_outputs = model(x) # MSELoss # loss = criterion(model_outputs, y_true.float().unsqueeze(-1)).cpu().detach().item() # HingeLoss valid_qa_id_sub = sample[2].cpu().detach().numpy() loss = get_hinge_loss(model_outputs, valid_qa_id_sub, criterion).cpu().detach().item() y_pred = model_outputs.cpu().detach().squeeze(-1).numpy() # CrossEntropy # loss = criterion( # model_outputs, y_true).cpu().detach().item() # y_pred = F.softmax( # model_outputs.cpu().detach(), dim=1).numpy() valid_loss.append(loss) valid_pred.append(y_pred) valid_true.append(y_true_local) pbar.update() pbar.close() valid_loss = np.asarray(valid_loss).mean() valid_pred = np.concatenate(valid_pred, axis=0) valid_true = np.concatenate(valid_true, axis=0) # 如果使用回归模型 valid_f1, thr = search_f1(valid_true, valid_pred) logger.info("Epoch {}, valid loss {:.5f}, valid f1 {:.4f}".format( epoch, valid_loss, valid_f1)) # 如果使用分类模型 # valid_pred_label = np.argmax(valid_pred, axis=1) # valid_auc = roc_auc_score(valid_true, valid_pred_label) # valid_p, valid_r, valid_f1, _ = precision_recall_fscore_support( # valid_true, valid_pred_label, average='binary') # logger.info( # "Epoch {}, valid loss {:.5f}, valid P {:.4f}, valid R {:.4f}, valid f1 {:.4f}, valid auc {:.4f}".format( # epoch, valid_loss, valid_p, valid_r, valid_f1, valid_auc) # ) # logger.info('Confusion Matrix: ') # logger.info(confusion_matrix(y_true=valid_true, # y_pred=valid_pred_label, normalize='all')) # Apply ReduceLROnPlateau to the lr. scheduler.step(valid_f1) stop_flag, best_flag = stopper.step(valid_f1) if best_flag: # 删除之前保存的模型 if ckpt_path is not None: ckpt_path.unlink() ckpt_path = CHECKPOINT_PATH / \ f"{MODEL_NAME}_{fold}_{epoch}_{stopper.best_score}.pt" # 保存目前的最佳模型 torch.save( { "model_name": "BertForHouseQA", "epoch": epoch, "valid_loss": valid_loss, "valid_f1": valid_f1, "model_state_dict": model.state_dict(), "train_idx": train_idx, "valid_idx": valid_idx, "fold": fold, # "optimizer_state_dict": optimizer.state_dict(), "thr": thr # 'scheduler_state_dict': scheduler.state_dict() }, f=ckpt_path, ) logger.info("A best score! Saved to checkpoints.") # 保存每个验证折的预测值,用作最后整个训练集的f1评估 all_pred[valid_idx] = valid_pred all_true[valid_idx] = valid_true if stop_flag: logger.info("Stop training due to early stopping.") # 终止训练 break # 保存每个验证折的预测值,用作最后整个训练集的f1评估 # oof[valid_idx] = valid_pred # valid_f1, _ = search_f1(valid_outputs, valid_pred) # 寻找最佳分类阈值和f1 score # print('Valid f1 score = ', valid_f1) # ========================================================== # 结束后,评估整个训练集 # CrossEntropy # all_pred = np.argmax(all_pred, axis=1) # all_auc = roc_auc_score(all_true, all_pred) # all_p, all_r, all_f1, _ = precision_recall_fscore_support( # all_true, all_pred, average='binary') # logger.info( # "all P {:.4f}, all R {:.4f}, all f1 {:.4f}, all auc {:.4f}".format( # all_p, all_r, all_f1, all_auc) # ) # logger.info('Confusion Matrix: ') # logger.info(confusion_matrix(y_true=all_true, # y_pred=all_pred, normalize='all')) # MSELoss all_f1, all_thr = search_f1(all_true, all_pred) logger.info("All f1 {:.4f}, all thr {:.4f}".format(all_f1, all_thr)) return all_f1, CHECKPOINT_PATH