def run(config_file): config = load_config(config_file) os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') os.environ['CUDA_VISIBLE_DEVICES'] = '0' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) all_transforms['valid'] = get_transforms(config.transforms.test) dataloaders = { phase: make_loader( data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, task='cls' ) for phase in ['train', 'valid'] } # create model model = CustomNet(config.model.encoder, config.data.num_classes) # train setting criterion = get_loss(config) params = [ {'params': model.base_params(), 'lr': config.optimizer.params.encoder_lr}, {'params': model.fresh_params(), 'lr': config.optimizer.params.decoder_lr} ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) # model runner runner = SupervisedRunner(model=model) callbacks = [MultiClassAccuracyCallback(threshold=0.5), F1ScoreCallback()] if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append(CheckpointCallback(resume=config.work_dir + '/checkpoints/best_full.pth')) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, callbacks=callbacks, verbose=True, fp16=True, )
def __init__(self, cfg): self.cfg = cfg train_dataset = getDataSet(cfg['data']['train'], 'train', cfg['data']['scale']) self.train_loader = DataLoader( train_dataset, cfg['data']['train']['batch_size'], shuffle=True, num_workers=cfg['data']['train']['n_workers']) val_dataset = getDataSet(cfg['data']['val'], 'val', cfg['data']['scale']) self.val_loader = DataLoader( val_dataset, 1, shuffle=False, num_workers=cfg['data']['val']['n_workers']) self.records = {'Epoch': [], 'PSNR': [], 'SSIM': []} self.log_dir = os.path.join( cfg['output_dir'], cfg['name'], time.strftime('%Y-%m-%d-%H:%M:%S', time.localtime(time.time()))) self.logger = utils.Logger(os.path.join(self.log_dir, 'info.log')) self.max_epochs = cfg['schedule']['num_epochs'] self.checkpoint_dir = os.path.join(self.log_dir, 'checkpoint') if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.epoch = 1 save_config(cfg, os.path.join(self.log_dir, 'config.yml')) self.logger.log('Train dataset has {} images and {} batches.'.format( len(train_dataset), len(self.train_loader))) self.logger.log('Val dataset has {} images and {} batches.'.format( len(val_dataset), len(self.val_loader)))
def val(self, model, sess, global_step): # load latest checkpoint model.load(sess) # initialize loss and score losses = list() scores = list() # get validation data val_iterator = self.data.get_val_iterator(self.batch_size) # define loop num_batches_per_epoch = (self.data.val_size - 1) // self.batch_size + 1 for step in tqdm(range(1, num_batches_per_epoch + 1)): batch_A, batch_B, \ batch_sentence_diff, batch_extra_features, labels = next(val_iterator) cur_batch_length = len(batch_A) feed_dict = { model.input_A: batch_A, model.input_B: batch_B, model.dropout_keep_prob: 1, model.sentence_vector_diff: batch_sentence_diff, model.extra_features: batch_extra_features, model.labels: [int(l) for l in labels] } loss, score, _ = model.val(sess, feed_dict=feed_dict) losses.append(loss) scores.append(score) val_loss = np.mean(losses) val_score = np.mean(scores) # summarize val loss and score self.summary_writer.summarize(global_step, summarizer="val", summaries_dict={ "score": np.array(val_score), "loss": np.array(val_loss) }) # save as best model if it is best score best_loss = float(getattr(self.config, "best_loss", 1e+5)) if val_loss < best_loss: self.logger.warn( "[Step {}] Saving for best loss : {:.5f} -> {:.5f}".format( global_step, best_loss, val_loss)) model.save( sess, os.path.join(self.checkpoint_dir, "best_loss", "best_loss.ckpt")) setattr(self.config, "best_loss", "{:.5f}".format(val_loss)) # save best config setattr(self.config, "best_step", str(self.global_step)) setattr(self.config, "best_epoch", str(self.cur_epoch)) save_config(self.config.checkpoint_dir, self.config) return val_loss, val_score
def val(self, model, sess, global_step): # Load latest checkpoint model.load(sess) sess.run(model.data_iterator.initializer) # Initialize loss and score losses = list() scores = list() # Define loop num_batches_per_epoch = (self.data.val_size - 1) // self.batch_size + 1 loop = tqdm(range(1, num_batches_per_epoch + 1)) for step in loop: feed_dict = { model.lstm_dropout_keep_prob: 1, model.num_negative_samples: 4, model.embed_dropout_keep_prob: 1, model.dense_dropout_keep_prob: 1 } loss, score = sess.run([model.loss, model.accuracy], feed_dict=feed_dict) losses.append(loss) scores.append(score) val_loss = np.mean(losses) val_score = np.mean(scores) # Summarize val loss and score self.summary_writer.summarize(global_step, summarizer="val", summaries_dict={ "score": np.array(val_score), "loss": np.array(val_loss) }) # Save as best model if it is best score best_loss = float(getattr(self.config, "best_loss", 1e+5)) if val_loss < best_loss: self.logger.warn( "[Step {}] Saving for best loss : {:.5f} -> {:.5f}".format( global_step, best_loss, val_loss)) model.save( sess, os.path.join(self.checkpoint_dir, "best_loss", "best_loss.ckpt")) setattr(self.config, "best_loss", "{:.5f}".format(val_loss)) # Save best config setattr(self.config, "best_step", str(self.global_step)) setattr(self.config, "best_epoch", str(self.cur_epoch)) save_config(self.config.checkpoint_dir, self.config) return val_loss, val_score
def val(self, model, sess, global_step): # load latest checkpoint model.load(sess) # initialize loss and score losses = list() scores = list() # get validation data val_iterator = self.data.get_val_iterator(self.batch_size) # define loop num_batches_per_epoch = (self.data.val_size - 1) // self.batch_size + 1 loop = tqdm(range(1, num_batches_per_epoch+1)) for step in loop: val_queries, val_replies, val_queries_lengths, val_replies_lengths = next(val_iterator) feed_dict = {model.input_queries: val_queries, model.input_replies: val_replies, model.queries_lengths: val_queries_lengths, model.replies_lengths: val_replies_lengths, model.dropout_keep_prob: 1, model.num_negative_samples: self.config.num_negative_samples} loss, score, _ = model.val(sess, feed_dict=feed_dict) losses.append(loss) scores.append(score) val_loss = np.mean(losses) val_score = np.mean(scores) # summarize val loss and score self.summary_writer.summarize(global_step, summarizer="val", summaries_dict={"score": np.array(val_score), "loss": np.array(val_loss)}) # save as best model if it is best score best_loss = float(getattr(self.config, "best_loss", 1e+5)) if val_loss < best_loss: self.logger.warn("[Step {}] Saving for best loss : {:.5f} -> {:.5f}".format(global_step, best_loss, val_loss)) model.save(sess, os.path.join(self.checkpoint_dir, "best_loss", "best_loss.ckpt")) setattr(self.config, "best_loss", "{:.5f}".format(val_loss)) # save best config setattr(self.config, "best_step", str(self.global_step)) setattr(self.config, "best_epoch", str(self.cur_epoch)) save_config(self.config.checkpoint_dir, self.config) return val_loss, val_score
def run_once(args): cfg, run_id, path = args # -- Set seed cfg.general.seed = utils.set_seed(cfg.general.seed) # -- Get data loaders data_loader = get_data_loader(cfg.data_loader) train_data = data_loader.get_train_loader() test_data = data_loader.get_test_loader() # -- Resume agent and metrics if checkpoints are available # TODO Resume if cfg.checkpoint != "": resume_path = path + "/" + cfg.checkpoint log.info("Resuming training ...") cfg.agent.resume = resume_path # -- Get agent agent = get_agent(cfg.agent) # -- Should have some kind of reporting agent # TODO Implement reporting agent # -- Init finished save_config(os.path.join(cfg.general.common.save_path, "ran_cfg"), cfg) eval_freq = cfg.train.eval_freq no_epochs = cfg.train.no_epochs - agent.get_train_epoch() for epoch in range(no_epochs): log.info("Train epoch: {}".format(epoch)) agent.train(train_data) if epoch % eval_freq == 0: agent.test(test_data) print("Finished an epoch :D") with open(path + "/loss_values_train", "wb") as f: pickle.dump(agent.loss_values_train, f) with open(path + "/loss_values_test", "wb") as f: pickle.dump(agent.loss_values_test, f) agent.eval_agent()
def run_once(args): cfg, run_id, path = args # -- Set seed cfg.general.seed = utils.set_seed(cfg.general.seed) # -- Resume agent and metrics if checkpoints are available # TODO Resume resume_path = path + "/" + cfg.checkpoint if resume_path: log.info("Resuming training ...") cfg.agent.resume = resume_path # -- Get agent agent = get_agent(cfg.agent) # -- Should have some kind of reporting agent # TODO Implement reporting agent # -- Init finished save_config(os.path.join(cfg.general.common.save_path, "ran_cfg"), cfg) agent.eval_agent()
def main(): args = parser.parse_args() ################################################## # DATSET ################################################## if args.model_save_path is not None: # Load a config file (.yml) params = load_config(args.config_path) # NOTE: Retrain the saved model from the last checkpoint elif args.saved_model_path is not None: params = load_config(os.path.join(args.saved_model_path, 'config.yml')) else: raise ValueError("Set model_save_path or saved_model_path.") # Load dataset train_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='train', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], max_epoch=params['num_epoch'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], sort_utt=True, sort_stop_epoch=params['sort_stop_epoch'], tool=params['tool'], num_enque=None, dynamic_batching=params['dynamic_batching']) dev_clean_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='dev_clean', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, tool=params['tool']) dev_other_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='dev_other', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], shuffle=True, tool=params['tool']) test_clean_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='test_clean', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], tool=params['tool']) test_other_data = Dataset(data_save_path=args.data_save_path, backend=params['backend'], input_channel=params['input_channel'], use_delta=params['use_delta'], use_double_delta=params['use_double_delta'], data_type='test_other', data_size=params['data_size'], label_type=params['label_type'], batch_size=params['batch_size'], splice=params['splice'], num_stack=params['num_stack'], num_skip=params['num_skip'], tool=params['tool']) params['num_classes'] = train_data.num_classes ################################################## # MODEL ################################################## # Model setting model = load(model_type=params['model_type'], params=params, backend=params['backend']) if args.model_save_path is not None: # Set save path save_path = mkdir_join(args.model_save_path, params['backend'], params['model_type'], params['label_type'], params['data_size'], model.name) model.set_save_path(save_path) # Save config file save_config(config_path=args.config_path, save_path=model.save_path) # Setting for logging logger = set_logger(model.save_path) if os.path.isdir(params['char_init']): # NOTE: Start training from the pre-trained character model model.load_checkpoint(save_path=params['char_init'], epoch=-1, load_pretrained_model=True) # Count total parameters for name in sorted(list(model.num_params_dict.keys())): num_params = model.num_params_dict[name] logger.info("%s %d" % (name, num_params)) logger.info("Total %.3f M parameters" % (model.total_parameters / 1000000)) # Define optimizer model.set_optimizer(optimizer=params['optimizer'], learning_rate_init=float(params['learning_rate']), weight_decay=float(params['weight_decay']), clip_grad_norm=params['clip_grad_norm'], lr_schedule=False, factor=params['decay_rate'], patience_epoch=params['decay_patient_epoch']) epoch, step = 1, 0 learning_rate = float(params['learning_rate']) metric_dev_best = 1 # NOTE: Retrain the saved model from the last checkpoint elif args.saved_model_path is not None: # Set save path model.save_path = args.saved_model_path # Setting for logging logger = set_logger(model.save_path, restart=True) # Define optimizer model.set_optimizer( optimizer=params['optimizer'], learning_rate_init=float(params['learning_rate']), # on-the-fly weight_decay=float(params['weight_decay']), clip_grad_norm=params['clip_grad_norm'], lr_schedule=False, factor=params['decay_rate'], patience_epoch=params['decay_patient_epoch']) # Restore the last saved model epoch, step, learning_rate, metric_dev_best = model.load_checkpoint( save_path=args.saved_model_path, epoch=-1, restart=True) else: raise ValueError("Set model_save_path or saved_model_path.") train_data.epoch = epoch - 1 # GPU setting model.set_cuda(deterministic=False, benchmark=True) logger.info('PID: %s' % os.getpid()) logger.info('USERNAME: %s' % os.uname()[1]) # Set process name setproctitle('libri_' + params['backend'] + '_' + params['model_type'] + '_' + params['label_type'] + '_' + params['data_size']) ################################################## # TRAINING LOOP ################################################## # Define learning rate controller lr_controller = Controller( learning_rate_init=learning_rate, backend=params['backend'], decay_start_epoch=params['decay_start_epoch'], decay_rate=params['decay_rate'], decay_patient_epoch=params['decay_patient_epoch'], lower_better=True) # Setting for tensorboard if params['backend'] == 'pytorch': tf_writer = SummaryWriter(model.save_path) # Train model csv_steps, csv_loss_train, csv_loss_dev = [], [], [] start_time_train = time.time() start_time_epoch = time.time() start_time_step = time.time() not_improved_epoch = 0 best_model = model loss_train_mean = 0. pbar_epoch = tqdm(total=len(train_data)) while True: # Compute loss in the training set (including parameter update) batch_train, is_new_epoch = train_data.next() model, loss_train_val = train_step(model, batch_train, params['clip_grad_norm'], backend=params['backend']) loss_train_mean += loss_train_val pbar_epoch.update(len(batch_train['xs'])) if (step + 1) % params['print_step'] == 0: # Compute loss in the dev set batch_dev = dev_clean_data.next()[0] loss_dev = model(batch_dev['xs'], batch_dev['ys'], batch_dev['x_lens'], batch_dev['y_lens'], is_eval=True) loss_train_mean /= params['print_step'] csv_steps.append(step) csv_loss_train.append(loss_train_mean) csv_loss_dev.append(loss_dev) # Logging by tensorboard if params['backend'] == 'pytorch': tf_writer.add_scalar('train/loss', loss_train_mean, step + 1) tf_writer.add_scalar('dev/loss', loss_dev, step + 1) for name, param in model.named_parameters(): name = name.replace('.', '/') tf_writer.add_histogram(name, param.data.cpu().numpy(), step + 1) tf_writer.add_histogram(name + '/grad', param.grad.data.cpu().numpy(), step + 1) duration_step = time.time() - start_time_step logger.info( "...Step:%d(epoch:%.3f) loss:%.3f(%.3f)/lr:%.5f/batch:%d/x_lens:%d (%.3f min)" % (step + 1, train_data.epoch_detail, loss_train_mean, loss_dev, learning_rate, train_data.current_batch_size, max(batch_train['x_lens']) * params['num_stack'], duration_step / 60)) start_time_step = time.time() loss_train_mean = 0. step += 1 # Save checkpoint and evaluate model per epoch if is_new_epoch: duration_epoch = time.time() - start_time_epoch logger.info('===== EPOCH:%d (%.3f min) =====' % (epoch, duration_epoch / 60)) # Save fugure of loss plot_loss(csv_loss_train, csv_loss_dev, csv_steps, save_path=model.save_path) if epoch < params['eval_start_epoch']: # Save the model model.save_checkpoint(model.save_path, epoch, step, learning_rate, metric_dev_best) else: start_time_eval = time.time() # dev if 'word' in params['label_type']: metric_dev_epoch, _ = do_eval_wer( models=[model], dataset=dev_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (dev-clean): %.3f %%' % (metric_dev_epoch * 100)) else: metric_dev_epoch, wer_dev_clean_epoch, _ = do_eval_cer( models=[model], dataset=dev_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info(' CER / WER (dev-clean): %.3f %% / %.3f %%' % ((metric_dev_epoch * 100), (wer_dev_clean_epoch * 100))) if metric_dev_epoch < metric_dev_best: metric_dev_best = metric_dev_epoch not_improved_epoch = 0 best_model = copy.deepcopy(model) logger.info('||||| Best Score |||||') # Save the model model.save_checkpoint(model.save_path, epoch, step, learning_rate, metric_dev_best) # dev-other & test if 'word' in params['label_type']: metric_dev_other_epoch, _ = do_eval_wer( models=[model], dataset=dev_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (dev-other): %.3f %%' % (metric_dev_other_epoch * 100)) wer_test_clean, _ = do_eval_wer( models=[model], dataset=test_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (test-clean): %.3f %%' % (wer_test_clean * 100)) wer_test_other, _ = do_eval_wer( models=[model], dataset=test_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_WORD, eval_batch_size=1) logger.info(' WER (test-other): %.3f %%' % (wer_test_other * 100)) logger.info( ' WER (test-mean): %.3f %%' % ((wer_test_clean + wer_test_other) * 100 / 2)) else: metric_dev_other_epoch, wer_dev_other_epoch, _ = do_eval_cer( models=[model], dataset=dev_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info( ' CER / WER (dev-other): %.3f %% / %.3f %%' % ((metric_dev_other_epoch * 100), (wer_dev_other_epoch * 100))) cer_test_clean, wer_test_clean, _ = do_eval_cer( models=[model], dataset=test_clean_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info( ' CER / WER (test-clean): %.3f %% / %.3f %%' % ((cer_test_clean * 100), (wer_test_clean * 100))) cer_test_other, wer_test_other, _ = do_eval_cer( models=[model], dataset=test_other_data, beam_width=1, max_decode_len=MAX_DECODE_LEN_CHAR, eval_batch_size=1) logger.info( ' CER / WER (test-other): %.3f %% / %.3f %%' % ((cer_test_other * 100), (wer_test_other * 100))) logger.info( ' CER / WER (test-mean): %.3f %% / %.3f %%' % (((cer_test_clean + cer_test_other) * 100 / 2), ((wer_test_clean + wer_test_other) * 100 / 2))) else: not_improved_epoch += 1 duration_eval = time.time() - start_time_eval logger.info('Evaluation time: %.3f min' % (duration_eval / 60)) # Early stopping if not_improved_epoch == params['not_improved_patient_epoch']: break # Update learning rate model.optimizer, learning_rate = lr_controller.decay_lr( optimizer=model.optimizer, learning_rate=learning_rate, epoch=epoch, value=metric_dev_epoch) if epoch == params['convert_to_sgd_epoch']: # Convert to fine-tuning stage model.set_optimizer( 'sgd', learning_rate_init=learning_rate, weight_decay=float(params['weight_decay']), clip_grad_norm=params['clip_grad_norm'], lr_schedule=False, factor=params['decay_rate'], patience_epoch=params['decay_patient_epoch']) logger.info('========== Convert to SGD ==========') # Inject Gaussian noise to all parameters if float(params['weight_noise_std']) > 0: model.weight_noise_injection = True pbar_epoch = tqdm(total=len(train_data)) print('========== EPOCH:%d (%.3f min) ==========' % (epoch, duration_epoch / 60)) if epoch == params['num_epoch']: break start_time_step = time.time() start_time_epoch = time.time() epoch += 1 # TODO: evaluate the best model by beam search here duration_train = time.time() - start_time_train logger.info('Total time: %.3f hour' % (duration_train / 3600)) if params['backend'] == 'pytorch': tf_writer.close() # Training was finished correctly with open(os.path.join(model.save_path, 'COMPLETE'), 'w') as f: f.write('')
def main(): global parser, args, args # arguments parser = argparse.ArgumentParser(description='byol-lightning-test') # Architecture & hyper-parameter parser.add_argument( '--arch', '-a', metavar='ARCH', default='resnet', help='model architecture: | [resnet, ...] (default: resnet18)') parser.add_argument('--depth', type=int, default=18, help='Model depth.') parser.add_argument('-c', '--checkpoint', default='../checkpoints', type=str, metavar='PATH', help='path to save checkpoint (default: checkpoint)') parser.add_argument('--epoch', type=int, default=100, help='Epoch') parser.add_argument('--batch-size', type=int, default=32, help='Epoch') parser.add_argument('--lr', '--learning-rate', default=1, type=float, metavar='LR', help='initial learning rate') parser.add_argument('--num-classes', type=int, default=100, help='Epoch') parser.add_argument('--from-scratch', action='store_true', default=False, help='use pre-trained model') parser.add_argument('--tune-all', action='store_true', default=False, help='use pre-trained model') # Device options parser.add_argument('--manualSeed', type=int, help='manual seed') parser.add_argument('--gpu-id', default='0', type=str, help='id(s) for CUDA_VISIBLE_DEVICES') parser.add_argument('--model-path', '--mp', type=str, help='byol trained model path') # Paths parser.add_argument('-d', '--dataset', default='neu', type=str) parser.add_argument( '--image_folder', type=str, required=True, help='path to your folder of images for self-supervised learning') parser.add_argument('--board-path', '--bp', default='../board', type=str, help='tensorboardx path') parser.add_argument('--board-tag', '--tg', default='fine-tuned', type=str, help='tensorboardx writer tag') args = parser.parse_args() # Use CUDA os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id use_cuda = torch.cuda.is_available() # Torch Seed # Random seed if args.manualSeed is None: args.manualSeed = random.randint(1, 10000) # Random Lib Seed random.seed(args.manualSeed) # Numpy Seed np.random.seed(args.manualSeed) if use_cuda: torch.cuda.manual_seed_all(args.manualSeed) # constants args.image_size = 256 args.workers = multiprocessing.cpu_count() args.task_time = datetime.now().isoformat() output_name = "{}{:d}-bs{:d}-lr{:.5f}-{}".format(args.arch, args.depth, args.batch_size, args.lr, args.board_tag) args.checkpoint = os.path.join(args.checkpoint, args.dataset, output_name, args.task_time) if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) config.save_config(args, os.path.join(args.checkpoint, "config.txt")) writer_train = SummaryWriter(log_dir=os.path.join( args.board_path, args.dataset, output_name, args.task_time, "train")) writer_test = SummaryWriter(log_dir=os.path.join( args.board_path, args.dataset, output_name, args.task_time, "test")) if args.arch is "resnet": if args.depth == 18: model = models.resnet18(pretrained=False).cuda() elif args.depth == 34: model = models.resnet34(pretrained=False).cuda() elif args.depth == 50: model = models.resnet50(pretrained=False).cuda() elif args.depth == 101: model = models.resnet101(pretrained=False).cuda() else: assert ("Not supported Depth") if not args.from_scratch: checkpoint = torch.load(args.model_path) model.load_state_dict(checkpoint) print("\t==> Fine tune full layers? : {}".format(str(args.tune_all))) # Simple manual fine tuning logic # if full == False, only last layer will be fine tuned~!! if not args.tune_all: params = model.parameters() for param in params: param.requires_grad = False model.num_classes = args.num_classes num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, args.num_classes) model = torch.nn.DataParallel(model).cuda() opt = torch.optim.Adam(model.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss().cuda() softmax = nn.Softmax(1).cuda() # Data loading code traindir = os.path.join(args.image_folder, 'train') testdir = os.path.join(args.image_folder, 'test') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) trainloader = torch.utils.data.DataLoader( datasets.ImageFolder( traindir, transforms.Compose([ transforms.Resize(args.image_size), transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.ColorJitter(0.4, 0.4, 0.4), transforms.ToTensor(), # normalize, ])), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) testloader = torch.utils.data.DataLoader( datasets.ImageFolder( testdir, transforms.Compose([ transforms.Resize(args.image_size), transforms.ToTensor(), # normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) losses_train = AverageMeter() top1_train = AverageMeter() top5_train = AverageMeter() losses_test = AverageMeter() top1_test = AverageMeter() top5_test = AverageMeter() for epoch in range(args.epoch): bar_train = Bar('Processing', max=len(trainloader)) bar_test = Bar('Processing', max=len(testloader)) train(model, criterion, opt, softmax, bar_train, epoch, trainloader, losses_train, top1_train, top5_train, writer_train) test(model, criterion, softmax, bar_test, epoch, testloader, losses_test, top1_test, top5_test, writer_test) # save your improved network torch.save(model.state_dict(), os.path.join(args.checkpoint, 'byol-finetune.pt'))
# log_dir=os.path.join(args.board_path, args.dataset, "{}-{}{:d}-bs{:d}-lr{:.5f}-{}".format(args.arch, # args.depth, # args.batch_size, # args.lr, # args.board_tag), # task_time, "train")) args.task_time = datetime.now().isoformat() output_name = "{}{:d}-bs{:d}-lr{:.5f}-{}".format(args.arch, args.depth, args.batch_size, args.lr, args.board_tag) args.checkpoint = os.path.join(args.checkpoint, args.dataset, output_name, args.task_time) if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) config.save_config(args, os.path.join(args.checkpoint, "config.txt")) writer_train = SummaryWriter( log_dir=os.path.join(args.board_path, args.dataset, output_name, args.task_time)) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) def expand_greyscale(t): return t.expand(3, -1, -1) class ImagesDataset(Dataset): def __init__(self, folder, image_size): super().__init__()
def run(config_file): config = load_config(config_file) if not os.path.exists(config.work_dir): os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') os.environ['CUDA_VISIBLE_DEVICES'] = '0' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) all_transforms['valid'] = get_transforms(config.transforms.test) dataloaders = { phase: make_loader(data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, debug=config.debug) for phase in ['train', 'valid'] } # create segmentation model with pre trained encoder model = getattr(smp, config.model.arch)( encoder_name=config.model.encoder, encoder_weights=config.model.pretrained, classes=config.data.num_classes, activation=None, ) # train setting criterion = get_loss(config) params = [ { 'params': model.decoder.parameters(), 'lr': config.optimizer.params.decoder_lr }, { 'params': model.encoder.parameters(), 'lr': config.optimizer.params.encoder_lr }, ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) # model runner runner = SupervisedRunner(model=model) callbacks = [DiceCallback(), IouCallback()] # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append( CheckpointCallback(resume=config.work_dir + '/checkpoints/best_full.pth')) if config.train.mixup: callbacks.append(MixupCallback()) if config.train.cutmix: callbacks.append(CutMixCallback()) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, callbacks=callbacks, verbose=True, fp16=True, )
def val(self, model, sess, global_step): # load latest checkpoint model.load(sess) sess.run(model.data_iterator.initializer) # initialize loss and score losses = list() val_queries, val_replies, val_generated_replies = list(), list(), list( ) # define loop num_batches_per_epoch = (self.data.val_size - 1) // self.batch_size + 1 loop = tqdm(range(1, num_batches_per_epoch + 1)) for step in loop: feed_dict = {model.dropout_keep_prob: 1} queries, replies, generated_replies, loss = model.val( sess, feed_dict=feed_dict) queries = [ " ".join([token.decode("utf-8") for token in query_tokens]) for query_tokens in queries ] replies = [ " ".join([token.decode("utf-8") for token in reply_tokens]) for reply_tokens in replies ] generated_replies = [" ".join([token.decode("utf-8") for token in generated_reply_tokens]) \ for generated_reply_tokens in generated_replies] val_queries.extend(queries) val_replies.extend(replies) val_generated_replies.extend(generated_replies) losses.append(loss) val_loss = np.mean(losses) # summarize val loss and score self.summary_writer.summarize( global_step, summarizer="val", summaries_dict={"loss": np.array(val_loss)}) # display some generated samples random_indices = sorted( np.random.choice(100, 10, replace=False).tolist()) for idx in random_indices: self.logger.info( self.generation_summary.format(val_queries[idx], val_replies[idx], generated_replies[idx])) # save as best model if it is best loss best_loss = float(getattr(self.config, "best_loss", 1e+5)) if val_loss < best_loss: self.logger.warn( "[Step {}] Saving for best loss : {:.5f} -> {:.5f}".format( global_step, best_loss, val_loss)) model.save( sess, os.path.join(self.checkpoint_dir, "best_loss", "best_loss.ckpt")) setattr(self.config, "best_loss", "{:.5f}".format(val_loss)) # save best config setattr(self.config, "best_step", str(self.global_step)) setattr(self.config, "best_epoch", str(self.cur_epoch)) save_config(self.config.checkpoint_dir, self.config) with open( os.path.join(self.checkpoint_dir, "best_loss", "generated_result.txt"), "w") as f: for query, reply, generated_reply in zip( val_queries, val_replies, val_generated_replies): f.write("{}\t{}\t{}\n".format(query, reply, generated_reply)) return val_loss
def run(config_file): config = load_config(config_file) #set up the environment flags for working with the KAGGLE GPU OR COLAB_GPU if 'COLAB_GPU' in os.environ: config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir elif 'KAGGLE_WORKING_DIR' in os.environ: config.work_dir = '/kaggle/working/' + config.work_dir print('working directory:', config.work_dir) #save the configuration to the working dir if not os.path.exists(config.work_dir): os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') #Enter the GPUS you have, os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) #our dataset has an explicit validation folder, use that later. all_transforms['valid'] = get_transforms(config.transforms.test) print("before rajat config", config.data.height, config.data.width) #fetch the dataloaders we need dataloaders = { phase: make_loader(data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, img_size=(config.data.height, config.data.width), batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, debug=config.debug) for phase in ['train', 'valid'] } #creating the segmentation model with pre-trained encoder ''' dumping the parameters for smp library encoder_name: str = "resnet34", encoder_depth: int = 5, encoder_weights: str = "imagenet", decoder_use_batchnorm: bool = True, decoder_channels: List[int] = (256, 128, 64, 32, 16), decoder_attention_type: Optional[str] = None, in_channels: int = 3, classes: int = 1, activation: Optional[Union[str, callable]] = None, aux_params: Optional[dict] = None, ''' model = getattr(smp, config.model.arch)( encoder_name=config.model.encoder, encoder_weights=config.model.pretrained, classes=config.data.num_classes, activation=None, ) #fetch the loss criterion = get_loss(config) params = [ { 'params': model.decoder.parameters(), 'lr': config.optimizer.params.decoder_lr }, { 'params': model.encoder.parameters(), 'lr': config.optimizer.params.encoder_lr }, ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) ''' dumping the catalyst supervised runner https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py model (Model): Torch model object device (Device): Torch device input_key (str): Key in batch dict mapping for model input output_key (str): Key in output dict model output will be stored under input_target_key (str): Key in batch dict mapping for target ''' runner = SupervisedRunner(model=model, device=get_device()) #@pavel,srk,rajat,vladimir,pudae check the IOU and the Dice Callbacks callbacks = [DiceCallback(), IouCallback()] #adding patience if config.train.early_stop_patience > 0: callbacks.append( EarlyStoppingCallback(patience=config.train.early_stop_patience)) #thanks for handling the distributed training ''' we are gonna take zero_grad after accumulation accumulation_steps ''' if config.train.accumulation_size > 0: accumulation_steps = config.train.accumulation_size // config.train.batch_size callbacks.extend([ CriterionCallback(), OptimizerCallback(accumulation_steps=accumulation_steps) ]) # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append( CheckpointCallback(resume=config.work_dir + '/checkpoints/last_full.pth')) ''' pudae добавь пожалуйста обратный вызов https://arxiv.org/pdf/1710.09412.pdf **srk adding the mixup callback ''' if config.train.mixup: callbacks.append(MixupCallback()) if config.train.cutmix: callbacks.append(CutMixCallback()) '''@rajat implemented cutmix, a wieghed combination of cutout and mixup ''' callbacks.append(MixupCallback()) callbacks.append(CutMixCallback()) ''' rajat introducing training loop https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py take care of the nvidias fp16 precision ''' print(config.work_dir) print(config.train.minimize_metric) runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, main_metric=config.train.main_metric, minimize_metric=config.train.minimize_metric, callbacks=callbacks, verbose=True, fp16=False, )
def run(config_file): config = load_config(config_file) if 'COLAB_GPU' in os.environ: config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir elif 'KAGGLE_WORKING_DIR' in os.environ: config.work_dir = '/kaggle/working/' + config.work_dir print('working directory:', config.work_dir) if not os.path.exists(config.work_dir): os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') os.environ['CUDA_VISIBLE_DEVICES'] = '0' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) all_transforms['valid'] = get_transforms(config.transforms.test) dataloaders = { phase: make_loader( data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, img_size=(config.data.height, config.data.width), batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, debug=config.debug ) for phase in ['train', 'valid'] } # create segmentation model with pre trained encoder model = getattr(smp, config.model.arch)( encoder_name=config.model.encoder, encoder_weights=config.model.pretrained, classes=config.data.num_classes, activation=None, ) # train setting criterion = get_loss(config) params = [ {'params': model.decoder.parameters(), 'lr': config.optimizer.params.decoder_lr}, {'params': model.encoder.parameters(), 'lr': config.optimizer.params.encoder_lr}, ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) # model runner runner = SupervisedRunner(model=model, device=get_device()) callbacks = [DiceCallback(), IouCallback()] if config.train.early_stop_patience > 0: callbacks.append(EarlyStoppingCallback( patience=config.train.early_stop_patience)) if config.train.accumulation_size > 0: accumulation_steps = config.train.accumulation_size // config.train.batch_size callbacks.extend( [CriterionCallback(), OptimizerCallback(accumulation_steps=accumulation_steps)] ) # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append(CheckpointCallback( resume=config.work_dir + '/checkpoints/last_full.pth')) if config.train.mixup: callbacks.append(MixupCallback()) if config.train.cutmix: callbacks.append(CutMixCallback()) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, main_metric=config.train.main_metric, minimize_metric=config.train.minimize_metric, callbacks=callbacks, verbose=True, fp16=True, )
def cli_main(): parser = options.get_training_parser() parser.add_argument( '--config', type=str, nargs='*', help= 'paths to JSON files of experiment configurations, from high to low priority', ) parser.add_argument('--exp-name', type=str, default='', help='name of the experiment') parser.add_argument( '--debug', default=False, action='store_true', help='run training in the debugging mode', ) parser.add_argument('--path-attributes', type=str, nargs='*', default=['task', 'arch', 'lr']) parser.add_argument('--torch-file-system', action='store_true') pre_parsed_args, unknown = parser.parse_known_args() config_dict = {} for config_path in pre_parsed_args.config: config_dict = update_config(config_dict, compose_configs(config_path)) parser_modifier = modify_factory(config_dict) args = options.parse_args_and_arch(parser, modify_parser=parser_modifier) update_namespace(args, config_dict) # set sharing strategy file system in case /dev/shm/ limits are small if args.torch_file_system: torch.multiprocessing.set_sharing_strategy('file_system') training_name = get_training_name(args) base_save_dir = generate_save_dir(args, training_name, sys.argv[1:]) setattr(args, 'training_name', training_name) setattr(args, 'save_dir', os.path.join(base_save_dir, 'checkpoints')) setattr(args, 'tensorboard_logdir', os.path.join(base_save_dir, 'tensorboard')) save_config(vars(args), base_save_dir) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if (args.update_freq is not None and max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d'): logger.info( 'NOTE: you may get faster training with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)