def train(restore, is_master=True): strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with strategy.scope(): encoders = get_encoders() dataset = get_dataset(encoders) train_data = dataset.batch(config.BATCH_SIZE) _, generator = get_generator(encoders) checkpoint_path = path.join(config.CHECKPOINT_DIR, "keras", "generator.ckpt") if restore: generator.load_weights(checkpoint_path) callbacks = [] if is_master: generator.summary() stats_filename = datetime.now().strftime("%Y%m%d_%H%M") + ".csv" callbacks = [ K.callbacks.CSVLogger( path.join(config.LOG_DIR, "stats", stats_filename)), # K.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True), EvaluationLogger(generator, dataset, encoders) ] initial_epoch = generator.optimizer.iterations.numpy( ) // config.STEPS_PER_EPOCH generator.fit(train_data, epochs=config.NUM_EPOCHS, initial_epoch=initial_epoch, steps_per_epoch=config.STEPS_PER_EPOCH, callbacks=callbacks)
def _get_dataset(name, batch_size, buffer_size, partial): dataset_path = _util.get_rel_datasets_path(name) _util.ensure_dir(dataset_path) return get_dataset(dataset_path, batch_size=batch_size, buffer_size=buffer_size, partial=partial).map(_only_cropped_scan)
def main(): args = get_arguments() logger.info(args) target_size = args.get('size') padding_ratio = args.get('padding') dataset = get_dataset(args.get('dataset', None)) print(dataset.root_path) img_paths = dataset.df_meta[dataset.img_colname].to_list() if dataset.relative_path: img_paths = [os.path.join(dataset.root_path, p) for p in img_paths] new_dataset_name = f'{dataset.name()}_{target_size}_pad={padding_ratio}' root_path = os.path.join(environments.DATASET_DIR, new_dataset_name) os.makedirs(root_path, exist_ok=True) if args.get('gpu', False): logger.info('use gpu') ctx = mx.gpu() else: ctx = mx.cpu() detector = MtcnnDetector(minsize=100, num_worker=1, ctx=ctx) results = [] for p in tqdm(img_paths, total=len(img_paths)): img = cv2.imread(p) dirname = p.split('/')[-2] filename = p.split('/')[-1].split('.')[0] dir_path = os.path.join(root_path, dirname) os.makedirs(dir_path, exist_ok=True) try: clipped, prob = clip_most_humanise_image( detector, img, target_size=target_size, padding_ratio=padding_ratio) new_filename = f'{filename}_{prob:.3f}.jpg' except NotDetectionError as e: # 切り取れなかった時は中心から切り出す img = Image.fromarray(img) prob = -1 clipped = clop_center(img, target_shape=(target_size, target_size)) clipped = np.array(clipped) new_filename = f'{filename}_not-detected.jpg' new_path = os.path.join(dir_path, new_filename) cv2.imwrite(new_path, clipped) results.append([os.path.relpath(new_path, root_path), prob]) df_meta = pd.DataFrame(results, columns=['img_path', 'prob']) df_meta['origin_path'] = dataset.df_meta[dataset.img_colname] df_meta[dataset.label_colname] = dataset.df_meta[dataset.label_colname] df_meta.to_csv(os.path.join(root_path, 'meta.csv'), index=False)
def vae_train(config): config['outdir'].mkdir(parents=True, exist_ok=True) # get device if config['use_gpu']: device = torch.device('cuda') # moving a tensor to GPU # useful at BUT cluster to prevent someone from getting the same GPU fake = torch.Tensor([1]).to(device) else: device = torch.device('cpu') dataset_class = get_dataset(config['dataset_type']) # compute or load mean and std of dataset trans = lambda x: logspec(x, **config['spectrum_conf']) dataset = dataset_class(config['dataset'], transform=trans) dataloader_meanstd = DataLoader(dataset) meanstd_norm = get_meanstd_norm(config['meanstd_norm_file'], dataloader_meanstd) # load the dataset trans = lambda x: meanstd_norm(logspec(x, **config['spectrum_conf'])) dataset = dataset_class(config['dataset'], transform=trans) dataloader_train = DataLoader(dataset, batch_size=config['batch_size'], collate_fn=PadCollate(), shuffle=True) # create the model model = SeqVAESpeaker(**config['vae_conf']).to(device) # store model config with open(config['outdir'] / 'vae_config', 'w') as f: json.dump(config['vae_conf'], f, indent=2) # load loss function if config['vae_objective'] == 'elbo': loss = ELBOLoss(model).to(device) elif config['vae_objective'] == 'elbo_speakerid': loss = ELBOSpeakerLoss(model, config['speaker_loss_weight']).to(device) else: raise KeyError(f'Unknown objective {config["vae_objective"]}') # run training trainer = Trainer(model, loss, dataloader_train, config['outdir'], device=device, **config['optimizer_conf']) trainer.run()
def train(restore): encoders = get_encoders() dataset = get_dataset(encoders, difficulty=10) text_rnn, generator, discriminator, gan = get_models(encoders) checkpoint_path = path.join(config.CHECKPOINT_DIR, "keras", "text_rnn.ckpt") if restore: text_rnn.load_weights(checkpoint_path) logger = EvaluationLogger(generator, dataset, encoders) accumulator = MetricsAccumulator(path.join(config.LOG_DIR, "stats")) _train_on_batch_f = _get_train_on_batch_f(generator, discriminator, gan, accumulator) difficulty = 10 dataset = get_dataset(encoders, difficulty) train_data = dataset.batch(config.BATCH_SIZE).take(config.STEPS_PER_EPOCH) for epoch in range(config.NUM_EPOCHS): # if epoch >= 500 and epoch % 10==0: # difficulty += 1 # dataset = get_dataset(encoders, difficulty) # train_data = dataset.batch(config.BATCH_SIZE).take(config.STEPS_PER_EPOCH) start_time = time.time() discr_only_steps = 0 # if epoch < 500 else 1 for b, (text_inputs_dict, images) in enumerate(train_data): print(f"{b} completed", end="\r") train_part = TRAIN_D if epoch < 5 else \ TRAIN_GD if b%(discr_only_steps+1) == 0 else TRAIN_D _train_on_batch_f(text_inputs_dict, images, train_part) accumulator.accumulate(epoch) logger.on_epoch_end(epoch) logging.info( "Done with epoch %s took %ss (difficulty=%s; discr_only_steps=%s)", epoch, round(time.time() - start_time, 2), difficulty, discr_only_steps)
def standardize(dataset: str): """ :param dataset: :return: """ assert isinstance(dataset, str) and len(dataset) tf.enable_eager_execution() train_path = _util.get_rel_datasets_path(dataset, "train") _util.ensure_dir(train_path) dataset_path = _util.get_rel_datasets_path(dataset) standardized_name = _get_standardized_name(dataset) standardized_path = _util.get_rel_datasets_path(standardized_name) # _util.ensure_path_free(standardized_path, empty_ok=True) # _util.mkdir(standardized_path) train_data = _dataset.get_dataset(train_path, partial=True) train_iter = train_data.repeat().make_one_shot_iterator() train_records = _dataset.get_records(train_path, partial=True) # Compute sample mean over train total = train_iter.next()[0][0] for _ in tqdm(train_records[1:]): sample = train_iter.next() total += sample[0][0] mean = total / len(train_records) total = tf.square(train_iter.next()[0][0] - mean) for _ in tqdm(train_records[1:]): sample = train_iter.next() scan = sample[0][0] total += tf.square(scan - mean) std = tf.sqrt(tf.reduce_mean(total)) _standardize_dataset(train_path, dataset, mean, std) _standardize_dataset(_util.get_rel_datasets_path(dataset, "dev"), dataset, mean, std) _standardize_dataset(_util.get_rel_datasets_path(dataset, "test"), dataset, mean, std) _dataset.save_shape(standardized_path, _dataset.load_shape(dataset_path)) _dataset.save_mean(standardized_path, mean.numpy()) _dataset.save_std(standardized_path, std.numpy())
def test_simple_cnn_model(): args = update_args(cfg_file='simple_cnn') device = 'cpu' train_data_dict = get_dataset(args).train_data_dict train_dataloader = get_dataloader( batch_size=args.TRAIN.BATCH_SIZE, dataset=train_data_dict['dataset'], num_workers=args.DATA.NUM_WORKERS, sampler=train_data_dict['train_sampler'], ) validation_dataloader = get_dataloader( batch_size=args.TRAIN.BATCH_SIZE, dataset=train_data_dict['dataset'], num_workers=args.DATA.NUM_WORKERS, sampler=train_data_dict['validation_sampler'], ) model = get_model( args=args, device=device, hparams={ 'learning rate': args.TRAIN.LR, 'batch size': args.TRAIN.BATCH_SIZE, }, ) for data, labels in train_dataloader: data, labels = data.to(device), labels.to(device) outputs = model(data) assert data.shape[0] == labels.shape[0] == outputs.shape[ 0] == args.TRAIN.BATCH_SIZE assert data.shape[1] == 3 assert outputs.shape[1] == len(args.DATA.CLASSES) break for data, labels in validation_dataloader: data, labels = data.to(device), labels.to(device) outputs = model(data) assert data.shape[0] == labels.shape[0] == outputs.shape[ 0] == args.TRAIN.BATCH_SIZE assert data.shape[1] == 3 assert outputs.shape[1] == len(args.DATA.CLASSES) break
def visualize(dataset: str): """ :param dataset: :return: """ assert isinstance(dataset, str) and len(dataset) tf.enable_eager_execution() dataset_path = _util.get_rel_datasets_path(dataset) _util.ensure_dir(dataset_path) data = _dataset.get_dataset(dataset_path, 1, 1, partial=True) scan = data.make_one_shot_iterator().next()[0][0].numpy() show_scan(scan.squeeze(), "")
def downsample(dataset: str, shape: List[int], partial=False): """ :param dataset: :param shape: :return: """ assert isinstance(dataset, str) and len(dataset) assert isinstance(shape, list) and all(isinstance(s, int) for s in shape) and len(shape) == 3 assert isinstance(partial, bool) tf.enable_eager_execution() dataset_path = _util.get_rel_datasets_path(dataset) _util.ensure_dir(dataset_path) data = _dataset.get_dataset(dataset_path, 1, 8, partial=partial) resized_dataset = "{}_resized".format(dataset) resized_path = _util.get_rel_datasets_path(resized_dataset) _util.ensure_path_free(resized_path) _util.mkdir(resized_path) iter = data.make_one_shot_iterator() records = _dataset.get_records(dataset_path, partial) for record in tqdm(records): record = record.replace(dataset, resized_dataset) sample = iter.next() scan = sample[0][0].numpy().squeeze() # show_scan(scan, "Original") crop = crop_image(scan, 1e-5) # show_scan(crop, "Crop") factors = [s / d for d, s in zip(crop.shape, shape)] resized = ndimage.zoom(crop, zoom=factors, order=4) # show_scan(resized, "Resized") _dataset.write_record(record, resized, sample[0][1].numpy().squeeze(), sample[1].numpy()) _dataset.save_shape(resized_path, shape)
def get_dataloader(cfg: object, mode: str) -> tuple: """Get dataloader function This is function to get dataloaders. Get dataset, then make dataloaders. Args: cfg: Config. mode: Mode. trainval: For trainning and validation. test: For test. Returns: Tuple of dataloaders. """ log.info(f"Loading {cfg.data.dataset.name} dataset...") dataset = get_dataset(cfg, mode) sampler = get_sampler(cfg, mode, dataset) if mode == "trainval": train_dataloader = DataLoader(cfg, dataset=dataset.train, sampler=sampler.train) val_dataloader = DataLoader(cfg, dataset=dataset.val, sampler=sampler.val) dataloaders = (train_dataloader, val_dataloader) elif mode == "test": test_dataloader = DataLoader(cfg, dataset=dataset.test, sampler=sampler.test) dataloaders = (test_dataloader) log.info(f"Successfully loaded {cfg.data.dataset.name} dataset.") return dataloaders
def _standardize_dataset(dataset_path, dataset, mean, std): data = _dataset.get_dataset(dataset_path, partial=True).make_one_shot_iterator() records = _dataset.get_records(dataset_path, partial=True) standardized_name = _get_standardized_name(dataset) standardized_path = dataset_path.replace(dataset, standardized_name) _util.ensure_path_free(standardized_path, empty_ok=True) _util.mkdir(standardized_path) for record in tqdm(records): record = record.replace(dataset, standardized_name) sample = data.next() scan = sample[0][0] # show_scan(scan.numpy().squeeze(), "Original") standardized = (scan - mean) / std # show_scan(standardized.numpy().squeeze(), "Standardized") _dataset.write_record(record, standardized.numpy().squeeze(), sample[0][1].numpy().squeeze(), sample[1].numpy())
def train(restore): encoders = get_encoders() dataset = get_dataset(encoders) text_rnn, generator = get_generator(encoders) checkpoint_path = path.join(config.CHECKPOINT_DIR, "keras", "generator.ckpt") if restore: generator.load_weights(checkpoint_path) stats_filename = datetime.now().strftime('%Y%m%d_%H%M') + ".csv" callbacks = [ # K.callbacks.TensorBoard(path.join(config.LOG_DIR, "tf_boards")), K.callbacks.CSVLogger( path.join(config.LOG_DIR, "stats", stats_filename)), K.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True), EvaluationLogger(generator, dataset, encoders) ] # https://github.com/keras-team/keras/issues/1872#issuecomment-572606922 initial_epoch = generator.optimizer.iterations.numpy( ) // config.STEPS_PER_EPOCH train_data = dataset.batch(config.BATCH_SIZE).take(config.STEPS_PER_EPOCH) # val_data = dataset.batch(config.BATCH_SIZE).take(8) generator.fit( train_data, epochs=config.NUM_EPOCHS, initial_epoch=initial_epoch, # validation_data=val_data, callbacks=callbacks) checkpoint_path = path.join(config.CHECKPOINT_DIR, "keras", "text_rnn.ckpt") text_rnn.save_weights(checkpoint_path)
def noise_models_train(config): config['outdir'].mkdir(parents=True, exist_ok=True) dataset_class = get_dataset(config['dataset_type']) # compute or load mean and std of dataset trans = lambda x: logspec(x, **config['spectrum_conf']) dataset = dataset_class(config['dataset'], transform=trans) dataloader_meanstd = DataLoader(dataset) meanstd_norm = get_meanstd_norm(config['meanstd_norm_file'], dataloader_meanstd) # load the dataset dataset = dataset_class(config['dataset']) for low, high in config['snrs']: for use_norm in [True, False]: logging.info( f'Noise model for SNR {low}-{high} dB, use norm: {use_norm}') name = f'snr_{low}_{high}' name = f'{name}_wonorm' if not use_norm else name get_noise_stats(config['outdir'] / name, (low, high), dataset, config['spectrum_conf'], meanstd_norm=meanstd_norm if use_norm else None)
class LinearMetrics(nn.Module): def __init__(self, input_dim, out_dim): super(LinearMetrics, self).__init__() self.linear = nn.Linear(input_dim, out_dim) def forward(self, x, label): return self.linear(x) if __name__ == '__main__': opt = Config() device = torch.device("cuda") train_dataset = get_dataset(Config.dataset, phase='train', input_shape=environments.INPUT_SHAPE) train_loader = data.DataLoader(train_dataset, batch_size=opt.train_batch_size, shuffle=True, num_workers=opt.num_workers) logger.info('{} train iters per epoch:'.format(len(train_loader))) if opt.loss == 'focal_loss': criterion = FocalLoss(gamma=2) elif opt.loss == 'logloss': criterion = torch.nn.CrossEntropyLoss() else: raise ValueError()
def main( args, args_file_path: str, tmp_results_dir: str, train_log_file_path: str, ) -> None: device = 'cuda' if torch.cuda.is_available() else 'cpu' train_data_dict = get_dataset(args).train_data_dict train_dataloader = get_dataloader( batch_size=args.TRAIN.BATCH_SIZE, dataset=train_data_dict['dataset'], num_workers=args.DATA.NUM_WORKERS, sampler=train_data_dict['train_sampler'], ) validation_dataloader = get_dataloader( batch_size=args.TRAIN.BATCH_SIZE, dataset=train_data_dict['dataset'], num_workers=args.DATA.NUM_WORKERS, sampler=train_data_dict['validation_sampler'], ) model = get_model( args=args, device=device, hparams={ 'learning rate': args.TRAIN.LR, 'batch size': args.TRAIN.BATCH_SIZE, }, ).to(device) mlflow_logger = MLFlowLogger(experiment_name=args.MLFLOW.EXPERIMENT_NAME, ) checkpoint_callback = ModelCheckpoint(monitor='validation_accuracy') trainer = pl.Trainer( callbacks=[checkpoint_callback], distributed_backend=args.TRAIN.DISTRIBUTED_BACKEND, gpus=args.TRAIN.GPUS, logger=mlflow_logger, max_epochs=args.TRAIN.MAX_EPOCHS, replace_sampler_ddp=False, ) try: exist_error = False trainer.fit(model, train_dataloader, validation_dataloader) except Exception: run_id = mlflow_logger.run_id if run_id is not None: error_file_path = join(tmp_results_dir, 'error_log.txt') with open(error_file_path, 'w') as f: traceback.print_exc(file=f) exist_error = True print() print('Failed to train. See error_log.txt on mlflow.') print(f'Experiment name: {args.MLFLOW.EXPERIMENT_NAME}') print(f'Run id: {run_id}') sys.exit(1) finally: run_id = mlflow_logger.run_id if run_id is not None: with open(args_file_path, 'w') as f: with redirect_stdout(f): print(args.dump()) mlflow_client = MlflowClient() mlflow_client.log_artifact(run_id, args_file_path) mlflow_client.log_artifact(run_id, train_log_file_path) if exist_error: mlflow_client.log_artifact(run_id, error_file_path) rmtree(tmp_results_dir, ignore_errors=True)
def objective(trial, args, tmp_results_dir: str) -> float: timestamp = datetime.today().strftime('%Y-%m-%d-%H:%M:%S') cur_tmp_results_dir = join(tmp_results_dir, timestamp) makedirs(cur_tmp_results_dir, exist_ok=True) args_file_path = join(cur_tmp_results_dir, 'args.yaml') train_log_file_path = join(cur_tmp_results_dir, 'log.txt') device = 'cuda' if torch.cuda.is_available() else 'cpu' train_data_dict = get_dataset(args).train_data_dict train_dataloader = get_dataloader( batch_size=args.TRAIN.BATCH_SIZE, dataset=train_data_dict['dataset'], num_workers=args.DATA.NUM_WORKERS, sampler=train_data_dict['train_sampler'], ) validation_dataloader = get_dataloader( batch_size=args.TRAIN.BATCH_SIZE, dataset=train_data_dict['dataset'], num_workers=args.DATA.NUM_WORKERS, sampler=train_data_dict['validation_sampler'], ) model = get_model( args=args, device=device, trial=trial, # if you want to decide hparams with trial, you don't need to write any values here. # This code decides learning rate with trial and record it in model's configure_optimizers method. hparams={ 'batch size': args.TRAIN.BATCH_SIZE, }, ).to(device) mlflow_logger = MLFlowLogger(experiment_name=args.MLFLOW.EXPERIMENT_NAME, ) checkpoint_callback = ModelCheckpoint(monitor='validation_accuracy') trainer = pl.Trainer( callbacks=[checkpoint_callback], distributed_backend=args.TRAIN.DISTRIBUTED_BACKEND, gpus=args.TRAIN.GPUS, logger=mlflow_logger, max_epochs=args.TRAIN.MAX_EPOCHS, replace_sampler_ddp=False, ) try: exist_error = False print(f'To see training logs, you can check {train_log_file_path}') with open(train_log_file_path, 'w') as f: with redirect_stdout(f): trainer.fit(model, train_dataloader, validation_dataloader) except Exception: run_id = mlflow_logger.run_id if run_id is not None: error_file_path = join(tmp_results_dir, 'error_log.txt') with open(error_file_path, 'w') as f: traceback.print_exc(file=f) exist_error = True print() print('Failed to train. See error_log.txt on mlflow.') print(f'Experiment name: {args.MLFLOW.EXPERIMENT_NAME}') print(f'Run id: {run_id}') sys.exit(1) finally: run_id = mlflow_logger.run_id if run_id is not None: with open(args_file_path, 'w') as f: with redirect_stdout(f): print(args.dump()) mlflow_client = MlflowClient() mlflow_client.log_artifact(run_id, args_file_path) mlflow_client.log_artifact(run_id, train_log_file_path) if exist_error: mlflow_client.log_artifact(run_id, error_file_path) rmtree(cur_tmp_results_dir, ignore_errors=True) return checkpoint_callback.best_model_score
def input_fn(): dataset = get_dataset(encoders) return dataset.batch(config.BATCH_SIZE)
def main(base_path, set_name=None, writer=None): """ Main eval loop: Iterates over all evaluation samples and saves the corresponding predictions. """ # default value if set_name is None: set_name = ['evaluation'] if 'training' in set_name: #set_name == 'training': # initialize train datasets train_loaders = [] if args.controlled_exp: # Use subset of datasets so that final dataset size is constant limit_size = int(args.controlled_size / len(args.train_datasets)) else: limit_size = None for dat_name in args.train_datasets: # iteration = min(dataset_len)/batch_size; go each dataset at a batchsize if dat_name == 'FreiHand': if len(args.train_queries_frei) > 0: train_queries = args.train_queries_frei else: train_queries = args.train_queries base_path = args.freihand_base_path elif dat_name == 'RHD': if len(args.train_queries_rhd) > 0: train_queries = args.train_queries_rhd else: train_queries = args.train_queries base_path = args.rhd_base_path elif (dat_name == 'Obman') or (dat_name == 'Obman_hand'): train_queries = args.train_queries elif dat_name == 'HO3D': if len(args.train_queries_ho3d) > 0: train_queries = args.train_queries_ho3d else: train_queries = args.train_queries base_path = args.ho3d_base_path train_dat = get_dataset( dat_name, 'training', #set_name, base_path, queries=train_queries, train=True, limit_size=limit_size, #transform=transforms.Compose([transforms.Rescale(256),transforms.ToTensor()])) ) print("Training dataset size: {}".format(len(train_dat))) # Initialize train dataloader train_loader0 = torch.utils.data.DataLoader( train_dat, batch_size=args.train_batch, shuffle=True, #check num_workers=args.num_workers, pin_memory=True, drop_last=True, ) train_loaders.append(train_loader0) train_loader = ConcatDataloader(train_loaders) #if 'evaluation' in set_name: val_loaders = [] for dat_name_val in args.val_datasets: if dat_name_val == 'FreiHand': val_queries = args.val_queries base_path = args.freihand_base_path elif dat_name_val == 'RHD': val_queries = args.val_queries base_path = args.rhd_base_path elif dat_name_val == 'HO3D': val_queries = args.val_queries base_path = args.ho3d_base_path val_dat = get_dataset( dat_name_val, 'evaluation', base_path, queries=val_queries, train=False, #transform=transforms.Compose([transforms.Rescale(256),transforms.ToTensor()])) ) print("Validation dataset size: {}".format(len(val_dat))) val_loader = torch.utils.data.DataLoader( val_dat, batch_size=args.val_batch, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=False, ) val_loaders.append(val_loader) val_loader = ConcatDataloader(val_loaders) #current_epoch = 0 if len(args.train_datasets) == 1: dat_name = args.train_datasets[0] #dat_name else: dat_name = args.train_datasets #losses = AverageMeter() if 'training' in set_name: #set_name == 'training': if args.optimizer == "Adam": optimizer = optim.Adam(model.parameters(), lr=args.init_lr, betas=(0.9, 0.999), weight_decay=0) if args.optimizer == "AdamW": optimizer = optim.Adam(model.parameters(), lr=args.init_lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) for epoch in range(1, args.total_epochs + 1): mode_train = True requires = args.train_requires args.train_batch = args.train_batch TrainVal(mode_train, dat_name, epoch + current_epoch, train_loader, model, optimizer, requires, args, writer) torch.cuda.empty_cache() # save parameters if (epoch + current_epoch) % args.save_interval == 0: # test mode_train = False requires = args.test_requires args.train_batch = args.val_batch print('For test part:') TrainVal(mode_train, dat_name_val, epoch + current_epoch, val_loader, model, optimizer, requires, args, writer) torch.cuda.empty_cache() save_model(model, optimizer, epoch, current_epoch, args) scheduler.step() elif 'evaluation' in set_name: #set_name == 'evaluation': mode_train = False requires = args.test_requires optimizer = optim.Adam(model.parameters(), lr=args.init_lr, betas=(0.9, 0.999), weight_decay=0) # #epoch = 0 #current_epoch = 0 #save_model(model,optimizer,epoch,current_epoch, args) #import pdb; pdb.set_trace() TrainVal(mode_train, dat_name_val, current_epoch, val_loader, model, None, requires, args, writer) print("Finish write prediction. Good luck!") print("Done!")
def train(dataset: str, epochs: int, batch_size: int, buffer_size: int, lr: float, l2_reg=0., tv_reg=0., ssim_loss=0., sobel_loss=0.): """ Trains an Autoencoder using the specified parameters. :param dataset: Existing dataset over which to train. Must contain train, dev, {mean,std}.pickle, shape.json :param epochs: Number of iterations over training data before termination. :param batch_size: Number of training samples per batch. :param buffer_size: Number of batches to prefetch. :param lr: Adam optimization initial learning rate. :param l2_reg: L2 regularization coefficient for kernel weights. :param tv_reg: Total Variation regularization coefficient for data. :param ssim_loss: SSIM regularization coefficient for data. :param sobel_loss: L2 regularization coefficient for data Sobel difference. """ assert isinstance(dataset, str) and len(dataset) assert isinstance(epochs, int) and epochs > 0 assert isinstance(batch_size, int) and batch_size > 0 assert isinstance(buffer_size, int) and batch_size > 0 assert isinstance(lr, float) and lr > 0 assert isinstance(l2_reg, float) and l2_reg >= 0 assert isinstance(tv_reg, float) and tv_reg >= 0 assert isinstance(ssim_loss, float) and ssim_loss >= 0 assert isinstance(sobel_loss, float) and sobel_loss >= 0 # Load and ensure required paths. weights_path = _util.get_weights_path_by_param(model="autoencoder", dataset=dataset, epochs=epochs, batch_size=batch_size, lr=lr, l2_reg=l2_reg, tv_reg=tv_reg, ssim_loss=ssim_loss, sobel_loss=sobel_loss) log_path = os.path.join(weights_path, "logs") _util.ensure_path_free(log_path, empty_ok=True) _util.mkdir(log_path) dataset_path = _util.get_rel_datasets_path(dataset) _util.ensure_dir(dataset_path) # Load model and input shape. shape = _dataset.load_shape(dataset_path) mean = _dataset.load_mean(dataset_path) std = _dataset.load_std(dataset_path) model = Autoencoder(l2_reg) # Create input/output placeholders. inp = tf.image.per_image_standardization( tf.placeholder(tf.float32, shape=[None, *shape])) out = model.call(inp) # Initialize loss functions. total_loss, l2_loss, l2_reg, tv_reg, ssim_loss, sobel_loss = \ _get_losses(inp, out, batch_size, model.losses, l2_reg, tv_reg, ssim_loss, sobel_loss) # Configure training operation. train_op = _get_train_op(total_loss, lr) # Load datasets train_dataset = (_dataset.get_dataset( os.path.join(dataset_path, "train"), partial=True).map( _only_cropped_scan).batch(batch_size).prefetch(buffer_size)) dev_dataset = (_dataset.get_dataset( os.path.join(dataset_path, "dev"), partial=True).map( _only_cropped_scan).batch(batch_size).prefetch(buffer_size)) # Setup logging and weight saving. _tboard.configure(log_path, flush_secs=2) saver = tf.train.Saver() # Initialize training loop variables. best_dev_loss, dev_loss = np.inf, np.inf config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) _logger.info("Counting datasets...") train_batches = dataset_iter_len( sess, train_dataset.make_one_shot_iterator().get_next()) _logger.info("\tTrain samples: {}".format(train_batches)) dev_batches = dataset_iter_len( sess, dev_dataset.make_one_shot_iterator().get_next()) _logger.info("\tDev samples: {}".format(dev_batches)) train_loss = total_loss / train_batches dev_loss = total_loss / dev_batches train_dataset = (_dataset.get_dataset( os.path.join(dataset_path, "train"), partial=True).map( _only_cropped_scan).batch(batch_size).prefetch(buffer_size)) for epoch in tqdm(range(epochs)): train_iter = train_dataset.make_one_shot_iterator().get_next() losses = defaultdict(float) for _ in range(train_batches): sample = sess.run(train_iter) _, _train_loss, _l2_loss, _l2_reg, _tv_reg, _ssim_loss, _sobel_loss = \ sess.run( [train_op, train_loss, l2_loss, l2_reg, tv_reg, ssim_loss, sobel_loss], feed_dict={inp: sample}) losses["train/loss/total"] += _train_loss losses["train/loss/l2_loss"] += _l2_loss losses["train/reg/l2"] += _l2_reg losses["train/reg/tv"] += _tv_reg losses["train/loss/ssim"] += _ssim_loss losses["train/loss/sobel"] += _sobel_loss # Increment before doing anything else to avoid zero-indexed epochs. epoch += 1 # Log training losses to tensorboard. for name, val in losses.items(): _tboard.log_value(name, val, step=epoch) _logger.info("Epoch {}: train loss {}".format( epoch, losses["train/loss/total"])) # Compute dev metrics every 2 epochs. if epoch < 2 or epoch % 2 == 0: losses.clear() # Compute and log dev loss _dev_loss, _l2_loss, _l2_reg, _tv_reg, _ssim_loss, _sobel_loss = \ _get_dev_loss(sess, inp, dev_dataset, dev_batches, dev_loss, l2_loss, l2_reg, tv_reg, ssim_loss, sobel_loss) # Log dev losses to tensorboard. _logger.info("Epoch {}: dev loss {}".format(epoch, _dev_loss)) _tboard.log_value("dev/loss/total", _dev_loss, step=epoch) _tboard.log_value("dev/loss/l2_loss", _l2_loss, step=epoch) _tboard.log_value("dev/reg/l2", _l2_reg, step=epoch) _tboard.log_value("dev/reg/tv", _tv_reg, step=epoch) _tboard.log_value("dev/loss/ssim", _ssim_loss, step=epoch) _tboard.log_value("dev/loss/sobel", _sobel_loss, step=epoch) # Save best model. if _dev_loss < best_dev_loss: save_path = saver.save( sess, os.path.join(weights_path, "{}.ckpt".format(epoch))) _logger.info( "Saved new best model to {}".format(save_path)) best_dev_loss = _dev_loss # Plot some reconstruction images _logger.info("Generating reconstruction plots...") _log_reconstruction_imgs("eval", sess, train_dataset, inp, out, epoch, mean, std) _log_reconstruction_imgs("train", sess, train_dataset, inp, out, epoch, mean, std)
type=str, default='checkpoints', help='save model directory') opt = parser.parse_args() print(opt) logging.basicConfig( level=logging.INFO, #打印日志级别数值 format='%(asctime)s: %(message)s', #输出时间和信息 stream=sys.stdout #指定日志的输出流 ) cudnn.benchmark = True logging.info('=========== Starting Training ============') train_data, test_data, char_to_index, index_to_char, n_class = get_dataset(opt) net = Attention_ocr(use_gpu=opt.use_gpu, NUM_CLASS=n_class) optimizer = torch.optim.Adam(net.parameters(), lr=opt.lr, betas=(0.9, 0.999)) criterion = losses.Attention_loss() net = torch.nn.DataParallel(net) net = net.cuda() model = Train_Engine(net) model.fit(index_to_char, train_data=train_data, test_data=test_data, optimizer=optimizer, criterion=criterion, epochs=opt.epochs,