def train(): """Train function.""" args = get_args("train") if args.need_profiler: from mindspore.profiler.profiling import Profiler profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True) ds = create_dataset(args) G_A = get_generator(args) G_B = get_generator(args) D_A = get_discriminator(args) D_B = get_discriminator(args) load_ckpt(args, G_A, G_B, D_A, D_B) imgae_pool_A = ImagePool(args.pool_size) imgae_pool_B = ImagePool(args.pool_size) generator = Generator(G_A, G_B, args.lambda_idt > 0) loss_D = DiscriminatorLoss(args, D_A, D_B) loss_G = GeneratorLoss(args, generator, D_A, D_B) optimizer_G = nn.Adam(generator.trainable_params(), get_lr(args), beta1=args.beta1) optimizer_D = nn.Adam(loss_D.trainable_params(), get_lr(args), beta1=args.beta1) net_G = TrainOneStepG(loss_G, generator, optimizer_G) net_D = TrainOneStepD(loss_D, optimizer_D) data_loader = ds.create_dict_iterator() reporter = Reporter(args) reporter.info('==========start training===============') for _ in range(args.max_epoch): reporter.epoch_start() for data in data_loader: img_A = data["image_A"] img_B = data["image_B"] res_G = net_G(img_A, img_B) fake_A = res_G[0] fake_B = res_G[1] res_D = net_D(img_A, img_B, imgae_pool_A.query(fake_A), imgae_pool_B.query(fake_B)) reporter.step_end(res_G, res_D) reporter.visualizer(img_A, img_B, fake_A, fake_B) reporter.epoch_end(net_G) if args.need_profiler: profiler.analyse() break reporter.info('==========end training===============')
def predict(): """Predict function.""" args = get_args("predict") G_A = get_generator(args) G_B = get_generator(args) # Use BatchNorm2d with batchsize=1, affine=False, training=True instead of InstanceNorm2d # Use real mean and varance rather than moving_men and moving_varance in BatchNorm2d G_A.set_train(True) G_B.set_train(True) load_ckpt(args, G_A, G_B) imgs_out = os.path.join(args.outputs_dir, "predict") if not os.path.exists(imgs_out): os.makedirs(imgs_out) if not os.path.exists(os.path.join(imgs_out, "fake_A")): os.makedirs(os.path.join(imgs_out, "fake_A")) if not os.path.exists(os.path.join(imgs_out, "fake_B")): os.makedirs(os.path.join(imgs_out, "fake_B")) args.data_dir = 'testA' ds = create_dataset(args) reporter = Reporter(args) reporter.start_predict("A to B") for data in ds.create_dict_iterator(output_numpy=True): img_A = Tensor(data["image"]) path_A = str(data["image_name"][0], encoding="utf-8") fake_B = G_A(img_A) save_image(fake_B, os.path.join(imgs_out, "fake_B", path_A)) reporter.info('save fake_B at %s', os.path.join(imgs_out, "fake_B", path_A)) reporter.end_predict() args.data_dir = 'testB' ds = create_dataset(args) reporter.dataset_size = args.dataset_size reporter.start_predict("B to A") for data in ds.create_dict_iterator(output_numpy=True): img_B = Tensor(data["image"]) path_B = str(data["image_name"][0], encoding="utf-8") fake_A = G_B(img_B) save_image(fake_A, os.path.join(imgs_out, "fake_A", path_B)) reporter.info('save fake_A at %s', os.path.join(imgs_out, "fake_A", path_B)) reporter.end_predict()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=config.weight_decay) elif config.optim == "SGD": optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, momentum=config.momentum, weight_decay=config.weight_decay) start_iter = 0 if config.resume: print("Loading the trained params and the state of optimizer...") start_iter = load_ckpt(config.resume, [("model", model)], [("optimizer", optimizer)]) for param_group in optimizer.param_groups: param_group["lr"] = lr print("Starting from iter ", start_iter) trainer = Trainer(start_iter, config, device, model, dataset_train, dataset_val, criterion, optimizer, experiment=experiment) if config.comet: with experiment.train(): trainer.iterate() else: trainer.iterate() # Set the configuration for testing elif config.mode == "test":
def main(config): CASE_NUM = config['case_num'] DATASET = config['dataset'] NORMALIZATION = config['normalization'] BATCH_SIZE = config['batch_size'] MAX_EPOCH = config['max_epoch'] OPTIM_TYPE = config['optimzer'] LR = config['learning_rate'] LR_STEP = config['lr_step'] LR_DECAY = config['lr_decay'] L2_DECAY = config['l2_decay'] TB_STATE = config['use_tensorboard'] MODEL_NAME = config['model_name'] ALPHA = config['alpha'] BETA = config['beta'] GAMMA = config['gamma'] PHI = config['phi'] LOSS_FN = config['loss_fn'] KERNEL_SIZE = config['kernel_size'] result_dir = RESULT_ROOT_DIR + '/' + CASE_NUM ckpt_path = result_dir + '/' + 'checkpoint.pt' #%% data_fname, data_dim = select_data(DATASET) data_path = '../data/' + data_fname data_test = NLUDataset(data_path, mode='test', random_seed=42) dataloader_test = DataLoader(data_test, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) classes = data_test.labels num_classes = len(classes) #%% device = ('cuda' if torch.cuda.is_available() else 'cpu') if device == 'cuda': print('Using GPU, %s' % torch.cuda.get_device_name(0)) net = select_model(MODEL_NAME, data_dim, KERNEL_SIZE, num_classes, ALPHA, BETA, PHI) net.to(device) loss_fn = select_loss(LOSS_FN) #%% net, best_validation_acc = load_ckpt(ckpt_path, net) start_time = time.time() test_acc, test_loss = evaluate(dataloader_test, device, net, loss_fn) curr_time = time.time() ttt = curr_time - start_time tt1 = ttt / data_test.__len__() print('########################################################') print('# Test accuracy of %d: %.4f' % (CASE_NUM, test_acc)) print("# Average %.6f s to process one input" % (tt1)) print('########################################################')
def main(): arg = args() if not os.path.exists(arg.exp_name): os.makedirs(arg.exp_name) assert arg.exp_name.split( '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..." output_dir = arg.exp_name if arg.local_rank == 0: save_scripts_in_exp_dir(output_dir) logger = logging_set(output_dir, arg.local_rank) logger.info(arg) logger.info( '\n================ experient name:[{}] ===================\n'.format( arg.exp_name)) os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True np.random.seed(0) torch.manual_seed(0) config = edict(yaml.load(open(arg.cfg, 'r'))) if arg.search: assert arg.search in [ 'None', 'sync', 'random', 'second_order_gradient', 'first_order_gradient' ] config.train.arch_search_strategy = arg.search if arg.batchsize: logger.info("update batchsize to {}".format(arg.batchsize)) config.train.batchsize = arg.batchsize config.num_workers = arg.num_workers print( 'GPU memory : \ntotal | used\n', os.popen( 'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader' ).read()) logger.info( '------------------------------ configuration ---------------------------' ) logger.info( '\n==> available {} GPUs , use numbers are {} device is {}\n'.format( torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"], torch.cuda.current_device())) # torch.cuda._initialized = True logger.info(pprint.pformat(config)) logger.info( '------------------------------- -------- ----------------------------' ) best = 0 criterion = MSELoss() Arch = bulid_up_network(config, criterion) if config.train.arch_search_strategy == 'random': logger.info("==>random seed is {}".format(config.train.random_seed)) np.random.seed(config.train.random_seed) torch.manual_seed(config.train.random_seed) Arch.arch_parameters_random_search() if arg.param_flop: Arch._print_info() if len(arg.gpu) > 1: use_multi_gpu = True if arg.distributed: torch.distributed.init_process_group(backend="nccl") #torch.distributed.init_process_group(backend="nccl",init_method='env://') local_rank = torch.distributed.get_rank() torch.cuda.set_device(local_rank) device = torch.device("cuda", local_rank) Arch.to(device) Arch = torch.nn.parallel.DistributedDataParallel( Arch, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) logger.info("local rank = {}".format(local_rank)) else: Arch = torch.nn.DataParallel(Arch).cuda() else: use_multi_gpu = False Arch = Arch.cuda() Search = Search_Arch(Arch.module, config) if use_multi_gpu else Search_Arch( Arch, config) # Arch.module for nn.DataParallel search_strategy = config.train.arch_search_strategy if not arg.distributed: train_queue, arch_queue, valid_queue = Dataloaders( search_strategy, config, arg) else: train_queue, \ arch_queue, \ valid_queue, \ train_sampler_dist, = Dataloaders(search_strategy,config,arg) #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None! logger.info( "\nNeural Architecture Search strategy is {}".format(search_strategy)) assert search_strategy in [ 'first_order_gradient', 'random', 'None', 'second_order_gradient', 'sync' ] if search_strategy == 'sync': # arch_parameters is also registered to model's parameters # so the weight-optimizer will also update the arch_parameters logger.info( "sync: The arch_parameters is also optimized by weight-optmizer synchronously" ) optimizer = torch.optim.Adam( Arch.parameters(), lr=config.train.w_lr_cosine_begin, ) else: # if search strategy is None,random,second_order_gradient and so on # the arch_parameters will be filtered by the weight-optimizer optimizer = torch.optim.Adam( filter_arch_parameters(Arch), lr=config.train.w_lr_cosine_begin, ) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = config.train.lr_step_size, # gamma = config.train.lr_decay_gamma ) if config.train.scheduler_name == "MultiStepLR": scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.train.LR_STEP, config.train.LR_FACTOR) elif config.train.scheduler_name == "CosineAnnealingLR": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=config.train.epoch_end, eta_min=config.train.w_lr_cosine_end) # best_result logger.info( "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+==" ) begin, end = config.train.epoch_begin, config.train.epoch_end if arg.load_ckpt: if use_multi_gpu: begin, best = load_ckpt(Arch.module, optimizer, scheduler, output_dir, logger) else: begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir, logger) for epoch in range(begin, end): lr = scheduler.get_lr()[0] logger.info( '==>time:({})--training...... current learning rate is {:.7f}'. format(datetime.datetime.now(), lr)) if arg.distributed: train_sampler_dist.set_epoch(epoch) #valid_sampler_dist.set_epoch(epoch) train( epoch, train_queue, arch_queue, Arch, Search, criterion, optimizer, lr, search_strategy, output_dir, logger, config, arg, ) scheduler.step() if not arg.distributed or (arg.distributed and arg.local_rank == 0): eval_results = evaluate(Arch, valid_queue, config, output_dir) if use_multi_gpu: best = save_model(epoch, best, eval_results, Arch.module, optimizer, scheduler, output_dir, logger) else: best = save_model(epoch, best, eval_results, Arch, optimizer, scheduler, output_dir, logger)
def train_main(): args = parse_args() # directory for storing weights and other training related files training_starttime = datetime.now().strftime("%d_%m_%Y-%H_%M_%S-%f") ckpt_dir = os.path.join(args.results_dir, args.dataset, f'checkpoints_{training_starttime}') os.makedirs(ckpt_dir, exist_ok=True) os.makedirs(os.path.join(ckpt_dir, 'confusion_matrices'), exist_ok=True) with open(os.path.join(ckpt_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) with open(os.path.join(ckpt_dir, 'argsv.txt'), 'w') as f: f.write(' '.join(sys.argv)) f.write('\n') # when using multi scale supervision the label needs to be downsampled. label_downsampling_rates = [8, 16, 32] # data preparation --------------------------------------------------------- data_loaders = prepare_data(args, ckpt_dir) if args.valid_full_res: train_loader, valid_loader, valid_loader_full_res = data_loaders else: train_loader, valid_loader = data_loaders valid_loader_full_res = None cameras = train_loader.dataset.cameras n_classes_without_void = train_loader.dataset.n_classes_without_void if args.class_weighting != 'None': class_weighting = train_loader.dataset.compute_class_weights( weight_mode=args.class_weighting, c=args.c_for_logarithmic_weighting) else: class_weighting = np.ones(n_classes_without_void) # model building ----------------------------------------------------------- model, device = build_model(args, n_classes=n_classes_without_void) if args.freeze > 0: print('Freeze everything but the output layer(s).') for name, param in model.named_parameters(): if 'out' not in name: param.requires_grad = False # loss, optimizer, learning rate scheduler, csvlogger ---------- # loss functions (only loss_function_train is really needed. # The other loss functions are just there to compare valid loss to # train loss) loss_function_train = \ utils.CrossEntropyLoss2d(weight=class_weighting, device=device) pixel_sum_valid_data = valid_loader.dataset.compute_class_weights( weight_mode='linear') pixel_sum_valid_data_weighted = \ np.sum(pixel_sum_valid_data * class_weighting) loss_function_valid = utils.CrossEntropyLoss2dForValidData( weight=class_weighting, weighted_pixel_sum=pixel_sum_valid_data_weighted, device=device) loss_function_valid_unweighted = \ utils.CrossEntropyLoss2dForValidDataUnweighted(device=device) optimizer = get_optimizer(args, model) # in this script lr_scheduler.step() is only called once per epoch lr_scheduler = OneCycleLR(optimizer, max_lr=[i['lr'] for i in optimizer.param_groups], total_steps=args.epochs, div_factor=25, pct_start=0.1, anneal_strategy='cos', final_div_factor=1e4) # load checkpoint if parameter last_ckpt is provided if args.last_ckpt: ckpt_path = os.path.join(ckpt_dir, args.last_ckpt) epoch_last_ckpt, best_miou, best_miou_epoch = \ load_ckpt(model, optimizer, ckpt_path, device) start_epoch = epoch_last_ckpt + 1 else: start_epoch = 0 best_miou = 0 best_miou_epoch = 0 valid_split = valid_loader.dataset.split # build the log keys for the csv log file and for the web logger log_keys = [f'mIoU_{valid_split}'] if args.valid_full_res: log_keys.append(f'mIoU_{valid_split}_full-res') best_miou_full_res = 0 log_keys_for_csv = log_keys.copy() # mIoU for each camera for camera in cameras: log_keys_for_csv.append(f'mIoU_{valid_split}_{camera}') if args.valid_full_res: log_keys_for_csv.append(f'mIoU_{valid_split}_full-res_{camera}') log_keys_for_csv.append('epoch') for i in range(len(lr_scheduler.get_lr())): log_keys_for_csv.append('lr_{}'.format(i)) log_keys_for_csv.extend(['loss_train_total', 'loss_train_full_size']) for rate in label_downsampling_rates: log_keys_for_csv.append('loss_train_down_{}'.format(rate)) log_keys_for_csv.extend([ 'time_training', 'time_validation', 'time_confusion_matrix', 'time_forward', 'time_post_processing', 'time_copy_to_gpu' ]) valid_names = [valid_split] if args.valid_full_res: valid_names.append(valid_split + '_full-res') for valid_name in valid_names: # iou for every class for i in range(n_classes_without_void): log_keys_for_csv.append(f'IoU_{valid_name}_class_{i}') log_keys_for_csv.append(f'loss_{valid_name}') if loss_function_valid_unweighted is not None: log_keys_for_csv.append(f'loss_{valid_name}_unweighted') csvlogger = CSVLogger(log_keys_for_csv, os.path.join(ckpt_dir, 'logs.csv'), append=True) # one confusion matrix per camera and one for whole valid data confusion_matrices = dict() for camera in cameras: confusion_matrices[camera] = \ ConfusionMatrixTensorflow(n_classes_without_void) confusion_matrices['all'] = \ ConfusionMatrixTensorflow(n_classes_without_void) # start training ----------------------------------------------------------- for epoch in range(int(start_epoch), args.epochs): # unfreeze if args.freeze == epoch and args.finetune is None: print('Unfreezing') for param in model.parameters(): param.requires_grad = True logs = train_one_epoch(model, train_loader, device, optimizer, loss_function_train, epoch, lr_scheduler, args.modality, label_downsampling_rates, debug_mode=args.debug) # validation after every epoch ----------------------------------------- miou, logs = validate(model, valid_loader, device, cameras, confusion_matrices, args.modality, loss_function_valid, logs, ckpt_dir, epoch, loss_function_valid_unweighted, debug_mode=args.debug) if args.valid_full_res: miou_full_res, logs = validate(model, valid_loader_full_res, device, cameras, confusion_matrices, args.modality, loss_function_valid, logs, ckpt_dir, epoch, loss_function_valid_unweighted, add_log_key='_full-res', debug_mode=args.debug) logs.pop('time', None) csvlogger.write_logs(logs) # save weights print(miou['all']) save_current_checkpoint = False if miou['all'] > best_miou: best_miou = miou['all'] best_miou_epoch = epoch save_current_checkpoint = True if args.valid_full_res and miou_full_res['all'] > best_miou_full_res: best_miou_full_res = miou_full_res['all'] best_miou_full_res_epoch = epoch save_current_checkpoint = True # don't save weights for the first 10 epochs as mIoU is likely getting # better anyway if epoch >= 10 and save_current_checkpoint is True: save_ckpt(ckpt_dir, model, optimizer, epoch) # save / overwrite latest weights (useful for resuming training) save_ckpt_every_epoch(ckpt_dir, model, optimizer, epoch, best_miou, best_miou_epoch) # write a finish file with best miou values in order overview # training result quickly with open(os.path.join(ckpt_dir, 'finished.txt'), 'w') as f: f.write('best miou: {}\n'.format(best_miou)) f.write('best miou epoch: {}\n'.format(best_miou_epoch)) if args.valid_full_res: f.write(f'best miou full res: {best_miou_full_res}\n') f.write(f'best miou full res epoch: {best_miou_full_res_epoch}\n') print("Training completed ")
# limitations under the License. # ============================================================================ """export file.""" import numpy as np from mindspore import context, Tensor from mindspore.train.serialization import export from src.models import get_generator from src.utils import get_args, load_ckpt args = get_args("export") context.set_context(mode=context.GRAPH_MODE, device_target=args.platform) if __name__ == '__main__': G_A = get_generator(args) G_B = get_generator(args) # Use BatchNorm2d with batchsize=1, affine=False, training=True instead of InstanceNorm2d # Use real mean and varance rather than moving_men and moving_varance in BatchNorm2d G_A.set_train(True) G_B.set_train(True) load_ckpt(args, G_A, G_B) input_shp = [1, 3, args.image_size, args.image_size] input_array = Tensor( np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32)) G_A_file = f"{args.file_name}_BtoA" export(G_A, input_array, file_name=G_A_file, file_format=args.file_format) G_B_file = f"{args.file_name}_AtoB" export(G_B, input_array, file_name=G_B_file, file_format=args.file_format)
def main(config): CASE_NUM = config['case_num'] DATASET = config['dataset'] NORMALIZATION = config['normalization'] BATCH_SIZE = config['batch_size'] MAX_EPOCH = config['max_epoch'] OPTIM_TYPE = config['optimizer'] LR = config['learning_rate'] LR_STEP = config['lr_step'] LR_DECAY = config['lr_decay'] L2_DECAY = config['l2_decay'] TB_STATE = config['use_tensorboard'] MODEL_NAME = config['model_name'] ALPHA = config['alpha'] BETA = config['beta'] GAMMA = config['gamma'] PHI = config['phi'] LOSS_FN = config['loss_fn'] KERNEL_SIZE = config['kernel_size'] result_dir = make_dir(RESULT_ROOT_DIR, str(CASE_NUM), overwrite=args.overwrite) ckpt_path = result_dir + '/' + 'checkpoint.pt' # =============================================== Select data and construct data_fname, data_dim = select_data(DATASET) data_path = '../data/' + data_fname data_train = NLUDataset(data_path, mode='train', normalization=NORMALIZATION, random_seed=42) dataloader_train = DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) data_valid = NLUDataset(data_path, mode='valid', normalization=NORMALIZATION, random_seed=42) dataloader_valid = DataLoader(data_valid, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) data_test = NLUDataset(data_path, mode='test', normalization=NORMALIZATION, random_seed=42) dataloader_test = DataLoader(data_test, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) num_train_samples = data_train.__len__() classes = data_train.labels num_classes = len(classes) # =============================================== Initialize model and optimizer device = ('cuda' if torch.cuda.is_available() else 'cpu') if device == 'cuda': print('Using GPU, %s' % torch.cuda.get_device_name(0)) net = select_model(MODEL_NAME, data_dim, KERNEL_SIZE, num_classes, ALPHA, BETA, PHI) net.to(device) loss_fn = select_loss(LOSS_FN) optimizer = select_optimizer(OPTIM_TYPE, net.parameters(), LR, L2_DECAY) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=LR_STEP, gamma=LR_DECAY) # =============================================== Train it = 0 train_losses, valid_losses, valid_accs = {}, {}, {} best_validation_acc = 0 log_term = 5 for epoch in range(MAX_EPOCH): #------------------------------------------------ One epoch start one_epoch_start = time.time() print('Epoch {} / Learning Rate: {:.0e}'.format( epoch, scheduler.get_lr()[0])) #------------------------------------------------ Train train_losses, it, net, optimizer, scheduler \ = train_1epoch(dataloader_train, device, train_losses, it, net, loss_fn, optimizer, scheduler, log_every=log_term) #------------------------------------------------ Validation valid_acc, valid_loss = evaluate(dataloader_valid, device, net, loss_fn) valid_losses[it] = valid_loss valid_accs[it] = valid_acc #------------------------------------------------ Save model saved = '' if valid_acc > best_validation_acc: best_validation_acc = valid_acc saved = save_ckpt(ckpt_path, net, best_validation_acc) print('Epoch {} / Valid loss: {:.4f}, Valid acc: {:.4f} {}'.format( epoch, valid_loss, valid_acc, saved)) #------------------------------------------------ One epoch end curr_time = time.time() print("One epoch time = %.2f s" % (curr_time - one_epoch_start)) print('#------------------------------------------------------#') save_train_log(result_dir, train_losses, valid_losses, valid_accs, best_validation_acc) # =============================================== Test net, best_validation_acc = load_ckpt(ckpt_path, net) test_acc, test_loss = evaluate(dataloader_test, device, net, loss_fn) return test_acc
def main(): arg = args() if not os.path.exists(arg.exp_name): os.makedirs(arg.exp_name) assert arg.exp_name.split( '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..." output_dir = arg.exp_name save_scripts_in_exp_dir(output_dir) logger = logging_set(output_dir) logger.info( '\n================ experient name:[{}] ===================\n'.format( arg.exp_name)) os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True np.random.seed(0) torch.manual_seed(0) config = edict(yaml.load(open(arg.cfg, 'r'))) if arg.search: assert arg.search in [ 'None', 'sync', 'random', 'second_order_gradient', 'first_order_gradient' ] config.train.arch_search_strategy = arg.search if arg.batchsize: logger.info("update batchsize to {}".format(arg.batchsize)) config.train.batchsize = arg.batchsize config.num_workers = arg.num_workers print( 'GPU memory : \ntotal | used\n', os.popen( 'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader' ).read()) logger.info( '------------------------------ configuration ---------------------------' ) logger.info( '\n==> available {} GPUs , use numbers are {} device is {}\n'.format( torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"], torch.cuda.current_device())) # torch.cuda._initialized = True logger.info(pprint.pformat(config)) logger.info( '------------------------------- -------- ----------------------------' ) criterion = MSELoss() Arch = bulid_up_network(config, criterion) if config.train.arch_search_strategy == 'random': logger.info("==>random seed is {}".format(config.train.random_seed)) np.random.seed(config.train.random_seed) torch.manual_seed(config.train.random_seed) Arch.arch_parameters_random_search() if arg.param_flop: Arch._print_info() # dump_input = torch.rand((1,3,128,128)) # graph = SummaryWriter(output_dir+'/log') # graph.add_graph(Arch, (dump_input, )) if len(arg.gpu) > 1: use_multi_gpu = True Arch = torch.nn.DataParallel(Arch).cuda() else: use_multi_gpu = False Arch = Arch.cuda() Search = Search_Arch(Arch.module, config) if use_multi_gpu else Search_Arch( Arch, config) # Arch.module for nn.DataParallel search_strategy = config.train.arch_search_strategy train_queue, arch_queue, valid_queue = Dataloaders(search_strategy, config, arg) #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None! logger.info( "\nNeural Architecture Search strategy is {}".format(search_strategy)) assert search_strategy in [ 'first_order_gradient', 'random', 'None', 'second_order_gradient', 'sync' ] if search_strategy == 'sync': # arch_parameters is also registered to model's parameters # so the weight-optimizer will also update the arch_parameters logger.info( "sync: The arch_parameters is also optimized by weight-optmizer synchronously" ) optimizer = torch.optim.Adam( Arch.parameters(), lr=config.train.w_lr_cosine_begin, ) else: # if search strategy is None,random,second_order_gradient and so on # the arch_parameters will be filtered by the weight-optimizer optimizer = torch.optim.Adam( filter_arch_parameters(Arch), lr=config.train.w_lr_cosine_begin, ) #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = config.train.lr_step_size, # gamma = config.train.lr_decay_gamma ) if config.train.scheduler_name == "MultiStepLR": scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.train.LR_STEP, config.train.LR_FACTOR) elif config.train.scheduler_name == "CosineAnnealingLR": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=config.train.epoch_end, eta_min=config.train.w_lr_cosine_end) # best_result best = 0 logger.info( "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+==" ) begin, end = config.train.epoch_begin, config.train.epoch_end if arg.load_ckpt: if use_multi_gpu: begin, best = load_ckpt(Arch.module, optimizer, scheduler, output_dir, logger) else: begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir, logger) for epoch in range(begin, end): lr = scheduler.get_lr()[0] logger.info( '==>time:({})--training...... current learning rate is {:.7f}'. format(datetime.datetime.now(), lr)) train( epoch, train_queue, arch_queue, Arch, Search, criterion, optimizer, lr, search_strategy, output_dir, logger, config, arg, ) scheduler.step() eval_results = evaluate(Arch, valid_queue, config, output_dir) if use_multi_gpu: best = save_model(epoch, best, eval_results, Arch.module, optimizer, scheduler, output_dir, logger) else: best = save_model(epoch, best, eval_results, Arch, optimizer, scheduler, output_dir, logger) ## visualize_heatamp if arg.visualize and epoch % 5 == 0: for i in range(len(valid_queue.dataset)): if valid_queue.dataset[i][1] != 185250: # choose an image_id continue print(valid_queue.dataset[i][1]) sample = valid_queue.dataset[i] img = sample[0].unsqueeze(0) #samples = next(iter(valid_dataloader)) #img = samples[0] output = Arch(img) print(img.size(), output.size()) visualize_heatamp(img, output, 'heatmaps', show_img=False) break