def main(argv): arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-l', '--level', dest='level', type=int, help='Provide number of the level to solve', default=0) arg_parser.add_argument('-v', '--verbose', dest='verbose', action="store_true", help='Set verbosity level to DEBUG', default=False) args = arg_parser.parse_args(argv) logger = setup_logger(verbose=args.verbose) level = args.level _valid_level(level, logger) _solve(level, logger)
def train(solved_score, population_size, elite_size, num_proc, log_video_rate): setup_logger() manager = mp.Manager() work_queue = manager.Queue() results_queue = manager.Queue() # Random Search 1st generation start_time = time.time() env = create_environment() population = create_population(env, population_size) print(population[0]) elite, top_scores = get_top_performers_from_random_population( env, population, elite_size) elapsed_time = time.time() - start_time log_generation_stats(1, top_scores, elapsed_time) # 2nd -> inf generation: Mutate Top Performers (classic GA) ma_reward = 0 spawn_processes(num_proc, work_fn=mutate_and_evaluate_task, args=(elite, work_queue, results_queue)) for generation in count(start=2, step=1): start_time = time.time() spawn_mutation_work(work_queue, elite_size, population_size) evaluated_population = collect_results(results_queue, size=population_size) top_scores = get_top_performers(evaluated_population, elite, elite_size) elapsed_time = time.time() - start_time if generation % log_video_rate == 0: record_evaluation_video(elite[0], env) log_generation_stats(generation, top_scores, elapsed_time) ma_reward = 0.7 * ma_reward + 0.3 * top_scores.mean() if ma_reward >= solved_score: print(f"Solved in {generation} generations") kill_processes(work_queue, num_proc) break
def __init__(self, url=None): self.soup = None self.url = url self.logger = setup_logger() self.logger.info('Solving %s located at %s' % (self.__class__.__name__.lower(), url)) cache_path = '/tmp/%s' % self.__class__.__name__ self.logger.debug('Setting up HTTP cache: %s.sqlite' % cache_path) requests_cache.install_cache(cache_path, extension='.sqlite') self.logger.debug('Requesting HTTP and dumping to BeautifulSoup') self.requests_result = requests.get(self.url) self.soup = BeautifulSoup(self.requests_result.text)
def main(): logger = setup_logger("eval_stats", __file__, FLAGS.dir, filename="summary.log") lbs = [] for seed in range(1234, 1234 + 10): filename = os.path.join(FLAGS.dir, "eval.log.{}".format(seed)) with open(filename, "rb") as f: text = f.read().decode("utf-8") lb = float(text.strip().split("\n")[-1].split("=")[-1].strip()) logger.info(str(lb)) lbs.append(lb) logger.info("{}+-{}".format(np.mean(lbs), np.std(lbs)))
def main(model_path, backbone, scale, path, save_path, gpu_id): device = torch.device("cuda:" + str(gpu_id)) logger = setup_logger(os.path.join(config.output_dir, 'test_log')) logger.info(config.print()) if os.path.exists(save_path): shutil.rmtree(save_path, ignore_errors=True) if not os.path.exists(save_path): os.makedirs(save_path) save_img_folder = os.path.join(save_path, 'img') if not os.path.exists(save_img_folder): os.makedirs(save_img_folder) save_txt_folder = os.path.join(save_path, 'result') if not os.path.exists(save_txt_folder): os.makedirs(save_txt_folder) img_paths = [os.path.join(path, x) for x in os.listdir(path)] net = PSENet(backbone=backbone, pretrained=config.pretrained, result_num=config.n) model = Pytorch_model(model_path, net=net, scale=scale, gpu_id=gpu_id) num_gpus = torch.cuda.device_count() if num_gpus > 1: model = nn.DataParallel(model) model = model.to(device) model = nn.DataParallel(model) recall, precision, f1 = merge_eval(model=model, save_path=os.path.join( config.output_dir, 'output'), test_path=config.testroot, device=device, base_path=config.base_path, use_sub=config.use_sub) logger.info('test: recall: {:.6f}, precision: {:.6f}, f1: {:.6f}'.format( recall, precision, f1)) # total_frame = 0.0 # total_time = 0.0 # for img_path in tqdm(img_paths): # img_name = os.path.basename(img_path).split('.')[0] # save_name = os.path.join(save_txt_folder, 'res_' + img_name + '.txt') # _, boxes_list, t = model.predict(img_path) # total_frame += 1 # total_time += t # # img = draw_bbox(img_path, boxes_list, color=(0, 0, 255)) # # cv2.imwrite(os.path.join(save_img_folder, '{}.jpg'.format(img_name)), img) # np.savetxt(save_name, boxes_list.reshape(-1, 8), delimiter=',', fmt='%d') # print('fps:{}'.format(total_frame / total_time)) return save_txt_folder
def main(): config_path = sys.argv[1] opt = util.load_yaml(config_path) if opt['path']['resume_state']: # resuming training resume_state = torch.load(opt['path']['resume_state']) else: resume_state = None util.mkdir(opt['path']['log']) util.setup_logger(None, opt['path']['log'], 'train', level=logging.INFO, screen=True) util.setup_logger('val', opt['path']['log'], 'val', level=logging.INFO) logger = logging.getLogger('base') set_random_seed(0) # tensorboard log writer = SummaryWriter(log_dir=opt['path']['tb_logger']) torch.backends.cudnn.benckmark = True for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = data.create_dataset(dataset_opt, phase) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) logger.info('Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) train_loader = data.create_dataloader(train_set, dataset_opt, phase) elif phase == 'valid': val_set = data.create_dataset(dataset_opt, phase) val_loader = data.create_dataloader(val_set, dataset_opt, phase) logger.info('Number of validation images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None # create model model = Model(opt) # resume training if resume_state: start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.load_model(current_step) model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 # training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs): for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break # update learning rate model.update_learning_rate() # training model.train(train_data, current_step) # log if current_step % opt['train']['print_freq'] == 0: logs = model.get_current_log() message = '<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> '.format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger writer.add_scalar(k, v, current_step) logger.info(message) if current_step % opt['train']['val_freq'] == 0: psnr, ssim = model.validate(val_loader, current_step) # log logger.info('# Validation # PSNR: {:.4e} SSIM: {:.4e}'.format( psnr, ssim)) logger_val = logging.getLogger('val') # validation logger logger_val.info( '<epoch:{:3d}, iter:{:8,d}> psnr: {:.4e} ssim: {:.4e}'. format(epoch, current_step, psnr, ssim)) # tensorboard logger writer.add_scalar('VAL_PSNR', psnr, current_step) writer.add_scalar('VAL_SSIM', ssim, current_step) # save models and training states if current_step % opt['train']['save_step'] == 0: logger.info('Saving models and training states.') model.save_model(epoch, current_step)
import datetime as DT import matplotlib as plt import numpy as np import os import pandas as pd import utils.utils as utils # TODO: functions with calc_daily mean and stuff.. are all almost the same.. generalize. # TODO: need to remove outliers before calculating mean. pd.set_option('display.max_rows', 200000) # so pandas prints more rows # --- logging - always cleans the log when importing and executing this file import logging utils.setup_logger('logger_clean', r'logs/clean.log') logger = logging.getLogger('logger_clean') # --- global variables # global variables were used because of .apply() function used inside calc_stats, otherwise I wouldve used these two as normal parameters. They receive value inside 'main()' global start_year global end_year # --- START Functions def clean_datetime_site_daily(df): """ Clears data in datetime and site field of a given pandas dataframe. site becomes: 'site' containing only the site code datetime becomes: 'date' containing only year-month-day value is repeated.
from __future__ import print_function from pdnn.run_DNN import run_DNN from pdnn.run_RBM import run_RBM from pdnn.run_SDA import run_SDA from pdnn.eval_DNN import eval_DNN import json from utils.utils import setup_logger MNIST_CONF = json.load(open("configs/unittest_mnist.json")) MAX_ITERS = 2 setup_logger(None) def banner(s): print("***********************" + s + "*************************") def test_hi(): print("hi") def test_rbm_dnn(): banner("rbm dnn") mnist_conf = MNIST_CONF.copy() mnist_conf["train_rbm"]["max_iters"] = MAX_ITERS run_RBM(mnist_conf) mnist_conf["train_dnn"]["max_iters"] = MAX_ITERS mnist_conf["init_dnn"] = { "filename": "temp/rbm/final.nnet",
def main(): tf.set_random_seed(1234) np.random.seed(1234) # Load celebA data_path = os.path.join('data', 'celebA', 'img_align_celeba.zip') celeba = dataset.CelebADataset(data_path) x = tf.placeholder(tf.float32, shape=[None] + celeba.data_dims, name='x') n_particles = tf.placeholder(tf.int32, shape=[], name='n_particles') n = tf.shape(x)[0] def log_joint(observed): model, _ = vae(observed, n, n_z, n_particles) log_pz, log_px_z = model.local_log_prob(['z', 'x']) return log_pz + log_px_z variational = q_net(x, n_z, n_particles) qz_samples, log_qz = variational.query('z', outputs=True, local_log_prob=True) lower_bound = zs.variational.elbo(log_joint, observed={'x': x}, latent={'z': [qz_samples, log_qz]}, axis=0) cost = tf.reduce_mean(lower_bound.sgvb()) lower_bound = tf.reduce_mean(lower_bound) model, _ = vae({'z': qz_samples}, n, n_z, n_particles) log_pz = model.local_log_prob('z') kl_term = tf.reduce_mean(log_qz - log_pz) # cost = kl_term optimizer = tf.train.AdamOptimizer(3e-4) infer_op = optimizer.minimize(cost) # Generate images n_gen = 100 _, x_mean = vae({}, n_gen, n_z, None) x_gen = tf.reshape(x_mean, [-1] + celeba.data_dims) # Interpolation # [n, n_z] x_start = x[:8] x_end = x[8:16] z_start = qz_samples[0, :8, :] z_end = qz_samples[0, 8:16, :] # [1, 8, 1] alpha = tf.reshape(tf.linspace(0., 1., 8), [1, 8, 1]) # [n, 1, n_z] z_start = tf.expand_dims(z_start, 1) z_end = tf.expand_dims(z_end, 1) # [n, 8, n_z] z_interp = alpha * z_start + (1. - alpha) * z_end z_interp = tf.reshape(z_interp, [-1, n_z]) _, x_interp = vae({'z': z_interp}, 64, n_z, None) x_interp = tf.reshape(x_interp, [-1] + celeba.data_dims) # Define training parameters epochs = 25 batch_size = 64 iters = celeba.train_size // batch_size save_image_freq = 1 print_freq = 100 save_model_freq = 5 test_freq = 1 test_batch_size = 500 test_iters = celeba.test_size // test_batch_size result_path = "results/vae_celeba_" + time.strftime("%Y%m%d_%H%M%S") saver = tf.train.Saver(max_to_keep=10) logger = setup_logger('vae_celeba', __file__, result_path) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Restore from the latest checkpoint ckpt_file = tf.train.latest_checkpoint(result_path) begin_epoch = 1 if ckpt_file is not None: logger.info('Restoring model from {}...'.format(ckpt_file)) begin_epoch = int(ckpt_file.split('.')[-2]) + 1 saver.restore(sess, ckpt_file) for epoch in range(begin_epoch, epochs + 1): lbs = [] kls = [] time_iters = [] for t in range(1, iters + 1): time_iter = -time.time() x_batch = celeba.next_batch(batch_size) _, lb, kl = sess.run([infer_op, lower_bound, kl_term], feed_dict={ x: x_batch, n_particles: 1 }) # logger.info('Iter {}: lb = {}'.format(t, lb)) lbs.append(lb) kls.append(kl) time_iter += time.time() time_iters.append(time_iter) if t % print_freq == 0: logger.info( 'Epoch={} Iter={} ({}s): lb = {}, kl = {}'.format( epoch, t, np.mean(time_iters), np.mean(lbs[-print_freq:]), np.mean(kls[-print_freq:]))) time_iters = [] logger.info('>> Epoch {}: Lower bound = {}, kl = {}'.format( epoch, np.mean(lbs), np.mean(kls))) interp_images = [] start_images = [] end_images = [] if epoch % test_freq == 0: time_test = -time.time() test_lbs = [] for t in range(test_iters): test_x_batch = celeba.next_test_batch(test_batch_size) test_lb, interp_image, start_image, end_image = sess.run( [lower_bound, x_interp, x_start, x_end], feed_dict={ x: test_x_batch, n_particles: 1 }) test_lbs.append(test_lb) interp_images.append(interp_image) start_images.append(start_image) end_images.append(end_image) time_test += time.time() logger.info('>>> TEST ({:.1f}s)'.format(time_test)) logger.info('>> Test lower bound = {}'.format( np.mean(test_lbs))) logger.info('Saving interpolations...') interp_name = os.path.join(result_path, "interp.epoch.{}.png".format(epoch)) save_image_collections(interp_images[0], interp_name, scale_each=True, shape=(8, 8)) if epoch == 1: save_image_collections(start_images[0], interp_name + ".start.png", scale_each=True, shape=(8, 1)) save_image_collections(end_images[0], interp_name + ".end.png", scale_each=True, shape=(8, 1)) if epoch % save_image_freq == 0: logger.info('Saving images...') images = sess.run(x_gen) name = os.path.join(result_path, "vae.epoch.{}.png".format(epoch)) save_image_collections(images, name, scale_each=True) if epoch % save_model_freq == 0: logger.info('Saving model...') save_path = os.path.join(result_path, "vae.epoch.{}.ckpt".format(epoch)) if not os.path.exists(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path)) saver.save(sess, save_path) logger.info('Done')
image_name = img_path.split('/')[-1].split('.')[0] write_result_as_txt(image_name, boxes_list, save_path) # recall precision f1 gt_path = os.path.join(test_path, 'gt/Test') fid_path = os.path.join(workspace, 'res_tt.txt') shutil.rmtree(fid_path, ignore_errors=True) precision, recall, hmean = evl_totaltext(save_path, gt_path, fid_path) # f_score_new = getresult(save_path,config.gt_name) return precision, recall, hmean if __name__ == "__main__": config.workspace = os.path.join(config.workspace_dir, config.exp_name) logger = setup_logger(os.path.join(config.workspace, 'test_log')) logger.info(config.print()) # best_save_path = '{}/Best_model_0.632154.pth'.format(config.workspace) best_save_path = "/data/glusterfs_cv_04/11121171/CVPR_Text/PSENet_file/Total_Text/Best_model_0.787389.pth" # writer = SummaryWriter(config.output_dir) model = PSENet(backbone=config.backbone, pretrained=config.pretrained, result_num=config.kernel_num, scale=config.scale) num_gpus = torch.cuda.device_count() device = torch.device("cuda:0") # if num_gpus > 1: model = nn.DataParallel(model) model = model.to(device)
ais = AIS(ais_log_prior, log_joint, {'z': pz_samples}, hmc, observed={'x': x_obs}, latent={'z': z}, n_chains=test_n_chains, n_temperatures=test_n_temperatures) model_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="model") variational_var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="variational") saver = tf.train.Saver(max_to_keep=10, var_list=model_var_list + variational_var_list) logger = setup_logger("vae_eval", __file__, result_path, filename="eval.log.{}".format(seed)) # Run the evaluation with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Restore from the latest checkpoint ckpt_file = tf.train.latest_checkpoint(result_path) if ckpt_file is not None: logger.info('Restoring model from {}...'.format(ckpt_file)) saver.restore(sess, ckpt_file) # AIS evaluation logger.info('Start evaluation...') time_ais = -time.time()
from pdnn.run_DNN import run_DNN from pdnn.run_RBM import run_RBM from pdnn.run_SDA import run_SDA from pdnn.eval_DNN import eval_DNN import json from utils.utils import setup_logger MNIST_CONF = json.load(open("configs/unittest_mnist.json")) MAX_ITERS = 2 setup_logger(None) def banner(s): print "***********************" + s + "*************************" def test_hi(): print "hi" def test_rbm_dnn(): banner("rbm dnn") mnist_conf = MNIST_CONF.copy() mnist_conf["train_rbm"]["max_iters"] = MAX_ITERS run_RBM(mnist_conf) mnist_conf["train_dnn"]["max_iters"] = MAX_ITERS mnist_conf["init_dnn"] = { "filename": "temp/rbm/final.nnet", "num_hidden_layers": -1, "with_final": 1 } run_DNN(mnist_conf)
def main_worker(gpus, ngpus_per_node, args, final_output_dir, tb_log_dir): # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED #os.environ['CUDA_VISIBLE_DEVICES']=gpus # Parallel setting print("Use GPU: {} for training".format(gpus)) update_config(cfg, args) #test(cfg, args) # logger setting logger, _ = setup_logger(final_output_dir, args.rank, 'train') writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # model initilization model = eval(cfg.MODEL.NAME + '.get_pose_net')(cfg, is_train=True) # load pretrained model before DDP initialization checkpoint_file = os.path.join(final_output_dir, 'model_best.pth.tar') if cfg.AUTO_RESUME: if os.path.exists(checkpoint_file): checkpoint = torch.load(checkpoint_file, map_location='cpu') state_dict = checkpoint['state_dict'] for key in list(state_dict.keys()): new_key = key.replace("module.", "") state_dict[new_key] = state_dict.pop(key) model.load_state_dict(state_dict) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) elif cfg.MODEL.HRNET_PRETRAINED: logger.info("=> loading a pretrained model '{}'".format( cfg.MODEL.PRETRAINED)) checkpoint = torch.load(cfg.MODEL.HRNET_PRETRAINED, map_location='cpu') state_dict = checkpoint['state_dict'] for key in list(state_dict.keys()): new_key = key.replace("module.", "") state_dict[new_key] = state_dict.pop(key) model.load_state_dict(state_dict) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir) # copy configuration file config_dir = args.cfg shutil.copy2(os.path.join(args.cfg), final_output_dir) # calculate GFLOPS dump_input = torch.rand( (1, 3, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[0])) logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE)) #ops, params = get_model_complexity_info( # model, (3, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[0]), # as_strings=True, print_per_layer_stat=True, verbose=True) # FP16 SETTING if cfg.FP16.ENABLED: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if cfg.FP16.STATIC_LOSS_SCALE != 1.0: if not cfg.FP16.ENABLED: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) if cfg.FP16.ENABLED: model = network_to_half(model) if cfg.MODEL.SYNC_BN and not cfg.cfg.DISTRIBUTED: print( 'Warning: Sync BatchNorm is only supported in distributed training.' ) # Distributed Computing master = True if cfg.DISTRIBUTED: # This block is not available args.local_rank += int(gpus[0]) print('This process is using GPU', args.local_rank) device = args.local_rank master = device == int(gpus[0]) dist.init_process_group(backend='nccl') if cfg.MODEL.SYNC_BN: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if gpus is not None: torch.cuda.set_device(device) model.cuda(device) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # workers = int(workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[device], output_device=device, find_unused_parameters=True) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) else: # implement this block gpu_ids = eval('[' + gpus + ']') device = gpu_ids[0] print('This process is using GPU', str(device)) model = torch.nn.DataParallel(model, gpu_ids).cuda(device) # Prepare loss functions criterion = {} if cfg.LOSS.WITH_HEATMAP_LOSS: criterion['heatmap_loss'] = HeatmapLoss().cuda() if cfg.LOSS.WITH_POSE2D_LOSS: criterion['pose2d_loss'] = JointsMSELoss().cuda() if cfg.LOSS.WITH_BONE_LOSS: criterion['bone_loss'] = BoneLengthLoss().cuda() if cfg.LOSS.WITH_JOINTANGLE_LOSS: criterion['jointangle_loss'] = JointAngleLoss().cuda() best_perf = 1e9 best_model = False last_epoch = -1 # optimizer must be initilized after model initilization optimizer = get_optimizer(cfg, model) if cfg.FP16.ENABLED: optimizer = FP16_Optimizer( optimizer, static_loss_scale=cfg.FP16.STATIC_LOSS_SCALE, dynamic_loss_scale=cfg.FP16.DYNAMIC_LOSS_SCALE, verbose=False) begin_epoch = cfg.TRAIN.BEGIN_EPOCH if not cfg.AUTO_RESUME and cfg.MODEL.HRNET_PRETRAINED: optimizer.load_state_dict(checkpoint['optimizer']) if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): begin_epoch = checkpoint['epoch'] best_perf = checkpoint['loss'] optimizer.load_state_dict(checkpoint['optimizer']) if 'train_global_steps' in checkpoint.keys() and \ 'valid_global_steps' in checkpoint.keys(): writer_dict['train_global_steps'] = checkpoint[ 'train_global_steps'] writer_dict['valid_global_steps'] = checkpoint[ 'valid_global_steps'] if cfg.FP16.ENABLED: logger.info("=> Using FP16 mode") lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer.optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=begin_epoch) elif cfg.TRAIN.LR_SCHEDULE == 'warmup': from utils.utils import get_linear_schedule_with_warmup lr_scheduler = get_linear_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=cfg.TRAIN.WARMUP_EPOCHS, num_training_steps=cfg.TRAIN.END_EPOCH - cfg.TRAIN.BEGIN_EPOCH, last_epoch=begin_epoch) elif cfg.TRAIN.LR_SCHEDULE == 'multi_step': lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=begin_epoch) else: print('Unknown learning rate schedule!') exit() # Data loading code train_loader_dict = make_dataloader(cfg, is_train=True, distributed=cfg.DISTRIBUTED) valid_loader_dict = make_dataloader(cfg, is_train=False, distributed=cfg.DISTRIBUTED) for i, (dataset_name, train_loader) in enumerate(train_loader_dict.items()): logger.info( 'Training Loader {}/{}:\n'.format(i + 1, len(train_loader_dict)) + str(train_loader.dataset)) for i, (dataset_name, valid_loader) in enumerate(valid_loader_dict.items()): logger.info('Validation Loader {}/{}:\n'.format( i + 1, len(valid_loader_dict)) + str(valid_loader.dataset)) #writer_dict['writer'].add_graph(model, (dump_input, )) """ Start training """ start_time = time.time() with torch.autograd.set_detect_anomaly(True): for epoch in range(begin_epoch + 1, cfg.TRAIN.END_EPOCH + 1): epoch_start_time = time.time() # shuffle datasets with the sample random seed if cfg.DISTRIBUTED: for data_loader in train_loader_dict.values(): data_loader.sampler.set_epoch(epoch) # train for one epoch # get_last_lr() returns a list logger.info('Start training [{}/{}] lr: {:.4e}'.format( epoch, cfg.TRAIN.END_EPOCH - cfg.TRAIN.BEGIN_EPOCH, lr_scheduler.get_last_lr()[0])) train(cfg, args, master, train_loader_dict, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict, logger, fp16=cfg.FP16.ENABLED, device=device) # In PyTorch 1.1.0 and later, you should call `lr_scheduler.step()` after `optimizer.step()`. lr_scheduler.step() # evaluate on validation set if not cfg.WITHOUT_EVAL: logger.info('Start evaluating [{}/{}]'.format( epoch, cfg.TRAIN.END_EPOCH - 1)) with torch.no_grad(): recorder = validate(cfg, args, master, valid_loader_dict, model, criterion, final_output_dir, tb_log_dir, writer_dict, logger, device=device) val_total_loss = recorder.avg_total_loss best_model = False if val_total_loss < best_perf: logger.info( 'This epoch yielded a better model with total loss {:.4f} < {:.4f}.' .format(val_total_loss, best_perf)) best_perf = val_total_loss best_model = True else: val_total_loss = 0 best_model = True if master: logger.info( '=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch, 'model': cfg.EXP_NAME + '.' + cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'loss': val_total_loss, 'optimizer': optimizer.state_dict(), 'train_global_steps': writer_dict['train_global_steps'], 'valid_global_steps': writer_dict['valid_global_steps'] }, best_model, final_output_dir) print('\nEpoch {} spent {:.2f} hours\n'.format( epoch, (time.time() - epoch_start_time) / 3600)) #if epoch == 3:break if master: final_model_state_file = os.path.join( final_output_dir, 'final_state{}.pth.tar'.format(gpus)) logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) writer_dict['writer'].close() print( '\n[Training Accomplished] {} epochs spent {:.2f} hours\n'.format( cfg.TRAIN.END_EPOCH - begin_epoch + 1, (time.time() - start_time) / 3600))
def __init__(self, config, model, criterion, metric_cls, train_loader, validate_loader, post_process=None): config['trainer']['output_dir'] = os.path.join( str(pathlib.Path(os.path.abspath(__name__)).parent), config['trainer']['output_dir']) config['name'] = config['name'] + '_' + model.name self.save_dir = os.path.join(config['trainer']['output_dir'], config['name']) self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoint') if config['trainer']['resume_checkpoint'] == '' and config['trainer'][ 'finetune_checkpoint'] == '': shutil.rmtree(self.save_dir, ignore_errors=True) if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.global_step = 0 self.start_epoch = 0 self.config = config self.model = model self.criterion = criterion self.metric_cls = metric_cls # logger and tensorboard self.epochs = self.config['trainer']['epochs'] self.log_iter = self.config['trainer']['log_iter'] self.tensorboard_enable = self.config['trainer']['tensorboard'] if config['local_rank'] == 0: anyconfig.dump(config, os.path.join(self.save_dir, 'config.yaml')) self.logger = setup_logger(os.path.join(self.save_dir, 'train.log')) self.logger_info(pformat(self.config)) # device torch.manual_seed(self.config['trainer']['seed']) # 为CPU设置随机种子 if torch.cuda.device_count() > 0 and torch.cuda.is_available(): self.with_cuda = True torch.backends.cudnn.benchmark = True self.device = torch.device("cuda") torch.cuda.manual_seed( self.config['trainer']['seed']) # 为当前GPU设置随机种子 torch.cuda.manual_seed_all( self.config['trainer']['seed']) # 为所有GPU设置随机种子 else: self.with_cuda = False self.device = torch.device("cpu") self.logger_info('train with device {} and pytorch {}'.format( self.device, torch.__version__)) self.optimizer = self._initialize('optimizer', torch.optim, model.parameters()) # resume or finetune if self.config['trainer']['resume_checkpoint'] != '': self._load_checkpoint(self.config['trainer']['resume_checkpoint'], resume=True) elif self.config['trainer']['finetune_checkpoint'] != '': self._load_checkpoint( self.config['trainer']['finetune_checkpoint'], resume=False) if self.config['lr_scheduler']['type'] != 'WarmupPolyLR': self.scheduler = self._initialize('lr_scheduler', torch.optim.lr_scheduler, self.optimizer) self.metrics = { 'recall': 0, 'precision': 0, 'hmean': 0, 'train_loss': float('inf'), 'best_model_epoch': 0 } self.model.to(self.device) # 分布式训练 if torch.cuda.device_count() > 1: local_rank = config['local_rank'] self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=False, find_unused_parameters=True) self.show_images_iter = self.config['trainer']['show_images_iter'] self.train_loader = train_loader if validate_loader is not None: assert post_process is not None self.validate_loader = validate_loader self.post_process = post_process self.train_loader_len = len(train_loader) if self.config['lr_scheduler']['type'] == 'WarmupPolyLR': warmup_iters = config['lr_scheduler']['args'][ 'warmup_epoch'] * self.train_loader_len if self.start_epoch > 1: self.config['lr_scheduler']['args']['last_epoch'] = ( self.start_epoch - 1) * self.train_loader_len self.scheduler = WarmupPolyLR(self.optimizer, max_iters=self.epochs * self.train_loader_len, warmup_iters=warmup_iters, **config['lr_scheduler']['args']) if self.validate_loader is not None: self.logger_info( 'train dataset has {} samples,{} in dataloader, validate dataset has {} samples,{} in dataloader' .format(len(self.train_loader.dataset), self.train_loader_len, len(self.validate_loader.dataset), len(self.validate_loader))) else: self.logger_info( 'train dataset has {} samples,{} in dataloader'.format( len(self.train_loader.dataset), self.train_loader_len)) if self.tensorboard_enable and config['local_rank'] == 0: from torch.utils.tensorboard import SummaryWriter self.writer = SummaryWriter(self.save_dir) try: dummy_input = torch.zeros(1, 3, 640, 640).to(self.device) self.writer.add_graph(self.model, dummy_input) torch.cuda.empty_cache() except: import traceback self.logger.error(traceback.format_exc()) self.logger.warn('add graph to tensorboard failed')
from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import RFE, SelectKBest, f_regression from sklearn.linear_model import LogisticRegression from xgboost import XGBClassifier, plot_importance from sklearn.preprocessing import MinMaxScaler # np.set_printoptions(threshold=np.nan) pd.set_option('display.max_rows', 200000) # --- logging - always cleans the log when importing and executing this file import logging utils.setup_logger('logger_feat_extract', r'logs/feat_extract.log') logger = logging.getLogger('logger_feat_extract') # --- measuring time import time # --- global variables global start_year global end_year def timeit(method): def timed(*args, **kw): ts = time.time() result = method(*args, **kw)
""" This file constains methods to interpolate and handle with some outliers. """ # --- general imports import datetime as DT import matplotlib as plt import numpy as np import os import pandas as pd import utils.utils as utils pd.set_option('display.max_rows', 200000) # --- logging - always cleans the log when importing and executing this file import logging utils.setup_logger('logger_refine', r'logs/refine.log') logger = logging.getLogger('logger_refine') # --- global variables global start_year global end_year # --- START Functions def handle_outliers(df): """ There are no silver bullets for this issue... TODO: maybe outliers should be handled before calculating the averages* Removes negative values from readings that should not have negative values """ logger.warning('Removing outliers (negative values and putting 0)')
def main_worker(gpu, ngpus_per_node, args, final_output_dir, tb_log_dir): args.gpu = gpu args.rank = args.rank * ngpus_per_node + gpu print('Init process group: dist_url: {}, world_size: {}, rank: {}'.format(cfg.DIST_URL, args.world_size, args.rank)) dist.init_process_group(backend=cfg.DIST_BACKEND, init_method=cfg.DIST_URL, world_size=args.world_size, rank=args.rank) update_config(cfg, args) # setup logger logger, _ = setup_logger(final_output_dir, args.rank, 'train') model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')(cfg, is_train=True) logger.info(get_model_summary(model, torch.zeros(1, 3, *cfg.MODEL.IMAGE_SIZE))) # copy model file if not cfg.MULTIPROCESSING_DISTRIBUTED or (cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0): this_dir = os.path.dirname(__file__) shutil.copy2(os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } if not cfg.MULTIPROCESSING_DISTRIBUTED or (cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0): dump_input = torch.rand((1, 3, cfg.MODEL.IMAGE_SIZE[1], cfg.MODEL.IMAGE_SIZE[0])) writer_dict['writer'].add_graph(model, (dump_input, )) # logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE)) if cfg.MODEL.SYNC_BN: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) torch.cuda.set_device(args.gpu) model.cuda(args.gpu) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) # define loss function (criterion) and optimizer criterion = JointsMSELoss(use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT).cuda(args.gpu) # Data loading code train_dataset = eval('dataset.'+cfg.DATASET.DATASET)( cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) ) valid_dataset = eval('dataset.'+cfg.DATASET.DATASET)( cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) ) train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU*len(cfg.GPUS), shuffle=(train_sampler is None), num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY, sampler=train_sampler ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=cfg.TEST.BATCH_SIZE_PER_GPU*len(cfg.GPUS), shuffle=False, num_workers=cfg.WORKERS, pin_memory=cfg.PIN_MEMORY ) logger.info(train_loader.dataset) best_perf = -1 best_model = False last_epoch = -1 optimizer = get_optimizer(cfg, model) begin_epoch = cfg.TRAIN.BEGIN_EPOCH checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth') if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) begin_epoch = checkpoint['epoch'] best_perf = checkpoint['perf'] last_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format(checkpoint_file, checkpoint['epoch'])) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch) for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH): # train for one epoch train(cfg, train_loader, model, criterion, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict) # In PyTorch 1.1.0 and later, you should call `lr_scheduler.step()` after `optimizer.step()`. lr_scheduler.step() # evaluate on validation set perf_indicator = validate( args, cfg, valid_loader, valid_dataset, model, criterion, final_output_dir, tb_log_dir, writer_dict ) if perf_indicator >= best_perf: best_perf = perf_indicator best_model = True else: best_model = False if not cfg.MULTIPROCESSING_DISTRIBUTED or ( cfg.MULTIPROCESSING_DISTRIBUTED and args.rank == 0 ): logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint({ 'epoch': epoch + 1, 'model': cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'best_state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join( final_output_dir, 'final_state{}.pth.tar'.format(gpu) ) logger.info('saving final model state to {}'.format( final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main(): config.workspace = os.path.join(config.workspace_dir, config.exp_name) if config.restart_training: shutil.rmtree(config.workspace, ignore_errors=True) if not os.path.exists(config.workspace): os.makedirs(config.workspace) shutil.rmtree(os.path.join(config.workspace, 'train_log'), ignore_errors=True) logger = setup_logger(os.path.join(config.workspace, 'train_log')) logger.info(config.print()) torch.manual_seed(config.seed) # 为CPU设置随机种子 if config.gpu_id is not None and torch.cuda.is_available(): torch.backends.cudnn.benchmark = True logger.info('train with gpu {} and pytorch {}'.format( config.gpu_id, torch.__version__)) device = torch.device("cuda:0") torch.cuda.manual_seed(config.seed) # 为当前GPU设置随机种子 torch.cuda.manual_seed_all(config.seed) # 为所有GPU设置随机种子 else: logger.info('train with cpu and pytorch {}'.format(torch.__version__)) device = torch.device("cpu") train_data = ICDAR17(config.trainroot, data_shape=config.data_shape, n=config.kernel_num, m=config.min_scale) train_loader = Data.DataLoader(dataset=train_data, batch_size=config.train_batch_size, shuffle=True, num_workers=int(config.workers)) # writer = SummaryWriter(config.output_dir) model = PSENet(backbone=config.backbone, pretrained=config.pretrained, result_num=config.kernel_num, scale=config.scale) if not config.pretrained and not config.restart_training: model.apply(weights_init) num_gpus = torch.cuda.device_count() if num_gpus > 1: model = nn.DataParallel(model) model = model.to(device) criterion = PSELoss(Lambda=config.Lambda, ratio=config.OHEM_ratio, reduction='mean') # optimizer = torch.optim.SGD(models.parameters(), lr=config.lr, momentum=0.99) optimizer = torch.optim.Adam(model.parameters(), lr=config.lr) if config.checkpoint != '' and not config.restart_training: start_epoch = load_checkpoint(config.checkpoint, model, logger, device, optimizer) start_epoch += 1 scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.lr_decay_step, gamma=config.lr_gamma, last_epoch=start_epoch) else: start_epoch = config.start_epoch scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.lr_decay_step, gamma=config.lr_gamma) all_step = len(train_loader) logger.info('train dataset has {} samples,{} in dataloader'.format( train_data.__len__(), all_step)) epoch = 0 f1 = 0 try: for epoch in range(start_epoch, config.epochs): start = time.time() train_loss, lr = train_epoch(model, optimizer, scheduler, train_loader, device, criterion, epoch, all_step, logger) logger.info( '[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format( epoch, config.epochs, train_loss, time.time() - start, lr)) save_path = '{}/epoch_model.pth'.format(config.workspace) save_checkpoint(save_path, model, optimizer, epoch, logger) if epoch >= 50 and epoch % 10 == 0: f_score_new = eval(model, os.path.join(config.workspace, 'output'), config.testroot, device) logger.info(' ---------------------------------------') logger.info(' test: f_score : {:.6f}'.format(f_score_new)) logger.info(' ---------------------------------------') if f_score_new > f1: f1 = f_score_new best_save_path = '{}/Best_model_{:.6f}.pth'.format( config.workspace, f1) save_checkpoint(best_save_path, model, optimizer, epoch, logger) # writer.add_scalar(tag='Test/recall', scalar_value=recall, global_step=epoch) # writer.add_scalar(tag='Test/precision', scalar_value=precision, global_step=epoch) # writer.add_scalar(tag='Test/f1', scalar_value=f1, global_step=epoch) # writer.close() except KeyboardInterrupt: save_checkpoint('{}/final.pth'.format(config.workspace), model, optimizer, epoch, logger)
def main(): tf.set_random_seed(1234) np.random.seed(1234) # Load celebA data_path = os.path.join('data', 'celebA', 'img_align_celeba.zip') celeba = dataset.CelebADataset(data_path) x = tf.placeholder(tf.float32, shape=[None] + celeba.data_dims, name='x') n_particles = tf.placeholder(tf.int32, shape=[], name='n_particles') n = tf.shape(x)[0] qz_samples = q_net(x, n_z, n_particles) # Use a single particle for the reconstruction term observed = {'x': x, 'z': qz_samples[:1]} model, z, _ = vae(observed, n, n_z, 1) # [1, n] log_px_qz = model.local_log_prob('x') eq_ll = tf.reduce_mean(log_px_qz) # [n_particles, n] log_p_qz = z.log_prob(qz_samples) eq_joint = eq_ll + tf.reduce_mean(log_p_qz) if FLAGS.estimator == "stein": estimator = SteinScoreEstimator(eta=eta) elif FLAGS.estimator == "spectral": estimator = SpectralScoreEstimator(n_eigen=None, eta=None, n_eigen_threshold=0.99) else: raise ValueError("The chosen estimator is not recognized.") optimizer = tf.train.AdamOptimizer(3e-4) entropy_grads_and_vars = entropy_gradients( optimizer, estimator, tf.transpose(qz_samples, [1, 0, 2])) entropy_dict = dict([(v, g) for g, v in entropy_grads_and_vars if g is not None]) joint_grads_and_vars = optimizer.compute_gradients(-eq_joint) joint_dict = dict([(v, g) for g, v in joint_grads_and_vars if g is not None]) def combine_grads(v): ret = 0. if v in entropy_dict: ret += -entropy_dict[v] if v in joint_dict: ret += joint_dict[v] return ret grads_and_vars = [(combine_grads(v), v) for v in tf.trainable_variables()] infer_op = optimizer.apply_gradients(grads_and_vars) # Generate images n_gen = 100 _, _, x_mean = vae({}, n_gen, n_z, None) x_gen = tf.reshape(x_mean, [-1] + celeba.data_dims) # Interpolation # [n, n_z] x_start = x[:8] x_end = x[8:16] z_start = qz_samples[0, :8, :] z_end = qz_samples[0, 8:16, :] # [1, 8, 1] alpha = tf.reshape(tf.linspace(0., 1., 8), [1, 8, 1]) # [n, 1, n_z] z_start = tf.expand_dims(z_start, 1) z_end = tf.expand_dims(z_end, 1) # [n, 8, n_z] z_interp = alpha * z_start + (1. - alpha) * z_end z_interp = tf.reshape(z_interp, [-1, n_z]) _, _, x_interp = vae({'z': z_interp}, 64, n_z, None) x_interp = tf.reshape(x_interp, [-1] + celeba.data_dims) # Define training parameters epochs = 25 batch_size = 64 iters = celeba.train_size // batch_size save_image_freq = 1 print_freq = 100 save_model_freq = 5 test_freq = 1 test_batch_size = 500 test_iters = celeba.test_size // test_batch_size result_path = "results/vae_celeba_" + FLAGS.estimator + \ time.strftime("_%Y%m%d_%H%M%S") saver = tf.train.Saver(max_to_keep=10) logger = setup_logger('vae_celeba_' + FLAGS.estimator, __file__, result_path) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Restore from the latest checkpoint ckpt_file = tf.train.latest_checkpoint(result_path) begin_epoch = 1 if ckpt_file is not None: logger.info('Restoring model from {}...'.format(ckpt_file)) begin_epoch = int(ckpt_file.split('.')[-2]) + 1 saver.restore(sess, ckpt_file) for epoch in range(begin_epoch, epochs + 1): eq_joints = [] time_iters = [] for t in range(1, iters + 1): time_iter = -time.time() x_batch = celeba.next_batch(batch_size) _, eq_joint_ = sess.run( [infer_op, eq_joint], feed_dict={x: x_batch, n_particles: n_est}) # logger.info('Iter {}: lb = {}, kl = {}, true_kl = {}' # .format(t, lb, kl, true_kl)) eq_joints.append(eq_joint_) time_iter += time.time() time_iters.append(time_iter) if t % print_freq == 0: logger.info( 'Epoch={} Iter={} ({}s): log joint = {}' .format(epoch, t, np.mean(time_iters), np.mean(eq_joints[-print_freq:]))) logger.info( 'Epoch {}: log joint = {}' .format(epoch, np.mean(eq_joints))) interp_images = [] start_images = [] end_images = [] if epoch % test_freq == 0: time_test = -time.time() test_eq_joints = [] for t in range(test_iters): test_x_batch = celeba.next_test_batch(test_batch_size) test_eq_joint, interp_image, start_image, end_image = \ sess.run( [eq_joint, x_interp, x_start, x_end], feed_dict={x: test_x_batch, n_particles: n_est}) test_eq_joints.append(test_eq_joint) interp_images.append(interp_image) start_images.append(start_image) end_images.append(end_image) time_test += time.time() logger.info('>>> TEST ({:.1f}s)'.format(time_test)) logger.info('>> Test log joint = {}' .format(np.mean(test_eq_joints))) logger.info('Saving interpolations...') interp_name = os.path.join(result_path, "interp.epoch.{}.png".format(epoch)) save_image_collections(interp_images[0], interp_name, scale_each=True, shape=(8, 8)) if epoch == 1: save_image_collections( start_images[0], interp_name + ".start.png", scale_each=True, shape=(8, 1)) save_image_collections( end_images[0], interp_name + ".end.png", scale_each=True, shape=(8, 1)) if epoch % save_image_freq == 0: logger.info('Saving images...') images = sess.run(x_gen) name = os.path.join(result_path, "vae.epoch.{}.png".format(epoch)) save_image_collections(images, name, scale_each=True) if epoch % save_model_freq == 0: logger.info('Saving model...') save_path = os.path.join(result_path, "vae.epoch.{}.ckpt".format(epoch)) if not os.path.exists(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path)) saver.save(sess, save_path) logger.info('Done')
def main(): tf.set_random_seed(1234) np.random.seed(1234) # Load MNIST data_path = os.path.join('data', 'mnist.pkl.gz') x_train, t_train, x_valid, t_valid, x_test, t_test = \ dataset.load_mnist_realval(data_path) x_train = np.vstack([x_train, x_valid]) x_test = np.random.binomial(1, x_test, size=x_test.shape) n_x = x_train.shape[1] n_z = FLAGS.n_z n_particles = tf.placeholder(tf.int32, shape=[], name='n_particles') x_input = tf.placeholder(tf.float32, shape=[None, n_x], name='x') x = tf.to_int32(tf.random_uniform(tf.shape(x_input)) <= x_input) learning_rate_ph = tf.placeholder(tf.float32, shape=[], name='lr') optimizer = tf.train.AdamOptimizer(learning_rate_ph, beta1=0.5) def build_tower_graph(x, id_): tower_x = x[id_ * tf.shape(x)[0] // FLAGS.num_gpus:(id_ + 1) * tf.shape(x)[0] // FLAGS.num_gpus] n = tf.shape(tower_x)[0] # qz_samples: [n_particles, n, n_z] qz_samples = q_net(tower_x, n_z, n_particles) # Use a single particle for the reconstruction term observed = {'x': tower_x, 'z': qz_samples[:1]} model, z, _ = vae(observed, n, n_x, n_z, 1) # log_px_qz: [1, n] log_px_qz = model.local_log_prob('x') eq_ll = tf.reduce_mean(log_px_qz) # log_p_qz: [n_particles, n] log_p_qz = z.log_prob(qz_samples) eq_joint = eq_ll + tf.reduce_mean(log_p_qz) if FLAGS.estimator == "stein": estimator = SteinScoreEstimator(eta=eta) elif FLAGS.estimator == "spectral": estimator = SpectralScoreEstimator(n_eigen=None, eta=None, n_eigen_threshold=0.99) else: raise ValueError("The chosen estimator is not recognized.") qzs = tf.transpose(qz_samples, [1, 0, 2]) dlog_q = estimator.compute_gradients(qzs) entropy_surrogate = tf.reduce_mean( tf.reduce_sum(tf.stop_gradient(-dlog_q) * qzs, -1)) cost = -eq_joint - entropy_surrogate grads_and_vars = optimizer.compute_gradients(cost) return grads_and_vars, eq_joint tower_losses = [] tower_grads = [] for i in range(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i): grads, tower_eq_joint = build_tower_graph(x, i) tower_losses.append([tower_eq_joint]) tower_grads.append(grads) eq_joint = average_losses(tower_losses)[0] grads = average_gradients(tower_grads) infer_op = optimizer.apply_gradients(grads) # Generate images n_gen = 100 _, _, x_logits = vae({}, n_gen, n_x, n_z, 1) x_gen = tf.reshape(tf.sigmoid(x_logits), [-1, 28, 28, 1]) # Define training parameters learning_rate = 1e-4 epochs = 3000 batch_size = 128 iters = x_train.shape[0] // batch_size save_image_freq = 10 save_model_freq = 100 test_freq = 10 test_batch_size = 400 test_iters = x_test.shape[0] // test_batch_size result_path = "results/vae_conv_{}_{}".format( n_z, FLAGS.estimator) + time.strftime("_%Y%m%d_%H%M%S") saver = tf.train.Saver(max_to_keep=10) logger = setup_logger('vae_conv_' + FLAGS.estimator, __file__, result_path) with create_session(FLAGS.log_device_placement) as sess: sess.run(tf.global_variables_initializer()) # Restore from the latest checkpoint ckpt_file = tf.train.latest_checkpoint(result_path) begin_epoch = 1 if ckpt_file is not None: logger.info('Restoring model from {}...'.format(ckpt_file)) begin_epoch = int(ckpt_file.split('.')[-2]) + 1 saver.restore(sess, ckpt_file) for epoch in range(begin_epoch, epochs + 1): time_epoch = -time.time() np.random.shuffle(x_train) eq_joints = [] for t in range(iters): x_batch = x_train[t * batch_size:(t + 1) * batch_size] _, eq_joint_ = sess.run( [infer_op, eq_joint], feed_dict={ x_input: x_batch, learning_rate_ph: learning_rate, n_particles: n_est }, ) eq_joints.append(eq_joint_) time_epoch += time.time() logger.info('Epoch {} ({:.1f}s): log joint = {}'.format( epoch, time_epoch, np.mean(eq_joints))) if epoch % test_freq == 0: time_test = -time.time() test_eq_joints = [] for t in range(test_iters): test_x_batch = x_test[t * test_batch_size:(t + 1) * test_batch_size] test_eq_joint = sess.run(eq_joint, feed_dict={ x: test_x_batch, n_particles: n_est }) test_eq_joints.append(test_eq_joint) time_test += time.time() logger.info('>>> TEST ({:.1f}s)'.format(time_test)) logger.info('>> Test log joint = {}'.format( np.mean(test_eq_joints))) if epoch % save_image_freq == 0: logger.info('Saving images...') images = sess.run(x_gen) name = os.path.join(result_path, "vae.epoch.{}.png".format(epoch)) save_image_collections(images, name) if epoch % save_model_freq == 0: logger.info('Saving model...') save_path = os.path.join(result_path, "vae.epoch.{}.ckpt".format(epoch)) if not os.path.exists(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path)) saver.save(sess, save_path) logger.info('Done')
def main(): config.workspace = os.path.join(config.workspace_dir, config.exp_name) # if config.restart_training: # shutil.rmtree(config.workspace, ignore_errors=True) if not os.path.exists(config.workspace): os.makedirs(config.workspace) logger = setup_logger(os.path.join(config.workspace, 'train_log')) logger.info(config.print()) torch.manual_seed(config.seed) # 为CPU设置随机种子 if config.gpu_id is not None and torch.cuda.is_available(): torch.backends.cudnn.benchmark = True logger.info('train with gpu {} and pytorch {}'.format( config.gpu_id, torch.__version__)) device = torch.device("cuda:0") torch.cuda.manual_seed(config.seed) # 为当前GPU设置随机种子 torch.cuda.manual_seed_all(config.seed) # 为所有GPU设置随机种子 else: logger.info('train with cpu and pytorch {}'.format(torch.__version__)) device = torch.device("cpu") train_data = ICDAR15(config.trainroot, config.is_pseudo, data_shape=config.data_shape, n=config.kernel_num, m=config.m) train_loader = Data.DataLoader(dataset=train_data, batch_size=config.train_batch_size, shuffle=True, num_workers=int(config.workers)) # writer = SummaryWriter(config.output_dir) model = PSENet(backbone=config.backbone, pretrained=config.pretrained, result_num=config.kernel_num, scale=config.scale) if not config.pretrained and not config.restart_training: model.apply(weights_init) num_gpus = torch.cuda.device_count() if num_gpus > 1: model = nn.DataParallel(model) model = model.to(device) criterion = PSELoss(Lambda=config.Lambda, ratio=config.OHEM_ratio, reduction='mean') optimizer = torch.optim.Adam(model.parameters(), lr=config.lr) if config.checkpoint != '' and not config.restart_training: start_epoch = load_checkpoint(config.checkpoint, model, logger, device, optimizer) start_epoch += 1 scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.lr_decay_step, gamma=config.lr_gamma, last_epoch=start_epoch) logger.info('resume from {}, epoch={}'.format(config.checkpoint, start_epoch)) else: start_epoch = config.start_epoch scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.lr_decay_step, gamma=config.lr_gamma) all_step = len(train_loader) logger.info('train dataset has {} samples,{} in dataloader'.format( train_data.__len__(), all_step)) epoch = 0 f1 = 0 for epoch in range(start_epoch, config.epochs): start = time.time() train_loss, lr = train_epoch(model, optimizer, scheduler, train_loader, device, criterion, epoch, all_step, logger) logger.info('[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format( epoch, config.epochs, train_loss, time.time() - start, lr)) if epoch % config.save_interval == 0: save_path = '{}/epoch_{}.pth'.format(config.workspace, epoch) latest_path = '{}/latest.pth'.format(config.workspace) save_checkpoint(save_path, model, optimizer, epoch, logger) save_checkpoint(latest_path, model, optimizer, epoch, logger)
def main(): if config.output_dir is None: config.output_dir = 'output' if config.restart_training: shutil.rmtree(config.output_dir, ignore_errors=True) if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) logger = setup_logger(os.path.join(config.output_dir, 'train_log')) logger.info(config.print()) torch.manual_seed(config.seed) # 为CPU设置随机种子 if config.gpu_id is not None and torch.cuda.is_available(): torch.backends.cudnn.benchmark = True logger.info('train with gpu {} and pytorch {}'.format(config.gpu_id, torch.__version__)) device = torch.device("cuda:0") torch.cuda.manual_seed(config.seed) # 为当前GPU设置随机种子 torch.cuda.manual_seed_all(config.seed) # 为所有GPU设置随机种子 else: logger.info('train with cpu and pytorch {}'.format(torch.__version__)) device = torch.device("cpu") train_data = MyDataset(config.trainroot, config.MIN_LEN, config.MAX_LEN, transform=transforms.ToTensor()) train_loader = Data.DataLoader(dataset=train_data, batch_size=config.train_batch_size, shuffle=True, num_workers=int(config.workers)) writer = SummaryWriter(config.output_dir) model = CTPN_Model(pretrained=config.pretrained) if not config.pretrained and not config.restart_training: model.apply(weights_init) num_gpus = torch.cuda.device_count() if num_gpus > 1: model = nn.DataParallel(model) model = model.to(device) dummy_input = torch.zeros(1, 3, 600, 800).to(device) writer.add_graph(model=model, input_to_model=dummy_input) criterion = CTPNLoss(device) # optimizer = torch.optim.SGD(model.parameters(), lr=config.lr, momentum=0.99) optimizer = torch.optim.Adam(model.parameters(), lr=config.lr) if config.checkpoint != '' and not config.restart_training: print('Loading Checkpoint...') start_epoch = load_checkpoint(config.ch9eckpoint, model, logger, device) start_epoch += 1 scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.lr_decay_step, gamma=config.lr_gamma, last_epoch=start_epoch) else: start_epoch = config.start_epoch scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.lr_decay_step, gamma=config.lr_gamma) all_step = len(train_loader) logger.info('train dataset has {} samples,{} in dataloader'.format(train_data.__len__(), all_step)) epoch = 0 best_model = {'loss': float('inf')} try: for epoch in range(start_epoch, config.epochs): start = time.time() train_loss, lr = train_epoch(model, optimizer, scheduler, train_loader, device, criterion, epoch, all_step, writer, logger) logger.info('[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format( epoch, config.epochs, train_loss, time.time() - start, lr)) # if (0.3 < train_loss < 0.4 and epoch % 1 == 0) or train_loss < 0.3: if epoch % 10 == 0 or train_loss < best_model['loss']: net_save_path = '{}/PSENet_{}_loss{:.6f}.pth'.format(config.output_dir, epoch, train_loss) save_checkpoint(net_save_path, model, optimizer, epoch, logger) if train_loss < best_model['loss']: best_model['loss'] = train_loss if 'model' in best_model: os.remove(best_model['model']) best_model['model'] = net_save_path shutil.copy(best_model['model'], '{}/best_loss{:.6f}.pth'.format(config.output_dir, best_model['loss'])) writer.close() except KeyboardInterrupt: pass finally: if best_model['model']: shutil.copy(best_model['model'], '{}/best_loss{:.6f}.pth'.format(config.output_dir, best_model['loss'])) logger.info(best_model)
def main(): parser = argparse.ArgumentParser(description='AN') parser.add_argument('--name', default='bn_smaller_batch', type=str) ## data setting parser.add_argument('--root', default='/scratch/local/ssd/datasets', type=str) parser.add_argument('--train_dataset', default='synthtext', type=str) parser.add_argument('--test_dataset', default='ic03', type=str) parser.add_argument('--vis_gt', default=False, type=bool) parser.add_argument('--vis_gt_path', default='/users/czhang/data/vis', type=str) parser.add_argument('--load_width', default=256, type=int) parser.add_argument('--load_height', default=32, type=int) parser.add_argument("--gpus", dest="gpu", default="0", type=str) parser.add_argument('--min_gt_len', default=3, type=int) parser.add_argument("--aug", dest="aug", action='store_true') parser.add_argument("--RA", dest="repeated_aug", default='1', type=int) ## model setting parser.add_argument('--alphabet', default=' 0123456789abcdefghijklmnopqrstuvwxyz', type=str) #parser.add_argument('--ignore_case', default=True, type=bool) parser.add_argument('--max_len', default=65, type=int) parser.add_argument("--cv", dest="context_vector", action='store_true') ## optim setting parser.add_argument('--batch_size', default=128, type=int) parser.add_argument('--resume_i', default=0, type=int) parser.add_argument('--resume_j', default=0, type=int) parser.add_argument('--cl_weight', default=1, type=int, help='center loss weight') parser.add_argument('--num_workers', default=64, type=int) parser.add_argument('--lr', default=1.0, type=float) parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5') parser.add_argument('--momentum', default=0.9, type=float) parser.add_argument('--weight_decay', default=1e-5, type=float) parser.add_argument('--gamma', default=0.1, type=float) parser.add_argument('--optim', default='adadelta', type=str, help='sgd, adam, adadelta') # parser.add_argument('--clip_grad', default=False, type=bool) parser.add_argument('--max_norm', default=400, type=int, help='Norm cutoff to prevent explosion of gradients') parser.add_argument('--max_epoches', default=1000, type=int) # parser.add_argument('--adjust_lr', default='800, 1600', type=str) ## output setting parser.add_argument('--log_iter', default=10, type=int) parser.add_argument('--eval_iter', default=2500, type=int) parser.add_argument('--save_iter', default=2500, type=int) parser.add_argument('--save_folder', default='/users/czhang/data/FAN/', type=str) parser.add_argument('--tbx_folder', default='/users/czhang/data/FAN/tbx', type=str) parser.add_argument('--eval_vis_num', default=15, type=int) parser.add_argument('--max_iter', default=2000000, type=int) args = parser.parse_args() args.save_folder = osp.join(args.save_folder, args.name) if osp.exists(args.save_folder) == False: os.mkdir(args.save_folder) tbx_dir = osp.join(args.tbx_folder, args.name) if osp.exists(args.tbx_folder) == False: os.mkdir(args.tbx_folder) if osp.exists(tbx_dir) == False: os.mkdir(tbx_dir) writer = SummaryWriter(tbx_dir) log_file_path = args.save_folder + '/' + time.strftime( '%Y%m%d_%H%M%S') + '.log' ## args.nClasses = len(args.alphabet) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu device = torch.device("cuda:0") setup_logger(log_file_path) print_args(args) torch.set_default_tensor_type('torch.FloatTensor') ## setup converter converter = strLabelConverter(args.alphabet) ## setup dataset logging.info('model will be trained on %s' % (args.train_dataset)) trainset = SynthLoader(args, args.train_dataset, converter, aug=args.aug) logging.info('%d training samples' % (trainset.__len__())) train_loader = data.DataLoader(trainset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=text_collate, pin_memory=True) logging.info('model will be evaluated on %s' % (args.test_dataset)) testset = SceneLoader(args, args.test_dataset, False) logging.info('%d test samples' % (testset.__len__())) test_loader = data.DataLoader(testset, 1, num_workers=args.num_workers, shuffle=False, pin_memory=True) ## setup model net = AN(args) net = torch.nn.DataParallel(net).to(device) centers = None if args.resume_i != 0 or args.resume_j != 0: resume_file = osp.join( args.save_folder, str(args.resume_i) + '_' + str(args.resume_j) + '.pth') logging.info('Resuming training, loading {}...'.format(resume_file)) checkpoint = torch.load(resume_file) #net.load_state_dict(checkpoint) net.load_state_dict(checkpoint['model_state_dict']) centers = checkpoint['class_centers'] ## setup criterion criterion = nn.CrossEntropyLoss() criterion2 = CenterLoss(device, centers) ## setup optimizer if args.cl_weight != 0: parameters = list(net.parameters()) + list(criterion2.parameters()) else: parameters = net.parameters() if args.optim == 'sgd': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) logging.info('model will be optimed by sgd') elif args.optim == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.weight_decay) logging.info('model will be optimed by adam') elif args.optim == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=args.weight_decay) logging.info('model will be optimed by adadelta') else: optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.weight_decay) logging.info('model will be optimed by adam') ## train model cudnn.benchmark = True net.train() iter_counter = args.resume_j + 1 acc_max = 0 running_loss, running_cenloss, running_croloss = 0., 0., 0. for i in range(args.max_epoches): i = args.resume_i + i t0 = time.time() for j, batch_samples in enumerate(train_loader): j = args.resume_j + j + 1 imgs, labels, paths = batch_samples imgs = Variable(imgs.float()).to(device) labels = Variable(labels.long()).to(device) #[batch*len] if args.context_vector or args.cl_weight != 0: preds, gts = net(imgs, labels) #[batch,len,classes] masks = mask(args, labels.view(args.batch_size, args.max_len), device) center_loss = criterion2(gts, labels, masks) running_cenloss += center_loss.item() else: preds = net(imgs, labels) center_loss = 0 ce_loss = criterion(preds.view(-1, args.nClasses), labels.view(-1)) loss = ce_loss + 0.01 * args.cl_weight * center_loss optimizer.zero_grad() loss.backward() if args.cl_weight != 0: for param in criterion2.parameters(): # update class centers # remove the effect of lambda on updating centers # lr of center loss set to 0.5 of the model lr param.grad.data *= (0.5 / (0.01 * args.cl_weight)) torch.nn.utils.clip_grad_norm_(net.parameters(), args.max_norm) optimizer.step() running_loss += loss.item() running_croloss += ce_loss.item() if iter_counter % args.log_iter == 0: t1 = time.time() acc, pred_samples, label_samples = lex_free_acc( preds, labels, converter) print( 'epoch:%3d iter:%6d loss:%4.6f acc:%4.6f %4.6fs/batch' % (i, j, running_loss / args.log_iter, acc, (t1 - t0) / args.log_iter)) writer.add_scalar('train/train_word_accuracy', acc, j) writer.add_scalar('train/train_loss', running_loss / args.log_iter, j) if args.cl_weight != 0: writer.add_scalar('train/train_ce_loss', running_croloss / args.log_iter, j) writer.add_scalar('train/train_center_loss', running_cenloss / args.log_iter, j) if iter_counter % (100 * args.log_iter) == 0: visual_img = imgs[0, :, :, :].unsqueeze(0) writer.add_image('train/train_im', visual_img, j) visual_txt = 'gt: ' + str( label_samples[0]) + ' ----- pred: ' + str( label_samples[0]) writer.add_text('train/train_txt', visual_txt, j) t0 = time.time() running_loss, running_cenloss, running_croloss = 0., 0., 0. if iter_counter % args.save_iter == 0: print('Saving state, epoch: %d iter:%d' % (i, j)) torch.save( { 'model_state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'class_centers': criterion2.centers }, args.save_folder + '/' + repr(i) + '_' + repr(j) + '.pth') if iter_counter % args.eval_iter == 0: ## eval model net.eval() n_correct = 0 skip_counter = 0 for index, sample in enumerate(test_loader): imgs, gt_strs, lexicon50, lexicon1k, lexiconfull, img_paths = sample gt_str = gt_strs[0] if len(gt_str) < args.min_gt_len or not gt_str.isalnum(): skip_counter += 1 continue imgs = Variable(imgs).cuda() gt_ind, _ = converter.encode(gt_str) gt_ind = torch.IntTensor( (gt_ind + [0] * args.max_len)[:args.max_len]) if args.context_vector or args.cl_weight != 0: preds, _ = net(imgs, gt_ind) else: preds = net(imgs, gt_ind) correct, pred_str, _ = lex_free_acc( preds, gt_ind, converter) n_correct += correct acc = n_correct * 1.0 / (testset.__len__() - skip_counter) if acc > acc_max: acc_max = acc logging.info('accuracy=%f acc_max=%f' % (acc, acc_max)) writer.add_scalar('val/val_word_accuracy', acc, j) net.train() if iter_counter > args.max_iter: break iter_counter += 1 torch.save(net.state_dict(), args.save_folder + '/final_0.pth') logging.info('The training stage on %s is over!!!' % (args.train_dataset))
def main(): if config.output_dir is None: config.output_dir = 'output' if config.restart_training: shutil.rmtree(config.output_dir, ignore_errors=True) if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) logger = setup_logger(os.path.join(config.output_dir, 'train_log')) logger.info(config.print()) torch.manual_seed(config.seed) # 为CPU设置随机种子 if config.gpu_id is not None and torch.cuda.is_available(): torch.backends.cudnn.benchmark = True logger.info('train with gpu {} and pytorch {}'.format( config.gpu_id, torch.__version__)) device = torch.device("cuda:0") torch.cuda.manual_seed(config.seed) # 为当前GPU设置随机种子 torch.cuda.manual_seed_all(config.seed) # 为所有GPU设置随机种子 else: logger.info('train with cpu and pytorch {}'.format(torch.__version__)) device = torch.device("cpu") train_data = MyDataset(args.train_dir, data_shape=config.data_shape, n=config.n, m=config.m, transform=transforms.ToTensor()) train_loader = Data.DataLoader(dataset=train_data, batch_size=args.batch_size, shuffle=True, num_workers=int(config.workers)) writer = SummaryWriter(config.output_dir) model = PSENet(backbone=config.backbone, pretrained=config.pretrained, result_num=config.n, scale=config.scale) if not config.pretrained and not config.restart_training: model.apply(weights_init) if args.resume_model: resume_model(model, args.resume_model) num_gpus = torch.cuda.device_count() if num_gpus > 1: model = nn.DataParallel(model) model = model.to(device) # dummy_input = torch.autograd.Variable(torch.Tensor(1, 3, 600, 800).to(device)) # writer.add_graph(models=models, input_to_model=dummy_input) criterion = PSELoss(Lambda=config.Lambda, ratio=config.OHEM_ratio, reduction='mean') # optimizer = torch.optim.SGD(models.parameters(), lr=config.lr, momentum=0.99) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) if config.checkpoint != '' and not config.restart_training: start_epoch = load_checkpoint(config.checkpoint, model, logger, device, optimizer) start_epoch += 1 scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.lr_decay_step, gamma=config.lr_gamma, last_epoch=start_epoch) else: start_epoch = config.start_epoch scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.lr_decay_step, gamma=config.lr_gamma) all_step = len(train_loader) logger.info('train dataset has {} samples,{} in dataloader'.format( train_data.__len__(), all_step)) epoch = 0 best_model = {'recall': 0, 'precision': 0, 'f1': 0, 'models': ''} try: for epoch in range(start_epoch, args.epochs): start = time.time() train_loss, lr = train_epoch(model, optimizer, scheduler, train_loader, device, criterion, epoch, all_step, writer, logger) logger.info( '[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format( epoch, config.epochs, train_loss, time.time() - start, lr)) if epoch % args.save_per_epoch == 0: save_model(model, epoch) writer.close() except KeyboardInterrupt: save_checkpoint('{}/final.pth'.format(config.output_dir), model, optimizer, epoch, logger) finally: if best_model['models']: logger.info(best_model)
def __init__(self, config, model, criterion): config['trainer']['output_dir'] = os.path.join( str(pathlib.Path(os.path.abspath(__name__)).parent), config['trainer']['output_dir']) config['name'] = config['name'] + '_' + model.name self.save_dir = os.path.join(config['trainer']['output_dir'], config['name']) self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoint') if config['trainer']['resume_checkpoint'] == '' and config['trainer'][ 'finetune_checkpoint'] == '': shutil.rmtree(self.save_dir, ignore_errors=True) if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.global_step = 0 self.start_epoch = 0 self.config = config self.model = model self.criterion = criterion # logger and tensorboard self.tensorboard_enable = self.config['trainer']['tensorboard'] self.epochs = self.config['trainer']['epochs'] self.log_iter = self.config['trainer']['log_iter'] anyconfig.dump(config, os.path.join(self.save_dir, 'config.yaml')) self.logger = setup_logger(os.path.join(self.save_dir, 'train.log')) self.logger_info(pformat(self.config)) # device torch.manual_seed(self.config['trainer']['seed']) # 为CPU设置随机种子 if torch.cuda.device_count() > 0 and torch.cuda.is_available(): self.with_cuda = True torch.backends.cudnn.benchmark = True self.device = torch.device("cuda") torch.cuda.manual_seed( self.config['trainer']['seed']) # 为当前GPU设置随机种子 torch.cuda.manual_seed_all( self.config['trainer']['seed']) # 为所有GPU设置随机种子 else: self.with_cuda = False self.device = torch.device("cpu") self.logger_info('train with device {} and pytorch {}'.format( self.device, torch.__version__)) # metrics self.metrics = { 'recall': 0, 'precision': 0, 'hmean': 0, 'train_loss': float('inf') } self.lr = config['optimizer']['args']['lr'] self.optimizer = self._initialize('optimizer', torch.optim, model.parameters()) # resume or finetune if self.config['trainer']['resume_checkpoint'] != '': self._laod_checkpoint(self.config['trainer']['resume_checkpoint'], resume=True) elif self.config['trainer']['finetune_checkpoint'] != '': self._laod_checkpoint( self.config['trainer']['finetune_checkpoint'], resume=False) if self.config['lr_scheduler']['type'] != 'WarmupPolyLR': self.scheduler = self._initialize('lr_scheduler', torch.optim.lr_scheduler, self.optimizer) self.model.to(self.device) if self.tensorboard_enable and config['local_rank'] == 0: from torch.utils.tensorboard import SummaryWriter self.writer = SummaryWriter(self.save_dir) try: # add graph dummy_input = torch.zeros(1, 3, 640, 640).to(self.device) self.writer.add_graph(self.model, dummy_input) torch.cuda.empty_cache() except: import traceback self.logger.error(traceback.format_exc()) self.logger.warn('add graph to tensorboard failed') # 分布式训练 if torch.cuda.device_count() > 1: local_rank = config['local_rank'] self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[local_rank], output_device=local_rank, broadcast_buffers=False, find_unused_parameters=True) # make inverse Normalize self.UN_Normalize = False if 'transforms' in self.config['dataset']['train']['dataset'][ 'args'].keys(): for t in self.config['dataset']['train']['dataset']['args'][ 'transforms']: if t['type'] == 'Normalize': self.normalize_mean = t['args']['mean'] self.normalize_std = t['args']['std'] self.UN_Normalize = True
def main(): config.workspace = os.path.join(config.workspace_dir, config.exp_name) if not os.path.exists(config.workspace): os.makedirs(config.workspace) logger = setup_logger(os.path.join(config.workspace, 'train_log')) logger.info(config.pprint()) torch.manual_seed(config.seed) # 为CPU设置随机种子 torch.backends.cudnn.benchmark = True logger.info('train with gpu {} and pytorch {}'.format( config.gpu_id, torch.__version__)) device = torch.device("cuda:0") torch.cuda.manual_seed(config.seed) # 为当前GPU设置随机种子 torch.cuda.manual_seed_all(config.seed) # 为所有GPU设置随机种子 train_data = TotalTextoader(config.train_data_dir, config.train_gt_dir, config.test_data_dir, config.test_gt_dir, split='train', is_transform=True, img_size=config.data_shape, kernel_num=config.kernel_num, min_scale=config.min_scale) train_loader = Data.DataLoader(dataset=train_data, batch_size=config.train_batch_size, shuffle=True, num_workers=int(config.workers)) model = PSENet(backbone=config.backbone, pretrained=config.pretrained, result_num=config.kernel_num, scale=config.scale) if not config.pretrained and not config.restart_training: model.apply(weights_init) num_gpus = torch.cuda.device_count() # if num_gpus > 1: model = nn.DataParallel(model) model = model.to(device) criterion = dice_loss optimizer = torch.optim.Adam(model.parameters(), lr=config.lr) if config.checkpoint != '' and config.restart_training == True: start_epoch = load_checkpoint(config.checkpoint, model, logger, device, optimizer) start_epoch += 1 scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.lr_decay_step, gamma=config.lr_gamma, last_epoch=start_epoch) logger.info('resume from {}, epoch={}'.format(config.checkpoint, start_epoch)) else: start_epoch = 1 scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.lr_decay_step, gamma=config.lr_gamma) all_step = len(train_loader) logger.info('train dataset has {} samples,{} iters in dataloader'.format( train_data.__len__(), all_step)) for epoch in range(start_epoch, config.epochs + 1): start = time.time() train_loss, lr = train_epoch(model, optimizer, scheduler, train_loader, device, criterion, epoch, all_step, logger) logger.info('[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format( epoch, config.epochs, train_loss, time.time() - start, lr)) if epoch % config.save_interval == 0: save_path = '{}/epoch_{}.pth'.format(config.workspace, epoch) latest_path = '{}/latest.pth'.format(config.workspace) save_checkpoint(save_path, model, optimizer, epoch, logger) save_checkpoint(latest_path, model, optimizer, epoch, logger)
def main_worker( gpu, ngpus_per_node, args, final_output_dir, tb_log_dir ): # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if cfg.MULTIPROCESSING_DISTRIBUTED: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes # 通过节点序号来计算进程在所有进程之中的序号 args.rank = args.rank * ngpus_per_node + gpu print('Init process group: dist_url: {}, world_size: {}, rank: {}'. format(args.dist_url, args.world_size, args.rank)) dist.init_process_group( backend=cfg.DIST_BACKEND, init_method=args.dist_url, world_size=args.world_size, rank=args.rank ) update_config(cfg, args) # setup logger logger, _ = setup_logger(final_output_dir, args.rank, 'train') model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')( cfg, is_train=True ) # copy model file if not cfg.MULTIPROCESSING_DISTRIBUTED or ( cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0 ): this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir ) # 利用tensorboard可视化结果 writer_dict = { 'writer': SummaryWriter(logdir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu] ) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: model = torch.nn.DataParallel(model).cuda() if not cfg.MULTIPROCESSING_DISTRIBUTED or ( cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0 ): dump_input = torch.rand( (1, 3, cfg.DATASET.INPUT_SIZE, cfg.DATASET.INPUT_SIZE) ).cuda() #writer_dict['writer'].add_graph(model, (dump_input, )) logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE)) # define loss function (criterion) and optimizer loss_factory = MultiLossFactory(cfg).cuda() # Data loading code train_loader = make_dataloader( cfg, is_train=True, distributed=args.distributed )
def main(): parser = argparse.ArgumentParser(description='AN') parser.add_argument('--name', default='second_training_bn', type=str) ## data setting parser.add_argument('--root', default='/users/czhang/data/', type=str) parser.add_argument('--load_folder', default='/users/czhang/data/FAN/', type=str) parser.add_argument('--test_dataset', default='ic13', type=str) parser.add_argument('--load_width', default=256, type=int) parser.add_argument('--load_height', default=32, type=int) parser.add_argument('--batch_size', default=1, type=int) parser.add_argument('--num_workers', default=32, type=int) parser.add_argument("--gpus", dest="gpu", default="1", type=str) parser.add_argument('--min_gt_len', default=3, type=int) parser.add_argument('--max_len', default=65, type=int) parser.add_argument("--cv", dest="context_vector", action='store_true') parser.add_argument('--lexicon', default=None, type=str) parser.add_argument('--max_ed', default=3, type=int) parser.add_argument('--tbx_folder', default='/users/czhang/data/FAN/tbx', type=str) ## model setting parser.add_argument('--load_epoch', default=0, type=int) parser.add_argument('--load_iter', default=0, type=int) parser.add_argument('--alphabet', default=' 0123456789abcdefghijklmnopqrstuvwxyz', type=str) ## output setting parser.add_argument('--out_dir', default='/users/czhang/data/FAN/', type=str) args = parser.parse_args() args.nClasses = len(args.alphabet) args.load_folder = osp.join(args.load_folder, args.name) args.out_dir = osp.join(args.out_dir, args.name, 'tests') if not osp.exists(args.out_dir): os.mkdir(args.out_dir) tbx_dir = osp.join(args.tbx_folder, args.name, 'tests') if osp.exists(args.tbx_folder) == False: os.mkdir(args.tbx_folder) if osp.exists(tbx_dir) == False: os.mkdir(tbx_dir) writer = SummaryWriter(tbx_dir) log_path = os.path.join(args.out_dir, args.test_dataset + '.txt') setup_logger(log_path) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu device = torch.device("cuda:0") logging.info('model will be evaluated on %s' % (args.test_dataset)) testset = SceneLoader(args, args.test_dataset, False) logging.info('%d test samples' % (testset.__len__())) test_loader = data.DataLoader(testset, args.batch_size, num_workers=args.num_workers, shuffle=False, pin_memory=True) ## model net = AN(args) net = torch.nn.DataParallel(net).to(device) checkpoint = str(args.load_epoch) + '_' + str(args.load_iter) + '.pth' load_file = torch.load(osp.join(args.load_folder, checkpoint)) net.load_state_dict(load_file['model_state_dict']) #net.load_state_dict(torch.load(load_file)) net.eval() n_correct = 0 skip_counter = 0 converter = strLabelConverter(args.alphabet) for index, sample in enumerate(test_loader): imgs, gt_strs, lexicon50, lexicon1k, lexiconfull, img_paths = sample gt_str = gt_strs[0] if args.test_dataset != 'iiit5k': if len(gt_str) < args.min_gt_len or not gt_str.isalnum(): print('skipping: %s' % gt_str) skip_counter += 1 continue else: if not gt_str.isalnum(): print('skipping: %s' % gt_str) skip_counter += 1 continue imgs = Variable(imgs).cuda() gt_ind, _ = converter.encode(gt_str) gt_ind = torch.IntTensor((gt_ind + [0] * args.max_len)[:args.max_len]) preds = net(imgs, gt_ind) if args.lexicon is None: correct, pred_str, _ = lex_free_acc(preds, gt_ind, converter) pred_lex = [] # lexicon decoding if args.lexicon is not None: if args.lexicon == '50': lexicon = lexicon50 if args.lexicon == '1k': lexicon = lexicon1k if args.lexicon == 'full': lexicon = full_lexicon correct, pred_str = lex_acc(args, lexicon, preds, gt_str, converter) ## decode if correct == 0: writer.add_image('test_im', imgs[0, :, :, :].unsqueeze(0), index) writer.add_text('pred', pred_str, index) writer.add_text('gt', gt_str, index) logging.info('pred: %s gt:%s ' % (pred_str, gt_str)) n_correct += correct acc = n_correct * 1.0 / (testset.__len__() - skip_counter) print(testset.__len__() - skip_counter) logging.info('accuracy=%f' % (acc))
if not osp.exists(args.out_dir): os.mkdir(args.out_dir) #tbx_dir =osp.join(args.tbx_folder,args.name,'tests') tbx_dir = args.tbx_folder if osp.exists(args.tbx_folder) == False: os.mkdir(args.tbx_folder) if osp.exists(tbx_dir) == False: os.mkdir(tbx_dir) writer = SummaryWriter(tbx_dir) log_path = os.path.join(args.out_dir, args.test_dataset + '.txt') setup_logger(log_path) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu device = torch.device("cuda:0") logging.info('model will be evaluated on %s' % (args.test_dataset)) net = AN(args) net = torch.nn.DataParallel(net).to(device) checkpoint = '../attention_net/0_480000.pth' load_file = torch.load(checkpoint) net.load_state_dict(load_file['model_state_dict']) #net.load_state_dict(torch.load(load_file)) net.eval()
def main_worker( gpu, ngpus_per_node, args, final_output_dir, tb_log_dir ): # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED if cfg.FP16.ENABLED: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if cfg.FP16.STATIC_LOSS_SCALE != 1.0: if not cfg.FP16.ENABLED: print("Warning: if --fp16 is not used, static_loss_scale will be ignored.") args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if cfg.MULTIPROCESSING_DISTRIBUTED: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu print('Init process group: dist_url: {}, world_size: {}, rank: {}'. format(args.dist_url, args.world_size, args.rank)) dist.init_process_group( backend=cfg.DIST_BACKEND, init_method=args.dist_url, world_size=args.world_size, rank=args.rank ) update_config(cfg, args) # setup logger logger, _ = setup_logger(final_output_dir, args.rank, 'train') model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')( cfg, is_train=True ) # copy model file if not cfg.MULTIPROCESSING_DISTRIBUTED or ( cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0 ): this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir ) writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } if not cfg.MULTIPROCESSING_DISTRIBUTED or ( cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0 ): dump_input = torch.rand( (1, 3, cfg.DATASET.INPUT_SIZE, cfg.DATASET.INPUT_SIZE) ) writer_dict['writer'].add_graph(model, (dump_input, )) # logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE)) if cfg.FP16.ENABLED: model = network_to_half(model) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu] ) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer loss_factory = MultiLossFactory(cfg).cuda() # Data loading code train_loader = make_dataloader( cfg, is_train=True, distributed=args.distributed ) logger.info(train_loader.dataset) best_perf = -1 best_model = False last_epoch = -1 optimizer = get_optimizer(cfg, model) if cfg.FP16.ENABLED: optimizer = FP16_Optimizer( optimizer, static_loss_scale=cfg.FP16.STATIC_LOSS_SCALE, dynamic_loss_scale=cfg.FP16.DYNAMIC_LOSS_SCALE ) begin_epoch = cfg.TRAIN.BEGIN_EPOCH checkpoint_file = os.path.join( final_output_dir, 'checkpoint.pth.tar') if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) begin_epoch = checkpoint['epoch'] best_perf = checkpoint['perf'] last_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) if cfg.FP16.ENABLED: lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer.optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch ) else: lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=last_epoch ) for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH): lr_scheduler.step() # train one epoch do_train(cfg, model, train_loader, loss_factory, optimizer, epoch, final_output_dir, tb_log_dir, writer_dict, fp16=cfg.FP16.ENABLED) perf_indicator = epoch if perf_indicator >= best_perf: best_perf = perf_indicator best_model = True else: best_model = False if not cfg.MULTIPROCESSING_DISTRIBUTED or ( cfg.MULTIPROCESSING_DISTRIBUTED and args.rank == 0 ): logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint({ 'epoch': epoch + 1, 'model': cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'best_state_dict': model.module.state_dict(), 'perf': perf_indicator, 'optimizer': optimizer.state_dict(), }, best_model, final_output_dir) final_model_state_file = os.path.join( final_output_dir, 'final_state{}.pth.tar'.format(gpu) ) logger.info('saving final model state to {}'.format( final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) writer_dict['writer'].close()
def main_worker(gpus, ngpus_per_node, args, final_output_dir, tb_log_dir): # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED #os.environ['CUDA_VISIBLE_DEVICES']=gpus # if len(gpus) == 1: # gpus = int(gpus) update_config(cfg, args) #test(cfg, args) # logger setting logger, _ = setup_logger(final_output_dir, args.rank, 'train') writer_dict = { 'writer': SummaryWriter(log_dir=tb_log_dir), 'train_global_steps': 0, 'valid_global_steps': 0, } # model initilization model = { "ransac": RANSACTriangulationNet, "alg": AlgebraicTriangulationNet, "vol": VolumetricTriangulationNet, "vol_CPM": VolumetricTriangulationNet_CPM, "FTL": FTLMultiviewNet }[cfg.MODEL.NAME](cfg) discriminator = Discriminator(cfg) # load pretrained model before DDP initialization if cfg.AUTO_RESUME: checkpoint_file = os.path.join(final_output_dir, 'model_best.pth.tar') if os.path.exists(checkpoint_file): checkpoint = torch.load(checkpoint_file, map_location=torch.device('cpu')) state_dict = checkpoint['state_dict'] D_state_dict = checkpoint['D_state_dict'] for key in list(state_dict.keys()): new_key = key.replace("module.", "") state_dict[new_key] = state_dict.pop(key) for key in list(D_state_dict.keys()): new_key = key.replace("module.", "") D_state_dict[new_key] = D_state_dict.pop(key) model.load_state_dict(state_dict) discriminator.load_state_dict(D_state_dict) logger.info("=> Loading checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) else: print('[Warning] Checkpoint file not found! Wrong path: {}'.format( checkpoint_file)) elif cfg.MODEL.HRNET_PRETRAINED: logger.info("=> loading a pretrained model '{}'".format( cfg.MODEL.PRETRAINED)) checkpoint = torch.load(cfg.MODEL.HRNET_PRETRAINED) state_dict = checkpoint['state_dict'] for key in list(state_dict.keys()): new_key = key.replace("module.", "") state_dict[new_key] = state_dict.pop(key) model.load_state_dict(state_dict) # initiliaze a optimizer # optimizer must be initilized after model initilization if cfg.MODEL.TRIANGULATION_MODEL_NAME == "vol": optimizer = torch.optim.Adam([{ 'params': model.backbone.parameters(), 'initial_lr': cfg.TRAIN.LR }, { 'params': model.process_features.parameters(), 'initial_lr': cfg.TRAIN.PROCESS_FEATURE_LR if hasattr(cfg.TRAIN, "PROCESS_FEATURE_LR") else cfg.TRAIN.LR }, { 'params': model.volume_net.parameters(), 'initial_lr': cfg.TRAIN.VOLUME_NET_LR if hasattr(cfg.TRAIN, "VOLUME_NET_LR") else cfg.TRAIN.LR }], lr=cfg.TRAIN.LR) else: optimizer = torch.optim.Adam( [{ 'params': filter(lambda p: p.requires_grad, model.parameters()), 'initial_lr': cfg.TRAIN.LR }], lr=cfg.TRAIN.LR) D_optimizer = torch.optim.RMSprop([{ 'params': filter(lambda p: p.requires_grad, discriminator.parameters()), 'initial_lr': cfg.TRAIN.LR }], lr=cfg.TRAIN.LR) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2(os.path.join(this_dir, '../lib/models', 'triangulation.py'), final_output_dir) # copy configuration file config_dir = args.cfg shutil.copy2(os.path.join(args.cfg), final_output_dir) # calculate GFLOPS # dump_input = torch.rand( # (1, 4, 3, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[0]) # ) # logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE)) # FP16 SETTING if cfg.FP16.ENABLED: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if cfg.FP16.STATIC_LOSS_SCALE != 1.0: if not cfg.FP16.ENABLED: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) if cfg.FP16.ENABLED: model = network_to_half(model) if cfg.MODEL.SYNC_BN and not cfg.DISTRIBUTED: print( 'Warning: Sync BatchNorm is only supported in distributed training.' ) if cfg.FP16.ENABLED: optimizer = FP16_Optimizer( optimizer, static_loss_scale=cfg.FP16.STATIC_LOSS_SCALE, dynamic_loss_scale=cfg.FP16.DYNAMIC_LOSS_SCALE, verbose=False) # Distributed Computing master = True if cfg.DISTRIBUTED: # This block is not available args.local_rank += int(gpus[0]) print('This process is using GPU', args.local_rank) device = args.local_rank master = device == int(gpus[0]) dist.init_process_group(backend='nccl') if cfg.MODEL.SYNC_BN: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if gpus is not None: torch.cuda.set_device(device) model.cuda(device) discriminator.cuda(device) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # workers = int(workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[device], output_device=device, find_unused_parameters=True) discriminator = torch.nn.parallel.DistributedDataParallel( discriminator, device_ids=[device], output_device=device, find_unused_parameters=True) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) else: # implement this block gpu_ids = eval('[' + gpus + ']') device = gpu_ids[0] print('This process is using GPU', str(device)) model = torch.nn.DataParallel(model, gpu_ids).cuda(device) discriminator = torch.nn.DataParallel(discriminator, gpu_ids).cuda(device) # Prepare loss functions criterion = {} if cfg.LOSS.WITH_HEATMAP_LOSS: criterion['heatmap_loss'] = HeatmapLoss().cuda(device) if cfg.LOSS.WITH_POSE2D_LOSS: criterion['pose2d_loss'] = JointsMSELoss().cuda(device) if cfg.LOSS.WITH_POSE3D_LOSS: criterion['pose3d_loss'] = Joints3DMSELoss().cuda(device) if cfg.LOSS.WITH_VOLUMETRIC_CE_LOSS: criterion['volumetric_ce_loss'] = VolumetricCELoss().cuda(device) if cfg.LOSS.WITH_BONE_LOSS: criterion['bone_loss'] = BoneLengthLoss().cuda(device) if cfg.LOSS.WITH_TIME_CONSISTENCY_LOSS: criterion['time_consistency_loss'] = Joints3DMSELoss().cuda(device) if cfg.LOSS.WITH_KCS_LOSS: criterion['KCS_loss'] = None if cfg.LOSS.WITH_JOINTANGLE_LOSS: criterion['jointangle_loss'] = JointAngleLoss().cuda(device) best_perf = 1e9 best_model = False last_epoch = -1 # load history begin_epoch = cfg.TRAIN.BEGIN_EPOCH if cfg.AUTO_RESUME and os.path.exists(checkpoint_file): begin_epoch = checkpoint['epoch'] + 1 best_perf = checkpoint['loss'] optimizer.load_state_dict(checkpoint['optimizer']) D_optimizer.load_state_dict(checkpoint['D_optimizer']) if 'train_global_steps' in checkpoint.keys() and \ 'valid_global_steps' in checkpoint.keys(): writer_dict['train_global_steps'] = checkpoint[ 'train_global_steps'] writer_dict['valid_global_steps'] = checkpoint[ 'valid_global_steps'] # Floating point 16 mode if cfg.FP16.ENABLED: logger.info("=> Using FP16 mode") lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer.optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=begin_epoch) else: lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR, last_epoch=begin_epoch) # Data loading code train_loader_dict = make_dataloader(cfg, is_train=True, distributed=cfg.DISTRIBUTED) valid_loader_dict = make_dataloader(cfg, is_train=False, distributed=cfg.DISTRIBUTED) for i, (dataset_name, train_loader) in enumerate(train_loader_dict.items()): logger.info( 'Training Loader {}/{}:\n'.format(i + 1, len(train_loader_dict)) + str(train_loader.dataset)) for i, (dataset_name, valid_loader) in enumerate(valid_loader_dict.items()): logger.info('Validation Loader {}/{}:\n'.format( i + 1, len(valid_loader_dict)) + str(valid_loader.dataset)) #writer_dict['writer'].add_graph(model, (dump_input, )) """ Start training """ start_time = time.time() with torch.autograd.set_detect_anomaly(True): for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH): epoch_start_time = time.time() # shuffle datasets with the sample random seed if cfg.DISTRIBUTED: for data_loader in train_loader_dict.values(): data_loader.sampler.set_epoch(epoch) # train for one epoch logger.info('Start training [{}/{}]'.format( epoch, cfg.TRAIN.END_EPOCH - 1)) train(epoch, cfg, args, master, train_loader_dict, [model, discriminator], criterion, [optimizer, D_optimizer], final_output_dir, tb_log_dir, writer_dict, logger, device, fp16=cfg.FP16.ENABLED) # In PyTorch 1.1.0 and later, you should call `lr_scheduler.step()` after `optimizer.step()`. lr_scheduler.step() # evaluate on validation set if not cfg.WITHOUT_EVAL: logger.info('Start evaluating [{}/{}]'.format( epoch, cfg.TRAIN.END_EPOCH - 1)) with torch.no_grad(): recorder = validate(cfg, args, master, valid_loader_dict, [model, discriminator], criterion, final_output_dir, tb_log_dir, writer_dict, logger, device) val_total_loss = recorder.avg_total_loss if val_total_loss < best_perf: logger.info( 'This epoch yielded a better model with total loss {:.4f} < {:.4f}.' .format(val_total_loss, best_perf)) best_perf = val_total_loss best_model = True else: best_model = False else: val_total_loss = 0 best_model = True logger.info('=> saving checkpoint to {}'.format(final_output_dir)) save_checkpoint( { 'epoch': epoch, 'model': cfg.EXP_NAME + '.' + cfg.MODEL.NAME, 'state_dict': model.state_dict(), 'D_state_dict': discriminator.state_dict(), 'loss': val_total_loss, 'optimizer': optimizer.state_dict(), 'D_optimizer': D_optimizer.state_dict(), 'train_global_steps': writer_dict['train_global_steps'], 'valid_global_steps': writer_dict['valid_global_steps'] }, best_model, final_output_dir) print('\nEpoch {} spent {:.2f} hours\n'.format( epoch, (time.time() - epoch_start_time) / 3600)) #if epoch == 3:break if master: final_model_state_file = os.path.join( final_output_dir, 'final_state{}.pth.tar'.format(gpus)) logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(model.state_dict(), final_model_state_file) writer_dict['writer'].close() print( '\n[Training Accomplished] {} epochs spent {:.2f} hours\n'.format( cfg.TRAIN.END_EPOCH - begin_epoch + 1, (time.time() - start_time) / 3600))
def main(): if config.output_dir is None: config.output_dir = 'output' if config.restart_training: shutil.rmtree(config.output_dir, ignore_errors=True) if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) logger = setup_logger(os.path.join(config.output_dir, 'train_log')) logger.info(config.print()) torch.manual_seed(config.seed) # 为CPU设置随机种子 if config.gpu_id is not None and torch.cuda.is_available(): torch.backends.cudnn.benchmark = True logger.info('train with gpu {} and pytorch {}'.format( config.gpu_id, torch.__version__)) device = torch.device("cuda:0") torch.cuda.manual_seed(config.seed) # 为当前GPU设置随机种子 torch.cuda.manual_seed_all(config.seed) # 为所有GPU设置随机种子 else: logger.info('train with cpu and pytorch {}'.format(torch.__version__)) device = torch.device("cpu") train_data = TibetanDataset(config.json_path, data_shape=config.data_shape, n=config.n, m=config.m, transform=transforms.ToTensor(), base_path=config.base_path) train_loader = Data.DataLoader(dataset=train_data, batch_size=config.train_batch_size, shuffle=True, num_workers=int(config.workers)) writer = SummaryWriter(config.output_dir) model = PSENet(backbone=config.backbone, pretrained=config.pretrained, result_num=config.n, scale=config.scale) if not config.pretrained and not config.restart_training: model.apply(weights_init) num_gpus = torch.cuda.device_count() if num_gpus > 1: model = nn.DataParallel(model) model = model.to(device) # dummy_input = torch.autograd.Variable(torch.Tensor(1, 3, 600, 800).to(device)) # writer.add_graph(models=models, input_to_model=dummy_input) criterion = PSELoss(Lambda=config.Lambda, ratio=config.OHEM_ratio, reduction='mean') # optimizer = torch.optim.SGD(models.parameters(), lr=config.lr, momentum=0.99) optimizer = torch.optim.Adam(model.parameters(), lr=config.lr) if config.checkpoint != '' and not config.restart_training: start_epoch = load_checkpoint(config.checkpoint, model, logger, device, optimizer) start_epoch += 1 scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.lr_decay_step, gamma=config.lr_gamma, last_epoch=start_epoch) else: start_epoch = config.start_epoch scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.lr_decay_step, gamma=config.lr_gamma) all_step = len(train_loader) logger.info('train dataset has {} samples,{} in dataloader'.format( train_data.__len__(), all_step)) epoch = 0 best_model = {'recall': 0, 'precision': 0, 'f1': 0, 'models': ''} try: for epoch in range(start_epoch, config.epochs): start = time.time() train_loss, lr = train_epoch(model, optimizer, scheduler, train_loader, device, criterion, epoch, all_step, writer, logger) logger.info( '[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}'.format( epoch, config.epochs, train_loss, time.time() - start, lr)) # net_save_path = '{}/PSENet_{}_loss{:.6f}.pth'.format(config.output_dir, epoch, # train_loss) # save_checkpoint(net_save_path, models, optimizer, epoch, logger) if (0.3 < train_loss < 0.4 and epoch % 4 == 0) or train_loss < 0.3: recall, precision, f1 = merge_eval(model=model, save_path=os.path.join( config.output_dir, 'output'), test_path=config.testroot, device=device, base_path=config.base_path, use_sub=config.use_sub) logger.info( 'test: recall: {:.6f}, precision: {:.6f}, f1: {:.6f}'. format(recall, precision, f1)) net_save_path = '{}/PSENet_{}_loss{:.6f}_r{:.6f}_p{:.6f}_f1{:.6f}.pth'.format( config.output_dir, epoch, train_loss, recall, precision, f1) save_checkpoint(net_save_path, model, optimizer, epoch, logger) if f1 > best_model['f1']: best_path = glob.glob(config.output_dir + '/Best_*.pth') for b_path in best_path: if os.path.exists(b_path): os.remove(b_path) best_model['recall'] = recall best_model['precision'] = precision best_model['f1'] = f1 best_model['models'] = net_save_path best_save_path = '{}/Best_{}_r{:.6f}_p{:.6f}_f1{:.6f}.pth'.format( config.output_dir, epoch, recall, precision, f1) if os.path.exists(net_save_path): shutil.copyfile(net_save_path, best_save_path) else: save_checkpoint(best_save_path, model, optimizer, epoch, logger) pse_path = glob.glob(config.output_dir + '/PSENet_*.pth') for p_path in pse_path: if os.path.exists(p_path): os.remove(p_path) writer.add_scalar(tag='Test/recall', scalar_value=recall, global_step=epoch) writer.add_scalar(tag='Test/precision', scalar_value=precision, global_step=epoch) writer.add_scalar(tag='Test/f1', scalar_value=f1, global_step=epoch) writer.close() except KeyboardInterrupt: save_checkpoint('{}/final.pth'.format(config.output_dir), model, optimizer, epoch, logger) finally: if best_model['models']: logger.info(best_model)
def main(): tf.set_random_seed(1234) np.random.seed(1234) # Load MNIST data_path = os.path.join('data', 'mnist.pkl.gz') x_train, t_train, x_valid, t_valid, x_test, t_test = \ dataset.load_mnist_realval(data_path) x_train = np.vstack([x_train, x_valid]) x_test = np.random.binomial(1, x_test, size=x_test.shape) n_x = x_train.shape[1] n_z = FLAGS.n_z n_particles = tf.placeholder(tf.int32, shape=[], name='n_particles') x_input = tf.placeholder(tf.float32, shape=[None, n_x], name='x') x = tf.to_int32(tf.random_uniform(tf.shape(x_input)) <= x_input) n = tf.shape(x)[0] qz = q_net(x, n_z, n_particles) # log_qz = qz.log_prob(qz) model, _ = vae({'x': x, 'z': qz}, n, n_x, n_z, n_particles) log_px_qz = model.local_log_prob('x') eq_ll = tf.reduce_mean(log_px_qz) kl = kl_normal_normal( qz.distribution.mean, qz.distribution.logstd, 0., 0.) kl_term = tf.reduce_mean(tf.reduce_sum(kl, -1)) lower_bound = eq_ll - kl_term cost = -lower_bound # log_pz = model.local_log_prob('z') # kl_term_est = tf.reduce_mean(log_qz - log_pz) # cost = kl_term learning_rate_ph = tf.placeholder(tf.float32, shape=[], name='lr') optimizer = tf.train.AdamOptimizer(learning_rate_ph, beta1=0.5) infer_op = optimizer.minimize(cost) # Generate images n_gen = 100 _, x_logits = vae({}, n_gen, n_x, n_z, 1) x_gen = tf.reshape(tf.sigmoid(x_logits), [-1, 28, 28, 1]) # Define training parameters lb_samples = 1 learning_rate = 1e-4 epochs = 3000 batch_size = 128 iters = x_train.shape[0] // batch_size save_image_freq = 10 save_model_freq = 100 test_freq = 10 test_batch_size = 400 test_iters = x_test.shape[0] // test_batch_size result_path = "results/vae_conv_{}_".format(n_z) + \ time.strftime("%Y%m%d_%H%M%S") saver = tf.train.Saver(max_to_keep=10) logger = setup_logger('vae_conv', __file__, result_path) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Restore from the latest checkpoint ckpt_file = tf.train.latest_checkpoint(result_path) begin_epoch = 1 if ckpt_file is not None: logger.info('Restoring model from {}...'.format(ckpt_file)) begin_epoch = int(ckpt_file.split('.')[-2]) + 1 saver.restore(sess, ckpt_file) for epoch in range(begin_epoch, epochs + 1): time_epoch = -time.time() np.random.shuffle(x_train) lbs = [] for t in range(iters): x_batch = x_train[t * batch_size:(t + 1) * batch_size] _, lb = sess.run( [infer_op, lower_bound], feed_dict={x_input: x_batch, learning_rate_ph: learning_rate, n_particles: lb_samples}) lbs.append(lb) time_epoch += time.time() logger.info( 'Epoch {} ({:.1f}s): Lower bound = {}' .format(epoch, time_epoch, np.mean(lbs))) if epoch % test_freq == 0: time_test = -time.time() test_lbs = [] for t in range(test_iters): test_x_batch = x_test[t * test_batch_size: (t + 1) * test_batch_size] test_lb = sess.run(lower_bound, feed_dict={x: test_x_batch, n_particles: lb_samples}) test_lbs.append(test_lb) time_test += time.time() logger.info('>>> TEST ({:.1f}s)'.format(time_test)) logger.info('>> Test lower bound = {}' .format(np.mean(test_lbs))) if epoch % save_image_freq == 0: logger.info('Saving images...') images = sess.run(x_gen) name = os.path.join(result_path, "vae.epoch.{}.png".format(epoch)) save_image_collections(images, name) if epoch % save_model_freq == 0: logger.info('Saving model...') save_path = os.path.join(result_path, "vae.epoch.{}.ckpt".format(epoch)) if not os.path.exists(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path)) saver.save(sess, save_path) logger.info('Done')