if h_new > 0.25 * h: rho *= 10 else: break w_est, h = w_new, h_new alpha += rho * h if h <= h_tol or rho >= rho_max: break W_est = _adj(w_est) W_est[np.abs(W_est) < w_threshold] = 0 return W_est if __name__ == '__main__': import utils utils.set_random_seed(1) n, d, s0, graph_type, sem_type = 100, 20, 20, 'ER', 'gauss' B_true = utils.simulate_dag(d, s0, graph_type) W_true = utils.simulate_parameter(B_true) np.savetxt('W_true.csv', W_true, delimiter=',') X = utils.simulate_linear_sem(W_true, n, sem_type) np.savetxt('X.csv', X, delimiter=',') W_est = notears_linear(X, lambda1=0.1, loss_type='l2') assert utils.is_dag(W_est) np.savetxt('W_est.csv', W_est, delimiter=',') acc = utils.count_accuracy(B_true, W_est != 0) print(acc)
def main(): #### setup options of three networks parser = argparse.ArgumentParser() parser.add_argument("-opt", type=str, help="Path to option YMAL file.") parser.add_argument("--launcher", choices=["none", "pytorch"], default="none", help="job launcher") parser.add_argument("--local_rank", type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) # choose small opt for SFTMD test, fill path of pre-trained model_F #### set random seed seed = opt["train"]["manual_seed"] if seed is None: seed = random.randint(1, 10000) util.set_random_seed(seed) # load PCA matrix of enough kernel print("load PCA matrix") pca_matrix = torch.load(opt["pca_matrix_path"], map_location=lambda storage, loc: storage) print("PCA matrix shape: {}".format(pca_matrix.shape)) #### distributed training settings if args.launcher == "none": # disabled distributed training opt["dist"] = False opt["dist"] = False rank = -1 print("Disabled distributed training.") else: opt["dist"] = True opt["dist"] = True init_dist() world_size = ( torch.distributed.get_world_size() ) # Returns the number of processes in the current process group rank = torch.distributed.get_rank( ) # Returns the rank of current process group torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True ###### Predictor&Corrector train ###### #### loading resume state if exists if opt["path"].get("resume_state", None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt["path"]["resume_state"], map_location=lambda storage, loc: storage.cuda(device_id), ) option.check_resume(opt, resume_state["iter"]) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0-7) if resume_state is None: # Predictor path util.mkdir_and_rename( opt["path"] ["experiments_root"]) # rename experiment folder if exists util.mkdirs( (path for key, path in opt["path"].items() if not key == "experiments_root" and "pretrain_model" not in key and "resume" not in key)) os.system("rm ./log") os.symlink(os.path.join(opt["path"]["experiments_root"], ".."), "./log") # config loggers. Before it, the log will not work util.setup_logger( "base", opt["path"]["log"], "train_" + opt["name"], level=logging.INFO, screen=False, tofile=True, ) util.setup_logger( "val", opt["path"]["log"], "val_" + opt["name"], level=logging.INFO, screen=False, tofile=True, ) logger = logging.getLogger("base") logger.info(option.dict2str(opt)) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( "You are using PyTorch {}. Tensorboard will use [tensorboardX]" .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter( log_dir="log/{}/tb_logger/".format(opt["name"])) else: util.setup_logger("base", opt["path"]["log"], "train", level=logging.INFO, screen=False) logger = logging.getLogger("base") torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt["datasets"].items(): if phase == "train": train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt["batch_size"])) total_iters = int(opt["train"]["niter"]) total_epochs = int(math.ceil(total_iters / train_size)) if opt["dist"]: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( "Number of train images: {:,d}, iters: {:,d}".format( len(train_set), train_size)) logger.info("Total epochs needed: {:d} for iters {:,d}".format( total_epochs, total_iters)) elif phase == "val": val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info("Number of val images in [{:s}]: {:d}".format( dataset_opt["name"], len(val_set))) else: raise NotImplementedError( "Phase [{:s}] is not recognized.".format(phase)) assert train_loader is not None assert val_loader is not None #### create model model = create_model(opt) # load pretrained model of SFTMD #### resume training if resume_state: logger.info("Resuming training from epoch: {}, iter: {}.".format( resume_state["epoch"], resume_state["iter"])) start_epoch = resume_state["epoch"] current_step = resume_state["iter"] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 prepro = util.SRMDPreprocessing(scale=opt["scale"], pca_matrix=pca_matrix, cuda=True, **opt["degradation"]) #### training logger.info("Start training from epoch: {:d}, iter: {:d}".format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt["dist"]: train_sampler.set_epoch(epoch) for _, train_data in enumerate(train_loader): current_step += 1 if current_step > total_iters: break LR_img, ker_map = prepro(train_data["GT"]) LR_img = (LR_img * 255).round() / 255 model.feed_data(LR_img, train_data["GT"], ker_map) model.optimize_parameters(current_step) model.update_learning_rate(current_step, warmup_iter=opt["train"]["warmup_iter"]) visuals = model.get_current_visuals() if current_step % opt["logger"]["print_freq"] == 0: logs = model.get_current_log() message = "<epoch:{:3d}, iter:{:8,d}, lr:{:.3e}> ".format( epoch, current_step, model.get_current_learning_rate()) for k, v in logs.items(): message += "{:s}: {:.4e} ".format(k, v) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank == 0: logger.info(message) # validation, to produce ker_map_list(fake) if current_step % opt["train"]["val_freq"] == 0 and rank <= 0: avg_psnr = 0.0 idx = 0 for _, val_data in enumerate(val_loader): # LR_img, ker_map = prepro(val_data['GT']) LR_img = val_data["LQ"] lr_img = util.tensor2img( LR_img) # save LR image for reference # valid Predictor model.feed_data(LR_img, val_data["GT"]) model.test() visuals = model.get_current_visuals() # Save images for reference img_name = os.path.splitext( os.path.basename(val_data["LQ_path"][0]))[0] img_dir = os.path.join(opt["path"]["val_images"], img_name) # img_dir = os.path.join(opt['path']['val_images'], str(current_step), '_', str(step)) util.mkdir(img_dir) save_lr_path = os.path.join(img_dir, "{:s}_LR.png".format(img_name)) util.save_img(lr_img, save_lr_path) sr_img = util.tensor2img(visuals["SR"]) # uint8 gt_img = util.tensor2img(visuals["GT"]) # uint8 save_img_path = os.path.join( img_dir, "{:s}_{:d}.png".format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR crop_size = opt["scale"] gt_img = gt_img / 255.0 sr_img = sr_img / 255.0 cropped_sr_img = sr_img[crop_size:-crop_size, crop_size:-crop_size, :] cropped_gt_img = gt_img[crop_size:-crop_size, crop_size:-crop_size, :] avg_psnr += util.calculate_psnr(cropped_sr_img * 255, cropped_gt_img * 255) idx += 1 avg_psnr = avg_psnr / idx # log logger.info("# Validation # PSNR: {:.6f}".format(avg_psnr)) logger_val = logging.getLogger("val") # validation logger logger_val.info( "<epoch:{:3d}, iter:{:8,d}, psnr: {:.6f}".format( epoch, current_step, avg_psnr)) # tensorboard logger if opt["use_tb_logger"] and "debug" not in opt["name"]: tb_logger.add_scalar("psnr", avg_psnr, current_step) #### save models and training states if current_step % opt["logger"]["save_checkpoint_freq"] == 0: if rank <= 0: logger.info("Saving models and training states.") model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info("Saving the final model.") model.save("latest") logger.info("End of Predictor and Corrector training.") tb_logger.close()
def worker_init_fn(worker_id, num_threads=1): utils.set_random_seed(worker_id) utils.reset_cpu_threads(num_threads)
def main(rank, args): """ Parameters ---------- rank : int Subprocess id args : dict Configuration """ if rank == 0: t1 = time.time() set_random_seed(args['seed']) # Remove the line below will result in problems for multiprocess torch.set_num_threads(1) # Setup dataset and data loader dataset = MoleculeDataset(args['dataset'], args['order'], ['train', 'val'], subset_id=rank, n_subsets=args['num_processes']) # Note that currently the batch size for the loaders should only be 1. train_loader = DataLoader(dataset.train_set, batch_size=args['batch_size'], shuffle=True, collate_fn=dataset.collate) val_loader = DataLoader(dataset.val_set, batch_size=args['batch_size'], shuffle=True, collate_fn=dataset.collate) if rank == 0: try: from tensorboardX import SummaryWriter writer = SummaryWriter(args['log_dir']) except ImportError: print( 'If you want to use tensorboard, install tensorboardX with pip.' ) writer = None train_printer = Printer(args['nepochs'], len(dataset.train_set), args['batch_size'], writer) val_printer = Printer(args['nepochs'], len(dataset.val_set), args['batch_size']) else: val_printer = None # Initialize model model = DGMG(atom_types=dataset.atom_types, bond_types=dataset.bond_types, node_hidden_size=args['node_hidden_size'], num_prop_rounds=args['num_propagation_rounds'], dropout=args['dropout']) if args['num_processes'] == 1: from utils import Optimizer optimizer = Optimizer(args['lr'], Adam(model.parameters(), lr=args['lr'])) else: from utils import MultiProcessOptimizer optimizer = MultiProcessOptimizer( args['num_processes'], args['lr'], Adam(model.parameters(), lr=args['lr'])) if rank == 0: t2 = time.time() best_val_prob = 0 # Training for epoch in range(args['nepochs']): model.train() if rank == 0: print('Training') for i, data in enumerate(train_loader): log_prob = model(actions=data, compute_log_prob=True) prob = log_prob.detach().exp() loss_averaged = -log_prob prob_averaged = prob optimizer.backward_and_step(loss_averaged) if rank == 0: train_printer.update(epoch + 1, loss_averaged.item(), prob_averaged.item()) synchronize(args['num_processes']) # Validation val_log_prob = evaluate(epoch, model, val_loader, val_printer) if args['num_processes'] > 1: dist.all_reduce(val_log_prob, op=dist.ReduceOp.SUM) val_log_prob /= args['num_processes'] # Strictly speaking, the computation of probability here is different from what is # performed on the training set as we first take an average of log likelihood and then # take the exponentiation. By Jensen's inequality, the resulting value is then a # lower bound of the real probabilities. val_prob = (-val_log_prob).exp().item() val_log_prob = val_log_prob.item() if val_prob >= best_val_prob: if rank == 0: torch.save({'model_state_dict': model.state_dict()}, args['checkpoint_dir']) print( 'Old val prob {:.10f} | new val prob {:.10f} | model saved' .format(best_val_prob, val_prob)) best_val_prob = val_prob elif epoch >= args['warmup_epochs']: optimizer.decay_lr() if rank == 0: print('Validation') if writer is not None: writer.add_scalar('validation_log_prob', val_log_prob, epoch) writer.add_scalar('validation_prob', val_prob, epoch) writer.add_scalar('lr', optimizer.lr, epoch) print('Validation log prob {:.4f} | prob {:.10f}'.format( val_log_prob, val_prob)) synchronize(args['num_processes']) if rank == 0: t3 = time.time() print('It took {} to setup.'.format(datetime.timedelta(seconds=t2 - t1))) print('It took {} to finish training.'.format( datetime.timedelta(seconds=t3 - t2))) print( '--------------------------------------------------------------------------' ) print('On average, an epoch takes {}.'.format( datetime.timedelta(seconds=(t3 - t2) / args['nepochs'])))
def main(): config = get_config() if not config.use_random_seed: set_random_seed(1) # A list of budgets to query, e.g. [100, 200, 300]. # Then the labeled pool will obtain 100 new samples each round, until 300 budgets are all used. budget_list = get_budget_list_from_config(config) # It contains all directory/save_paths that will be used paths_dict = prepare_active_learning_dir_from_config(config, budget_list) dataset_info = prepare_dataset_from_config( config, paths_dict['data_download_path'], paths_dict['data_save_path'] ) time_stamp = time.strftime("%Y-%m-%d %H:%M") # Save the train set details for later analysis if not os.path.exists(paths_dict['trainset_info_path']): torch.save( dataset_info.trainset_info, paths_dict['trainset_info_path'] ) # The training configurations including backbone architecture, lr, batch size.. trainer_config = get_trainer_config( config.data, config.training_method, config.train_mode ) discovered_samples = dataset_info.discovered_samples discovered_classes = dataset_info.discovered_classes # Trainer is the main class for training and querying trainer = Trainer( training_method=config.training_method, trainer_config=trainer_config, dataset_info=dataset_info ) for i, b in enumerate(budget_list): # b is the budget for independent mode, need to adjust it for sequential mode if config.active_query_scheme == 'sequential': if i > 0: budget = b - budget_list[i-1] else: budget = b else: budget = b new_discovered_samples, new_discovered_classes = trainer.query( discovered_samples, discovered_classes, budget=budget, query_method=config.query_method, query_result_path=paths_dict['active_query_results'][b], verbose=config.verbose ) if config.active_query_scheme == 'sequential': print("Using sequential mode, we updated the discovered samples") discovered_samples, discovered_classes = new_discovered_samples, new_discovered_classes else: print("Using independent mode, we do not update the initial labeled pool.") trainer.train( new_discovered_samples, new_discovered_classes, ckpt_path=paths_dict['active_ckpt_results'][b], verbose=config.verbose ) closed_set_test_acc = trainer.eval_closed_set( new_discovered_classes, result_path=paths_dict['active_test_results'][b], verbose=config.verbose )
def main(): # ========== # parameters # ========== opts_dict = receive_arg() rank = opts_dict['train']['rank'] unit = opts_dict['train']['criterion']['unit'] num_iter = int(opts_dict['train']['num_iter']) interval_print = int(opts_dict['train']['interval_print']) interval_val = int(opts_dict['train']['interval_val']) # ========== # init distributed training # ========== if opts_dict['train']['is_dist']: utils.init_dist(local_rank=rank, backend='nccl') # TO-DO: load resume states if exists pass # ========== # create logger # ========== if rank == 0: log_dir = op.join("exp", opts_dict['train']['exp_name']) utils.mkdir(log_dir) log_fp = open(opts_dict['train']['log_path'], 'w') # log all parameters msg = (f"{'<' * 10} Hello {'>' * 10}\n" f"Timestamp: [{utils.get_timestr()}]\n" f"\n{'<' * 10} Options {'>' * 10}\n" f"{utils.dict2str(opts_dict)}") print(msg) log_fp.write(msg + '\n') log_fp.flush() # ========== # TO-DO: init tensorboard # ========== pass # ========== # fix random seed # ========== seed = opts_dict['train']['random_seed'] # >I don't know why should rs + rank utils.set_random_seed(seed + rank) # ========== # Ensure reproducibility or Speed up # Ensure reproducibility or Speed up # ========== #torch.backends.cudnn.benchmark = False # if reproduce #torch.backends.cudnn.deterministic = True # if reproduce torch.backends.cudnn.benchmark = True # speed up # ========== # create train and val data prefetchers # ========== # create datasets train_ds_type = opts_dict['dataset']['train']['type'] val_ds_type = opts_dict['dataset']['val']['type'] radius = opts_dict['network']['radius'] assert train_ds_type in dataset.__all__, \ "Not implemented!" assert val_ds_type in dataset.__all__, \ "Not implemented!" train_ds_cls = getattr(dataset, train_ds_type) val_ds_cls = getattr(dataset, val_ds_type) train_ds = train_ds_cls(opts_dict=opts_dict['dataset']['train'], radius=radius) val_ds = val_ds_cls(opts_dict=opts_dict['dataset']['val'], radius=radius) # create datasamplers train_sampler = utils.DistSampler( dataset=train_ds, num_replicas=opts_dict['train']['num_gpu'], rank=rank, ratio=opts_dict['dataset']['train']['enlarge_ratio']) val_sampler = None # no need to sample val data # create dataloaders train_loader = utils.create_dataloader( dataset=train_ds, opts_dict=opts_dict, sampler=train_sampler, phase='train', seed=opts_dict['train']['random_seed']) val_loader = utils.create_dataloader(dataset=val_ds, opts_dict=opts_dict, sampler=val_sampler, phase='val') assert train_loader is not None batch_size = opts_dict['dataset']['train']['batch_size_per_gpu'] * \ opts_dict['train']['num_gpu'] # divided by all GPUs num_iter_per_epoch = math.ceil(len(train_ds) * \ opts_dict['dataset']['train']['enlarge_ratio'] / batch_size) num_epoch = math.ceil(num_iter / num_iter_per_epoch) val_num = len(val_ds) # create dataloader prefetchers tra_prefetcher = utils.CPUPrefetcher(train_loader) val_prefetcher = utils.CPUPrefetcher(val_loader) # ========== # create model # ========== model = MFVQE(opts_dict=opts_dict['network']) model = model.to(rank) # model.load_state_dict(torch.load('~/STDF-PyTorch/Code/exp/QP27/MFQEv2_R3_enlarge300x/ckp_220000.pt')['state_dict']) if opts_dict['train']['is_dist']: model = DDP(model, device_ids=[rank]) """ # load pre-trained generator ckp_path = opts_dict['network']['stdf']['load_path'] checkpoint = torch.load(ckp_path) state_dict = checkpoint['state_dict'] if ('module.' in list(state_dict.keys())[0]) and (not opts_dict['train']['is_dist']): # multi-gpu pre-trained -> single-gpu training new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove module new_state_dict[name] = v model.load_state_dict(new_state_dict) print(f'loaded from {ckp_path}') elif ('module.' not in list(state_dict.keys())[0]) and (opts_dict['train']['is_dist']): # single-gpu pre-trained -> multi-gpu training new_state_dict = OrderedDict() for k, v in state_dict.items(): name = 'module.' + k # add module new_state_dict[name] = v model.load_state_dict(new_state_dict) print(f'loaded from {ckp_path}') else: # the same way of training model.load_state_dict(state_dict) print(f'loaded from {ckp_path}') """ # ========== # define loss func & optimizer & scheduler & scheduler & criterion # ========== # define loss func assert opts_dict['train']['loss'].pop('type') == 'CharbonnierLoss', \ "Not implemented." loss_func = utils.CharbonnierLoss(**opts_dict['train']['loss']) # define optimizer assert opts_dict['train']['optim'].pop('type') == 'Adam', \ "Not implemented." optimizer = optim.Adam(model.parameters(), **opts_dict['train']['optim']) # define scheduler if opts_dict['train']['scheduler']['is_on']: assert opts_dict['train']['scheduler'].pop('type') == \ 'CosineAnnealingRestartLR', "Not implemented." del opts_dict['train']['scheduler']['is_on'] scheduler = utils.CosineAnnealingRestartLR( optimizer, **opts_dict['train']['scheduler']) opts_dict['train']['scheduler']['is_on'] = True # define criterion assert opts_dict['train']['criterion'].pop('type') == \ 'PSNR', "Not implemented." criterion = utils.PSNR() # start_iter = 0 # should be restored start_epoch = start_iter // num_iter_per_epoch # display and log if rank == 0: msg = (f"\n{'<' * 10} Dataloader {'>' * 10}\n" f"total iters: [{num_iter}]\n" f"total epochs: [{num_epoch}]\n" f"iter per epoch: [{num_iter_per_epoch}]\n" f"val sequence: [{val_num}]\n" f"start from iter: [{start_iter}]\n" f"start from epoch: [{start_epoch}]") print(msg) log_fp.write(msg + '\n') log_fp.flush() # ========== # evaluate original performance, e.g., PSNR before enhancement # ========== vid_num = val_ds.get_vid_num() if opts_dict['train']['pre-val'] and rank == 0: msg = f"\n{'<' * 10} Pre-evaluation {'>' * 10}" print(msg) log_fp.write(msg + '\n') per_aver_dict = {} for i in range(vid_num): per_aver_dict[i] = utils.Counter() pbar = tqdm(total=val_num, ncols=opts_dict['train']['pbar_len']) # fetch the first batch val_prefetcher.reset() val_data = val_prefetcher.next() while val_data is not None: # get data gt_data = val_data['gt'].to(rank) # (B [RGB] H W) lq_data = val_data['lq'].to(rank) # (B T [RGB] H W) index_vid = val_data['index_vid'].item() name_vid = val_data['name_vid'][0] # bs must be 1! b, _, _, _, _ = lq_data.shape # eval batch_perf = np.mean([ criterion(lq_data[i, radius, ...], gt_data[i]) for i in range(b) ]) # bs must be 1! # log per_aver_dict[index_vid].accum(volume=batch_perf) # display pbar.set_description("{:s}: [{:.3f}] {:s}".format( name_vid, batch_perf, unit)) pbar.update() # fetch next batch val_data = val_prefetcher.next() pbar.close() # log ave_performance = np.mean([ per_aver_dict[index_vid].get_ave() for index_vid in range(vid_num) ]) msg = "> ori performance: [{:.3f}] {:s}".format(ave_performance, unit) print(msg) log_fp.write(msg + '\n') log_fp.flush() if opts_dict['train']['is_dist']: torch.distributed.barrier() # all processes wait for ending if rank == 0: msg = f"\n{'<' * 10} Training {'>' * 10}" print(msg) log_fp.write(msg + '\n') # create timer total_timer = utils.Timer() # total tra + val time of each epoch # ========== # start training + validation (test) # ========== model.train() num_iter_accum = start_iter for current_epoch in range(start_epoch, num_epoch + 1): # shuffle distributed subsamplers before each epoch if opts_dict['train']['is_dist']: train_sampler.set_epoch(current_epoch) # fetch the first batch tra_prefetcher.reset() train_data = tra_prefetcher.next() # train this epoch while train_data is not None: # over sign num_iter_accum += 1 print(num_iter_accum) if num_iter_accum > num_iter: break # get data gt_data = train_data['gt'].to(rank) # (B [RGB] H W) lq_data = train_data['lq'].to(rank) # (B T [RGB] H W) b, _, c, _, _ = lq_data.shape input_data = torch.cat([lq_data[:, :, i, ...] for i in range(c)], dim=1) # B [R1 ... R7 G1 ... G7 B1 ... B7] H W enhanced_data = model(input_data) # get loss optimizer.zero_grad() # zero grad loss = torch.mean( torch.stack([ loss_func(enhanced_data[i], gt_data[i]) for i in range(b) ])) # cal loss loss.backward() # cal grad optimizer.step() # update parameters # update learning rate if opts_dict['train']['scheduler']['is_on']: scheduler.step() # should after optimizer.step() if (num_iter_accum % interval_print == 0) and (rank == 0): # display & log lr = optimizer.param_groups[0]['lr'] loss_item = loss.item() msg = (f"iter: [{num_iter_accum}]/{num_iter}, " f"epoch: [{current_epoch}]/{num_epoch - 1}, " "lr: [{:.3f}]x1e-4, loss: [{:.4f}]".format( lr * 1e4, loss_item)) print(msg) log_fp.write(msg + '\n') if ((num_iter_accum % interval_val == 0) or \ (num_iter_accum == num_iter)) and (rank == 0): # save model checkpoint_save_path = ( f"{opts_dict['train']['checkpoint_save_path_pre']}" f"{num_iter_accum}" ".pt") state = { 'num_iter_accum': num_iter_accum, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } if opts_dict['train']['scheduler']['is_on']: state['scheduler'] = scheduler.state_dict() torch.save(state, checkpoint_save_path) # validation with torch.no_grad(): per_aver_dict = {} for index_vid in range(vid_num): per_aver_dict[index_vid] = utils.Counter() pbar = tqdm(total=val_num, ncols=opts_dict['train']['pbar_len']) # train -> eval model.eval() # fetch the first batch val_prefetcher.reset() val_data = val_prefetcher.next() while val_data is not None: # get data gt_data = val_data['gt'].to(rank) # (B [RGB] H W) lq_data = val_data['lq'].to(rank) # (B T [RGB] H W) index_vid = val_data['index_vid'].item() name_vid = val_data['name_vid'][0] # bs must be 1! b, _, c, _, _ = lq_data.shape input_data = torch.cat( [lq_data[:, :, i, ...] for i in range(c)], dim=1) # B [R1 ... R7 G1 ... G7 B1 ... B7] H W enhanced_data = model(input_data) # (B [RGB] H W) # eval batch_perf = np.mean([ criterion(enhanced_data[i], gt_data[i]) for i in range(b) ]) # bs must be 1! # display pbar.set_description("{:s}: [{:.3f}] {:s}".format( name_vid, batch_perf, unit)) pbar.update() # log per_aver_dict[index_vid].accum(volume=batch_perf) # fetch next batch val_data = val_prefetcher.next() # end of val pbar.close() # eval -> train model.train() # log ave_per = np.mean([ per_aver_dict[index_vid].get_ave() for index_vid in range(vid_num) ]) msg = ("> model saved at {:s}\n" "> ave val per: [{:.3f}] {:s}").format( checkpoint_save_path, ave_per, unit) print(msg) log_fp.write(msg + '\n') log_fp.flush() if opts_dict['train']['is_dist']: torch.distributed.barrier() # all processes wait for ending # fetch next batch train_data = tra_prefetcher.next() # end of this epoch (training dataloader exhausted) # end of all epochs # ========== # final log & close logger # ========== if rank == 0: total_time = total_timer.get_interval() / 3600 msg = "TOTAL TIME: [{:.1f}] h".format(total_time) print(msg) log_fp.write(msg + '\n') msg = (f"\n{'<' * 10} Goodbye {'>' * 10}\n" f"Timestamp: [{utils.get_timestr()}]") print(msg) log_fp.write(msg + '\n') log_fp.close()
import sys sys.path.insert(0, os.getcwd()) import numpy as np import argparse import torch from torch import nn import matplotlib.pyplot as plt import time from utils import check_dir, set_random_seed, accuracy, mIoU, get_logger from models.second_segmentation import Segmentator from data.transforms import get_transforms_binary_segmentation from models.pretraining_backbone import ResNet18Backbone from data.segmentation import DataReaderBinarySegmentation set_random_seed(0) global_step = 0 def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument('data_folder', type=str, help="folder containing the data") parser.add_argument('weights_init', type=str, default="ImageNet") parser.add_argument('--output-root', type=str, default='results') parser.add_argument('--lr', type=float, default=0.01, help='learning rate') parser.add_argument('--bs', type=int, default=32, help='batch_size') parser.add_argument('--size', type=int, default=256, help='image size') parser.add_argument('--snapshot-freq', type=int,
parser.add_argument('--batch-size', type=int, default=128) parser.add_argument('--num-epoch', type=int, default=100) parser.add_argument('--lr', type=float, default=3e-4) # Model parser.add_argument('--model-type', type=str, default='basic') parser.add_argument('--likelihood-type', type=str, default='bernoulli') parser.add_argument('--hidden-channels', type=int, default=256) parser.add_argument('--latent-dim', type=int, default=10) parser.add_argument('--num-latents', type=int, default=10) parser.add_argument('--beta', type=float, default=1.0) parser.add_argument('--temperature', type=float, default=0.6) args = parser.parse_args() set_random_seed(args.seed) experiment_root = pathlib.Path('experiments') / args.experiment_name args.experiment_root = str(experiment_root) if not experiment_root.exists(): experiment_root.mkdir() with open(experiment_root / 'config.json', 'w') as f: json.dump(vars(args), f, indent=4, sort_keys=True) experiment_log_path = experiment_root / 'logs' args.experiment_log_path = str(experiment_log_path) if not experiment_log_path.exists(): experiment_log_path.mkdir() experiment_model_path = experiment_root / 'models'
train_vocab = datadir.train_dataset.vocab train_sets = datadir.train_dataset.raw_sets dev_vocab = datadir.dev_dataset.vocab dev_sets = datadir.dev_dataset.raw_sets # # for train # train_flat_word_list, train_word_idxes, train_cluster_idxes = get_word_idxes_and_cluster_idxes(train_sets, train_vocab, word2id) # train_word_embeddings = embedding[np.array(train_word_idxes)] run_times = DataConfig['run_times'] seed_list = DataConfig['seed_list'] ari_list, nmi_list, fmi_list = [], [], [] for i in range(run_times): # TODO : model set_random_seed(seed=seed_list[i]) dev_flat_word_list, dev_word_idxes, dev_cluster_idxes = get_word_idxes_and_cluster_idxes(dev_sets, dev_vocab,word2id) dev_word_embeddings = embedding[np.array(dev_word_idxes)] if method_type == 'kmeans': # set cluster number by prior k_cluster = len(dev_cluster_idxes.keys()) model = Kmeans(n_cluster=k_cluster, seed=seed_list[i]) pred_labels = model.predict(dev_word_embeddings) elif method_type == 'gmms': k_component = len(dev_cluster_idxes.keys()) model = GMMs(n_component=k_component, seed=seed_list[i]) pred_labels = model.predict(dev_word_embeddings) elif method_type == 'ac': k_cluster = len(dev_cluster_idxes.keys()) model = AC(n_cluster=k_cluster)
elif inc_option == 'partial+noisy': config['partial_unk_rate'] = float( para_option.split('-')[1].split('+')[0]) config['noisy_diff_rate'] = float( para_option.split('-')[1].split('+')[1]) print('partial_unk_rate', config['partial_unk_rate']) print('noisy_diff_rate', config['noisy_diff_rate']) config['noisy_lambda'] = 1.0 config['para_option'] = para_option + '-1.0' elif inc_option == 'auxiliary': config['auxiliary_option'] = para_option.split('-')[1] config['k-gram'] = 5 config['k-gram-freq-gate'] = 2 config['inc_lambda'] = 1.0 elif inc_option == 'knowledge': config['k-gram'] = int(para_option.split('-')[1]) config['k-gram-freq-gate'] = 2 config['inc_lambda'] = 1.0 elif inc_option == 'constraints': config['constraint_option'] = para_option.split('-')[1] elif inc_option == 'partial+constraints': config['constraint_option'] = para_option.split('-')[1].split( '+')[0] config['partial_unk_rate'] = float( para_option.split('-')[1].split('+')[1]) print('config', config) set_random_seed(config['seed']) print('incidental option', config['inc_option']) run_test_experiments(config)
write_data(small_large_qamr, small_large_qamr_file, 'small_large_qamr') write_data(test_qamr, test_qamr_file, 'test_qamr') write_data(large_qasrl, large_qasrl_file, 'large_qasrl') write_data(small_large_qasrl, small_large_qasrl_file, 'small_large_qasrl') write_data(test_qasrl, test_qasrl_file, 'test_qasrl') write_data(large_qare, large_qare_file, 'large_qare') write_data(small_large_qare, small_large_qare_file, 'small_large_qare') write_data(test_qare, test_qare_file, 'test_qare') write_data(large_newsqa, large_newsqa_file, 'large_newsqa') write_data(small_large_newsqa, small_large_newsqa_file, 'small_large_newsqa') write_data(test_newsqa, test_newsqa_file, 'test_newsqa') write_data(large_triviaqa, large_triviaqa_file, 'large_triviaqa') write_data(small_large_triviaqa, small_large_triviaqa_file, 'small_large_triviaqa') write_data(test_triviaqa, test_triviaqa_file, 'test_triviaqa') if __name__ == '__main__': set_random_seed(666) # dev_file = 'QA-data/TriviaQA/TriviaQA_squad_dev.json' # unique_dev_file = 'QA-data/TriviaQA/TriviaQA_squad_dev.unique.json' # get_unique_answer_data(dev_file, unique_dev_file) # input_file_list = ['QA-data/TriviaQA/TriviaQA_squad_train.json', 'QA-data/TriviaQA/TriviaQA_squad_dev.unique.json'] # output_file = 'QA-data/TriviaQA/triviaqa.all.json' # option = 'triviaqa' # combine_data(input_file_list, output_file, option) generate_qa_data() # input_file = 'QA-data/xdomain-QA/small_large_triviaqa.json' # get_stats(input_file)
def set_seed(self, seed): if seed: self.seed = seed set_random_seed(self.seed) self.function_approximator.set_seed(seed)
def init_seed(self, seed): if seed: set_random_seed(self.seed) return seed
else: anchor_img[s] = x[0] yield [ groundings, 1 - groundings, seen_mask, np.array(anchor_aud), np.array(anchor_img), np.array(anchor_labels) ], np.zeros(CURR_BATCH_SIZE) j += BATCH_SIZE // 2 if __name__ == '__main__': set_gpu() random_seed = set_random_seed(int(sys.argv[1]) if len(sys.argv) > 1 else 0) BATCH_SIZE = 512 NUM_EPOCHS = 3 MARGIN = 0.8 path = '/home/venkatk/Experiments/New-Experiment/' #Data/Features/' imgnet_path = '/home/venkatk/Experiments/Audio-Visual-Deep-Multimodal-Networks-master/' # Load Data ------------------------------------------------------------------------- imgnet_audio_train, imgnet_audio_val, imgnet_img_train, imgnet_img_val = get_data( imgnet_path + 'Data/', [ 'audio_features_train', 'audio_features_val', 'image_features_train', 'image_features_val' ]) MAX_LEN = imgnet_audio_train[list(imgnet_audio_train)[0]].shape[1]