def main(args): """ Setup logger and execute PLT classifier :param args: Arguments from the command line parsed using parse_command_line_args funtion :return: None """ # LOGGER PARAMETERS LOG_FILE = 'plt_multiclass' # Root name of the log file. Execution time stamp will be appended to the filename # and will be created in ../logs/ directory LOG_LEVEL = 'info' # Log level: debug / info / warning / error / critical (Preferred: info)(Use debug during debugging) SHOW_LOG = False # If True, shows the log info in the run window. # LOGGER CONFIGURATION logger_config = Logger(__name__, log_file=LOG_FILE, log_level=LOG_LEVEL, show_log=SHOW_LOG) logger = logger_config.get_logger() # EXECUTE PLT MULTI-CLASS try: logger.info('Executing plt_multiclass with the arguments received') plt_multiclass(logger, **vars(args)) except: print(f'\n\n!!! Error Occurred During Execution !!!\nCheck log file for further details\n' f'Log file: {logger.handlers[0].baseFilename}') pass
def main(args): logger_config = Logger(__name__, log_file='plt_multiclass', log_level='debug', show_log=False) logger = logger_config.get_logger() try: logger.info('Executing plt_multiclass with the arguements received') plt_multiclass(logger, **vars(args)) except: print( f'\n\n!!! Error Occurred During Execution !!!\nCheck log file for further details\n' f'Log file: {logger.handlers[0].baseFilename}') pass
def __init__(self, s_log_filename="scraper_base.log", s_log_level="DEBUG", s_url=None, regexp=None, b_global=False): """ constructor - must supply the logging parameters """ self.log = Logger(s_log_filename, s_log_level) self._s_url = s_url self._regexp = regexp self._retry_delay_seconds = RETRY_DELAY_SECONDS self._max_retries = MAX_RETRIES self._last_scraped_url = None self._b_global = b_global
def test(opt): logger = Logger(opt) dataset = VISTDataset(opt) opt.vocab_size = dataset.get_vocab_size() opt.seq_length = dataset.get_story_length() dataset.test() test_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.workers) evaluator = Evaluator(opt, 'test') model = models.setup(opt) model.cuda() predictions, metrics = evaluator.test_story(model, dataset, test_loader, opt)
def prepare_logger(xargs): args = copy.deepcopy(xargs) from log_utils import Logger logger = Logger(args.save_dir, args.rand_seed) logger.log('Main Function with logger : {:}'.format(logger)) logger.log('Arguments : -------------------------------') for name, value in args._get_kwargs(): logger.log('{:16} : {:}'.format(name, value)) logger.log("Python Version : {:}".format(sys.version.replace('\n', ' '))) logger.log("Pillow Version : {:}".format(PIL.__version__)) logger.log("PyTorch Version : {:}".format(torch.__version__)) logger.log("cuDNN Version : {:}".format(torch.backends.cudnn.version())) logger.log("CUDA available : {:}".format(torch.cuda.is_available())) logger.log("CUDA GPU numbers : {:}".format(torch.cuda.device_count())) logger.log("CUDA_VISIBLE_DEVICES : {:}".format( os.environ['CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ else 'None')) return logger
def prepare_logger(xargs): args = copy.deepcopy(xargs) logstr = 'seed-{:}-time-{:}'.format(args.rand_seed, time_for_file()) logger = Logger(args.save_path, logstr) logger.log('Main Function with logger : {:}'.format(logger)) logger.log('Arguments : -------------------------------') for name, value in args._get_kwargs(): logger.log('{:16} : {:}'.format(name, value)) logger.log("Python Version : {:}".format(sys.version.replace('\n', ' '))) logger.log("Pillow Version : {:}".format(PIL.__version__)) logger.log("PyTorch Version : {:}".format(torch.__version__)) logger.log("cuDNN Version : {:}".format(torch.backends.cudnn.version())) logger.log("CUDA available : {:}".format(torch.cuda.is_available())) logger.log("CUDA GPU numbers: {:}".format(torch.cuda.device_count())) return logger
def main(args): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True prepare_seed(args.rand_seed) logstr = 'seed-{:}-time-{:}'.format(args.rand_seed, time_for_file()) logger = Logger(args.save_path, logstr) logger.log('Main Function with logger : {:}'.format(logger)) logger.log('Arguments : -------------------------------') for name, value in args._get_kwargs(): logger.log('{:16} : {:}'.format(name, value)) logger.log("Python version : {}".format(sys.version.replace('\n', ' '))) logger.log("Pillow version : {}".format(PIL.__version__)) logger.log("PyTorch version : {}".format(torch.__version__)) logger.log("cuDNN version : {}".format(torch.backends.cudnn.version())) # General Data Argumentation mean_fill = tuple([int(x * 255) for x in [0.485, 0.456, 0.406]]) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) assert args.arg_flip == False, 'The flip is : {}, rotate is {}'.format( args.arg_flip, args.rotate_max) train_transform = [transforms.PreCrop(args.pre_crop_expand)] train_transform += [ transforms.TrainScale2WH((args.crop_width, args.crop_height)) ] train_transform += [ transforms.AugScale(args.scale_prob, args.scale_min, args.scale_max) ] #if args.arg_flip: # train_transform += [transforms.AugHorizontalFlip()] if args.rotate_max: train_transform += [transforms.AugRotate(args.rotate_max)] train_transform += [ transforms.AugCrop(args.crop_width, args.crop_height, args.crop_perturb_max, mean_fill) ] train_transform += [transforms.ToTensor(), normalize] train_transform = transforms.Compose(train_transform) eval_transform = transforms.Compose([ transforms.PreCrop(args.pre_crop_expand), transforms.TrainScale2WH((args.crop_width, args.crop_height)), transforms.ToTensor(), normalize ]) assert ( args.scale_min + args.scale_max ) / 2 == args.scale_eval, 'The scale is not ok : {},{} vs {}'.format( args.scale_min, args.scale_max, args.scale_eval) # Model Configure Load model_config = load_configure(args.model_config, logger) args.sigma = args.sigma * args.scale_eval logger.log('Real Sigma : {:}'.format(args.sigma)) # Training Dataset train_data = VDataset(train_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator, args.video_parser) train_data.load_list(args.train_lists, args.num_pts, True) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # Evaluation Dataloader eval_loaders = [] if args.eval_vlists is not None: for eval_vlist in args.eval_vlists: eval_vdata = IDataset(eval_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator) eval_vdata.load_list(eval_vlist, args.num_pts, True) eval_vloader = torch.utils.data.DataLoader( eval_vdata, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) eval_loaders.append((eval_vloader, True)) if args.eval_ilists is not None: for eval_ilist in args.eval_ilists: eval_idata = IDataset(eval_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator) eval_idata.load_list(eval_ilist, args.num_pts, True) eval_iloader = torch.utils.data.DataLoader( eval_idata, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) eval_loaders.append((eval_iloader, False)) # Define network lk_config = load_configure(args.lk_config, logger) logger.log('model configure : {:}'.format(model_config)) logger.log('LK configure : {:}'.format(lk_config)) net = obtain_model(model_config, lk_config, args.num_pts + 1) assert model_config.downsample == net.downsample, 'downsample is not correct : {} vs {}'.format( model_config.downsample, net.downsample) logger.log("=> network :\n {}".format(net)) logger.log('Training-data : {:}'.format(train_data)) for i, eval_loader in enumerate(eval_loaders): eval_loader, is_video = eval_loader logger.log('The [{:2d}/{:2d}]-th testing-data [{:}] = {:}'.format( i, len(eval_loaders), 'video' if is_video else 'image', eval_loader.dataset)) logger.log('arguments : {:}'.format(args)) opt_config = load_configure(args.opt_config, logger) if hasattr(net, 'specify_parameter'): net_param_dict = net.specify_parameter(opt_config.LR, opt_config.Decay) else: net_param_dict = net.parameters() optimizer, scheduler, criterion = obtain_optimizer(net_param_dict, opt_config, logger) logger.log('criterion : {:}'.format(criterion)) net, criterion = net.cuda(), criterion.cuda() net = torch.nn.DataParallel(net) last_info = logger.last_info() if last_info.exists(): logger.log("=> loading checkpoint of the last-info '{:}' start".format( last_info)) last_info = torch.load(last_info) start_epoch = last_info['epoch'] + 1 checkpoint = torch.load(last_info['last_checkpoint']) assert last_info['epoch'] == checkpoint[ 'epoch'], 'Last-Info is not right {:} vs {:}'.format( last_info, checkpoint['epoch']) net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) logger.log("=> load-ok checkpoint '{:}' (epoch {:}) done".format( logger.last_info(), checkpoint['epoch'])) elif args.init_model is not None: init_model = Path(args.init_model) assert init_model.exists(), 'init-model {:} does not exist'.format( init_model) checkpoint = torch.load(init_model) checkpoint = remove_module_dict(checkpoint['state_dict'], True) net.module.detector.load_state_dict(checkpoint) logger.log("=> initialize the detector : {:}".format(init_model)) start_epoch = 0 else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch = 0 detector = torch.nn.DataParallel(net.module.detector) eval_results = eval_all(args, eval_loaders, detector, criterion, 'start-eval', logger, opt_config) if args.eval_once: logger.log("=> only evaluate the model once") logger.close() return # Main Training and Evaluation Loop start_time = time.time() epoch_time = AverageMeter() for epoch in range(start_epoch, opt_config.epochs): scheduler.step() need_time = convert_secs2time( epoch_time.avg * (opt_config.epochs - epoch), True) epoch_str = 'epoch-{:03d}-{:03d}'.format(epoch, opt_config.epochs) LRs = scheduler.get_lr() logger.log( '\n==>>{:s} [{:s}], [{:s}], LR : [{:.5f} ~ {:.5f}], Config : {:}'. format(time_string(), epoch_str, need_time, min(LRs), max(LRs), opt_config)) # train for one epoch train_loss = train(args, train_loader, net, criterion, optimizer, epoch_str, logger, opt_config, lk_config, epoch >= lk_config.start) # log the results logger.log('==>>{:s} Train [{:}] Average Loss = {:.6f}'.format( time_string(), epoch_str, train_loss)) # remember best prec@1 and save checkpoint save_path = save_checkpoint( { 'epoch': epoch, 'args': deepcopy(args), 'arch': model_config.arch, 'state_dict': net.state_dict(), 'detector': detector.state_dict(), 'scheduler': scheduler.state_dict(), 'optimizer': optimizer.state_dict(), }, logger.path('model') / '{:}-{:}.pth'.format(model_config.arch, epoch_str), logger) last_info = save_checkpoint( { 'epoch': epoch, 'last_checkpoint': save_path, }, logger.last_info(), logger) eval_results = eval_all(args, eval_loaders, detector, criterion, epoch_str, logger, opt_config) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.close()
def train_single_model(save_dir, workers, datasets, xpaths, splits, use_less, seeds, model_str, arch_config): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.deterministic = True #torch.backends.cudnn.benchmark = True torch.set_num_threads( workers ) save_dir = Path(save_dir) / 'specifics' / '{:}-{:}-{:}-{:}'.format('LESS' if use_less else 'FULL', model_str, arch_config['channel'], arch_config['num_cells']) logger = Logger(str(save_dir), 0, False) if model_str in CellArchitectures: arch = CellArchitectures[model_str] logger.log('The model string is found in pre-defined architecture dict : {:}'.format(model_str)) else: try: arch = CellStructure.str2structure(model_str) except: raise ValueError('Invalid model string : {:}. It can not be found or parsed.'.format(model_str)) assert arch.check_valid_op(get_search_spaces('cell', 'full')), '{:} has the invalid op.'.format(arch) logger.log('Start train-evaluate {:}'.format(arch.tostr())) logger.log('arch_config : {:}'.format(arch_config)) start_time, seed_time = time.time(), AverageMeter() for _is, seed in enumerate(seeds): logger.log('\nThe {:02d}/{:02d}-th seed is {:} ----------------------<.>----------------------'.format(_is, len(seeds), seed)) to_save_name = save_dir / 'seed-{:04d}.pth'.format(seed) if to_save_name.exists(): logger.log('Find the existing file {:}, directly load!'.format(to_save_name)) checkpoint = torch.load(to_save_name) else: logger.log('Does not find the existing file {:}, train and evaluate!'.format(to_save_name)) checkpoint = evaluate_all_datasets(arch, datasets, xpaths, splits, use_less, seed, arch_config, workers, logger) torch.save(checkpoint, to_save_name) # log information logger.log('{:}'.format(checkpoint['info'])) all_dataset_keys = checkpoint['all_dataset_keys'] for dataset_key in all_dataset_keys: logger.log('\n{:} dataset : {:} {:}'.format('-'*15, dataset_key, '-'*15)) dataset_info = checkpoint[dataset_key] #logger.log('Network ==>\n{:}'.format( dataset_info['net_string'] )) logger.log('Flops = {:} MB, Params = {:} MB'.format(dataset_info['flop'], dataset_info['param'])) logger.log('config : {:}'.format(dataset_info['config'])) logger.log('Training State (finish) = {:}'.format(dataset_info['finish-train'])) last_epoch = dataset_info['total_epoch'] - 1 train_acc1es, train_acc5es = dataset_info['train_acc1es'], dataset_info['train_acc5es'] valid_acc1es, valid_acc5es = dataset_info['valid_acc1es'], dataset_info['valid_acc5es'] logger.log('Last Info : Train = Acc@1 {:.2f}% Acc@5 {:.2f}% Error@1 {:.2f}%, Test = Acc@1 {:.2f}% Acc@5 {:.2f}% Error@1 {:.2f}%'.format(train_acc1es[last_epoch], train_acc5es[last_epoch], 100-train_acc1es[last_epoch], valid_acc1es[last_epoch], valid_acc5es[last_epoch], 100-valid_acc1es[last_epoch])) # measure elapsed time seed_time.update(time.time() - start_time) start_time = time.time() need_time = 'Time Left: {:}'.format( convert_secs2time(seed_time.avg * (len(seeds)-_is-1), True) ) logger.log('\n<<<***>>> The {:02d}/{:02d}-th seed is {:} <finish> other procedures need {:}'.format(_is, len(seeds), seed, need_time)) logger.close()
def main(save_dir: Path, workers: int, datasets: List[Text], xpaths: List[Text], splits: List[int], seeds: List[int], nets: List[str], opt_config: Dict[Text, Any], to_evaluate_indexes: tuple, cover_mode: bool): log_dir = save_dir / 'logs' log_dir.mkdir(parents=True, exist_ok=True) logger = Logger(str(log_dir), os.getpid(), False) logger.log('xargs : seeds = {:}'.format(seeds)) logger.log('xargs : cover_mode = {:}'.format(cover_mode)) logger.log('-' * 100) logger.log( 'Start evaluating range =: {:06d} - {:06d}'.format(min(to_evaluate_indexes), max(to_evaluate_indexes)) +'({:} in total) / {:06d} with cover-mode={:}'.format(len(to_evaluate_indexes), len(nets), cover_mode)) for i, (dataset, xpath, split) in enumerate(zip(datasets, xpaths, splits)): logger.log( '--->>> Evaluate {:}/{:} : dataset={:9s}, path={:}, split={:}'.format(i, len(datasets), dataset, xpath, split)) logger.log('--->>> optimization config : {:}'.format(opt_config)) #to_evaluate_indexes = list(range(srange[0], srange[1] + 1)) start_time, epoch_time = time.time(), AverageMeter() for i, index in enumerate(to_evaluate_indexes): channelstr = nets[index] logger.log('\n{:} evaluate {:06d}/{:06d} ({:06d}/{:06d})-th arch [seeds={:}] {:}'.format(time_string(), i, len(to_evaluate_indexes), index, len(nets), seeds, '-' * 15)) logger.log('{:} {:} {:}'.format('-' * 15, channelstr, '-' * 15)) # test this arch on different datasets with different seeds has_continue = False for seed in seeds: to_save_name = save_dir / 'arch-{:06d}-seed-{:04d}.pth'.format(index, seed) if to_save_name.exists(): if cover_mode: logger.log('Find existing file : {:}, remove it before evaluation'.format(to_save_name)) os.remove(str(to_save_name)) else: logger.log('Find existing file : {:}, skip this evaluation'.format(to_save_name)) has_continue = True continue results = evaluate_all_datasets(channelstr, datasets, xpaths, splits, opt_config, seed, workers, logger) torch.save(results, to_save_name) logger.log('\n{:} evaluate {:06d}/{:06d} ({:06d}/{:06d})-th arch [seeds={:}] ===>>> {:}'.format(time_string(), i, len(to_evaluate_indexes), index, len(nets), seeds, to_save_name)) # measure elapsed time if not has_continue: epoch_time.update(time.time() - start_time) start_time = time.time() need_time = 'Time Left: {:}'.format(convert_secs2time(epoch_time.avg * (len(to_evaluate_indexes) - i - 1), True)) logger.log('This arch costs : {:}'.format(convert_secs2time(epoch_time.val, True))) logger.log('{:}'.format('*' * 100)) logger.log('{:} {:74s} {:}'.format('*' * 10, '{:06d}/{:06d} ({:06d}/{:06d})-th done, left {:}'.format(i, len( to_evaluate_indexes), index, len(nets), need_time), '*' * 10)) logger.log('{:}'.format('*' * 100)) logger.close()
def train(opt): # utils.setup_seed() logger = Logger(opt, save_code=opt.save_code) ################### set up dataset and dataloader ######################## dataset = VISTDataset(opt) opt.vocab_size = dataset.get_vocab_size() opt.seq_length = dataset.get_story_length() dataset.set_option(data_type={'whole_story': False, 'split_story': True, 'caption': False, 'prefix_story': True}) dataset.train() train_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=opt.shuffle, num_workers=opt.workers) dataset.val() val_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.workers) ##################### set up model, criterion and optimizer ###### bad_valid = 0 # set up evaluator evaluator = Evaluator(opt, 'val') # set up criterion crit = criterion.LanguageModelCriterion() # set up model model = models.setup(opt) model.cuda() # set up optimizer optimizer = setup_optimizer(opt, model) dataset.train() model.train() initial_lr = opt.learning_rate logging.info(model) ############################## training ################################## for epoch in range(logger.epoch_start, opt.max_epochs): # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob for iter, batch in enumerate(train_loader): start = time.time() logger.iteration += 1 torch.cuda.synchronize() feature_fc = batch['feature_fc'].cuda() if opt.use_obj: feature_obj = batch['feature_obj'].cuda() if opt.use_spatial: feature_obj_spatial = batch['feature_obj_spatial'].cuda() else: feature_obj_spatial = None if opt.use_classes: feature_obj_classes = batch['feature_obj_classes'].cuda() else: feature_obj_classes = None if opt.use_attrs: feature_obj_attrs = batch['feature_obj_attrs'].cuda() else: feature_obj_attrs = None target = batch['split_story'].cuda() prefix = batch['prefix_story'].cuda() history_count = batch['history_counter'].cuda() index = batch['index'] optimizer.zero_grad() # cross entropy loss output = model(feature_fc, feature_obj, target, history_count, spatial=feature_obj_spatial, clss=feature_obj_classes, attrs=feature_obj_attrs) loss = crit(output, target) loss.backward() train_loss = loss.item() nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip, norm_type=2) optimizer.step() torch.cuda.synchronize() if iter % opt.log_step == 0: logging.info("Epoch {} - Iter {} / {}, loss = {:.5f}, time used = {:.3f}s".format(epoch, iter, len(train_loader), train_loss, time.time() - start)) # Write the training loss summary if logger.iteration % opt.losses_log_every == 0: logger.log_training(epoch, iter, train_loss, opt.learning_rate, model.ss_prob) if logger.iteration % opt.save_checkpoint_every == 0: # Evaluate on validation dataset and save model for every epoch val_loss, predictions, metrics = evaluator.eval_story(model, crit, dataset, val_loader, opt) if opt.metric == 'XE': score = -val_loss else: score = metrics[opt.metric] logger.log_checkpoint(epoch, val_loss, metrics, predictions, opt, model, dataset, optimizer) # halve the learning rate if not improving for a long time if logger.best_val_score > score: bad_valid += 1 if bad_valid >= opt.bad_valid_threshold: opt.learning_rate = opt.learning_rate * opt.learning_rate_decay_rate logging.info("halve learning rate to {}".format(opt.learning_rate)) checkpoint_path = os.path.join(logger.log_dir, 'model-best.pth') model.load_state_dict(torch.load(checkpoint_path)) utils.set_lr(optimizer, opt.learning_rate) # set the decayed rate bad_valid = 0 logging.info("bad valid : {}".format(bad_valid)) else: opt.learning_rate = initial_lr logging.info("achieving best {} score: {}".format(opt.metric, score)) bad_valid = 0
model_to_use = [ DAE, # AE, # VAE ] record_t_e = [] record_t_mle_e = [] for Net in model_to_use: print('-----\n{}'.format(Net.__name__)) # f = open('%s.log' %(Net.__name__), 'a') # sys.stdout = f # sys.stderr = f # sys.stdout = Logger('%s.log' %(Net.__name__), sys.stdout) # sys.stderr = Logger('%s.log_file' %(Net.__name__), sys.stderr) logger = Logger(log_file_name='%s.txt' %(Net.__name__), log_level=logging.DEBUG, logger_name='%s' %(Net.__name__)).get_log() model = Net().to(device) for epoch in range(1, args.epochs + 1): [t_e,t_mle_e] = train(epoch,train_loader,model,device,args) record_t_e.append(t_e) record_t_mle_e.append(t_mle_e) test(epoch,test_loader,model,device,args) import matplotlib.pyplot as plt import numpy as np x_axis = np.arange(1,args.epochs+1) fig_te = plt.figure() plt.plot(x_axis,record_t_e) plt.xlabel('epoch') plt.ylabel('training loss') plt.show()
def train(opt): setup_seed() logger = Logger(opt) ################### set up dataset and dataloader ######################## dataset = VISTDataset(opt) opt.vocab_size = dataset.get_vocab_size() opt.seq_length = dataset.get_story_length() dataset.set_option(data_type={'whole_story': False, 'split_story': True, 'caption': False}) dataset.train() train_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=opt.shuffle, num_workers=opt.workers) dataset.val() val_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.workers) ##################### set up model, criterion and optimizer ###### bad_valid = 0 # set up evaluator evaluator = Evaluator(opt, 'val') # set up criterion crit = criterion.LanguageModelCriterion() if opt.start_rl >= 0: rl_crit = criterion.ReinforceCriterion(opt, dataset) # set up model model = models.setup(opt) model.cuda() # set up optimizer optimizer = setup_optimizer(opt, model) dataset.train() model.train() ############################## training ################################## for epoch in range(logger.epoch_start, opt.max_epochs): # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob for iter, batch in enumerate(train_loader): start = time.time() logger.iteration += 1 torch.cuda.synchronize() feature_fc = Variable(batch['feature_fc']).cuda() target = Variable(batch['split_story']).cuda() index = batch['index'] semantic = batch['semantic'] optimizer.zero_grad() # cross entropy loss output = model(feature_fc, target, semantic) loss = crit(output, target) if opt.start_rl >= 0 and epoch >= opt.start_rl: # reinforcement learning seq, seq_log_probs, baseline = model.sample(feature_fc, sample_max=False, rl_training=True) rl_loss, avg_score = rl_crit(seq, seq_log_probs, baseline, index) print(rl_loss.data[0] / loss.data[0]) loss = opt.rl_weight * rl_loss + (1 - opt.rl_weight) * loss logging.info("average {} score: {}".format(opt.reward_type, avg_score)) loss.backward() train_loss = loss.data[0] nn.utils.clip_grad_norm(model.parameters(), opt.grad_clip, norm_type=2) optimizer.step() torch.cuda.synchronize() logging.info("Epoch {} - Iter {} / {}, loss = {:.5f}, time used = {:.3f}s".format(epoch, iter, len(train_loader), train_loss, time.time() - start)) # Write the training loss summary if logger.iteration % opt.losses_log_every == 0: logger.log_training(epoch, iter, train_loss, opt.learning_rate, model.ss_prob) if logger.iteration % opt.save_checkpoint_every == 0: # Evaluate on validation dataset and save model for every epoch val_loss, predictions, metrics = evaluator.eval_story(model, crit, dataset, val_loader, opt) if opt.metric == 'XE': score = -val_loss else: score = metrics[opt.metric] logger.log_checkpoint(epoch, val_loss, metrics, predictions, opt, model, dataset, optimizer) # halve the learning rate if not improving for a long time if logger.best_val_score > score: bad_valid += 1 if bad_valid >= 4: opt.learning_rate = opt.learning_rate / 2.0 logging.info("halve learning rate to {}".format(opt.learning_rate)) checkpoint_path = os.path.join(logger.log_dir, 'model-best.pth') model.load_state_dict(torch.load(checkpoint_path)) utils.set_lr(optimizer, opt.learning_rate) # set the decayed rate bad_valid = 0 logging.info("bad valid : {}".format(bad_valid)) else: logging.info("achieving best {} score: {}".format(opt.metric, score)) bad_valid = 0
def prepare_logger(xargs): args = copy.deepcopy(xargs) from log_utils import Logger # import pdb # pdb.set_trace() if xargs.non_tailor: logger = Logger(args.save_dir, "{:}-sparse".format(args.rand_seed), sparse_flag=True) # elif args.drop_path_prob_max>0.0: # logger = Logger(args.save_dir, "{:}-droppath".format(args.rand_seed), sparse_flag=True) else: logger = Logger(args.save_dir, args.rand_seed, sparse_flag=False) # logger = Logger(args.save_dir, "{:}-nodroppath".format(args.rand_seed), sparse_flag = True) logger.log('Main Function with logger : {:}'.format(logger)) logger.log('Arguments : -------------------------------') for name, value in args._get_kwargs(): logger.log('{:16} : {:}'.format(name, value)) logger.log("Python Version : {:}".format(sys.version.replace('\n', ' '))) logger.log("Pillow Version : {:}".format(PIL.__version__)) logger.log("PyTorch Version : {:}".format(torch.__version__)) logger.log("cuDNN Version : {:}".format(torch.backends.cudnn.version())) logger.log("CUDA available : {:}".format(torch.cuda.is_available())) logger.log("CUDA GPU numbers : {:}".format(torch.cuda.device_count())) logger.log("CUDA_VISIBLE_DEVICES : {:}".format( os.environ['CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ else 'None')) return logger
def train_single_model( save_dir, workers, datasets, xpaths, splits, use_less, seeds, model_str, arch_config ): assert torch.cuda.is_available(), "CUDA is not available." torch.backends.cudnn.enabled = True torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = True torch.set_num_threads(workers) save_dir = ( Path(save_dir) / "specifics" / "{:}-{:}-{:}-{:}".format( "LESS" if use_less else "FULL", model_str, arch_config["channel"], arch_config["num_cells"], ) ) logger = Logger(str(save_dir), 0, False) if model_str in CellArchitectures: arch = CellArchitectures[model_str] logger.log( "The model string is found in pre-defined architecture dict : {:}".format( model_str ) ) else: try: arch = CellStructure.str2structure(model_str) except: raise ValueError( "Invalid model string : {:}. It can not be found or parsed.".format( model_str ) ) assert arch.check_valid_op( get_search_spaces("cell", "full") ), "{:} has the invalid op.".format(arch) logger.log("Start train-evaluate {:}".format(arch.tostr())) logger.log("arch_config : {:}".format(arch_config)) start_time, seed_time = time.time(), AverageMeter() for _is, seed in enumerate(seeds): logger.log( "\nThe {:02d}/{:02d}-th seed is {:} ----------------------<.>----------------------".format( _is, len(seeds), seed ) ) to_save_name = save_dir / "seed-{:04d}.pth".format(seed) if to_save_name.exists(): logger.log( "Find the existing file {:}, directly load!".format(to_save_name) ) checkpoint = torch.load(to_save_name) else: logger.log( "Does not find the existing file {:}, train and evaluate!".format( to_save_name ) ) checkpoint = evaluate_all_datasets( arch, datasets, xpaths, splits, use_less, seed, arch_config, workers, logger, ) torch.save(checkpoint, to_save_name) # log information logger.log("{:}".format(checkpoint["info"])) all_dataset_keys = checkpoint["all_dataset_keys"] for dataset_key in all_dataset_keys: logger.log( "\n{:} dataset : {:} {:}".format("-" * 15, dataset_key, "-" * 15) ) dataset_info = checkpoint[dataset_key] # logger.log('Network ==>\n{:}'.format( dataset_info['net_string'] )) logger.log( "Flops = {:} MB, Params = {:} MB".format( dataset_info["flop"], dataset_info["param"] ) ) logger.log("config : {:}".format(dataset_info["config"])) logger.log( "Training State (finish) = {:}".format(dataset_info["finish-train"]) ) last_epoch = dataset_info["total_epoch"] - 1 train_acc1es, train_acc5es = ( dataset_info["train_acc1es"], dataset_info["train_acc5es"], ) valid_acc1es, valid_acc5es = ( dataset_info["valid_acc1es"], dataset_info["valid_acc5es"], ) logger.log( "Last Info : Train = Acc@1 {:.2f}% Acc@5 {:.2f}% Error@1 {:.2f}%, Test = Acc@1 {:.2f}% Acc@5 {:.2f}% Error@1 {:.2f}%".format( train_acc1es[last_epoch], train_acc5es[last_epoch], 100 - train_acc1es[last_epoch], valid_acc1es[last_epoch], valid_acc5es[last_epoch], 100 - valid_acc1es[last_epoch], ) ) # measure elapsed time seed_time.update(time.time() - start_time) start_time = time.time() need_time = "Time Left: {:}".format( convert_secs2time(seed_time.avg * (len(seeds) - _is - 1), True) ) logger.log( "\n<<<***>>> The {:02d}/{:02d}-th seed is {:} <finish> other procedures need {:}".format( _is, len(seeds), seed, need_time ) ) logger.close()
def train(opt): """ 模型训练函数 """ # 自定义的类,日志记录 logger = Logger(opt) # 获取数据 dataset = VISTDataset(opt) opt.vocab_size = dataset.get_vocab_size() opt.seq_length = dataset.get_story_length() # print(dataset.get_word2id()['the']) dataset.set_option(data_type={ 'whole_story': False, 'split_story': True, 'caption': True }) # 若不使用caption数据,则将其设为False dataset.train() train_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=opt.shuffle) dataset.test() # 改为valid val_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False) # m = dataset.word2id # 记录上升的 valid_loss 次数 bad_valid = 0 # 创建Evaluator evaluator = Evaluator(opt, 'val') # 损失 crit = criterion.LanguageModelCriterion() # 是否使用强化学习,默认为-1 if opt.start_rl >= 0: rl_crit = criterion.ReinforceCriterion(opt, dataset) # set up model,函数在init文件中,若有原来模型,则加载模型参数 model = models.setup(opt) model.cuda() optimizer = setup_optimizer(opt, model) dataset.train() model.train() for epoch in range(logger.epoch_start, opt.max_epochs): # 默认为 0-20 # scheduled_sampling_start表示在第几个epoch,衰减gt使用概率,最大到0.25,5个epoch之内还是0 if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = ( epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every # 后者默认值为5,//为向下取整除 opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) # 0.05、0.25 model.ss_prob = opt.ss_prob # 对数据进行一个batch一个batch的迭代 for iter, batch in enumerate(train_loader): start = time.time() logger.iteration += 1 torch.cuda.synchronize() # 获取batch中的数据,图像特征、caption、以及target features = Variable(batch['feature_fc']).cuda() # 64*5*2048 caption = None if opt.caption: caption = Variable(batch['caption']).cuda() # 64*5*20 target = Variable(batch['split_story']).cuda() # 64*5*30 index = batch['index'] optimizer.zero_grad() # 模型运行,返回一个概率分布,然后计算交叉熵损失 output = model(features, target, caption) loss = crit(output, target) if opt.start_rl >= 0 and epoch >= opt.start_rl: # reinforcement learning # 获取 sample 数据和 baseline 数据 seq, seq_log_probs, baseline = model.sample(features, caption=caption, sample_max=False, rl_training=True) rl_loss, avg_score = rl_crit(seq, seq_log_probs, baseline, index) print(rl_loss.data[0] / loss.data[0]) loss = opt.rl_weight * rl_loss + (1 - opt.rl_weight) * loss logging.info("average {} score: {}".format( opt.reward_type, avg_score)) # 反向传播 loss.backward() train_loss = loss.item() # 梯度裁剪,第二个参数为梯度最大范数,大于该值则进行裁剪 nn.utils.clip_grad_norm(model.parameters(), opt.grad_clip, norm_type=2) optimizer.step() torch.cuda.synchronize() # 日志记录时间以及损失 logging.info( "Epoch {} - Iter {} / {}, loss = {:.5f}, time used = {:.3f}s". format(epoch, iter, len(train_loader), train_loss, time.time() - start)) # Write the training loss summary,tensorboard记录 if logger.iteration % opt.losses_log_every == 0: logger.log_training(epoch, iter, train_loss, opt.learning_rate, model.ss_prob) # validation验证,每迭代save_checkpoint_every轮评测一次 if logger.iteration % opt.save_checkpoint_every == 0: val_loss, predictions, metrics = evaluator.eval_story( model, crit, dataset, val_loader, opt) if opt.metric == 'XE': score = -val_loss else: score = metrics[opt.metric] logger.log_checkpoint(epoch, val_loss, metrics, predictions, opt, model, dataset, optimizer) # halve the learning rate if not improving for a long time if logger.best_val_score > score: bad_valid += 1 if bad_valid >= 4: opt.learning_rate = opt.learning_rate / 2.0 logging.info("halve learning rate to {}".format( opt.learning_rate)) checkpoint_path = os.path.join(logger.log_dir, 'model-best.pth') model.load_state_dict(torch.load(checkpoint_path)) utils.set_lr(optimizer, opt.learning_rate) # set the decayed rate bad_valid = 0 logging.info("bad valid : {}".format(bad_valid)) else: logging.info("achieving best {} score: {}".format( opt.metric, score)) bad_valid = 0
def train(opt): logger = Logger(opt) flag = Flag(D_iters=opt.D_iter, G_iters=opt.G_iter, always=opt.always) ################### set up dataset and dataloader ######################## dataset = VISTDataset(opt) opt.vocab_size = dataset.get_vocab_size() opt.seq_length = dataset.get_story_length() dataset.set_option(data_type={ 'whole_story': False, 'split_story': True, 'caption': False }) dataset.train() train_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=opt.shuffle, num_workers=opt.workers) dataset.val() val_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.workers) ##################### set up model, criterion and optimizer ###### bad_valid = 0 # set up evaluator evaluator = Evaluator(opt, 'val') # set up criterion crit = criterion.LanguageModelCriterion() rl_crit = criterion.ReinforceCriterion(opt, dataset) # set up model model = models.setup(opt) model.cuda() disc_opt = copy.copy(opt) disc_opt.model = 'RewardModel' disc = models.setup(disc_opt) if os.path.exists(os.path.join(logger.log_dir, 'disc-model.pth')): logging.info("loading pretrained RewardModel") disc.load_state_dict( torch.load(os.path.join(logger.log_dir, 'disc-model.pth'))) disc.cuda() # set up optimizer optimizer = setup_optimizer(opt, model) disc_optimizer = setup_optimizer(opt, disc) dataset.train() model.train() disc.train() ############################## training ################################## for epoch in range(logger.epoch_start, opt.max_epochs): # Assign the scheduled sampling prob start = time.time() for iter, batch in enumerate(train_loader): logger.iteration += 1 torch.cuda.synchronize() feature_fc = Variable(batch['feature_fc']).cuda() target = Variable(batch['split_story']).cuda() index = batch['index'] optimizer.zero_grad() disc_optimizer.zero_grad() if flag.flag == "Disc": model.eval() disc.train() if opt.decoding_method_DISC == 'sample': seq, seq_log_probs, baseline = model.sample( feature_fc, sample_max=False, rl_training=True, pad=True) elif opt.decoding_method_DISC == 'greedy': seq, seq_log_probs, baseline = model.sample( feature_fc, sample_max=True, rl_training=True, pad=True) else: model.train() disc.eval() seq, seq_log_probs, baseline = model.sample(feature_fc, sample_max=False, rl_training=True, pad=True) seq = Variable(seq).cuda() mask = (seq > 0).float() mask = to_contiguous( torch.cat([ Variable( mask.data.new(mask.size(0), mask.size(1), 1).fill_(1)), mask[:, :, :-1] ], 2)) normed_seq_log_probs = (seq_log_probs * mask).sum(-1) / mask.sum(-1) gen_score = disc(seq.view(-1, seq.size(2)), feature_fc.view(-1, feature_fc.size(2))) if flag.flag == "Disc": gt_score = disc(target.view(-1, target.size(2)), feature_fc.view(-1, feature_fc.size(2))) loss = -torch.sum(gt_score) + torch.sum(gen_score) avg_pos_score = torch.mean(gt_score) avg_neg_score = torch.mean(gen_score) if logger.iteration % 5 == 0: logging.info("pos reward {} neg reward {}".format( avg_pos_score.data[0], avg_neg_score.data[0])) print( "PREDICTION: ", utils.decode_story(dataset.get_vocab(), seq[:1].data)[0]) print( "GROUND TRUTH: ", utils.decode_story(dataset.get_vocab(), target[:1].data)[0]) else: rewards = Variable(gen_score.data - 0.001 * normed_seq_log_probs.data) #with open("/tmp/reward.txt", "a") as f: # print(" ".join(map(str, rewards.data.cpu().numpy())), file=f) loss, avg_score = rl_crit(seq.data, seq_log_probs, baseline, index, rewards) # if logger.iteration % opt.losses_log_every == 0: avg_pos_score = torch.mean(gen_score) logging.info("average reward: {} average IRL score: {}".format( avg_score.data[0], avg_pos_score.data[0])) if flag.flag == "Disc": loss.backward() nn.utils.clip_grad_norm(disc.parameters(), opt.grad_clip, norm_type=2) disc_optimizer.step() else: tf_loss = crit(model(feature_fc, target), target) print("rl_loss / tf_loss = ", loss.data[0] / tf_loss.data[0]) loss = opt.rl_weight * loss + (1 - opt.rl_weight) * tf_loss loss.backward() nn.utils.clip_grad_norm(model.parameters(), opt.grad_clip, norm_type=2) optimizer.step() train_loss = loss.data[0] torch.cuda.synchronize() # Write the training loss summary if logger.iteration % opt.losses_log_every == 0: logger.log_training(epoch, iter, train_loss, opt.learning_rate, model.ss_prob) logging.info( "Epoch {} Train {} - Iter {} / {}, loss = {:.5f}, time used = {:.3f}s" .format(epoch, flag.flag, iter, len(train_loader), train_loss, time.time() - start)) start = time.time() if logger.iteration % opt.save_checkpoint_every == 0: if opt.always is None: # Evaluate on validation dataset and save model for every epoch val_loss, predictions, metrics = evaluator.eval_story( model, crit, dataset, val_loader, opt) if opt.metric == 'XE': score = -val_loss else: score = metrics[opt.metric] logger.log_checkpoint(epoch, val_loss, metrics, predictions, opt, model, dataset, optimizer) # halve the learning rate if not improving for a long time if logger.best_val_score > score: bad_valid += 1 if bad_valid >= 10: opt.learning_rate = opt.learning_rate / 2.0 logging.info("halve learning rate to {}".format( opt.learning_rate)) checkpoint_path = os.path.join( logger.log_dir, 'model-best.pth') model.load_state_dict(torch.load(checkpoint_path)) utils.set_lr( optimizer, opt.learning_rate) # set the decayed rate bad_valid = 0 logging.info("bad valid : {}".format(bad_valid)) else: logging.info("achieving best {} score: {}".format( opt.metric, score)) bad_valid = 0 else: torch.save(disc.state_dict(), os.path.join(logger.log_dir, 'disc-model.pth')) flag.inc()
def main(super_path, ckp_path, workers, datasets, xpaths, splits, use_less): from config_utils import dict2config from models import get_cell_based_tiny_net logger = Logger(str(ckp_path), 0, False) ckp = torch.load(super_path) from collections import OrderedDict state_dict = OrderedDict() model_name = super_path.split('/')[2][:-8] old_state_dict = ckp['shared_cnn'] if model_name == 'ENAS' else ckp[ 'search_model'] for k, v in old_state_dict.items(): if 'module' in k: name = k[7:] # remove `module.` else: name = k state_dict[name] = v model_config = dict2config( { 'name': model_name, 'C': 16, 'N': 5, 'max_nodes': 4, 'num_classes': 10, 'space': [ 'none', 'skip_connect', 'nor_conv_1x1', 'nor_conv_3x3', 'avg_pool_3x3' ], 'affine': False, 'track_running_stats': True }, None) supernet = get_cell_based_tiny_net(model_config) # supernet.load_state_dict(ckp['search_model']) supernet.load_state_dict(state_dict) ckp_names = os.listdir(ckp_path) from datetime import datetime random.seed(datetime.now()) random.shuffle(ckp_names) for ckp_name in ckp_names: if not ckp_name.endswith('.tar'): continue if 'super' in ckp_name: continue if not os.path.exists(os.path.join(ckp_path, ckp_name)): continue arch = getvalue(ckp_name, 'arch') op_list = get_op_list(int(arch)) net = supernet.extract_sub({ '1<-0': op_list[0], '2<-0': op_list[1], '2<-1': op_list[2], '3<-0': op_list[3], '3<-1': op_list[4], '3<-2': op_list[5], }) network = torch.nn.DataParallel(net).cuda() valid_losses, valid_acc1s, valid_acc5s, valid_tms = evaluate_all_datasets( network, datasets, xpaths, splits, use_less, workers, logger) try: old_ckp = torch.load(os.path.join(ckp_path, ckp_name)) except: print(ckp_name) continue for key in valid_losses: old_ckp[key] = valid_losses[key] for key in valid_acc1s: old_ckp[key] = valid_acc1s[key] for key in valid_acc5s: old_ckp[key] = valid_acc5s[key] for key in valid_tms: old_ckp[key] = valid_tms[key] old_ckp['super'] = network.module.state_dict() cf10_super = valid_acc1s['cf10-otest-acc1'] new_ckp_name = ckp_name[:-4] + f'_cf10-super_f{cf10_super}' + '.tar' torch.save(old_ckp, os.path.join(ckp_path, new_ckp_name)) os.remove(os.path.join(ckp_path, ckp_name))
def main(args): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True prepare_seed(args.rand_seed) logstr = 'seed-{:}-time-{:}'.format(args.rand_seed, time_for_file()) logger = Logger(args.save_path, logstr) logger.log('Main Function with logger : {:}'.format(logger)) logger.log('Arguments : -------------------------------') for name, value in args._get_kwargs(): logger.log('{:16} : {:}'.format(name, value)) logger.log("Python version : {}".format(sys.version.replace('\n', ' '))) logger.log("Pillow version : {}".format(PIL.__version__)) logger.log("PyTorch version : {}".format(torch.__version__)) logger.log("cuDNN version : {}".format(torch.backends.cudnn.version())) # General Data Argumentation mean_fill = tuple( [int(x*255) for x in [0.485, 0.456, 0.406] ] ) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) assert args.arg_flip == False, 'The flip is : {}, rotate is {}'.format(args.arg_flip, args.rotate_max) train_transform = [transforms.PreCrop(args.pre_crop_expand)] train_transform += [transforms.TrainScale2WH((args.crop_width, args.crop_height))] train_transform += [transforms.AugScale(args.scale_prob, args.scale_min, args.scale_max)] #if args.arg_flip: # train_transform += [transforms.AugHorizontalFlip()] if args.rotate_max: train_transform += [transforms.AugRotate(args.rotate_max)] train_transform += [transforms.AugCrop(args.crop_width, args.crop_height, args.crop_perturb_max, mean_fill)] train_transform += [transforms.ToTensor(), normalize] train_transform = transforms.Compose( train_transform ) eval_transform = transforms.Compose([transforms.PreCrop(args.pre_crop_expand), transforms.TrainScale2WH((args.crop_width, args.crop_height)), transforms.ToTensor(), normalize]) assert (args.scale_min+args.scale_max) / 2 == args.scale_eval, 'The scale is not ok : {},{} vs {}'.format(args.scale_min, args.scale_max, args.scale_eval) # Model Configure Load model_config = load_configure(args.model_config, logger) args.sigma = args.sigma * args.scale_eval logger.log('Real Sigma : {:}'.format(args.sigma)) # Training Dataset train_data = Dataset(train_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator) train_data.load_list(args.train_lists, args.num_pts, True) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # Evaluation Dataloader eval_loaders = [] if args.eval_vlists is not None: for eval_vlist in args.eval_vlists: eval_vdata = Dataset(eval_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator) eval_vdata.load_list(eval_vlist, args.num_pts, True) eval_vloader = torch.utils.data.DataLoader(eval_vdata, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) eval_loaders.append((eval_vloader, True)) if args.eval_ilists is not None: for eval_ilist in args.eval_ilists: eval_idata = Dataset(eval_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator) eval_idata.load_list(eval_ilist, args.num_pts, True) eval_iloader = torch.utils.data.DataLoader(eval_idata, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) eval_loaders.append((eval_iloader, False)) # Define network logger.log('configure : {:}'.format(model_config)) net = obtain_model(model_config, args.num_pts + 1) assert model_config.downsample == net.downsample, 'downsample is not correct : {} vs {}'.format(model_config.downsample, net.downsample) logger.log("=> network :\n {}".format(net)) logger.log('Training-data : {:}'.format(train_data)) for i, eval_loader in enumerate(eval_loaders): eval_loader, is_video = eval_loader logger.log('The [{:2d}/{:2d}]-th testing-data [{:}] = {:}'.format(i, len(eval_loaders), 'video' if is_video else 'image', eval_loader.dataset)) logger.log('arguments : {:}'.format(args)) opt_config = load_configure(args.opt_config, logger) if hasattr(net, 'specify_parameter'): net_param_dict = net.specify_parameter(opt_config.LR, opt_config.Decay) else: net_param_dict = net.parameters() optimizer, scheduler, criterion = obtain_optimizer(net_param_dict, opt_config, logger) logger.log('criterion : {:}'.format(criterion)) net, criterion = net.cuda(), criterion.cuda() net = torch.nn.DataParallel(net) last_info = logger.last_info() if last_info.exists(): logger.log("=> loading checkpoint of the last-info '{:}' start".format(last_info)) last_info = torch.load(last_info) start_epoch = last_info['epoch'] + 1 checkpoint = torch.load(last_info['last_checkpoint']) assert last_info['epoch'] == checkpoint['epoch'], 'Last-Info is not right {:} vs {:}'.format(last_info, checkpoint['epoch']) net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) logger.log("=> load-ok checkpoint '{:}' (epoch {:}) done" .format(logger.last_info(), checkpoint['epoch'])) else: logger.log("=> do not find the last-info file : {:}".format(last_info)) start_epoch = 0 if args.eval_once: logger.log("=> only evaluate the model once") eval_results = eval_all(args, eval_loaders, net, criterion, 'eval-once', logger, opt_config) logger.close() ; return # Main Training and Evaluation Loop start_time = time.time() epoch_time = AverageMeter() for epoch in range(start_epoch, opt_config.epochs): scheduler.step() need_time = convert_secs2time(epoch_time.avg * (opt_config.epochs-epoch), True) epoch_str = 'epoch-{:03d}-{:03d}'.format(epoch, opt_config.epochs) LRs = scheduler.get_lr() logger.log('\n==>>{:s} [{:s}], [{:s}], LR : [{:.5f} ~ {:.5f}], Config : {:}'.format(time_string(), epoch_str, need_time, min(LRs), max(LRs), opt_config)) # train for one epoch train_loss, train_nme = train(args, train_loader, net, criterion, optimizer, epoch_str, logger, opt_config) # log the results logger.log('==>>{:s} Train [{:}] Average Loss = {:.6f}, NME = {:.2f}'.format(time_string(), epoch_str, train_loss, train_nme*100)) # remember best prec@1 and save checkpoint save_path = save_checkpoint({ 'epoch': epoch, 'args' : deepcopy(args), 'arch' : model_config.arch, 'state_dict': net.state_dict(), 'scheduler' : scheduler.state_dict(), 'optimizer' : optimizer.state_dict(), }, logger.path('model') / '{:}-{:}.pth'.format(model_config.arch, epoch_str), logger) last_info = save_checkpoint({ 'epoch': epoch, 'last_checkpoint': save_path, }, logger.last_info(), logger) eval_results = eval_all(args, eval_loaders, net, criterion, epoch_str, logger, opt_config) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() logger.close()
def train(opt): logger = Logger(opt) # 定义 logger flag = Flag(D_iters=opt.D_iter, G_iters=opt.G_iter, always=opt.always) # 初始化训练标签 dataset = VISTDataset(opt) # 加载数据 opt.vocab_size = dataset.get_vocab_size() opt.seq_length = dataset.get_story_length() dataset.set_option(data_type={ 'whole_story': False, 'split_story': True, 'caption': False }) dataset.train() train_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=opt.shuffle) dataset.val() val_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False) bad_valid = 0 evaluator = Evaluator(opt, 'val') crit = criterion.LanguageModelCriterion() rl_crit = criterion.ReinforceCriterion(opt, dataset) # 强化学习的损失函数 # set up model model = models.setup(opt) model.cuda() disc_opt = copy.copy(opt) disc_opt.model = 'RewardModel' # 加入model属性 disc = models.setup(disc_opt) # 判别器模型,实例化哪个模型的类 if os.path.exists(os.path.join('./data/save/', 'disc-model.pth')): # 若存在,则加载模型参数 logging.info("loading pretrained RewardModel") disc.load_state_dict( torch.load(os.path.join(logger.log_dir, 'disc-model.pth'))) disc.cuda() # 两个优化器,完全独立的两个模型 optimizer = setup_optimizer(opt, model) disc_optimizer = setup_optimizer(disc_opt, disc) # fix dataset.train() model.train() disc.train() ############################## training ################################## for epoch in range(logger.epoch_start, opt.max_epochs): # 最大轮数为 50 start = time.time() for iter, batch in enumerate(train_loader): # 开始迭代 logger.iteration += 1 # 记录迭代次数 torch.cuda.synchronize() # 获取数据 feature_fc = Variable(batch['feature_fc']).cuda() target = Variable(batch['split_story']).cuda() index = batch['index'] optimizer.zero_grad() disc_optimizer.zero_grad() if flag.flag == "Disc": model.eval() # policy model参数不更新 disc.train() # 更新判别器参数 if opt.decoding_method_DISC == 'sample': # True,返回 sample 的序列,根据概率分布 sample seq, seq_log_probs, baseline = model.sample( feature_fc, sample_max=False, rl_training=True, pad=True) elif opt.decoding_method_DISC == 'greedy': seq, seq_log_probs, baseline = model.sample( feature_fc, sample_max=True, rl_training=True, pad=True) else: model.train() # 更新模型 disc.eval() # 判别器不更新 seq, seq_log_probs, baseline = model.sample(feature_fc, sample_max=False, rl_training=True, pad=True) seq = Variable(seq).cuda() mask = (seq > 0).float() # 64,5,30 mask = to_contiguous( torch.cat([ Variable( mask.data.new(mask.size(0), mask.size(1), 1).fill_(1)), mask[:, :, :-1] ], 2)) normed_seq_log_probs = (seq_log_probs * mask).sum(-1) / mask.sum( -1) # 64,5,得到整个序列的概率 gen_score = disc( seq.view(-1, seq.size(2)), feature_fc.view(-1, feature_fc.size(2))) # 计算sample序列的reward分数 if flag.flag == "Disc": # 先训练判别器,生成器已经预训练好。训练该判别器参数,使其能对标签和生成数据进行打分。 gt_score = disc(target.view(-1, target.size(2)), feature_fc.view( -1, feature_fc.size(2))) # 计算真实序列的reward loss = -torch.sum(gt_score) + torch.sum( gen_score) # 计算损失,loss为负很正常 # 计算平均 reward,训练判别器希望能尽可能pos高 avg_pos_score = torch.mean(gt_score) avg_neg_score = torch.mean(gen_score) if logger.iteration % 5 == 0: logging.info("pos reward {} neg reward {}".format( avg_pos_score.item(), avg_neg_score.item())) # print("PREDICTION: ", utils.decode_story(dataset.get_vocab(), seq[:1].data)[0]) # print("GROUND TRUTH: ", utils.decode_story(dataset.get_vocab(), target[:1].data)[0]) else: rewards = Variable(gen_score.data - 0 * normed_seq_log_probs.view(-1).data) #with open("/tmp/reward.txt", "a") as f: # print(" ".join(map(str, rewards.data.cpu().numpy())), file=f) loss, avg_score = rl_crit(seq.data, seq_log_probs, baseline, index, rewards.view(-1, seq.size(1))) # if logger.iteration % opt.losses_log_every == 0: avg_pos_score = torch.mean(gen_score) # logging.info("average reward: {} average IRL score: {}".format(avg_score.item(), avg_pos_score.item())) if flag.flag == "Disc": loss.backward() nn.utils.clip_grad_norm(disc.parameters(), opt.grad_clip, norm_type=2) disc_optimizer.step() else: tf_loss = crit(model(feature_fc, target), target) # print("rl_loss / tf_loss = ", loss.item() / tf_loss.item()) loss = opt.rl_weight * loss + (1 - opt.rl_weight) * tf_loss loss.backward() nn.utils.clip_grad_norm(model.parameters(), opt.grad_clip, norm_type=2) optimizer.step() train_loss = loss.item() torch.cuda.synchronize() # Write the training loss summary if logger.iteration % opt.losses_log_every == 0: logger.log_training(epoch, iter, train_loss, opt.learning_rate, model.ss_prob) logging.info( "Epoch {} Train {} - Iter {} / {}, loss = {:.5f}, time used = {:.3f}s" .format(epoch, flag.flag, iter, len(train_loader), train_loss, time.time() - start)) start = time.time() if logger.iteration % opt.save_checkpoint_every == 0: if opt.always is None: # Evaluate on validation dataset and save model for every epoch val_loss, predictions, metrics = evaluator.eval_story( model, crit, dataset, val_loader, opt) if opt.metric == 'XE': score = -val_loss else: score = metrics[opt.metric] logger.log_checkpoint(epoch, val_loss, metrics, predictions, opt, model, dataset, optimizer) # halve the learning rate if not improving for a long time if logger.best_val_score > score: bad_valid += 1 if bad_valid >= 10: opt.learning_rate = opt.learning_rate / 2.0 logging.info("halve learning rate to {}".format( opt.learning_rate)) checkpoint_path = os.path.join( logger.log_dir, 'model-best.pth') model.load_state_dict(torch.load(checkpoint_path)) utils.set_lr( optimizer, opt.learning_rate) # set the decayed rate bad_valid = 0 logging.info("bad valid : {}".format(bad_valid)) else: logging.info("achieving best {} score: {}".format( opt.metric, score)) bad_valid = 0 else: torch.save(disc.state_dict(), os.path.join(logger.log_dir, 'disc-model.pth')) flag.inc()
def main(save_dir, workers, datasets, xpaths, splits, use_less, srange, arch_index, seeds, cover_mode, meta_info, arch_config): assert torch.cuda.is_available(), 'CUDA is not available.' torch.backends.cudnn.enabled = True #torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True torch.set_num_threads( workers ) assert len(srange) == 2 and 0 <= srange[0] <= srange[1], 'invalid srange : {:}'.format(srange) if use_less: sub_dir = Path(save_dir) / '{:06d}-{:06d}-C{:}-N{:}-LESS'.format(srange[0], srange[1], arch_config['channel'], arch_config['num_cells']) else: sub_dir = Path(save_dir) / '{:06d}-{:06d}-C{:}-N{:}'.format(srange[0], srange[1], arch_config['channel'], arch_config['num_cells']) logger = Logger(str(sub_dir), 0, False) all_archs = meta_info['archs'] assert srange[1] < meta_info['total'], 'invalid range : {:}-{:} vs. {:}'.format(srange[0], srange[1], meta_info['total']) assert arch_index == -1 or srange[0] <= arch_index <= srange[1], 'invalid range : {:} vs. {:} vs. {:}'.format(srange[0], arch_index, srange[1]) if arch_index == -1: to_evaluate_indexes = list(range(srange[0], srange[1]+1)) else: to_evaluate_indexes = [arch_index] logger.log('xargs : seeds = {:}'.format(seeds)) logger.log('xargs : arch_index = {:}'.format(arch_index)) logger.log('xargs : cover_mode = {:}'.format(cover_mode)) logger.log('-'*100) logger.log('Start evaluating range =: {:06d} vs. {:06d} vs. {:06d} / {:06d} with cover-mode={:}'.format(srange[0], arch_index, srange[1], meta_info['total'], cover_mode)) for i, (dataset, xpath, split) in enumerate(zip(datasets, xpaths, splits)): logger.log('--->>> Evaluate {:}/{:} : dataset={:9s}, path={:}, split={:}'.format(i, len(datasets), dataset, xpath, split)) logger.log('--->>> architecture config : {:}'.format(arch_config)) start_time, epoch_time = time.time(), AverageMeter() for i, index in enumerate(to_evaluate_indexes): arch = all_archs[index] logger.log('\n{:} evaluate {:06d}/{:06d} ({:06d}/{:06d})-th architecture [seeds={:}] {:}'.format('-'*15, i, len(to_evaluate_indexes), index, meta_info['total'], seeds, '-'*15)) #logger.log('{:} {:} {:}'.format('-'*15, arch.tostr(), '-'*15)) logger.log('{:} {:} {:}'.format('-'*15, arch, '-'*15)) # test this arch on different datasets with different seeds has_continue = False for seed in seeds: to_save_name = sub_dir / 'arch-{:06d}-seed-{:04d}.pth'.format(index, seed) if to_save_name.exists(): if cover_mode: logger.log('Find existing file : {:}, remove it before evaluation'.format(to_save_name)) os.remove(str(to_save_name)) else : logger.log('Find existing file : {:}, skip this evaluation'.format(to_save_name)) has_continue = True continue results = evaluate_all_datasets(CellStructure.str2structure(arch), \ datasets, xpaths, splits, use_less, seed, \ arch_config, workers, logger) torch.save(results, to_save_name) logger.log('{:} --evaluate-- {:06d}/{:06d} ({:06d}/{:06d})-th seed={:} done, save into {:}'.format('-'*15, i, len(to_evaluate_indexes), index, meta_info['total'], seed, to_save_name)) # measure elapsed time if not has_continue: epoch_time.update(time.time() - start_time) start_time = time.time() need_time = 'Time Left: {:}'.format( convert_secs2time(epoch_time.avg * (len(to_evaluate_indexes)-i-1), True) ) logger.log('This arch costs : {:}'.format( convert_secs2time(epoch_time.val, True) )) logger.log('{:}'.format('*'*100)) logger.log('{:} {:74s} {:}'.format('*'*10, '{:06d}/{:06d} ({:06d}/{:06d})-th done, left {:}'.format(i, len(to_evaluate_indexes), index, meta_info['total'], need_time), '*'*10)) logger.log('{:}'.format('*'*100)) logger.close()
class ScraperBase: """ Scraper abstract base class - to derive from this class: (1) implement scrape_worker() (2) execute scrape(), which wraps scrape_worker() """ __metaclass__ = abc.ABCMeta def __init__(self, s_log_filename="scraper_base.log", s_log_level="DEBUG", s_url=None, regexp=None, b_global=False): """ constructor - must supply the logging parameters """ self.log = Logger(s_log_filename, s_log_level) self._s_url = s_url self._regexp = regexp self._retry_delay_seconds = RETRY_DELAY_SECONDS self._max_retries = MAX_RETRIES self._last_scraped_url = None self._b_global = b_global @abc.abstractmethod def scrape_worker(self): """ this is the function that derived scrapers must implement and which does all the scraping work, the rest of this class here is just for structural support only """ s_html = None if self._s_url is None: self.log.warning("this object was initialized without a URL!") return None else: s_html = self.fetch_html(self._s_url)[0] if self._regexp is None: return s_html elif s_html is not None: if self._b_global: return self._regexp.findall(s_html) else: match = self._regexp.search(s_html) if match is not None: return match.groups() else: return None def scrape(self): """ calls scrape_worker() but with logging and timing infrastructure """ start_time = time.time() response = self.scrape_worker() elapsed_time = time.time() - start_time self.log.debug("url=%s, duration=%f seconds" % (self._last_scraped_url, elapsed_time)) return response def fetch_rss(self, s_url): """ fetches the rss feed at a url given by url string s_url """ b_noanswer = True n_tries = 0 response = None while b_noanswer and n_tries < self._max_retries: try: response = feedparser.parse(s_url) self._last_scraped_url = s_url b_noanswer = False except RuntimeError as ex: self.log.error("Cannot open %s\n%s\nretrying in %2.2f s\n" % (s_url, str(ex), self._retry_delay_seconds)) time.sleep(1) b_noanswer = True n_tries += 1 return response def fetch_html(self, s_url): """ fetches the html at a url given by url string s_url """ # spoof the user agent to appear like an iphone's # "User-Agent" : "Mozilla/5.0(Windows; U; Windows NT 5.1; en-US) Ap"+ # "User-Agent" : "Mozilla/5.0 (compatible; VideoSurf_bot +ht... # ... tp://www.videosurf.com/bot.html)", d_headers = {"User-Agent": "", "Referer": "http://python.org"} # create a request object for the URL # request = urllib2.Request(s_url, headers=d_headers) req = request.Request(s_url, headers=d_headers) # create an opener object # opener = urllib2.build_opener() opener = request.build_opener() # open a connection and receive the http response headers + contents b_noanswer = True n_tries = 0 contents = None headers = None code = None while b_noanswer and n_tries < self._max_retries: try: response = opener.open(req) self._last_scraped_url = s_url b_noanswer = False # return values contents = response.read() headers = response.headers code = response.code # except (urllib2.HTTPError, urllib2.URLError) as ex: except (request.HTTPError, urllib2.URLError) as ex: s_message = "Cannot open %s\n%s\nretrying in %2.2f s\n" % \ (s_url, str(ex), self._retry_delay_seconds) sys.stderr.write(s_message + "\n") self.log.error(s_message) time.sleep(1) b_noanswer = True n_tries += 1 return contents, headers, code @staticmethod def get_text_from_html(s_html, s_separator=" "): """ returns list of non-empty text elements in html string snippet 's_html' """ re_text = re.compile('>([^<>]+)<') s_result = re_text.findall(str(s_html))[0] return s_separator.join(s_result.split()) @staticmethod def get_table_from_html(s_html, i_table=0): """ return a list of lists, each containing the text elements of an html table extracted from soup 'soup' (specifically, using zero indexing, table 'i_table' of all the tables detected in 'soup') """ soup = BeautifulSoup(s_html, features="html.parser") tables = soup('table') table = tables[i_table] l_text_rows = [] l_html_rows = table.findAll('tr') for html_row in l_html_rows: l_text_cells = [] l_html_cells = html_row.findAll('td') for html_cell in l_html_cells: l_text_cells.append(ScraperBase.get_text_from_html(html_cell)) l_text_rows.append(l_text_cells) return l_text_rows