def __init__(self, model, dataset, ctx=-1, batch_size=128, optimizer='sgd', lambdas=[0.1, 0.1], print_freq=32): self.model = model self.dataset = dataset self.batch_size = batch_size self.optbb = optim.SGD(chain(self.model.age_classifier.parameters(), self.model.RFM.parameters(), self.model.margin_fc.parameters(), self.model.backbone.parameters()), lr=0.01, momentum=0.9) self.optDAL = optim.SGD(self.model.DAL.parameters(), lr=0.01, momentum=0.9) self.lambdas = lambdas self.print_freq = print_freq self.id_recorder = Recorder() self.age_recorder = Recorder() self.trainingDAL = False if ctx < 0: self.ctx = torch.device('cpu') else: self.ctx = torch.device(f'cuda:{ctx}')
def __init__(self, worker_id, num_env, game_name, n_stack, child_conn, args): super(worker, self).__init__() self.daemon = True self.worker_id = worker_id self.num_env = num_env self.n_stack = n_stack self.child_conn = child_conn self.args = args self.envs = [] self.index_base = worker_id * num_env self.episode_length = [0] * num_env for i in range(num_env): time.sleep(0.1) access_index = self.index_base + i env = atari(game_name, n_stack) env.reset() self.envs.append(env) if args.record == True: self.recorder = [] for i in range(num_env): self.recorder.append( Recorder(int(worker_id * num_env + i), game_name))
def main(): global ssn, folder runbool = True while runbool: mkdir() ssn = None print("Program started") while not ssnreceived(): print('waiting for ssn') rec = Recorder(ssn,folder) rec.run() stop = None while stop is None: stop = input('Enter something to stop the recording') print(' ') if str(stop.strip()) == 'stop': runbool = False time.sleep(0.2) print(stop) if rec is not None: rec.stop() filename = ssn if os.path.isfile(folder + filename + '.wav'): db = Database(folder, ssn, filename) db.upload() db.adduser() db.addfile() db.addfiletouser() else: print('Not added to database')
def __init__( self, model, dataset, ctx=-1, batch_size=128, optimizer='sgd', grad_accu=1, lambdas=[0.05, 0.1], print_freq=32, train_head_only=True ): self.model = model self.dataset = dataset self.batch_size = batch_size self.finetune_layers = ( # self.model.backbone.repeat_3[-1:], self.model.backbone.last_bn, self.model.backbone.last_linear, self.model.backbone.block8 ) first_group = [ { "params": chain( self.model.age_classifier.parameters(), self.model.RFM.parameters(), self.model.margin_fc.parameters(), ), "lr": 5e-4 } ] if not train_head_only: # first_group[0]["lr"] = 1e-4 first_group.append( { "params": chain( *(x.parameters() for x in self.finetune_layers) ), "lr": 5e-5 } ) self.optbb = RAdam(first_group) self.optDAL = RAdam(self.model.DAL.parameters(), lr=5e-4) self.lambdas = lambdas self.print_freq = print_freq self.id_recorder = Recorder() self.age_recorder = Recorder() self.trainingDAL = False if ctx < 0: self.ctx = torch.device('cpu') else: self.ctx = torch.device(f'cuda:{ctx}') self.scaler1 = GradScaler() self.scaler2 = GradScaler() self.grad_accu = grad_accu self.train_head_only = train_head_only
def train(): model, recorder = mdl.Classifier(), Recorder() optimizer = torch.optim.Adam(model.parameters(), weight_decay=constants.WEIGHT_DECAY) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, constants.EPOCHS) trainer = Trainer(model, optimizer, scheduler, recorder) trainer.fit(constants.EPOCHS) trainer.save_model() recorder.plot()
def test(args): model_path = sorted(glob(os.path.join('ckpt', args.tag, '*.pth')))[-1] model = torch.load(model_path, map_location='cpu').eval() print('Loaded model: {}'.format(model_path)) model_name = os.path.basename(os.path.splitext(model_path)[0]) # initialize video writer video_filename = 'output_{}_{}.avi'.format(args.tag, model_name) dict_screen_shape = {"flappy": (288, 512), "pixelcopter": (48, 48)} out = Recorder(video_filename=video_filename, fps=30, width=dict_screen_shape[args.game][0], height=dict_screen_shape[args.game][1]) score_list = [] time_list = [] game = Game(game=args.game) for trials in range(10): elapsed_Time = 0 action = torch.zeros([model.number_of_actions], dtype=torch.float32) terminal = game.game_over() start = time.time() score = 0 image_data = game.get_torch_image() state = torch.cat( (image_data, image_data, image_data, image_data)).unsqueeze(0) while not terminal: output = model(state)[0] action = torch.zeros([model.number_of_actions], dtype=torch.float32) action_index = torch.argmax(output) score += game.act(action_index) terminal = game.game_over() image_data_1 = game.get_torch_image() state = torch.cat( (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0) out.write(game.get_image()) game.reset_game() score_list.append(score) time_list.append(time.time() - start) print('Game Ended!') print('Score: {} !'.format(score)) # Add summary out.write_score(sum(score_list), sum(time_list)) out.save() print('Total Score: {}'.format(sum(score_list))) print('Total Run Time: {:.3f}'.format(sum(time_list))) print('Saved video: {}'.format(video_filename))
def __init__(self, arg): self.arg = arg self.save_arg() if self.arg.random_fix: self.rng = RandomState(seed=self.arg.random_seed) self.device = GpuDataParallel() self.recoder = Recorder(self.arg.work_dir, self.arg.print_log) self.data_loader = {} self.topk = (1, 5) self.stat = Stat(self.arg.model_args['num_classes'], self.topk) self.model, self.optimizer = self.Loading() self.loss = self.criterion()
def train(args): Arguments.save_args(args, args.args_path) train_loader, val_loader, _ = get_dataloaders(args) model = UNetVgg16(n_classes=args.n_classes).to(args.device) optimizer = get_optimizer(args.optimizer, model) lr_scheduler = LRScheduler(args.lr_scheduler, optimizer) criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device) model_saver = ModelSaver(args.model_path) recorder = Recorder(['train_miou', 'train_acc', 'train_loss', 'val_miou', 'val_acc', 'val_loss']) for epoch in range(args.n_epochs): print(f"{args.experim_name} Epoch {epoch+1}:") train_loss, train_acc, train_miou, train_ious = train_epoch( model=model, dataloader=train_loader, n_classes=args.n_classes, optimizer=optimizer, lr_scheduler=lr_scheduler, criterion=criterion, device=args.device, ) print(f"train | mIoU: {train_miou:.3f} | accuracy: {train_acc:.3f} | loss: {train_loss:.3f}") val_loss, val_scores = eval_epoch( model=model, dataloader=val_loader, n_classes=args.n_classes, criterion=criterion, device=args.device, ) val_miou, val_ious, val_acc = val_scores['mIoU'], val_scores['IoUs'], val_scores['accuracy'] print(f"valid | mIoU: {val_miou:.3f} | accuracy: {val_acc:.3f} | loss: {val_loss:.3f}") recorder.update([train_miou, train_acc, train_loss, val_miou, val_acc, val_loss]) recorder.save(args.record_path) if args.metric.startswith("IoU"): metric = val_ious[int(args.metric.split('_')[1])] else: metric = val_miou model_saver.save_models(metric, epoch+1, model, ious={'train': train_ious, 'val': val_ious}) print(f"best model at epoch {model_saver.best_epoch} with miou {model_saver.best_score:.5f}")
cfg['saveto'] = './model_200/' cfg['report_interval'] = args.report print(cfg) train_data = batchify(corpus.train, cfg['batch_size']) val_data = batchify(corpus.valid, cfg['batch_size']) test_data = batchify(corpus.test, cfg['batch_size']) with open(cfg['init'], 'rb') as f: policy = torch.load(f) print(policy) reinforce_model = Reinforce(policy=policy, sigma=cfg['sigma'], gamma=cfg['gamma']) recorder = Recorder(output_path=cfg['output_file']) valid_loss = [] loss = evaluate(val_data, reinforce_model.policy, cfg) print('start from valid loss = ', loss) valid_loss.append(loss) ntokens = cfg['dict_size'] optimizer = optim.Adam(reinforce_model.parameters(), lr=cfg['lr']) start_time = time.time() for epoch in range(cfg['epochs']): total_loss = 0.0 total_LM_loss = 0.0 for i in range(0, train_data.size(0) - 1, cfg['max_len']):
def run(rank, args): base_setting(args) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) random.seed(args.seed) gpuid = args.gpuid[rank] is_master = rank == 0 is_mp = len(args.gpuid) > 1 world_size = len(args.gpuid) if is_master: recorder = Recorder(args.log) tok = BertTokenizer.from_pretrained(args.model_type) if args.use_ids: collate_fn = partial(collate_mp_ids, pad_token_id=tok.pad_token_id, is_test=False) collate_fn_val = partial(collate_mp_ids, pad_token_id=tok.pad_token_id, is_test=True) train_set = RefactoringIDsDataset( f"./{args.dataset}/{args.datatype}/train", args.model_type, maxlen=args.max_len, max_num=args.max_num) val_set = RefactoringIDsDataset( f"./{args.dataset}/{args.datatype}/val", args.model_type, is_test=True, maxlen=512, is_sorted=False) else: collate_fn = partial(collate_mp, pad_token_id=tok.pad_token_id, is_test=False) collate_fn_val = partial(collate_mp, pad_token_id=tok.pad_token_id, is_test=True) train_set = RefactoringDataset( f"./{args.dataset}/{args.datatype}/train", args.model_type, maxlen=args.max_len, maxnum=args.max_num) val_set = RefactoringDataset(f"./{args.dataset}/{args.datatype}/val", args.model_type, is_test=True, maxlen=512, is_sorted=False, maxnum=args.max_num) if is_mp: train_sampler = torch.utils.data.distributed.DistributedSampler( train_set, num_replicas=world_size, rank=rank, shuffle=True) dataloader = DataLoader(train_set, batch_size=args.batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn, sampler=train_sampler) val_sampler = torch.utils.data.distributed.DistributedSampler( val_set, num_replicas=world_size, rank=rank) val_dataloader = DataLoader(val_set, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn_val, sampler=val_sampler) else: dataloader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=4, collate_fn=collate_fn) val_dataloader = DataLoader(val_set, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn_val) # build models model_path = args.pretrained if args.pretrained is not None else args.model_type model = Refactor(model_path, num_layers=args.num_layers) if args.model_pt is not None: model.load_state_dict( torch.load(args.model_pt, map_location=f'cuda:{gpuid}')) if args.cuda: if len(args.gpuid) == 1: model = model.cuda() else: dist.init_process_group("nccl", rank=rank, world_size=world_size) model = nn.parallel.DistributedDataParallel( model.to(gpuid), [gpuid], find_unused_parameters=True) model.train() init_lr = args.max_lr / args.warmup_steps optimizer = optim.Adam(model.parameters(), lr=init_lr) if is_master: recorder.write_config(args, [model], __file__) minimum_loss = 100 all_step_cnt = 0 # start training for epoch in range(args.epoch): optimizer.zero_grad() step_cnt = 0 steps = 0 avg_loss = 0 for (i, batch) in enumerate(dataloader): if args.cuda: to_cuda(batch, gpuid) step_cnt += 1 output = model(batch["src_input_ids"], batch["candidate_ids"], batch["tgt_input_ids"]) similarity, gold_similarity = output['score'], output[ 'summary_score'] loss = args.scale * RankingLoss(similarity, gold_similarity, args.margin, args.gold_margin, args.gold_weight, no_gold=args.no_gold) loss = loss / args.accumulate_step avg_loss += loss.item() loss.backward() if step_cnt == args.accumulate_step: if args.grad_norm > 0: nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm) step_cnt = 0 steps += 1 all_step_cnt += 1 lr = args.max_lr * min( all_step_cnt**(-0.5), all_step_cnt * (args.warmup_steps**(-1.5))) for param_group in optimizer.param_groups: param_group['lr'] = lr optimizer.step() optimizer.zero_grad() if steps % args.report_freq == 0 and step_cnt == 0 and is_master: recorder.print("epoch: %d, batch: %d, avg loss: %.6f" % (epoch + 1, steps, avg_loss / args.report_freq)) recorder.print(f"learning rate: {lr:.6f}") recorder.plot("loss", {"loss": avg_loss / args.report_freq}, all_step_cnt) recorder.print() avg_loss = 0 del similarity, gold_similarity, loss if all_step_cnt % args.test_freq == 0 and all_step_cnt != 0 and step_cnt == 0: loss = test(val_dataloader, model, args, gpuid) if loss < minimum_loss and is_master: minimum_loss = loss if is_mp: recorder.save(model.module, "model.bin") else: recorder.save(model, "model.bin") recorder.save(optimizer, "optimizer.bin") recorder.print("best - epoch: %d, batch: %d" % (epoch + 1, i / args.accumulate_step + 1)) if is_master: if is_mp: recorder.save(model.module, "model_cur.bin") else: recorder.save(model, "model_cur.bin") recorder.save(optimizer, "optimizer_cur.bin") recorder.print("val score: %.6f" % (1 - loss))
def main(): opts = BaseOptions() args = opts.parse() logger = Logger(args.save_path) opts.print_options(logger) mean = np.array([0.485, 0.406, 0.456]) std = np.array([0.229, 0.224, 0.225]) train_transform = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop((224, 224), padding=7), transforms.ToTensor(), transforms.Normalize(mean, std) ]) test_transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean, std)]) train_data = Market('data/{}.mat'.format(args.dataset), state='train', transform=train_transform) gallery_data = Market('data/{}.mat'.format(args.dataset), state='gallery', transform=test_transform) probe_data = Market('data/{}.mat'.format(args.dataset), state='probe', transform=test_transform) num_classes = train_data.return_num_class() train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=True) gallery_loader = torch.utils.data.DataLoader(gallery_data, batch_size=args.batch_size, shuffle=False, num_workers=2, pin_memory=True) probe_loader = torch.utils.data.DataLoader(probe_data, batch_size=args.batch_size, shuffle=False, num_workers=2, pin_memory=True) net = resnet.resnet50(pretrained=False, num_classes=num_classes).cuda() checkpoint = torch.load(args.pretrain_path) fixed_layers = ('fc', ) state_dict = reset_state_dict(checkpoint, net, *fixed_layers) net.load_state_dict(state_dict) logger.print_log('loaded pre-trained feature net') criterion_CE = nn.CrossEntropyLoss().cuda() bn_params, conv_params = partition_params(net, 'bn') optimizer = torch.optim.SGD([{ 'params': bn_params, 'weight_decay': 0 }, { 'params': conv_params }], lr=args.lr, momentum=0.9, weight_decay=args.wd) train_stats = ('acc', 'loss') val_stats = ('acc', ) recorder = Recorder(args.epochs, val_stats[0], train_stats, val_stats) logger.print_log( 'observing training stats: {} \nvalidation stats: {}'.format( train_stats, val_stats)) start_epoch = 0 if args.resume: if os.path.isfile(args.resume): logger.print_log("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) recorder = checkpoint['recorder'] start_epoch = checkpoint['epoch'] net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.print_log("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: logger.print_log("=> no checkpoint found at '{}'".format( args.resume)) # Main loop start_time = time.time() epoch_time = AverageMeter() for epoch in range(start_epoch, args.epochs): need_hour, need_mins, need_secs = convert_secs2time( epoch_time.avg * (args.epochs - epoch)) need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format( need_hour, need_mins, need_secs) logger.print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s}'.format( time_string(), epoch, args.epochs, need_time)) lr, _ = adjust_learning_rate(optimizer, (args.lr, args.lr), epoch, args.epochs, args.lr_strategy) print(" lr:{}".format(lr)) train(train_loader, net, criterion_CE, optimizer, epoch, recorder, logger, args) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': net.state_dict(), 'recorder': recorder, 'optimizer': optimizer.state_dict(), }, False, args.save_path, 'checkpoint.pth.tar') recorder.plot_curve(os.path.join(args.save_path, 'curve.png')) # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() evaluate(gallery_loader, probe_loader, net, args.epochs - 1, recorder, logger)
def main(config, resume): set_seed(config['seed']) train_recorder = Recorder() # setup data_loader instances train_data = getattr(module_data, config['dataloader']['type'])( data_path=config['dataloader']['args']['train_data'], data_quota=config['dataloader']['args']['data_quota']) logging.info('using %d examples to train. ' % len(train_data)) data_loader = DataLoader( dataset=train_data, batch_size=config['dataloader']['args']['batch_size']) # val_data = getattr(module_data, config['dataloader']['type'])( # data_path = config['dataloader']['args']['val_data'], # data_quota = config['dataloader']['args']['data_quota'] # ) # logging.info('using %d examples to val. ' % len(val_data)) # valid_data_loader = DataLoader(dataset = val_data, # batch_size = config['data_loader']['batch_size']) # build model architecture model = getattr(models, config['model']['type'])(config['model']['args'], device=config['device']) logging.info(['my PID is: ', os.getpid()]) # get function handles of loss and metrics loss = getattr(module_loss, config['loss'])() # metrics = [getattr(module_metric, met) for met in config['metrics']] # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler g_trainable_params = filter(lambda p: p.requires_grad, model.G.parameters()) g_optimizer = getattr(torch.optim, config['optimizer']['generator']['type'])( g_trainable_params, **config['optimizer']['generator']['args']) d_trainable_params = filter(lambda p: p.requires_grad, model.D.parameters()) d_optimizer = getattr(torch.optim, config['optimizer']['discriminator']['type'])( d_trainable_params, **config['optimizer']['discriminator']['args']) trainer = Trainer(model, loss, g_optimizer, d_optimizer, resume=resume, config=config, data_loader=data_loader, valid_data_loader=None, metrics=None, lr_scheduler=None, train_recorder=train_recorder) logging.info('begin training. ') trainer.train()
def main(args): if args.seed: np.random.seed(int(args.seed)) torch.backends.cudnn.deterministic = True torch.manual_seed(0) config = get_config(args.dataset, args.version) method = config['model'] criterion = nn.CrossEntropyLoss().cuda() try: model = model_mappings[method](K=config['n_class']).cuda() except KeyError: print('%s model does not exist' % method) sys.exit(1) model_dir = './saved/%s_%s.pth' % (config['name'], method) if args.mode == 'train': log_dir = './log/%s_%s.log' % (config['name'], method) train_loader, validation_loader = get_dataloader(config) if config['optimizer'] == 'Adam': optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=5e-4) elif config['optimizer'] == 'SGD': optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9, weight_decay=5e-4) else: print('cannot found %s optimizer' % config['optimizer']) sys.exit(1) scheduler = ReduceLROnPlateau(optimizer, patience=3) recorder = Recorder(('loss_train', 'acc_train', 'loss_val', 'acc_val')) iou_val_max = 0 for epoch in range(1, config['epoch'] + 1): print('Epoch %s:' % epoch) loss_train, acc_train = train(config, model, criterion, optimizer, train_loader, method=method) loss_val, acc_val, iou_val = evaluate(config, model, criterion, validation_loader, method=method) scheduler.step(loss_train) # update loss and accuracy per epoch recorder.update((loss_train, acc_train, loss_val, acc_val)) # save model with higher iou if iou_val > iou_val_max and args.save: torch.save(recorder.record, log_dir) torch.save( { 'epoch': epoch, 'version': args.version, 'model_state_dict': model.state_dict(), }, model_dir) print( 'validation iou improved from %.5f to %.5f. Model Saved.' % (iou_val_max, iou_val)) iou_val_max = iou_val elif args.mode == 'evaluate': test_dir = '%s/%s' % (config['root'], args.test_folder) test_set = Dataset(test_dir, config['size'], *get_transform(config, is_train=False)) test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0, drop_last=False) model.load_state_dict(torch.load(model_dir)['model_state_dict']) # save prediction results, make directory if not exists save_dir = '%s/predictions/%s_%s' % (test_dir, args.version, method) if not os.path.isdir('%s/predictions' % test_dir): os.mkdir('%s/predictions' % test_dir) if not os.path.isdir(save_dir): os.mkdir(save_dir) evaluate(config, model, criterion, test_loader, method=method, test_flag=True, save_dir=save_dir) else: print('%s mode does not exist' % args.mode)
def main(): # fetch arguments args = parse_args() # initialize logger logger = SysLogger(LOGFILE) recorder = Recorder(RECORDER_FILE) logger.info('starting...') rl_knobs = knobs.get_rl_knobs(args.scenario) pso_knobs = knobs.get_pso_knobs(args.scenario) bm = benchmark.get_benchmark_instance(args.benchmark) env = DB_Env(db_info=args.db_info, benchmark=bm, recorder=recorder) if len(rl_knobs) == 0 and args.is_train: print(SysLogger) logger.print( 'current mode is training, so you must set reinforcement learning knobs.', fd=SysLogger.stderr) return -1 # reinforcement learning if len(rl_knobs) > 0: env.set_tuning_knobs(rl_knobs) # lazy loading. Because loading tensorflow has to cost too much time. from algorithms.rl_agent import RLAgent rl = RLAgent(env, agent='ddpg') if args.is_train: rl.fit(STEPS, nb_max_episode_steps=NB_MAX_EPISODE_STEPS) rl.save(args.model_path) logger.print('saved model at %s' % args.model_path) return 0 # training mode stop here. if not args.model_path: from sys import stderr print('have no model path, you can use --model-path argument.', file=stderr, flush=True) exit(-1) rl.load(args.model_path) rl.test(TEST_EPISODES, nb_max_episode_steps=NB_MAX_EPISODE_STEPS) recorder.write_best_val('reward') # heuristic algorithm if len(pso_knobs) > 0: env.set_tuning_knobs(pso_knobs) def heuristic_callback(v): s, r, d, _ = env.step(v, False) return -r # - reward pso = Pso(func=heuristic_callback, dim=len(pso_knobs), particle_nums=3, max_iteration=100, x_min=0, x_max=1, max_vel=0.5) pso.update() # if you have other approaches, you can code here. recorder.write_best_val('reward') logger.print('please see result at logfile: %s.' % RECORDER_FILE)
def test(self, data_fetcher, num_samples, if_baseline=False, if_return_each=False, img_save_folder=None, if_train=True): """ val (in training): idx_out=0/1/2/3/4 test: idx_out=-2, record time wo. iqa """ if if_baseline or if_train: assert self.crit_lst is not None, 'NO METRICS!' if self.crit_lst is not None: if_tar_only = False msg = 'dst vs. src | ' if if_baseline else 'tar vs. src | ' else: if_tar_only = True msg = 'only get dst | ' report_dict = None recorder_dict = dict() for crit_name in self.crit_lst: recorder_dict[crit_name] = Recorder() write_dict_lst = [] timer = CUDATimer() # validation baseline: no iqa, no parse name # validation, not baseline: no iqa, parse name # test baseline: no iqa, no parse name # test, no baseline, iqa, no parse name if_iqa = True if (not if_train) and (not if_baseline) else False if if_iqa: timer_wo_iqam = Recorder() idx_out = -2 # testing; judge by IQAM if_parse_name = True if if_train and (not if_baseline) else False self.set_eval_mode() data_fetcher.reset() test_data = data_fetcher.next() assert len(test_data['name']) == 1, 'ONLY SUPPORT bs==1!' pbar = tqdm(total=num_samples, ncols=100) while test_data is not None: im_lq = test_data['lq'].cuda(non_blocking=True) # assume bs=1 im_name = test_data['name'][0] # assume bs=1 if if_parse_name: im_type = im_name.split('_')[-1].split('.')[0] if im_type in ['qf50', 'qp22']: idx_out = 0 elif im_type in ['qf40', 'qp27']: idx_out = 1 elif im_type in ['qf30', 'qp32']: idx_out = 2 elif im_type in ['qf20', 'qp37']: idx_out = 3 elif im_type in ['qf10', 'qp42']: idx_out = 4 else: raise Exception(f"im_type IS {im_type}, NO MATCHING TYPE!") timer.start_record() if if_tar_only: if if_iqa: time_wo_iqa, im_out = self.model.net[self.model.infer_subnet](inp_t=im_lq, idx_out=idx_out).clamp_(0., 1.) else: im_out = self.model.net[self.model.infer_subnet](inp_t=im_lq, idx_out=idx_out).clamp_(0., 1.) timer.record_inter() else: im_gt = test_data['gt'].cuda(non_blocking=True) # assume bs=1 if if_baseline: im_out = im_lq else: if if_iqa: time_wo_iqa, im_out = self.model.net[self.model.infer_subnet](inp_t=im_lq, idx_out=idx_out) im_out = im_out.clamp_(0., 1.) else: im_out = self.model.net[self.model.infer_subnet](inp_t=im_lq, idx_out=idx_out).clamp_(0., 1.) timer.record_inter() _msg = f'{im_name} | ' for crit_name in self.crit_lst: crit_fn = self.crit_lst[crit_name]['fn'] crit_unit = self.crit_lst[crit_name]['unit'] perfm = crit_fn(torch.squeeze(im_out, 0), torch.squeeze(im_gt, 0)) recorder_dict[crit_name].record(perfm) _msg += f'[{perfm:.3e}] {crit_unit:s} | ' _msg = _msg[:-3] if if_return_each: msg += _msg + '\n' pbar.set_description(_msg) if if_iqa: timer_wo_iqam.record(time_wo_iqa) if img_save_folder is not None: # save im im = tensor2im(torch.squeeze(im_out, 0)) save_path = img_save_folder / (str(im_name) + '.png') cv2.imwrite(str(save_path), im) pbar.update() test_data = data_fetcher.next() pbar.close() if not if_tar_only: for crit_name in self.crit_lst: crit_unit = self.crit_lst[crit_name]['unit'] crit_if_focus = self.crit_lst[crit_name]['if_focus'] ave_perfm = recorder_dict[crit_name].get_ave() msg += f'{crit_name} | [{ave_perfm:.3e}] {crit_unit} | ' write_dict_lst.append(dict(tag=f'{crit_name} (val)', scalar=ave_perfm)) if crit_if_focus: report_dict = dict(ave_perfm=ave_perfm, lsb=self.crit_lst[crit_name]['fn'].lsb) ave_fps = 1. / timer.get_ave_inter() msg += f'ave. fps | [{ave_fps:.1f}]' if if_iqa: ave_time_wo_iqam = timer_wo_iqam.get_ave() fps_wo_iqam = 1. / ave_time_wo_iqam msg += f' | ave. fps wo. IQAM | [{fps_wo_iqam:.1f}]' if if_train: assert report_dict is not None return msg.rstrip(), write_dict_lst, report_dict else: return msg.rstrip()
model = getattr(model_def, args.arch)() model.cuda() train_loader, test_loader = data_loader(batch_size=args.batch_size, n_workers=args.workers, dataset=args.dataset) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, nesterov=True) prev_state = None if args.resume: prev_state = torch.load('references/{}_checkpoint.th'.format(args.arch)) epoch_time = AverageMeter() rec = Recorder() all_start_time = time.time() start_epoch = 0 if prev_state: print() model.load_state_dict(prev_state['model_state']) optimizer.load_state_dict(prev_state['optimizer_state']) epoch_time = prev_state['epoch_time'] rec = prev_state['records'] all_start_time -= prev_state['training_time'] print('Overriding provided arg with prev_state args: ', prev_state['args']) args = prev_state['args'] start_epoch = prev_state['epoch'] scheduler = None if args.scheduler == 'exponential':
def main(args): # Defines configuration dictionary and network architecture to use config = get_config(args.dataset, args.version) method = config['model'] # Defines the loss function. Takes a tensor as argument to initiate class balancing, # which can be obtained from the balance script. Uncomment argument below. if config['balance'] and args.gpu and torch.cuda.is_available(): criterion = nn.CrossEntropyLoss(weight=balance(config)).cuda() elif config['balance']: criterion = nn.CrossEntropyLoss(weight=balance(config)) elif args.gpu and torch.cuda.is_available(): criterion = nn.CrossEntropyLoss().cuda() else: criterion = nn.CrossEntropyLoss() # Maps configuration method to network class defined in models.py try: if args.gpu and torch.cuda.is_available(): model = model_mappings[method](K=config['n_class']).cuda() else: model = model_mappings[method](K=config['n_class']) except KeyError: print('%s model does not exist' % method) sys.exit(1) if args.mode == 'train': # Starts training time to be completed at end of conditional statement start = time.time() # Defines directory for trained network, training log, and training plot # respectively; create these directories in MatSeg if this is not already done. model_dir = './saved/%s_%s.pth' % (config['name'], method) log_dir = './log/%s_%s.log' % (config['name'], method) plot_dir = './plots/%s_%s.png' % (config['name'], method) # Obtains iterable data sets from function above train_loader, validation_loader = get_dataloader(config) # Conditional outlining choice of optimizer; includes hard-coded hyperparameters if config['optimizer'] == 'Adam': optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=5e-4) elif config['optimizer'] == 'SGD': optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9, weight_decay=5e-4) else: print('cannot found %s optimizer' % config['optimizer']) sys.exit(1) # Defines dynamic learning rate reduction. Patience defines the number of epochs after # which to reduce the LR should training loss not decrease in those epochs. scheduler = ReduceLROnPlateau(optimizer, patience=config['patience']) # Gives entries in the Recorder object to measure; obtained from evaluate function recorder = Recorder(('loss_train', 'acc_train', 'loss_val', 'acc_val', 'mean_iou', 'class_precision', 'class_iou')) iou_val_max = 0 # Iterate through number of epochs for epoch in range(1, config['epoch'] + 1): gc.collect() print('Epoch %s:' % epoch) loss_train, acc_train = train(config, model, criterion, optimizer, train_loader, method=method, gpu=args.gpu) loss_val, acc_val, iou_val, class_precision, class_iou = evaluate( config, model, criterion, validation_loader, gpu=args.gpu, method=method) # Update learning rate scheduler based on training loss scheduler.step(loss_train) # Update metrics in Recorder object for each epoch recorder.update((loss_train, acc_train, loss_val, acc_val, iou_val, class_precision, class_iou)) # Save model with higher mean IoU if iou_val > iou_val_max and args.save: torch.save(recorder.record, log_dir) torch.save( { 'epoch': epoch, 'version': args.version, 'model_state_dict': model.state_dict(), }, model_dir) print( 'validation iou improved from %.5f to %.5f. Model Saved.' % (iou_val_max, iou_val)) iou_val_max = iou_val # Stop training if learning rate is reduced three times or (commented out) if validation loss # loss does not decrease for 20 epochs. Otherwise, continue training. if (optimizer.param_groups[0]['lr'] / config['lr']) <= 1e-3: print('Learning Rate Reduced to 1e-3 of Original Value', 'Training Stopped', sep='\n') epochs = epoch break # elif all(recorder['loss_val'][-20:][i] <= recorder['loss_val'][-20:][i+1] for i in range(19)): # print('Loss has not decreased for previous 20 epochs', 'Training Stopped', sep='\n') # epochs = epoch # break else: epochs = epoch continue # Obtain time after all epochs, compute total training time, print and plot results end = time.time() time_taken = end - start print(recorder.record) plotting(recorder.record, config, start, time_taken, plot_dir, epochs) elif args.mode == 'evaluate': # Load test data into and iterable dataset with no augmentation and verbose metrics test_dir = '%s/%s' % (config['root'], args.test_folder) test_set = Dataset(test_dir, config['size'], *get_transform(config, is_train=False)) test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0, drop_last=False) # Load desired trained network from saved directory model_dir = './saved/%s_%s.pth' % (config['name'], method) model.load_state_dict(torch.load(model_dir)['model_state_dict']) # Define directories to which to save predictions and overlays respectively, and create them if necessary save_dir = '%s/predictions/%s_%s' % (test_dir, args.version, method) overlay_dir = '%s/overlays/%s_%s' % (test_dir, args.version, method) labels_dir = os.path.join(test_dir, 'labels_npy') if not os.path.isdir('%s/predictions' % test_dir): os.mkdir('%s/predictions' % test_dir) if not os.path.isdir(save_dir): os.mkdir(save_dir) evaluate(config, model, criterion, test_loader, gpu=args.gpu, method=method, test_flag=True, save_dir=save_dir) # Creates overlays if this is specified in the command line if os.path.isdir(labels_dir) and args.overlay: if not os.path.isdir(overlay_dir): os.makedirs(overlay_dir) overlay(labels_dir, save_dir, overlay_dir, config['n_class']) else: print('%s mode does not exist' % args.mode)
def train_eval_save(car_id_list, dest_term, model_id, n_save_viz=0): """ TRAIN and EVAL for given car and experimental settings """ # Load datasets path_trn, meta_trn, dest_trn, dt_trn, full_path_trn, \ path_tst, meta_tst, dest_tst, dt_tst, full_path_tst = \ unified_latest_seqdata(car_id_list, proportion_list, dest_term, train_ratio=0.8, seq_len=FLAGS.seq_len, data_dir=DATA_DIR) print('trn_data:', path_trn.shape, dest_trn.shape) print('tst_data:', path_tst.shape, dest_tst.shape) # Define model dir model_dir = os.path.join(MODEL_DIR, 'dest_type_%d' % dest_term, 'minibatch', model_id) model = Model(model_dir) FLAGS.train = FLAGS.train or model.latest_checkpoint is None # Build graph and initialize all variables model.build_graph() model.init_or_restore_all_variables(restart=FLAGS.restart) # TRAIN PART if FLAGS.train: # model.print_all_trainable_variables() model.train(path_trn, meta_trn, dest_trn) # TEST EVALUATION PART # FOR TARGETING CARS for car_id in car_id_list: # LOAD DATA path_trn, meta_trn, dest_trn, dt_trn, full_path_trn, \ path_tst, meta_tst, dest_tst, dt_tst, full_path_tst = \ unified_latest_seqdata([car_id], proportion_list, dest_term, train_ratio=0.8, seq_len=FLAGS.seq_len, data_dir=DATA_DIR) # dist_tst = model.eval_dist(path_tst, meta_tst, dest_tst) # recorder = Recorder('PATHWISE_' + RECORD_FNAME) # for i in tqdm(range(len(dist_tst))): # recorder.append_values( # ['car{:03}'.format(car_id) if isinstance(car_id, int) else 'car' + car_id, # dt_tst[i], *meta_tst[i], dist_tst[i]]) # recorder.next_line() if FLAGS.record: log.info('save the results to %s', RECORD_FNAME) global_step = model.latest_step loss_trn = model.eval_mean_distance(path_trn, meta_trn, dest_trn) loss_tst = model.eval_mean_distance(path_tst, meta_tst, dest_tst) print('car_id:', car_id, 'trn_data:', path_trn.shape, dest_trn.shape, end='--') print(loss_trn, loss_tst) # SAVE THE RESULT INTO CSV recorder = Recorder(RECORD_FNAME) recorder.append_values(['car{:03}'.format(car_id) if isinstance(car_id, int) else 'car' + car_id, model_id, len(path_trn), len(path_tst), global_step, loss_trn, loss_tst]) recorder.next_line() if n_save_viz > 0: # DEFINE PLOT AND GET PRED POINTS pred_tst = model.predict(path_tst, meta_tst) myplot = ResultPlot() myplot.add_point( path_trn, label=None, color='lightgray', marker='.', s=10, alpha=1, must_contain=False) myplot.add_point( dest_trn, label=None, color='gray', marker='.', s=10, alpha=1, must_contain=False) # PLOT ALL TEST ERRORS for i in range(pred_tst.shape[0]): difference = np.stack([dest_tst[i], pred_tst[i]], axis=0) myplot.add_tmp_path( difference, label=None, color='lightblue', marker=None, must_contain=True) myplot.add_tmp_point( dest_tst[i], label=None, color='mediumblue', marker='*', s=100, alpha=1, must_contain=True) myplot.add_tmp_point( pred_tst[i], label=None, color='crimson', marker='*', s=100, alpha=1, must_contain=True) dist_km = dist(dest_tst, pred_tst, to_km=True) # Define details to save plot save_dir = os.path.join(VIZ_DIR, 'path_and_prediction', 'dest_term_%d' % dest_term, 'car_%03d' % car_id) fname = model_id + '.png' title = '{fname}\ndist={dist_km}km' title = title.format(fname=fname, dist_km='N/A' if dist_km is None else '%.1f' % dist_km) myplot.draw_and_save(title, save_dir, fname) # FOR EACH TRIP for i in range(n_save_viz): myplot.add_tmp_path( full_path_tst[i], label=None, color='lightblue', marker='.', must_contain=True) myplot.add_tmp_path( path_tst[i], label='input_path', color='mediumblue', marker='.', must_contain=True) dest_true, dest_pred = dest_tst[i], pred_tst[i] myplot.add_tmp_point( dest_true, label='true_destination', color='mediumblue', marker='*', s=100, alpha=1, must_contain=True) myplot.add_tmp_point( dest_pred, label='pred_destination', color='crimson', marker='*', s=100, alpha=1, must_contain=True) start_time = convert_time_for_fname(dt_tst[i]) dist_km = dist(dest_pred, dest_true, to_km=True) # Define details to save plot save_dir = os.path.join(VIZ_DIR, 'path_and_prediction', 'dest_term_%d' % dest_term, 'car_%03d' % car_id, 'start_%s' % start_time) fname = model_id + '.png' title = '{datetime}\n{fname}\ndist={dist_km}km' title = title.format(fname=fname, datetime=start_time, dist_km='N/A' if dist_km is None else '%.1f' % dist_km) myplot.draw_and_save(title, save_dir, fname) # Close tf session to release GPU memory model.close_session()