def test(args): model_path = sorted(glob(os.path.join('ckpt', args.tag, '*.pth')))[-1] model = torch.load(model_path, map_location='cpu').eval() print('Loaded model: {}'.format(model_path)) model_name = os.path.basename(os.path.splitext(model_path)[0]) # initialize video writer video_filename = 'output_{}_{}.avi'.format(args.tag, model_name) dict_screen_shape = {"flappy": (288, 512), "pixelcopter": (48, 48)} out = Recorder(video_filename=video_filename, fps=30, width=dict_screen_shape[args.game][0], height=dict_screen_shape[args.game][1]) score_list = [] time_list = [] game = Game(game=args.game) for trials in range(10): elapsed_Time = 0 action = torch.zeros([model.number_of_actions], dtype=torch.float32) terminal = game.game_over() start = time.time() score = 0 image_data = game.get_torch_image() state = torch.cat( (image_data, image_data, image_data, image_data)).unsqueeze(0) while not terminal: output = model(state)[0] action = torch.zeros([model.number_of_actions], dtype=torch.float32) action_index = torch.argmax(output) score += game.act(action_index) terminal = game.game_over() image_data_1 = game.get_torch_image() state = torch.cat( (state.squeeze(0)[1:, :, :], image_data_1)).unsqueeze(0) out.write(game.get_image()) game.reset_game() score_list.append(score) time_list.append(time.time() - start) print('Game Ended!') print('Score: {} !'.format(score)) # Add summary out.write_score(sum(score_list), sum(time_list)) out.save() print('Total Score: {}'.format(sum(score_list))) print('Total Run Time: {:.3f}'.format(sum(time_list))) print('Saved video: {}'.format(video_filename))
def train(args): Arguments.save_args(args, args.args_path) train_loader, val_loader, _ = get_dataloaders(args) model = UNetVgg16(n_classes=args.n_classes).to(args.device) optimizer = get_optimizer(args.optimizer, model) lr_scheduler = LRScheduler(args.lr_scheduler, optimizer) criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device) model_saver = ModelSaver(args.model_path) recorder = Recorder(['train_miou', 'train_acc', 'train_loss', 'val_miou', 'val_acc', 'val_loss']) for epoch in range(args.n_epochs): print(f"{args.experim_name} Epoch {epoch+1}:") train_loss, train_acc, train_miou, train_ious = train_epoch( model=model, dataloader=train_loader, n_classes=args.n_classes, optimizer=optimizer, lr_scheduler=lr_scheduler, criterion=criterion, device=args.device, ) print(f"train | mIoU: {train_miou:.3f} | accuracy: {train_acc:.3f} | loss: {train_loss:.3f}") val_loss, val_scores = eval_epoch( model=model, dataloader=val_loader, n_classes=args.n_classes, criterion=criterion, device=args.device, ) val_miou, val_ious, val_acc = val_scores['mIoU'], val_scores['IoUs'], val_scores['accuracy'] print(f"valid | mIoU: {val_miou:.3f} | accuracy: {val_acc:.3f} | loss: {val_loss:.3f}") recorder.update([train_miou, train_acc, train_loss, val_miou, val_acc, val_loss]) recorder.save(args.record_path) if args.metric.startswith("IoU"): metric = val_ious[int(args.metric.split('_')[1])] else: metric = val_miou model_saver.save_models(metric, epoch+1, model, ious={'train': train_ious, 'val': val_ious}) print(f"best model at epoch {model_saver.best_epoch} with miou {model_saver.best_score:.5f}")
def run(rank, args): base_setting(args) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) random.seed(args.seed) gpuid = args.gpuid[rank] is_master = rank == 0 is_mp = len(args.gpuid) > 1 world_size = len(args.gpuid) if is_master: recorder = Recorder(args.log) tok = BertTokenizer.from_pretrained(args.model_type) if args.use_ids: collate_fn = partial(collate_mp_ids, pad_token_id=tok.pad_token_id, is_test=False) collate_fn_val = partial(collate_mp_ids, pad_token_id=tok.pad_token_id, is_test=True) train_set = RefactoringIDsDataset( f"./{args.dataset}/{args.datatype}/train", args.model_type, maxlen=args.max_len, max_num=args.max_num) val_set = RefactoringIDsDataset( f"./{args.dataset}/{args.datatype}/val", args.model_type, is_test=True, maxlen=512, is_sorted=False) else: collate_fn = partial(collate_mp, pad_token_id=tok.pad_token_id, is_test=False) collate_fn_val = partial(collate_mp, pad_token_id=tok.pad_token_id, is_test=True) train_set = RefactoringDataset( f"./{args.dataset}/{args.datatype}/train", args.model_type, maxlen=args.max_len, maxnum=args.max_num) val_set = RefactoringDataset(f"./{args.dataset}/{args.datatype}/val", args.model_type, is_test=True, maxlen=512, is_sorted=False, maxnum=args.max_num) if is_mp: train_sampler = torch.utils.data.distributed.DistributedSampler( train_set, num_replicas=world_size, rank=rank, shuffle=True) dataloader = DataLoader(train_set, batch_size=args.batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn, sampler=train_sampler) val_sampler = torch.utils.data.distributed.DistributedSampler( val_set, num_replicas=world_size, rank=rank) val_dataloader = DataLoader(val_set, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn_val, sampler=val_sampler) else: dataloader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=4, collate_fn=collate_fn) val_dataloader = DataLoader(val_set, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn_val) # build models model_path = args.pretrained if args.pretrained is not None else args.model_type model = Refactor(model_path, num_layers=args.num_layers) if args.model_pt is not None: model.load_state_dict( torch.load(args.model_pt, map_location=f'cuda:{gpuid}')) if args.cuda: if len(args.gpuid) == 1: model = model.cuda() else: dist.init_process_group("nccl", rank=rank, world_size=world_size) model = nn.parallel.DistributedDataParallel( model.to(gpuid), [gpuid], find_unused_parameters=True) model.train() init_lr = args.max_lr / args.warmup_steps optimizer = optim.Adam(model.parameters(), lr=init_lr) if is_master: recorder.write_config(args, [model], __file__) minimum_loss = 100 all_step_cnt = 0 # start training for epoch in range(args.epoch): optimizer.zero_grad() step_cnt = 0 steps = 0 avg_loss = 0 for (i, batch) in enumerate(dataloader): if args.cuda: to_cuda(batch, gpuid) step_cnt += 1 output = model(batch["src_input_ids"], batch["candidate_ids"], batch["tgt_input_ids"]) similarity, gold_similarity = output['score'], output[ 'summary_score'] loss = args.scale * RankingLoss(similarity, gold_similarity, args.margin, args.gold_margin, args.gold_weight, no_gold=args.no_gold) loss = loss / args.accumulate_step avg_loss += loss.item() loss.backward() if step_cnt == args.accumulate_step: if args.grad_norm > 0: nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm) step_cnt = 0 steps += 1 all_step_cnt += 1 lr = args.max_lr * min( all_step_cnt**(-0.5), all_step_cnt * (args.warmup_steps**(-1.5))) for param_group in optimizer.param_groups: param_group['lr'] = lr optimizer.step() optimizer.zero_grad() if steps % args.report_freq == 0 and step_cnt == 0 and is_master: recorder.print("epoch: %d, batch: %d, avg loss: %.6f" % (epoch + 1, steps, avg_loss / args.report_freq)) recorder.print(f"learning rate: {lr:.6f}") recorder.plot("loss", {"loss": avg_loss / args.report_freq}, all_step_cnt) recorder.print() avg_loss = 0 del similarity, gold_similarity, loss if all_step_cnt % args.test_freq == 0 and all_step_cnt != 0 and step_cnt == 0: loss = test(val_dataloader, model, args, gpuid) if loss < minimum_loss and is_master: minimum_loss = loss if is_mp: recorder.save(model.module, "model.bin") else: recorder.save(model, "model.bin") recorder.save(optimizer, "optimizer.bin") recorder.print("best - epoch: %d, batch: %d" % (epoch + 1, i / args.accumulate_step + 1)) if is_master: if is_mp: recorder.save(model.module, "model_cur.bin") else: recorder.save(model, "model_cur.bin") recorder.save(optimizer, "optimizer_cur.bin") recorder.print("val score: %.6f" % (1 - loss))