def main():
    args = parse_args()
    make_output_dir(args)
    config_for_multi_gpu(args)
    set_seed(args)
    with Timer('load input'):
        train_data_loader, dev_data_loader, test_data_loader = load_data_for_nlu_task(
            args, train=True, dev=True, test=False)
    print(f'train batch size: {args.train_batch_size}')
    print(f'train data batch num: {len(train_data_loader)}')
    # 每个epoch做两次dev:
    args.eval_interval = len(train_data_loader) // 2
    print(f'eval interval: {args.eval_interval}')
    # 注意该参数影响学习率warm up
    args.max_train_steps = len(train_data_loader) * args.max_train_epochs
    print(f'max steps: {args.max_train_steps}')
    if not args.early_stop:
        print(
            f'do not use early stop, training will last {args.max_train_epochs} epochs'
        )

    with Timer('load trainer'):
        trainer = load_trainer(args)
    with Timer('Train'):
        trainer.train(train_data_loader, dev_data_loader)
Example #2
0
def main():
    """None->None
    
    Main entrance of the script.
    
    Init the script,
    Parse arguments,
    Call the right action.
    """
    
    ## parse arguments
    args = parse_args()
    
    # mock client
    http_provider = onedrivesdk.HttpProvider()
    auth_provider = onedrivesdk.AuthProvider
    
    client = onedrivesdk.OneDriveClient

    ## Call action
    # Init
    if args.mode == 'init':
        client = do_init(client, args)

        # We assume that the init is successful
        print('Loged in, saving information...')

        save_session(client, path = args.conf)
        return

    ## Load session
    # If the mode is not init, there has to be a working session
    # located at the conf path. 
    client = load_session(client, path = args.conf)
    
    # get
    if args.mode == 'get':
        do_get(client, args)

    elif args.mode == 'list':
        do_list(client, args)

    elif args.mode == 'put':
        do_put(client, args)

    elif args.mode == 'delete':
        do_delete(client, args)

    elif args.mode == 'mkdir':
        do_mkdir(client, args)

    elif args.mode == 'move':
        do_move(client, args)

    elif args.mode == 'remote':
        do_remote(client, args)

    elif args.mode == 'quota':
        do_quota(client, args)
Example #3
0
def main():
    with Timer('parse args'):
        args = parse_args()
    # 添加多卡运行下的配置参数
    # BERT训练须在多卡下运行,单卡非常慢
    config_for_multi_gpu(args)
    # set_seed 必须在设置n_gpu之后
    set_seed(args)

    if args.run_mode == 'train':
        train(args)
    elif args.run_mode == 'dev':
        dev(args)
    elif args.run_mode == 'inference':
        inference(args)
Example #4
0
def main():
    # parse args
    args = parse_args()

    # build data_loader
    file_path = args.file_path
    data_loader = build_train_loader(file_path)

    device = torch.device("cpu")
    model = build_model().to(device)

    optimizer = build_optimizer(model, lr=args.lr)
    lr_milestones = [len(data_loader) * m for m in args.lr_milestones]
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=lr_milestones, gamma=args.lr_gamma)

    def save_model_checkpoint():
        if args.output_dir:
            checkpoint = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'epoch': epoch,
                'args': args
            }
            torch.save(
                checkpoint,
                os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
            torch.save(checkpoint,
                       os.path.join(args.output_dir, 'checkpoint.pth'))

    print("Start training")
    start_time = time.time()
    import ipdb
    ipdb.set_trace()
    for epoch in range(args.epochs):
        train_one_epoch(model,
                        optimizer,
                        lr_scheduler,
                        data_loader,
                        epoch,
                        args.print_freq,
                        checkpoint_fn=save_model_checkpoint)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Example #5
0
def main():
    with Timer('parse args'):
        args = parse_args()
    # 添加多卡运行下的配置参数
    # Setup CUDA, GPU & distributed training
    config_for_multi_gpu(args)
    # set_seed 必须在设置n_gpu之后
    set_seed(args)
    # 创建输出文件夹,保存运行结果,配置文件,模型参数
    if args.run_mode == 'train' and args.local_rank in [-1, 0]:
        make_output_dir(args)

    if args.run_mode == 'train':
        train(args)
    elif args.run_mode == 'dev':
        dev(args)
    elif args.run_mode == 'inference':
        inference(args)
Example #6
0
def main():
    with Timer('Parse args'):
        # 加载参数设置
        configs = parse_args()
    # 添加多卡运行下的配置参数, Setup CUDA, GPU & distributed training
    setup_for_multi_gpu(configs)
    # set_seed 必须在设置n_gpu之后
    set_seed(configs)
    # 训练模式下需要创建输出文件夹,以用来保存运行结果,配置文件,模型参数等
    if configs.command == 'train' and configs.local_rank in [-1, 0]:
        setup_output_dir(configs)

    if configs.command == 'train':
        train(configs)
    elif configs.command == 'dev':
        dev(configs)
    # 支持训练完成之后立刻在test上测试结果
    elif configs.command == 'inference':
        inference(configs)
Example #7
0
from utils.arguments import parse_args
from utils.data import load_dna_data_gan
from utils.gan_model import load_deep_signal_gan_model, load_basic_gan_model, load_dc_gan_model
from utils.train import pre_train, train

import numpy as np
import tensorflow as tf

args = parse_args()

np.random.seed(args.seed)
tf.compat.v1.set_random_seed(args.seed)

x_train, x_test, y_test, x_val, y_val = load_dna_data_gan(args)
generator, discriminator, GAN = load_dc_gan_model(args)
pre_train(args, generator, discriminator, x_train)
results = train(args, generator, discriminator, GAN, x_train, x_test, y_test,
                x_val, y_val)
Example #8
0
def main():

    args = arguments.parse_args()
    LOGGER = ConsoleLogger('Finetune', 'train')
    logdir = LOGGER.getLogFolder()
    LOGGER.info(args)
    LOGGER.info(config)

    cudnn.benckmark = config.CUDNN.BENCHMARK
    cudnn.deterministic = config.CUDNN.DETERMINISTIC
    cudnn.enabled = config.CUDNN.ENABLED

    # ------------------- Data loader -------------------

    data_transform = transforms.Compose([
        trsf.ImageTrsf(),  # normalize
        trsf.Joints3DTrsf(),  # centerize
        trsf.ToTensor()
    ])  # to tensor

    train_data = Mocap(config.dataset.train,
                       SetType.TRAIN,
                       transform=data_transform)
    train_data_loader = DataLoader(train_data,
                                   batch_size=args.batch_size,
                                   shuffle=config.data_loader.shuffle,
                                   num_workers=8)

    # val_data = Mocap(
    #     config.dataset.val,
    #     SetType.VAL,
    #     transform=data_transform)
    # val_data_loader = DataLoader(
    #     val_data,
    #     batch_size=2,
    #     shuffle=config.data_loader.shuffle,
    #     num_workers=8)

    test_data = Mocap(config.dataset.test,
                      SetType.TEST,
                      transform=data_transform)
    test_data_loader = DataLoader(test_data,
                                  batch_size=2,
                                  shuffle=config.data_loader.shuffle,
                                  num_workers=8)

    # ------------------- Model -------------------
    with open('model/model.yaml') as fin:
        model_cfg = edict(yaml.safe_load(fin))
    resnet = pose_resnet.get_pose_net(model_cfg, True)
    Loss2D = HeatmapLoss()  # same as MSELoss()
    autoencoder = encoder_decoder.AutoEncoder(args.batch_norm,
                                              args.denis_activation)
    # LossHeatmapRecon = HeatmapLoss()
    LossHeatmapRecon = HeatmapLossSquare()
    # Loss3D = nn.MSELoss()
    Loss3D = PoseLoss()
    LossLimb = LimbLoss()

    if torch.cuda.is_available():
        device = torch.device(f"cuda:{args.gpu}")
        resnet = resnet.cuda(device)
        Loss2D = Loss2D.cuda(device)
        autoencoder = autoencoder.cuda(device)
        LossHeatmapRecon.cuda(device)
        Loss3D.cuda(device)
        LossLimb.cuda(device)

    # ------------------- optimizer -------------------
    if args.freeze_2d_model:
        optimizer = optim.Adam(autoencoder.parameters(), lr=args.learning_rate)
    else:
        optimizer = optim.Adam(itertools.chain(resnet.parameters(),
                                               autoencoder.parameters()),
                               lr=args.learning_rate)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=args.step_size,
                                          gamma=0.1)
    # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer)

    # ------------------- load model -------------------
    if args.load_model:
        if not os.path.isfile(args.load_model):
            raise ValueError(f"No checkpoint found at {args.load_model}")
        checkpoint = torch.load(args.load_model)
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        resnet.load_state_dict(checkpoint['resnet_state_dict'])
        autoencoder.load_state_dict(checkpoint['autoencoder_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler'])

    if args.load_2d_model:
        if not os.path.isfile(args.load_2d_model):
            raise ValueError(f"No checkpoint found at {args.load_2d_model}")
        checkpoint = torch.load(args.load_2d_model, map_location=device)
        resnet.load_state_dict(checkpoint['resnet_state_dict'])

    if args.load_3d_model:
        if not os.path.isfile(args.load_3d_model):
            raise ValueError(f"No checkpoint found at {args.load_3d_model}")
        checkpoint = torch.load(args.load_3d_model, map_location=device)
        autoencoder.load_state_dict(checkpoint['autoencoder_state_dict'])

    # ------------------- tensorboard -------------------
    train_global_steps = 0
    writer_dict = {
        'writer': SummaryWriter(log_dir=logdir),
        'train_global_steps': train_global_steps
    }

    best_perf = float('inf')
    best_model = False
    # ------------------- run the model -------------------
    for epoch in range(args.epochs):
        with torch.autograd.set_detect_anomaly(True):
            LOGGER.info(f'====Training epoch {epoch}====')
            losses = AverageMeter()
            batch_time = AverageMeter()

            # ------------------- Evaluation -------------------
            eval_body = evaluate.EvalBody()
            eval_upper = evaluate.EvalUpperBody()
            eval_lower = evaluate.EvalLowerBody()

            resnet.train()
            autoencoder.train()

            end = time.time()
            for it, (img, p2d, p3d, heatmap,
                     action) in enumerate(train_data_loader, 0):

                img = img.to(device)
                p3d = p3d.to(device)
                heatmap = heatmap.to(device)

                heatmap2d_hat = resnet(img)  # torch.Size([16, 15, 48, 48])
                p3d_hat, heatmap2d_recon = autoencoder(heatmap2d_hat)

                loss2d = Loss2D(heatmap2d_hat, heatmap).mean()
                loss_recon = LossHeatmapRecon(heatmap2d_recon,
                                              heatmap2d_hat).mean()
                loss_3d = Loss3D(p3d_hat, p3d).mean()
                loss_cos, loss_len = LossLimb(p3d_hat, p3d)
                loss_cos = loss_cos.mean()
                loss_len = loss_len.mean()

                loss = args.lambda_2d * loss2d + args.lambda_recon * loss_recon + args.lambda_3d * loss_3d - args.lambda_cos * loss_cos + args.lambda_len * loss_len

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                batch_time.update(time.time() - end)
                losses.update(loss.item(), img.size(0))

                if it % config.train.PRINT_FREQ == 0:
                    # logging messages
                    msg = 'Epoch: [{0}][{1}/{2}]\t' \
                          'Batch Time {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \
                          'Speed {speed:.1f} samples/s\t' \
                          'Loss {loss.val:.5f} ({loss.avg:.5f})\t'.format(
                        epoch, it, len(train_data_loader), batch_time=batch_time,
                        speed=img.size(0) / batch_time.val,  # averaged within batch
                        loss=losses)
                    LOGGER.info(msg)

                    writer = writer_dict['writer']
                    global_steps = writer_dict['train_global_steps']
                    lr = [
                        group['lr']
                        for group in scheduler.optimizer.param_groups
                    ]
                    writer.add_scalar('learning_rate', lr, global_steps)
                    writer.add_scalar('train_loss', losses.val, global_steps)
                    writer.add_scalar('batch_time', batch_time.val,
                                      global_steps)
                    writer.add_scalar('losses/loss_2d', loss2d, global_steps)
                    writer.add_scalar('losses/loss_recon', loss_recon,
                                      global_steps)
                    writer.add_scalar('losses/loss_3d', loss_3d, global_steps)
                    writer.add_scalar('losses/loss_cos', loss_cos,
                                      global_steps)
                    writer.add_scalar('losses/loss_len', loss_len,
                                      global_steps)
                    image_grid = draw2Dpred_and_gt(img, heatmap2d_hat,
                                                   (368, 368))
                    writer.add_image('predicted_heatmaps', image_grid,
                                     global_steps)
                    image_grid_recon = draw2Dpred_and_gt(
                        img, heatmap2d_recon, (368, 368))
                    writer.add_image('reconstructed_heatmaps',
                                     image_grid_recon, global_steps)
                    writer_dict['train_global_steps'] = global_steps + 1

                    # ------------------- evaluation on training data -------------------

                    # Evaluate results using different evaluation metrices
                    y_output = p3d_hat.data.cpu().numpy()
                    y_target = p3d.data.cpu().numpy()

                    eval_body.eval(y_output, y_target, action)
                    eval_upper.eval(y_output, y_target, action)
                    eval_lower.eval(y_output, y_target, action)

                end = time.time()

            # ------------------- Save results -------------------
            checkpoint_dir = os.path.join(logdir, 'checkpoints')
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            LOGGER.info('=> saving checkpoint to {}'.format(checkpoint_dir))
            states = dict()
            states['resnet_state_dict'] = resnet.state_dict()
            states['autoencoder_state_dict'] = autoencoder.state_dict()
            states['optimizer_state_dict'] = optimizer.state_dict()
            states['scheduler'] = scheduler.state_dict()

            torch.save(states,
                       os.path.join(checkpoint_dir, f'checkpoint_{epoch}.tar'))

            res = {
                'FullBody': eval_body.get_results(),
                'UpperBody': eval_upper.get_results(),
                'LowerBody': eval_lower.get_results()
            }

            LOGGER.info('===========Evaluation on Train data==========')
            LOGGER.info(pprint.pformat(res))

            # utils_io.write_json(config.eval.output_file, res)

            # ------------------- validation -------------------
            resnet.eval()
            autoencoder.eval()
            val_loss = validate(LOGGER, test_data_loader, resnet, autoencoder,
                                device, epoch)
            if val_loss < best_perf:
                best_perf = val_loss
                best_model = True

            if best_model:
                shutil.copyfile(
                    os.path.join(checkpoint_dir, f'checkpoint_{epoch}.tar'),
                    os.path.join(checkpoint_dir, f'model_best.tar'))
                best_model = False

            # scheduler.step(val_loss)
            scheduler.step()
    LOGGER.info('Done.')
Example #9
0
def main():
    """Main"""

    args = arguments.parse_args()
    LOGGER = ConsoleLogger(args.training_type, 'train')
    logdir = LOGGER.getLogFolder()
    LOGGER.info(args)
    LOGGER.info(config)


    cudnn.benckmark = config.CUDNN.BENCHMARK
    cudnn.deterministic = config.CUDNN.DETERMINISTIC
    cudnn.enabled = config.CUDNN.ENABLED

    # ------------------- Data loader -------------------

    data_transform = transforms.Compose([
        trsf.ImageTrsf(),  # normalize
        trsf.Joints3DTrsf(),  # centerize
        trsf.ToTensor()])  # to tensor

    # training data
    train_data = Mocap(
        config.dataset.train,
        SetType.TRAIN,
        transform=data_transform)
    train_data_loader = DataLoader(
        train_data,
        batch_size=args.batch_size,
        shuffle=config.data_loader.shuffle,
        num_workers=8)

    val_data = Mocap(
        config.dataset.val,
        SetType.VAL,
        transform=data_transform)
    val_data_loader = DataLoader(
        val_data,
        batch_size=2,
        shuffle=config.data_loader.shuffle,
        num_workers=8)

    # ------------------- Model -------------------
    if args.training_type != 'Train3d':
        with open('model/model.yaml') as fin:
            model_cfg = edict(yaml.safe_load(fin))
        resnet = pose_resnet.get_pose_net(model_cfg, True)
        Loss2D = HeatmapLoss()  # same as MSELoss()
        # LossMSE = nn.MSELoss()
    if args.training_type != 'Train2d':
        autoencoder = encoder_decoder.AutoEncoder()

    if torch.cuda.is_available():
        device = torch.device(f"cuda:{args.gpu}")
        if args.training_type != 'Train3d':
            resnet = resnet.cuda(device)
            Loss2D = Loss2D.cuda(device)
        if args.training_type != 'Train2d':
            autoencoder = autoencoder.cuda(device)

    # ------------------- optimizer -------------------
    if args.training_type == 'Train2d':
        optimizer = optim.Adam(resnet.parameters(), lr=args.learning_rate)
    if args.training_type == 'Train3d':
        optimizer = optim.Adam(autoencoder.parameters(), lr=config.train.learning_rate)
    if args.training_type != 'Finetune':
        optimizer = optim.Adam(itertools.chain(resnet.parameters(), autoencoder.parameters()), lr=config.train.learning_rate)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=0.1)

    # ------------------- load model -------------------
    if args.load_model:
        if not os.path.isfile(args.load_model):
            raise ValueError(f"No checkpoint found at {args.load_model}")
        checkpoint = torch.load(args.load_model)
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        if args.training_type != 'Train3d':
            resnet.load_state_dict(checkpoint['resnet_state_dict'])
        if args.training_type != 'Train2d':
            autoencoder.load_state_dict(checkpoint['autoencoder_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler'])


    # ------------------- tensorboard -------------------
    train_global_steps = 0
    writer_dict = {
        'writer': SummaryWriter(log_dir=logdir),
        'train_global_steps': train_global_steps
    }

    # ------------------- Evaluation -------------------
    if args.training_type != 'Train2d':
        eval_body = evaluate.EvalBody()
        eval_upper = evaluate.EvalUpperBody()
        eval_lower = evaluate.EvalLowerBody()


    best_perf = float('inf')
    best_model = False
    # ------------------- run the model -------------------
    for epoch in range(args.epochs):
        with torch.autograd.set_detect_anomaly(True):
            LOGGER.info(f'====Training epoch {epoch}====')
            losses = AverageMeter()
            batch_time = AverageMeter()

            resnet.train()
            autoencoder.train()

            end = time.time()
            for it, (img, p2d, p3d, heatmap, action) in enumerate(train_data_loader, 0):

                img = img.to(device)
                p2d = p2d.to(device)
                p3d = p3d.to(device)
                heatmap = heatmap.to(device)

                if args.training_type != 'Train3d':
                    heatmap2d_hat = resnet(img)  # torch.Size([16, 15, 48, 48])
                else:
                    heatmap2d_hat = heatmap
                p3d_hat, heatmap2d_recon = autoencoder(heatmap2d_hat)

                loss2d = Loss2D(heatmap, heatmap2d_hat).mean()
                # loss2d = LossMSE(heatmap, heatmap2d_hat)

                if args.training_type == 'Train2d':
                    loss = loss2d
                elif args.training_type == 'Train3d':
                    pass

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                batch_time.update(time.time() - end)
                losses.update(loss.item(), img.size(0))

                if it % config.train.PRINT_FREQ == 0:
                    # logging messages
                    msg = 'Epoch: [{0}][{1}/{2}]\t' \
                          'Batch Time {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \
                          'Speed {speed:.1f} samples/s\t' \
                          'Loss {loss.val:.5f} ({loss.avg:.5f})\t'.format(
                        epoch, it, len(train_data_loader), batch_time=batch_time,
                        speed=img.size(0) / batch_time.val,  # averaged within batch
                        loss=losses)
                    LOGGER.info(msg)
                end = time.time()
            scheduler.step()

            # ------------------- validation -------------------

            resnet.eval()
            autoencoder.eval()

            if args.training_type != 'Train2d':
                # Evaluate results using different evaluation metrices
                y_output = p3d_hat.data.cpu().numpy()
                y_target = p3d.data.cpu().numpy()

                eval_body.eval(y_output, y_target, action)
                eval_upper.eval(y_output, y_target, action)
                eval_lower.eval(y_output, y_target, action)


            # ------------------- Save results -------------------
            checkpoint_dir = os.path.join(logdir, 'checkpoints')
            LOGGER.info('=> saving checkpoint to {}'.format(checkpoint_dir))
            states = dict()
            if args.training_type!='Train3d':
                states['resnet_state_dict'] = resnet.state_dict()
            if args.training_type!='Train2d':
                states['autoencoder_state_dict'] = autoencoder.state_dict()
            states['optimizer_state_dict']= optimizer.state_dict()

            torch.save(states, f'checkpoint_{epoch}.tar')
            res = {'FullBody': eval_body.get_results(),
                   'UpperBody': eval_upper.get_results(),
                   'LowerBody': eval_lower.get_results()}

            utils_io.write_json(config.eval.output_file, res)

            LOGGER.info('Done.')
Example #10
0
def main():

    args = arguments.parse_args()
    LOGGER = ConsoleLogger('Train2d', 'train')
    logdir = LOGGER.getLogFolder()
    LOGGER.info(args)
    LOGGER.info(config)

    cudnn.benckmark = config.CUDNN.BENCHMARK
    cudnn.deterministic = config.CUDNN.DETERMINISTIC
    cudnn.enabled = config.CUDNN.ENABLED

    # ------------------- Data loader -------------------

    data_transform = transforms.Compose([
        trsf.ImageTrsf(),  # normalize
        trsf.Joints3DTrsf(),  # centerize
        trsf.ToTensor()
    ])  # to tensor

    train_data = Mocap(config.dataset.train,
                       SetType.TRAIN,
                       transform=data_transform)
    train_data_loader = DataLoader(train_data,
                                   batch_size=args.batch_size,
                                   shuffle=config.data_loader.shuffle,
                                   num_workers=8)

    test_data = Mocap(config.dataset.test,
                      SetType.TEST,
                      transform=data_transform)
    test_data_loader = DataLoader(test_data,
                                  batch_size=2,
                                  shuffle=config.data_loader.shuffle,
                                  num_workers=8)

    # ------------------- Model -------------------
    with open('model/model.yaml') as fin:
        model_cfg = edict(yaml.safe_load(fin))
    resnet = pose_resnet.get_pose_net(model_cfg, True)
    Loss2D = HeatmapLoss()  # same as MSELoss()
    # LossMSE = nn.MSELoss()

    if torch.cuda.is_available():
        device = torch.device(f"cuda:{args.gpu}")
        resnet = resnet.cuda(device)
        Loss2D = Loss2D.cuda(device)

    # ------------------- optimizer -------------------
    optimizer = optim.Adam(resnet.parameters(), lr=args.learning_rate)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=args.step_size,
                                          gamma=0.1)

    # ------------------- load model -------------------
    if args.load_model:
        if not os.path.isfile(args.load_model):
            raise FileNotFoundError(
                f"No checkpoint found at {args.load_model}")
        checkpoint = torch.load(args.load_model)
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        resnet.load_state_dict(checkpoint['resnet_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler'])

    # ------------------- tensorboard -------------------
    train_global_steps = 0
    writer_dict = {
        'writer': SummaryWriter(log_dir=logdir),
        'train_global_steps': train_global_steps
    }

    best_model = False
    best_perf = float('inf')
    # ------------------- run the model -------------------
    for epoch in range(args.epochs):
        with torch.autograd.set_detect_anomaly(True):
            LOGGER.info(f'====Training epoch {epoch}====')
            losses = AverageMeter()
            batch_time = AverageMeter()

            resnet.train()

            end = time.time()
            for it, (img, p2d, p3d, heatmap,
                     action) in enumerate(train_data_loader, 0):

                img = img.to(device)
                p2d = p2d.to(device)
                p3d = p3d.to(device)
                heatmap = heatmap.to(device)

                heatmap2d_hat = resnet(img)  # torch.Size([16, 15, 48, 48])

                loss2d = Loss2D(heatmap2d_hat, heatmap).mean()
                # loss2d = LossMSE(heatmap, heatmap2d_hat)

                loss = loss2d * args.lambda_2d

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                batch_time.update(time.time() - end)
                losses.update(loss.item() / args.lambda_2d, img.size(0))

                if it % config.train.PRINT_FREQ == 0:
                    # logging messages
                    msg = 'Epoch: [{0}][{1}/{2}]\t' \
                          'Batch Time {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \
                          'Speed {speed:.1f} samples/s\t' \
                          'Loss {loss.val:.5f} ({loss.avg:.5f})\t'.format(
                        epoch, it, len(train_data_loader), batch_time=batch_time,
                        speed=img.size(0) / batch_time.val,  # averaged within batch
                        loss=losses)
                    LOGGER.info(msg)

                    writer = writer_dict['writer']
                    global_steps = writer_dict['train_global_steps']
                    lr = [
                        group['lr']
                        for group in scheduler.optimizer.param_groups
                    ]
                    writer.add_scalar('learning_rate', lr, global_steps)
                    writer.add_scalar('train_loss', losses.val, global_steps)
                    writer.add_scalar('batch_time', batch_time.val,
                                      global_steps)
                    image_grid = draw2Dpred_and_gt(img, heatmap2d_hat)
                    writer.add_image('predicted_heatmaps', image_grid,
                                     global_steps)
                    writer_dict['train_global_steps'] = global_steps + 1

                end = time.time()
            scheduler.step()
            # ------------------- Save results -------------------
            checkpoint_dir = os.path.join(logdir, 'checkpoints')
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            LOGGER.info('=> saving checkpoint to {}'.format(checkpoint_dir))
            states = dict()
            states['resnet_state_dict'] = resnet.state_dict()
            states['optimizer_state_dict'] = optimizer.state_dict()
            states['scheduler'] = scheduler.state_dict()
            torch.save(states,
                       os.path.join(checkpoint_dir, f'checkpoint_{epoch}.tar'))

            # ------------------- validation -------------------
            resnet.eval()
            val_loss = validate(LOGGER, test_data_loader, resnet, device,
                                epoch)
            if val_loss < best_perf:
                best_perf = val_loss
                best_model = True

            if best_model:
                shutil.copyfile(
                    os.path.join(checkpoint_dir, f'checkpoint_{epoch}.tar'),
                    os.path.join(checkpoint_dir, f'model_best.tar'))
                best_model = False

    LOGGER.info('Done.')
Example #11
0
def main():
    ''' Main func '''

    # Args
    args = arguments.parse_args()

    # Args - Configfile path
    filepath = args.configfile

    # filename provided on cli.
    filename = os.path.basename(filepath)

    # Tmux.ini mode.
    if filename == 'tmux.ini':
        r.banner('Tmux.conf')
        # Config filepath.
        tmux_ini = './configs/tmux.ini'
        # tmux.conf and tmux.conf.bak filepaths.
        tmux_conf = os.path.join(os.path.expanduser('~'), '.tmux.conf')
        tmux_bak = tmux_conf + '.bak'
        if os.path.isfile(tmux_conf):
            # Create 'tmux.conf.bak' backup.
            r.logging.warning(f'File already exists: {tmux_conf}')
            filepath = copy_file(tmux_conf, tmux_bak)
            r.logging.info(f'Created backup: {filepath}')
        # Copy 'tmux.ini' to 'tmux.conf'.
        filepath = copy_file(tmux_ini, tmux_conf)
        r.logging.info(f'Copied: {tmux_ini} to {filepath}')

    # Alias mode - set delimiter ':' for 'aliases.ini' configfile.
    elif filename == 'aliases.ini':
        config = ConfigParser(allow_no_value=True, delimiters='*')
        config.optionxform = str
        config.read(filepath)
        # Config file values.
        ALIASES = [k for k in config['aliases']]
        EXPORTS = [k for k in config['bashrc']]
        # Check if aliases are in configfile.
        r.banner('Aliases')
        if ALIASES:
            filepath = os.path.join(os.path.expanduser('~'), '.bash_aliases')
            # Append list to file.
            results = append_lst(ALIASES, filepath)
            # Print results.
            [
                logging.info(f'Appended {filepath}: {result}')
                for result in results
            ]
        # Check if exports are in configfile.
        r.banner('Exports')
        if EXPORTS:
            filepath = os.path.join(os.path.expanduser('~'), f'.bashrc')
            # Append list to file.
            results = append_lst(EXPORTS, filepath)
            # Print results.
            [
                logging.info(f'Appended {filepath}: {result}')
                for result in results
            ]

    # Tool mode - set delimiter '=' for all other configfiles.
    else:
        config = ConfigParser(allow_no_value=True, delimiters='=')
        config.optionxform = str
        config.read(filepath)
        # Config file values.
        DEST_DIR = ''.join([k for k in config['tools_dir']][0])
        GITHUB_URLS = [k for k in config['github_urls']]
        BINARY_URLS = [k for k in config['binary_urls']]
        PIP_PACKAGES = [k for k in config['pip_packages']]
        APT_PACKAGES = [k for k in config['apt_packages']]

        # Pause, mainly used for testing.
        r.ctrl_c()

        # Downloader init.
        dl = download.Downloader(DEST_DIR)
        # Github-Downloads.
        r.banner('Github Downloads')
        with r.console.status(status=f'[status.text]Downloading') as status:
            make_threaded(dl.get_gitrepo, GITHUB_URLS)
        # URL-Downloads.
        r.banner('URL Downloads')
        with r.console.status(status=f'[status.text]Downloading') as status:
            make_threaded(dl.get_binary, BINARY_URLS)

        # Installer init.
        installer = install.Installer()
        # Pip Download/Install.
        r.banner('Pip Downloads/Installs')
        with r.console.status(status=f'[status.text]Processing...') as status:
            results = list(map(installer.pip_install, PIP_PACKAGES))
        # APT Download/Install.
        r.banner('APT Downloads/Installs')
        with r.console.status(status=f'[status.text]Processing...') as status:
            results = list(map(installer.apt_install, APT_PACKAGES))