Beispiel #1
0
def train(args, dataloader, model):
    epoch = 1
    optimizer = optim.Adam(list(model.parameters()), lr=args.lr)
    scheduler = MultiStepLR(optimizer, milestones=LR_milestones, gamma=args.lr)

    model.train()
    for epoch in range(5000):
        for batch_idx, data in enumerate(dataloader):
            model.zero_grad()
            features = data['features'].float()
            adj_input = data['adj'].float()

            features = Variable(features).cuda()
            adj_input = Variable(adj_input).cuda()
            
            loss = model(features, adj_input)
            print('Epoch: ', epoch, ', Iter: ', batch_idx, ', Loss: ', loss)
            loss.backward()

            optimizer.step()
            scheduler.step()
            break
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Large-scale Point Cloud Semantic Segmentation with Superpoint Graphs')

    # Optimization arguments
    parser.add_argument('--wd', default=0, type=float, help='Weight decay')
    parser.add_argument('--lr',
                        default=1e-2,
                        type=float,
                        help='Initial learning rate')
    parser.add_argument(
        '--lr_decay',
        default=0.7,
        type=float,
        help='Multiplicative factor used on learning rate at `lr_steps`')
    parser.add_argument(
        '--lr_steps',
        default='[]',
        help='List of epochs where the learning rate is decreased by `lr_decay`'
    )
    parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
    parser.add_argument(
        '--epochs',
        default=10,
        type=int,
        help='Number of epochs to train. If <=0, only testing will be done.')
    parser.add_argument('--batch_size', default=2, type=int, help='Batch size')
    parser.add_argument('--optim', default='adam', help='Optimizer: sgd|adam')
    parser.add_argument(
        '--grad_clip',
        default=1,
        type=float,
        help='Element-wise clipping of gradient. If 0, does not clip')
    parser.add_argument(
        '--loss_weights',
        default='none',
        help='[none, proportional, sqrt] how to weight the loss function')

    # Learning process arguments
    parser.add_argument('--cuda', default=1, type=int, help='Bool, use cuda')
    parser.add_argument(
        '--nworkers',
        default=0,
        type=int,
        help=
        'Num subprocesses to use for data loading. 0 means that the data will be loaded in the main process'
    )
    parser.add_argument('--test_nth_epoch',
                        default=1,
                        type=int,
                        help='Test each n-th epoch during training')
    parser.add_argument('--save_nth_epoch',
                        default=1,
                        type=int,
                        help='Save model each n-th epoch during training')
    parser.add_argument(
        '--test_multisamp_n',
        default=10,
        type=int,
        help='Average logits obtained over runs with different seeds')

    # Dataset
    parser.add_argument('--dataset',
                        default='sema3d',
                        help='Dataset name: sema3d|s3dis')
    parser.add_argument(
        '--cvfold',
        default=0,
        type=int,
        help='Fold left-out for testing in leave-one-out setting (S3DIS)')
    parser.add_argument('--odir',
                        default='results',
                        help='Directory to store results')
    parser.add_argument('--resume',
                        default='',
                        help='Loads a previously saved model.')
    parser.add_argument('--db_train_name', default='train')
    parser.add_argument('--db_test_name', default='test')
    parser.add_argument('--use_val_set', type=int, default=0)
    parser.add_argument('--SEMA3D_PATH', default='datasets/semantic3d')
    parser.add_argument('--S3DIS_PATH', default='datasets/s3dis')
    parser.add_argument('--VKITTI_PATH', default='datasets/vkitti')
    parser.add_argument('--CUSTOM_SET_PATH', default='datasets/custom_set')
    parser.add_argument(
        '--use_pyg',
        default=0,
        type=int,
        help='Wether to use Pytorch Geometric for graph convolutions')

    # Model
    parser.add_argument(
        '--model_config',
        default='gru_10,f_8',
        help=
        'Defines the model as a sequence of layers, see graphnet.py for definitions of respective layers and acceptable arguments. In short: rectype_repeats_mv_layernorm_ingate_concat, with rectype the type of recurrent unit [gru/crf/lstm], repeats the number of message passing iterations, mv (default True) the use of matrix-vector (mv) instead vector-vector (vv) edge filters, layernorm (default True) the use of layernorms in the recurrent units, ingate (default True) the use of input gating, concat (default True) the use of state concatenation'
    )
    parser.add_argument('--seed',
                        default=1,
                        type=int,
                        help='Seed for random initialisation')
    parser.add_argument(
        '--edge_attribs',
        default=
        'delta_avg,delta_std,nlength/ld,surface/ld,volume/ld,size/ld,xyz/d',
        help=
        'Edge attribute definition, see spg_edge_features() in spg.py for definitions.'
    )

    # Point cloud processing
    parser.add_argument(
        '--pc_attribs',
        default='xyzrgbelpsvXYZ',
        help=
        'Point attributes fed to PointNets, if empty then all possible. xyz = coordinates, rgb = color, e = elevation, lpsv = geometric feature, d = distance to center'
    )
    parser.add_argument(
        '--pc_augm_scale',
        default=0,
        type=float,
        help=
        'Training augmentation: Uniformly random scaling in [1/scale, scale]')
    parser.add_argument(
        '--pc_augm_rot',
        default=1,
        type=int,
        help='Training augmentation: Bool, random rotation around z-axis')
    parser.add_argument(
        '--pc_augm_mirror_prob',
        default=0,
        type=float,
        help='Training augmentation: Probability of mirroring about x or y axes'
    )
    parser.add_argument(
        '--pc_augm_jitter',
        default=1,
        type=int,
        help='Training augmentation: Bool, Gaussian jittering of all attributes'
    )
    parser.add_argument(
        '--pc_xyznormalize',
        default=1,
        type=int,
        help='Bool, normalize xyz into unit ball, i.e. in [-0.5,0.5]')

    # Filter generating network
    parser.add_argument(
        '--fnet_widths',
        default='[32,128,64]',
        help=
        'List of width of hidden filter gen net layers (excluding the input and output ones, they are automatic)'
    )
    parser.add_argument(
        '--fnet_llbias',
        default=0,
        type=int,
        help='Bool, use bias in the last layer in filter gen net')
    parser.add_argument(
        '--fnet_orthoinit',
        default=1,
        type=int,
        help='Bool, use orthogonal weight initialization for filter gen net.')
    parser.add_argument(
        '--fnet_bnidx',
        default=2,
        type=int,
        help='Layer index to insert batchnorm to. -1=do not insert.')
    parser.add_argument(
        '--edge_mem_limit',
        default=30000,
        type=int,
        help=
        'Number of edges to process in parallel during computation, a low number can reduce memory peaks.'
    )

    # Superpoint graph
    parser.add_argument(
        '--spg_attribs01',
        default=1,
        type=int,
        help='Bool, normalize edge features to 0 mean 1 deviation')
    parser.add_argument('--spg_augm_nneigh',
                        default=100,
                        type=int,
                        help='Number of neighborhoods to sample in SPG')
    parser.add_argument('--spg_augm_order',
                        default=3,
                        type=int,
                        help='Order of neighborhoods to sample in SPG')
    parser.add_argument(
        '--spg_augm_hardcutoff',
        default=512,
        type=int,
        help=
        'Maximum number of superpoints larger than args.ptn_minpts to sample in SPG'
    )
    parser.add_argument(
        '--spg_superedge_cutoff',
        default=-1,
        type=float,
        help=
        'Artificially constrained maximum length of superedge, -1=do not constrain'
    )

    # Point net
    parser.add_argument(
        '--ptn_minpts',
        default=40,
        type=int,
        help=
        'Minimum number of points in a superpoint for computing its embedding.'
    )
    parser.add_argument('--ptn_npts',
                        default=128,
                        type=int,
                        help='Number of input points for PointNet.')
    parser.add_argument('--ptn_widths',
                        default='[[64,64,128,128,256], [256,64,32]]',
                        help='PointNet widths')
    parser.add_argument('--ptn_widths_stn',
                        default='[[64,64,128], [128,64]]',
                        help='PointNet\'s Transformer widths')
    parser.add_argument(
        '--ptn_nfeat_stn',
        default=11,
        type=int,
        help='PointNet\'s Transformer number of input features')
    parser.add_argument('--ptn_prelast_do', default=0, type=float)
    parser.add_argument(
        '--ptn_mem_monger',
        default=1,
        type=int,
        help=
        'Bool, save GPU memory by recomputing PointNets in back propagation.')

    # Decoder
    parser.add_argument(
        '--sp_decoder_config',
        default="[]",
        type=str,
        help=
        'Size of the decoder : sp_embedding -> sp_class. First layer of size sp_embed (* (1+n_ecc_iteration) if concatenation) and last layer is n_classes'
    )

    args = parser.parse_args()
    args.start_epoch = 0
    args.lr_steps = ast.literal_eval(args.lr_steps)
    args.fnet_widths = ast.literal_eval(args.fnet_widths)
    args.ptn_widths = ast.literal_eval(args.ptn_widths)
    args.sp_decoder_config = ast.literal_eval(args.sp_decoder_config)
    args.ptn_widths_stn = ast.literal_eval(args.ptn_widths_stn)

    print('Will save to ' + args.odir)
    if not os.path.exists(args.odir):
        os.makedirs(args.odir)
    with open(os.path.join(args.odir, 'cmdline.txt'), 'w') as f:
        f.write(" ".join([
            "'" + a + "'" if (len(a) == 0 or a[0] != '-') else a
            for a in sys.argv
        ]))

    set_seed(args.seed, args.cuda)
    logging.getLogger().setLevel(
        logging.INFO)  #set to logging.DEBUG to allow for more prints
    if (args.dataset == 'sema3d' and args.db_test_name.startswith('test')) or (
            args.dataset.startswith('s3dis_02') and args.cvfold == 2):
        # needed in pytorch 0.2 for super-large graphs with batchnorm in fnet  (https://github.com/pytorch/pytorch/pull/2919)
        torch.backends.cudnn.enabled = False

    if args.use_pyg:
        torch.backends.cudnn.enabled = False

    # Decide on the dataset
    if args.dataset == 'sema3d':
        import sema3d_dataset
        dbinfo = sema3d_dataset.get_info(args)
        create_dataset = sema3d_dataset.get_datasets
    elif args.dataset == 's3dis':
        import s3dis_dataset
        dbinfo = s3dis_dataset.get_info(args)
        create_dataset = s3dis_dataset.get_datasets
    elif args.dataset == 'vkitti':
        import vkitti_dataset
        dbinfo = vkitti_dataset.get_info(args)
        create_dataset = vkitti_dataset.get_datasets
    elif args.dataset == 'custom_dataset':
        import custom_dataset  #<- to write!
        dbinfo = custom_dataset.get_info(args)
        create_dataset = custom_dataset.get_datasets
    else:
        raise NotImplementedError('Unknown dataset ' + args.dataset)

    # Create model and optimizer
    if args.resume != '':
        if args.resume == 'RESUME': args.resume = args.odir + '/model.pth.tar'
        model, optimizer, stats = resume(args, dbinfo)
    else:
        model = create_model(args, dbinfo)
        optimizer = create_optimizer(args, model)
        stats = []

    train_dataset, test_dataset, valid_dataset, scaler = create_dataset(args)

    print(
        'Train dataset: %i elements - Test dataset: %i elements - Validation dataset: %i elements'
        % (len(train_dataset), len(test_dataset), len(valid_dataset)))
    ptnCloudEmbedder = pointnet.CloudEmbedder(args)
    scheduler = MultiStepLR(optimizer,
                            milestones=args.lr_steps,
                            gamma=args.lr_decay,
                            last_epoch=args.start_epoch - 1)

    ############
    def train():
        """ Trains for one epoch """
        model.train()

        loader = torch.utils.data.DataLoader(train_dataset,
                                             batch_size=args.batch_size,
                                             collate_fn=spg.eccpc_collate,
                                             num_workers=args.nworkers,
                                             shuffle=True,
                                             drop_last=True)
        if logging.getLogger().getEffectiveLevel() > logging.DEBUG:
            loader = tqdm(loader, ncols=65)

        loss_meter = tnt.meter.AverageValueMeter()
        acc_meter = tnt.meter.ClassErrorMeter(accuracy=True)
        confusion_matrix = metrics.ConfusionMatrix(dbinfo['classes'])
        t0 = time.time()

        # iterate over dataset in batches
        for bidx, (targets, GIs, clouds_data) in enumerate(loader):
            t_loader = 1000 * (time.time() - t0)

            model.ecc.set_info(GIs, args.cuda)
            label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:,
                                                                   0], targets[:,
                                                                               2:], targets[:, 1:].sum(
                                                                                   1
                                                                               )
            if args.cuda:
                label_mode, label_vec, segm_size = label_mode_cpu.cuda(
                ), label_vec_cpu.float().cuda(), segm_size_cpu.float().cuda()
            else:
                label_mode, label_vec, segm_size = label_mode_cpu, label_vec_cpu.float(
                ), segm_size_cpu.float()

            optimizer.zero_grad()
            t0 = time.time()

            embeddings = ptnCloudEmbedder.run(model, *clouds_data)
            outputs = model.ecc(embeddings)

            loss = nn.functional.cross_entropy(outputs,
                                               Variable(label_mode),
                                               weight=dbinfo["class_weights"])

            loss.backward()
            ptnCloudEmbedder.bw_hook()

            if args.grad_clip > 0:
                for p in model.parameters():
                    p.grad.data.clamp_(-args.grad_clip, args.grad_clip)
            optimizer.step()

            t_trainer = 1000 * (time.time() - t0)
            #loss_meter.add(loss.data[0]) # pytorch 0.3
            loss_meter.add(loss.item())  # pytorch 0.4

            o_cpu, t_cpu, tvec_cpu = filter_valid(outputs.data.cpu().numpy(),
                                                  label_mode_cpu.numpy(),
                                                  label_vec_cpu.numpy())
            acc_meter.add(o_cpu, t_cpu)
            confusion_matrix.count_predicted_batch(tvec_cpu,
                                                   np.argmax(o_cpu, 1))

            logging.debug(
                'Batch loss %f, Loader time %f ms, Trainer time %f ms.',
                loss.data.item(), t_loader, t_trainer)
            t0 = time.time()

        return acc_meter.value()[0], loss_meter.value(
        )[0], confusion_matrix.get_overall_accuracy(
        ), confusion_matrix.get_average_intersection_union()

    ############
    def eval(is_valid=False):
        """ Evaluated model on test set """
        model.eval()

        if is_valid:  #validation
            loader = torch.utils.data.DataLoader(valid_dataset,
                                                 batch_size=1,
                                                 collate_fn=spg.eccpc_collate,
                                                 num_workers=args.nworkers)
        else:  #evaluation
            loader = torch.utils.data.DataLoader(test_dataset,
                                                 batch_size=1,
                                                 collate_fn=spg.eccpc_collate,
                                                 num_workers=args.nworkers)

        if logging.getLogger().getEffectiveLevel() > logging.DEBUG:
            loader = tqdm(loader, ncols=65)

        acc_meter = tnt.meter.ClassErrorMeter(accuracy=True)
        loss_meter = tnt.meter.AverageValueMeter()
        confusion_matrix = metrics.ConfusionMatrix(dbinfo['classes'])

        # iterate over dataset in batches
        for bidx, (targets, GIs, clouds_data) in enumerate(loader):
            model.ecc.set_info(GIs, args.cuda)
            label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:,
                                                                   0], targets[:,
                                                                               2:], targets[:, 1:].sum(
                                                                                   1
                                                                               ).float(
                                                                               )
            if args.cuda:
                label_mode, label_vec, segm_size = label_mode_cpu.cuda(
                ), label_vec_cpu.float().cuda(), segm_size_cpu.float().cuda()
            else:
                label_mode, label_vec, segm_size = label_mode_cpu, label_vec_cpu.float(
                ), segm_size_cpu.float()

            embeddings = ptnCloudEmbedder.run(model, *clouds_data)
            outputs = model.ecc(embeddings)

            loss = nn.functional.cross_entropy(outputs,
                                               Variable(label_mode),
                                               weight=dbinfo["class_weights"])
            loss_meter.add(loss.item())

            o_cpu, t_cpu, tvec_cpu = filter_valid(outputs.data.cpu().numpy(),
                                                  label_mode_cpu.numpy(),
                                                  label_vec_cpu.numpy())
            if t_cpu.size > 0:
                acc_meter.add(o_cpu, t_cpu)
                confusion_matrix.count_predicted_batch(tvec_cpu,
                                                       np.argmax(o_cpu, 1))

        return meter_value(acc_meter), loss_meter.value(
        )[0], confusion_matrix.get_overall_accuracy(
        ), confusion_matrix.get_average_intersection_union(
        ), confusion_matrix.get_mean_class_accuracy()

    ############
    def eval_final():
        """ Evaluated model on test set in an extended way: computes estimates over multiple samples of point clouds and stores predictions """
        model.eval()

        acc_meter = tnt.meter.ClassErrorMeter(accuracy=True)
        confusion_matrix = metrics.ConfusionMatrix(dbinfo['classes'])
        collected, predictions = defaultdict(list), {}

        # collect predictions over multiple sampling seeds
        for ss in range(args.test_multisamp_n):
            test_dataset_ss = create_dataset(args, ss)[1]
            loader = torch.utils.data.DataLoader(test_dataset_ss,
                                                 batch_size=1,
                                                 collate_fn=spg.eccpc_collate,
                                                 num_workers=args.nworkers)
            if logging.getLogger().getEffectiveLevel() > logging.DEBUG:
                loader = tqdm(loader, ncols=65)

            # iterate over dataset in batches
            for bidx, (targets, GIs, clouds_data) in enumerate(loader):
                model.ecc.set_info(GIs, args.cuda)
                label_mode_cpu, label_vec_cpu, segm_size_cpu = targets[:,
                                                                       0], targets[:, 2:], targets[:, 1:].sum(
                                                                           1
                                                                       ).float(
                                                                       )

                embeddings = ptnCloudEmbedder.run(model, *clouds_data)
                outputs = model.ecc(embeddings)

                fname = clouds_data[0][0][:clouds_data[0][0].rfind('.')]
                collected[fname].append(
                    (outputs.data.cpu().numpy(), label_mode_cpu.numpy(),
                     label_vec_cpu.numpy()))

        # aggregate predictions (mean)
        for fname, lst in collected.items():
            o_cpu, t_cpu, tvec_cpu = list(zip(*lst))
            if args.test_multisamp_n > 1:
                o_cpu = np.mean(np.stack(o_cpu, 0), 0)
            else:
                o_cpu = o_cpu[0]
            t_cpu, tvec_cpu = t_cpu[0], tvec_cpu[0]
            predictions[fname] = np.argmax(o_cpu, 1)
            o_cpu, t_cpu, tvec_cpu = filter_valid(o_cpu, t_cpu, tvec_cpu)
            if t_cpu.size > 0:
                acc_meter.add(o_cpu, t_cpu)
                confusion_matrix.count_predicted_batch(tvec_cpu,
                                                       np.argmax(o_cpu, 1))

        per_class_iou = {}
        perclsiou = confusion_matrix.get_intersection_union_per_class()
        for c, name in dbinfo['inv_class_map'].items():
            per_class_iou[name] = perclsiou[c]

        return meter_value(acc_meter), confusion_matrix.get_overall_accuracy(
        ), confusion_matrix.get_average_intersection_union(
        ), per_class_iou, predictions, confusion_matrix.get_mean_class_accuracy(
        ), confusion_matrix.confusion_matrix

    ############
    # Training loop
    try:
        best_iou = stats[-1]['best_iou']
    except:
        best_iou = 0
    TRAIN_COLOR = '\033[0m'
    VAL_COLOR = '\033[0;94m'
    TEST_COLOR = '\033[0;93m'
    BEST_COLOR = '\033[0;92m'
    epoch = args.start_epoch

    for epoch in range(args.start_epoch, args.epochs):
        print('Epoch {}/{} ({}):'.format(epoch, args.epochs, args.odir))
        scheduler.step()

        acc, loss, oacc, avg_iou = train()

        print(TRAIN_COLOR + '-> Train Loss: %1.4f   Train accuracy: %3.2f%%' %
              (loss, acc))

        new_best_model = False
        if args.use_val_set:
            acc_val, loss_val, oacc_val, avg_iou_val, avg_acc_val = eval(True)
            print(VAL_COLOR + '-> Val Loss: %1.4f  Val accuracy: %3.2f%%  Val oAcc: %3.2f%%  Val IoU: %3.2f%%  best ioU: %3.2f%%' % \
                 (loss_val, acc_val, 100*oacc_val, 100*avg_iou_val,100*max(best_iou,avg_iou_val)) + TRAIN_COLOR)
            if avg_iou_val > best_iou:  #best score yet on the validation set
                print(BEST_COLOR + '-> New best model achieved!' + TRAIN_COLOR)
                best_iou = avg_iou_val
                new_best_model = True
                torch.save(
                    {
                        'epoch': epoch + 1,
                        'args': args,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'scaler': scaler
                    }, os.path.join(args.odir, 'model.pth.tar'))
        elif epoch % args.save_nth_epoch == 0 or epoch == args.epochs - 1:
            torch.save(
                {
                    'epoch': epoch + 1,
                    'args': args,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'scaler': scaler
                }, os.path.join(args.odir, 'model.pth.tar'))
        #test every test_nth_epochs
        #or test after each enw model (but skip the first 5 for efficiency)
        if (not(args.use_val_set) and (epoch+1) % args.test_nth_epoch == 0)  \
           or (args.use_val_set and new_best_model and epoch > 5):
            acc_test, loss_test, oacc_test, avg_iou_test, avg_acc_test = eval(
                False)
            print(TEST_COLOR + '-> Test Loss: %1.4f  Test accuracy: %3.2f%%  Test oAcc: %3.2f%%  Test avgIoU: %3.2f%%' % \
                 (loss_test, acc_test, 100*oacc_test, 100*avg_iou_test) + TRAIN_COLOR)
        else:
            acc_test, loss_test, oacc_test, avg_iou_test, avg_acc_test = 0, 0, 0, 0, 0

        stats.append({
            'epoch': epoch,
            'acc': acc,
            'loss': loss,
            'oacc': oacc,
            'avg_iou': avg_iou,
            'acc_test': acc_test,
            'oacc_test': oacc_test,
            'avg_iou_test': avg_iou_test,
            'avg_acc_test': avg_acc_test,
            'best_iou': best_iou
        })
        """
        if epoch % args.save_nth_epoch == 0 or epoch==args.epochs-1:
            with open(os.path.join(args.odir, 'trainlog.json'), 'w') as outfile:
                json.dump(stats, outfile,indent=4)
            torch.save({'epoch': epoch + 1, 'args': args, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), 'scaler': scaler},
                       os.path.join(args.odir, 'model.pth.tar'))
        """

        if math.isnan(loss): break

        if len(stats) > 0:
            with open(os.path.join(args.odir, 'trainlog.json'),
                      'w') as outfile:
                json.dump(stats, outfile, indent=4)

    if args.use_val_set:
        args.resume = args.odir + '/model.pth.tar'
        model, optimizer, stats = resume(args, dbinfo)
        torch.save(
            {
                'epoch': epoch + 1,
                'args': args,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
            }, os.path.join(args.odir, 'model.pth.tar'))

    # Final evaluation
    if args.test_multisamp_n > 0 and 'test' in args.db_test_name:
        acc_test, oacc_test, avg_iou_test, per_class_iou_test, predictions_test, avg_acc_test, confusion_matrix = eval_final(
        )
        print(
            '-> Multisample {}: Test accuracy: {}, \tTest oAcc: {}, \tTest avgIoU: {}, \tTest mAcc: {}'
            .format(args.test_multisamp_n, acc_test, oacc_test, avg_iou_test,
                    avg_acc_test))
        with h5py.File(
                os.path.join(args.odir,
                             'predictions_' + args.db_test_name + '.h5'),
                'w') as hf:
            for fname, o_cpu in predictions_test.items():
                hf.create_dataset(name=fname, data=o_cpu)  #(0-based classes)
        with open(
                os.path.join(args.odir,
                             'scores_' + args.db_test_name + '.json'),
                'w') as outfile:
            json.dump([{
                'epoch': args.start_epoch,
                'acc_test': acc_test,
                'oacc_test': oacc_test,
                'avg_iou_test': avg_iou_test,
                'per_class_iou_test': per_class_iou_test,
                'avg_acc_test': avg_acc_test
            }], outfile)
        np.save(os.path.join(args.odir, 'pointwise_cm.npy'), confusion_matrix)
Beispiel #3
0
def training_benchmark(arg, milestones):
    logging.basicConfig(filename=arg.log_path, level=logging.INFO)  # log file
    logging.info('Started')
    if not os.path.exists(arg.model_path):
        os.makedirs(arg.model_path)
    model = VDN_NET(in_channels=arg.channels, depth_snet=arg.snet)
    model = model.float()
    clipping = bool(arguments.clipping)

    # Load training data
    obj_data = gd.TrainBenchmark(h5_file_=arg.train_data,
                                 patch_size=arg.patch,
                                 window=11,
                                 radius=5)
    if torch.cuda.is_available():
        model.cuda()
        torch.backends.cudnn.benchmark = True
        data = DataLoader(obj_data,
                          batch_size=arg.batch,
                          shuffle=True,
                          num_workers=arg.workers,
                          pin_memory=True)
    else:
        data = DataLoader(obj_data, batch_size=arg.batch, shuffle=True)

    # network parameters
    epsilon = np.sqrt(1.0e-6)
    p_window = 7
    if clipping:
        gadient_clip_Dnet = 1000.0
        gadient_clip_Snet = 50.0
        Dnet_parameters = [
            x for name, x in model.named_parameters()
            if 'dnet' in name.lower()
        ]
        Snet_parameters = [
            x for name, x in model.named_parameters()
            if 'snet' in name.lower()
        ]
    optimizer = optim.Adam(model.parameters(), lr=2e-4)
    scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=arg.gamma)

    print("Training model Benchmark now!")

    for epoch in range(arg.epochs):
        tic = time.time()
        if clipping:
            grad_D = 0.0
            grad_S = 0.0
        epoch_avg_loss = 0.0
        mse_avg = 0.0
        psnr_avg = 0.0
        ssim_avg = 0.0
        lr = optimizer.param_groups[0]['lr']
        if lr < arg.learning:
            print("reach min learning rate at epoch" + str(epoch))
        model.train()
        for i, batch_data in enumerate(data):
            if torch.cuda.is_available():
                y_batch, x_batch, sigma_arr = Variable(
                    batch_data[0]).cuda(), Variable(
                        batch_data[1]).cuda(), Variable(batch_data[2]).cuda()
            else:
                y_batch, x_batch, sigma_arr = batch_data[0], batch_data[
                    1], batch_data[2]
            optimizer.zero_grad()
            out_D, out_s = model(y_batch)
            loss, loglikelihood, kl_z, kl_sigma = loss_func.get_loss(
                x_batch, y_batch, sigma_arr, p_window,
                out_D[:, :arg.channels, :, :], out_D[:, arg.channels:, :, :],
                out_s[:, :arg.channels, :, :], out_s[:, arg.channels:, :, :],
                epsilon)
            loss.backward()
            if clipping:
                full_grad_D = nn.utils.clip_grad_norm_(Dnet_parameters,
                                                       gadient_clip_Dnet)
                full_grad_S = nn.utils.clip_grad_norm_(Snet_parameters,
                                                       gadient_clip_Snet)
                grad_D = (grad_D * (i / (i + 1)) + full_grad_D / (i + 1))
                grad_S = (grad_S * (i / (i + 1)) + full_grad_S / (i + 1))
            optimizer.step()
            epoch_avg_loss += loss.detach().item()
            predicted_image = y_batch - out_D[:, :arg.channels, :, :].detach(
            ).data
            predicted_image = predicted_image.clamp(0, 1)
            mse = calc_MSE(predicted_image, x_batch)
            mse_avg += mse
            psnr_avg += psnr(predicted_image * 255, x_batch * 255)
            ssim_avg += calculate_ssim(img_as_ubyte(
                predicted_image.permute(2, 3, 1, 0).cpu().numpy()),
                                       img_as_ubyte(
                                           x_batch.permute(2, 3, 1,
                                                           0).cpu().numpy()),
                                       multichannel=True)
            if i == 0:
                print("First ForwardPAss\n Loss: {}, MSE: {}".format(
                    loss.detach().item(), mse))
            if (i + 1) % 100 == 0:
                print("{} - Loss: {}, MSE:{}, epoch:{}".format(
                    i + 1, loss.item(), mse, epoch + 1))
            if i >= 5000:
                break
        if clipping:
            gadient_clip_Dnet = min(gadient_clip_Dnet, grad_D)
            gadient_clip_Dnet = min(gadient_clip_Dnet, grad_S)
        print("----------------------------------------------------------")
        print(
            "Epoch: {},  Avg MSE:{},  Avg Epoch Loss:{},  Avg PSNR:{}, Avg SSIM : {}, LR:{}"
            .format(epoch + 1, mse_avg / (i + 1), epoch_avg_loss / (i + 1),
                    psnr_avg / (i + 1), ssim_avg / (i + 1), lr))
        logging.info("av loss: {}, epoch: {}".format(epoch_avg_loss / (i + 1),
                                                     epoch + 1))
        # --------------- here comes the validation!  ---------------
        model.eval()
        avg_psnr_validation = 0.0
        avg_ssim_validation = 0.0
        obj_data = gd.ValidationBenchmark(h5_file_=arg.val_data)
        if torch.cuda.is_available():
            model.cuda()
            torch.backends.cudnn.benchmark = True
        for idx in range(obj_data.__len__()):
            noisy, image = obj_data.__getitem__(idx)
            ch, ht, wt = noisy.shape
            noisy = noisy.view(1, ch, ht, wt).cuda()
            image = image.cuda()
            model_out, _ = model(noisy)
            noise = noisy - model_out[:, :ch, ].detach().data
            clean_img_pred = noise.view(ch, ht, wt).permute(1, 2,
                                                            0).clamp(0, 1)
            image = image.view(ch, ht, wt).permute(1, 2, 0)
            avg_psnr_validation += psnr(image * 255, clean_img_pred * 255)
            avg_ssim_validation += compare_ssim(
                img_as_ubyte(image.cpu().numpy()),
                img_as_ubyte(clean_img_pred.cpu().numpy()),
                win_size=11,
                data_range=255,
                multichannel=True,
                gaussian_weights=True)
        print("average validation PSNR = ",
              avg_psnr_validation / obj_data.__len__())
        print("average validation SSIM = ",
              avg_ssim_validation / obj_data.__len__())

        # -------------- finish validation ---------------------------------
        scheduler.step()
        toc = time.time()
        print('Time for this epoch: {:.2f}'.format(toc - tic))
        if epoch % arguments.epoch_save == 0:
            torch.save(
                model.state_dict(),
                os.path.join(arg.model_path,
                             "model_" + str(epoch) + "_epochs.pth"))
            print("saved model as" + arg.model_path)
    print("Finished Training...\n Saving model now.....\n")
    torch.save(model.state_dict(),
               os.path.join(arg.model_path, "final_model.pth"))
    print("saved model as" + os.path.join(arg.model_path, "final_model.pth"))
#    model = models.__dict__[opt.MODEL.ARCH]()

if opt.MODEL.ARCH.startswith('densenet'):
    assert (opt.MODEL.INPUT_SIZE % 32 == 0)
    model.avgpool = nn.AvgPool2d(opt.MODEL.INPUT_SIZE // 32, stride=1)
    #model.avgpool = nn.AdaptiveAvgPool2d(1)
    model.classifier = nn.Linear(model.classifier.in_features,
                                 DATA_INFO.NUM_CLASSES)
    model = torch.nn.DataParallel(model).cuda()
else:
    raise NotImplementedError
    model = torch.nn.DataParallel(model).cuda()

optimizer = optim.Adam(model.module.parameters(), opt.TRAIN.LEARNING_RATE)
lr_scheduler = MultiStepLR(optimizer,
                           opt.TRAIN.LR_MILESTONES,
                           gamma=opt.TRAIN.LR_GAMMA,
                           last_epoch=-1)

if opt.TRAIN.RESUME is None:
    last_epoch = 0
    logger.info("Training will start from Epoch {}".format(last_epoch + 1))

else:
    last_checkpoint = torch.load(opt.TRAIN.RESUME)
    assert (last_checkpoint['arch'] == opt.MODEL.ARCH)
    model.module.load_state_dict(last_checkpoint['state_dict'])
    optimizer.load_state_dict(last_checkpoint['optimizer'])
    logger.info("Checkpoint '{}' was loaded.".format(opt.TRAIN.RESUME))

    last_epoch = last_checkpoint['epoch']
    logger.info("Training will be resumed from Epoch {}".format(
Beispiel #5
0
            'params': model.features.parameters(),
            'lr': 1e-4 * 10
        }, {
            'params': model.classifier.parameters(),
            'lr': 1e-4
        }]
    else:
        for param in model.embedding.parameters():
            param.requires_grad = False
        for param in model.features.parameters():
            param.requires_grad = False
        optim_configs = [{'params': model.classifier.parameters(), 'lr': 1e-4}]
    optimizer = Adam(optim_configs, lr=1e-4)
    lr_scheduler = MultiStepLR(
        optimizer,
        milestones=[int(NUM_EPOCHS * 0.5),
                    int(NUM_EPOCHS * 0.7)],
        gamma=0.1)

    print(
        "# trainable parameters:",
        sum(param.numel() if param.requires_grad else 0
            for param in model.parameters()))
    # record statistics
    results = {
        'train_loss': [],
        'train_accuracy': [],
        'test_loss': [],
        'test_accuracy': []
    }
    # record current best test accuracy
Beispiel #6
0
net = ResNet50()

if USE_CUDA:
    net.cuda()

    devices = []
    for i in range(args.devices):
        devices.append(i)

    if len(devices)>1:
        net = torch.nn.DataParallel(net, device_ids=devices)
        cudnn.benchmark = True

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
scheduler = MultiStepLR(optimizer, milestones=[args.epochs*.25, args.epochs*.5,args.epochs*.75], gamma=0.1)

def train(epoch):
    print('\nEpoch: %d' % epoch)
    global iter_count
    
    epoch_start_time = time.time()
    scheduler.step()

    net.train()
    train_loss = 0
    correct = 0
    total = 0

    for batch_idx, (inputs, targets) in enumerate(trainloader):
        iter_count += 1
Beispiel #7
0
def train(y_train,
          X_train,
          y_val,
          X_val,
          ld,
          frq,
          beta,
          alpha=1.0,
          rho=0.9,
          loss_name='L0'):
    LOSS = {
        'L0': LossFunc(False, False),
        'L1': LossFunc(True, False),  # Pernalty2
        'L2': LossFunc(False, True),  # Pernalty1
        'L3': LossFunc(True, True)  # Pernalty1 + 2
    }
    Loss = LOSS[loss_name]
    Weight = Variable(torch.FloatTensor(0.5 * np.ones(beta.shape)),
                      requires_grad=True)
    frq = Variable(torch.Tensor(frq))
    ld = Variable(torch.Tensor(ld))

    batch_size = 50
    train_dataset = AssocDataSet(X=X_train, y=y_train)
    train_loader = data.DataLoader(dataset=train_dataset,
                                   batch_size=batch_size,
                                   shuffle=True,
                                   num_workers=1)

    val_X = Variable(torch.Tensor(X_val), requires_grad=False)
    val_y = Variable(torch.Tensor(y_val), requires_grad=False)

    opt = torch.optim.Adam([Weight], lr=0.02)
    scheduler = MultiStepLR(opt,
                            milestones=([x * 5 for x in range(1, 25)] +
                                        [200, 300, 400]),
                            gamma=0.83)

    epoch_iterator = tqdm(range(101))
    for epoch in epoch_iterator:
        epoch_losses = []
        for cur_X, cur_y in train_loader:
            opt.zero_grad()
            cur_X = Variable(cur_X, requires_grad=False)
            cur_y = Variable(cur_y, requires_grad=False)
            loss = Loss(cur_X,
                        cur_y,
                        Weight,
                        alpha=alpha,
                        rho=rho,
                        gamma=frq,
                        tau=ld)
            epoch_losses.append(loss.data[0])
            loss.backward()
            opt.step()

        scheduler.step()

        val_loss = Loss(val_X,
                        val_y,
                        Weight,
                        alpha=alpha,
                        rho=rho,
                        gamma=frq,
                        tau=ld).data[0]
        status = 'Ephch[{}]: loss: {}, val: {}; rho: {}; alpha: {}'.format(
            epoch, np.mean(epoch_losses), val_loss, rho, alpha)
        epoch_iterator.set_description(status)

        weight_name = '{}_rho_{}_alpha_{}.npy'.format(loss_name,
                                                      str(rho)[:3],
                                                      str(alpha)[:3])
        weight_dir = os.path.join('weight', weight_name)
        weight_file = os.path.abspath(os.path.expanduser(weight_dir))
        weight = Weight.data.numpy()
        np.save(weight_file, weight)

    return val_loss
def _main(
    meta_dir: str,
    save_prefix: str = '',
    model_name: str = 'refine_unet_base',  # or refine_spectrogram_unet
    save_dir: str = 'savedir',
    batch_size: int = 128,
    num_workers: int = 16,
    fix_len: float = 2.,
    lr: float = 5e-4,
    beta1: float = 0.5,
    beta2: float = 0.9,
    weight_decay: float = 0.0,
    max_step: int = 100000,
    valid_max_step: int = 30,
    save_interval: int = 1000,
    log_interval: int = 100,
    grad_clip: float = 0.0,
    grad_norm: float = 30.0,
    milestones: Tuple[int] = None,
    gamma: float = 0.2,
    is_augment: bool = True,
    is_dsd: bool = False,
    # model args
    hidden_dim: int = 768,
    filter_len: int = 512,
    hop_len: int = 64,
    block_layers: int = 4,
    layers: int = 4,
    kernel_size: int = 3,
    norm: str = 'ins',
    act: str = 'comp',
    refine_layers: int = 1,
):
    betas = beta1, beta2

    # setup model args
    model_args = {
        'hidden_dim': hidden_dim,
        'filter_len': filter_len,
        'hop_len': hop_len,
        'spec_dim': filter_len // 2 + 1,
        'block_layers': block_layers,
        'layers': layers,
        'kernel_size': kernel_size,
        'norm': norm,
        'refine_layers': refine_layers,
        'act': act
    }

    # create model
    model = build_model(model_name, extra_kwargs=model_args).cuda()

    # multi-gpu
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    # create optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 betas=betas,
                                 weight_decay=weight_decay)

    if milestones:
        milestones = [int(x) for x in list(milestones)]
        scheduler = MultiStepLR(optimizer, milestones, gamma=gamma)
    else:
        scheduler = None

    # adopt dsd100 case
    if is_dsd:
        sr = 44100
        if is_augment:
            dataset_func = get_datasets
            meta_cls = DSD100Meta
        else:
            dataset_func = dsd100.get_datasets
    else:
        sr = 22050
        # load dataset
        if is_augment:
            dataset_func = get_datasets
            meta_cls = VoiceBankMeta
        else:
            dataset_func = voice_bank.get_datasets

    train_loader, valid_loader = dataset_func(meta_dir,
                                              batch_size=batch_size,
                                              num_workers=num_workers,
                                              meta_cls=meta_cls,
                                              fix_len=int(fix_len * sr),
                                              audio_mask=True)

    # train
    loss = Wave2WaveTrainer(model,
                            optimizer,
                            train_loader,
                            valid_loader,
                            max_step=max_step,
                            valid_max_step=min(valid_max_step,
                                               len(valid_loader)),
                            save_interval=save_interval,
                            log_interval=log_interval,
                            save_dir=save_dir,
                            save_prefix=save_prefix,
                            grad_clip=grad_clip,
                            grad_norm=grad_norm,
                            pretrained_path='',
                            scheduler=scheduler,
                            sr=sr).run()

    return {
        'loss': loss,
        'status': 'ok',
    }
                    batch_size=batch_size,
                    collate_fn=training.collate_pil)
'''
for i,(x,y) in enumerate(loader):
    mtcnn(x, save_path=y)
    print('\rBatch {} of {}'.format(i+1, len(loader)), end='')

del mtcnn
'''

resnet = InceptionResnetV1(classify=True,
                           pretrained='vggface2',
                           num_classes=len(dataset.class_to_idx)).to(device)

optimizer = optim.Adam(resnet.parameters(), lr=0.001)
scheduler = MultiStepLR(optimizer, [5, 10])

trans = transforms.Compose(
    [np.float32,
     transforms.ToTensor(), fixed_image_standardization])
dataset = datasets.ImageFolder(data_dir + 'cropped', transform=trans)
img_inds = np.arange(len(dataset))
np.random.shuffle(img_inds)
train_inds = img_inds[:int(0.8 * len(img_inds))]
val_inds = img_inds[int(0.8 * len(img_inds)):]

train_loader = DataLoader(dataset,
                          num_workers=workers,
                          batch_size=batch_size,
                          sampler=SubsetRandomSampler(train_inds))
val_loader = DataLoader(dataset,
Beispiel #10
0
def main_worker(gpu, ngpus_per_node, cfg):
    cfg['GPU'] = gpu
    if gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass
    cfg['RANK'] = cfg['RANK'] * ngpus_per_node + gpu
    dist.init_process_group(backend=cfg['DIST_BACKEND'],
                            init_method=cfg["DIST_URL"],
                            world_size=cfg['WORLD_SIZE'],
                            rank=cfg['RANK'])

    # Data loading code
    batch_size = int(cfg['BATCH_SIZE'])
    per_batch_size = int(batch_size / ngpus_per_node)
    #workers = int((cfg['NUM_WORKERS'] + ngpus_per_node - 1) / ngpus_per_node) # dataload threads
    workers = int(cfg['NUM_WORKERS'])
    DATA_ROOT = cfg[
        'DATA_ROOT']  # the parent root where your train/val/test data are stored
    VAL_DATA_ROOT = cfg['VAL_DATA_ROOT']
    RECORD_DIR = cfg['RECORD_DIR']
    RGB_MEAN = cfg['RGB_MEAN']  # for normalize inputs
    RGB_STD = cfg['RGB_STD']
    DROP_LAST = cfg['DROP_LAST']
    LR_SCHEDULER = cfg['LR_SCHEDULER']
    LR_STEP_SIZE = cfg['LR_STEP_SIZE']
    LR_DECAY_EPOCH = cfg['LR_DECAY_EPOCH']
    LR_DECAT_GAMMA = cfg['LR_DECAT_GAMMA']
    LR_END = cfg['LR_END']
    WARMUP_EPOCH = cfg['WARMUP_EPOCH']
    WARMUP_LR = cfg['WARMUP_LR']
    NUM_EPOCH = cfg['NUM_EPOCH']
    USE_APEX = cfg['USE_APEX']
    EVAL_FREQ = cfg['EVAL_FREQ']
    SYNC_BN = cfg['SYNC_BN']
    print("=" * 60)
    print("Overall Configurations:")
    print(cfg)
    print("=" * 60)
    transform_list = [
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=RGB_MEAN, std=RGB_STD),
    ]
    if cfg['RANDOM_ERASING']:
        transform_list.append(RandomErasing())
    if cfg['CUTOUT']:
        transform_list.append(Cutout())
    train_transform = transforms.Compose(transform_list)
    if cfg['RANDAUGMENT']:
        train_transform.transforms.insert(
            0, RandAugment(n=cfg['RANDAUGMENT_N'], m=cfg['RANDAUGMENT_M']))
    dataset_train = FaceDataset(DATA_ROOT, RECORD_DIR, train_transform)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        dataset_train)
    train_loader = torch.utils.data.DataLoader(dataset_train,
                                               batch_size=per_batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=DROP_LAST)
    SAMPLE_NUMS = dataset_train.get_sample_num_of_each_class()
    NUM_CLASS = len(train_loader.dataset.classes)
    print("Number of Training Classes: {}".format(NUM_CLASS))

    lfw, cfp_fp, agedb_30, vgg2_fp, lfw_issame, cfp_fp_issame, agedb_30_issame, vgg2_fp_issame = get_val_data(
        VAL_DATA_ROOT)

    #======= model & loss & optimizer =======#
    BACKBONE_DICT = {
        'MobileFaceNet': MobileFaceNet,
        'ResNet_50': ResNet_50,
        'ResNet_101': ResNet_101,
        'ResNet_152': ResNet_152,
        'IR_50': IR_50,
        'IR_100': IR_100,
        'IR_101': IR_101,
        'IR_152': IR_152,
        'IR_185': IR_185,
        'IR_200': IR_200,
        'IR_SE_50': IR_SE_50,
        'IR_SE_100': IR_SE_100,
        'IR_SE_101': IR_SE_101,
        'IR_SE_152': IR_SE_152,
        'IR_SE_185': IR_SE_185,
        'IR_SE_200': IR_SE_200,
        'AttentionNet_IR_56': AttentionNet_IR_56,
        'AttentionNet_IRSE_56': AttentionNet_IRSE_56,
        'AttentionNet_IR_92': AttentionNet_IR_92,
        'AttentionNet_IRSE_92': AttentionNet_IRSE_92,
        'PolyNet': PolyNet,
        'PolyFace': PolyFace,
        'EfficientPolyFace': EfficientPolyFace,
        'ResNeSt_50': resnest50,
        'ResNeSt_101': resnest101,
        'ResNeSt_100': resnest100,
        'GhostNet': GhostNet,
        'MobileNetV3': MobileNetV3,
        'ProxylessNAS': proxylessnas
    }  #'HRNet_W30': HRNet_W30, 'HRNet_W32': HRNet_W32, 'HRNet_W40': HRNet_W40, 'HRNet_W44': HRNet_W44, 'HRNet_W48': HRNet_W48, 'HRNet_W64': HRNet_W64

    BACKBONE_NAME = cfg['BACKBONE_NAME']
    INPUT_SIZE = cfg['INPUT_SIZE']
    assert INPUT_SIZE == [112, 112]
    backbone = BACKBONE_DICT[BACKBONE_NAME](INPUT_SIZE)
    print("=" * 60)
    print(backbone)
    print("{} Backbone Generated".format(BACKBONE_NAME))
    print("=" * 60)
    HEAD_DICT = {
        'Softmax': Softmax,
        'ArcFace': ArcFace,
        'Combined': Combined,
        'CosFace': CosFace,
        'SphereFace': SphereFace,
        'Am_softmax': Am_softmax,
        'CurricularFace': CurricularFace,
        'ArcNegFace': ArcNegFace,
        'SVX': SVXSoftmax,
        'AirFace': AirFace,
        'QAMFace': QAMFace,
        'CircleLoss': CircleLoss
    }
    HEAD_NAME = cfg['HEAD_NAME']
    EMBEDDING_SIZE = cfg['EMBEDDING_SIZE']  # feature dimension
    head = HEAD_DICT[HEAD_NAME](in_features=EMBEDDING_SIZE,
                                out_features=NUM_CLASS)
    print("Params: ", count_model_params(backbone))
    print("Flops:", count_model_flops(backbone))
    #backbone = backbone.eval()
    #print("Flops: ", flops_to_string(2*float(profile_macs(backbone.eval(), torch.randn(1, 3, 112, 112)))))
    #backbone = backbone.train()
    print("=" * 60)
    print(head)
    print("{} Head Generated".format(HEAD_NAME))
    print("=" * 60)

    #--------------------optimizer-----------------------------
    if BACKBONE_NAME.find("IR") >= 0:
        backbone_paras_only_bn, backbone_paras_wo_bn = separate_irse_bn_paras(
            backbone
        )  # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability
    else:
        backbone_paras_only_bn, backbone_paras_wo_bn = separate_resnet_bn_paras(
            backbone
        )  # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability

    LR = cfg['LR']  # initial LR
    WEIGHT_DECAY = cfg['WEIGHT_DECAY']
    MOMENTUM = cfg['MOMENTUM']
    optimizer = optim.SGD(
        [{
            'params': backbone_paras_wo_bn + list(head.parameters()),
            'weight_decay': WEIGHT_DECAY
        }, {
            'params': backbone_paras_only_bn
        }],
        lr=LR,
        momentum=MOMENTUM)
    if LR_SCHEDULER == 'step':
        scheduler = StepLR(optimizer,
                           step_size=LR_STEP_SIZE,
                           gamma=LR_DECAT_GAMMA)
    elif LR_SCHEDULER == 'multi_step':
        scheduler = MultiStepLR(optimizer,
                                milestones=LR_DECAY_EPOCH,
                                gamma=LR_DECAT_GAMMA)
    elif LR_SCHEDULER == 'cosine':
        scheduler = CosineWarmupLR(optimizer,
                                   batches=len(train_loader),
                                   epochs=NUM_EPOCH,
                                   base_lr=LR,
                                   target_lr=LR_END,
                                   warmup_epochs=WARMUP_EPOCH,
                                   warmup_lr=WARMUP_LR)

    print("=" * 60)
    print(optimizer)
    print("Optimizer Generated")
    print("=" * 60)

    # loss
    LOSS_NAME = cfg['LOSS_NAME']
    LOSS_DICT = {
        'Softmax': nn.CrossEntropyLoss(),
        'LabelSmooth': LabelSmoothCrossEntropyLoss(classes=NUM_CLASS),
        'Focal': FocalLoss(),
        'HM': HardMining(),
        'Softplus': nn.Softplus()
    }
    loss = LOSS_DICT[LOSS_NAME].cuda(gpu)
    print("=" * 60)
    print(loss)
    print("{} Loss Generated".format(loss))
    print("=" * 60)

    torch.cuda.set_device(cfg['GPU'])
    backbone.cuda(cfg['GPU'])
    head.cuda(cfg['GPU'])

    #optionally resume from a checkpoint
    BACKBONE_RESUME_ROOT = cfg[
        'BACKBONE_RESUME_ROOT']  # the root to resume training from a saved checkpoint
    HEAD_RESUME_ROOT = cfg[
        'HEAD_RESUME_ROOT']  # the root to resume training from a saved checkpoint
    IS_RESUME = cfg['IS_RESUME']
    if IS_RESUME:
        print("=" * 60)
        if os.path.isfile(BACKBONE_RESUME_ROOT):
            print("Loading Backbone Checkpoint '{}'".format(
                BACKBONE_RESUME_ROOT))
            loc = 'cuda:{}'.format(cfg['GPU'])
            backbone.load_state_dict(
                torch.load(BACKBONE_RESUME_ROOT, map_location=loc))
            if os.path.isfile(HEAD_RESUME_ROOT):
                print("Loading Head Checkpoint '{}'".format(HEAD_RESUME_ROOT))
                checkpoint = torch.load(HEAD_RESUME_ROOT, map_location=loc)
                cfg['START_EPOCH'] = checkpoint['EPOCH']
                head.load_state_dict(checkpoint['HEAD'])
                optimizer.load_state_dict(checkpoint['OPTIMIZER'])
                del (checkpoint)
        else:
            print(
                "No Checkpoint Found at '{}' and '{}'. Please Have a Check or Continue to Train from Scratch"
                .format(BACKBONE_RESUME_ROOT, HEAD_RESUME_ROOT))
        print("=" * 60)
    ori_backbone = copy.deepcopy(backbone)
    if SYNC_BN:
        backbone = apex.parallel.convert_syncbn_model(backbone)
    if USE_APEX:
        [backbone, head], optimizer = amp.initialize([backbone, head],
                                                     optimizer,
                                                     opt_level='O2')
        backbone = DDP(backbone)
        head = DDP(head)
    else:
        backbone = torch.nn.parallel.DistributedDataParallel(
            backbone, device_ids=[cfg['GPU']])
        head = torch.nn.parallel.DistributedDataParallel(
            head, device_ids=[cfg['GPU']])

    # checkpoint and tensorboard dir
    MODEL_ROOT = cfg['MODEL_ROOT']  # the root to buffer your checkpoints
    LOG_ROOT = cfg['LOG_ROOT']  # the root to log your train/val status

    os.makedirs(MODEL_ROOT, exist_ok=True)
    os.makedirs(LOG_ROOT, exist_ok=True)

    writer = SummaryWriter(
        LOG_ROOT)  # writer for buffering intermedium results
    # train
    for epoch in range(cfg['START_EPOCH'], cfg['NUM_EPOCH']):
        train_sampler.set_epoch(epoch)
        if LR_SCHEDULER != 'cosine':
            scheduler.step()
        #train for one epoch
        DISP_FREQ = 100  # 100 batch
        batch = 0  # batch index
        backbone.train()  # set to training mode
        head.train()
        losses = AverageMeter()
        top1 = AverageMeter()
        top5 = AverageMeter()
        for inputs, labels in tqdm(iter(train_loader)):
            if LR_SCHEDULER == 'cosine':
                scheduler.step()
            # compute output
            start_time = time.time()
            inputs = inputs.cuda(cfg['GPU'], non_blocking=True)
            labels = labels.cuda(cfg['GPU'], non_blocking=True)

            if cfg['MIXUP']:
                inputs, labels_a, labels_b, lam = mixup_data(
                    inputs, labels, cfg['GPU'], cfg['MIXUP_PROB'],
                    cfg['MIXUP_ALPHA'])
                inputs, labels_a, labels_b = map(Variable,
                                                 (inputs, labels_a, labels_b))
            elif cfg['CUTMIX']:
                inputs, labels_a, labels_b, lam = cutmix_data(
                    inputs, labels, cfg['GPU'], cfg['CUTMIX_PROB'],
                    cfg['MIXUP_ALPHA'])
                inputs, labels_a, labels_b = map(Variable,
                                                 (inputs, labels_a, labels_b))
            features = backbone(inputs)
            outputs = head(features, labels)

            if cfg['MIXUP'] or cfg['CUTMIX']:
                lossx = mixup_criterion(loss, outputs, labels_a, labels_b, lam)
            else:
                lossx = loss(outputs,
                             labels) if HEAD_NAME != 'CircleLoss' else loss(
                                 outputs).mean()
            end_time = time.time()
            duration = end_time - start_time
            if ((batch + 1) % DISP_FREQ == 0) and batch != 0:
                print("batch inference time", duration)

            # compute gradient and do SGD step
            optimizer.zero_grad()
            if USE_APEX:
                with amp.scale_loss(lossx, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                lossx.backward()
            optimizer.step()

            # measure accuracy and record loss
            prec1, prec5 = accuracy(outputs.data, labels, topk=(
                1, 5)) if HEAD_NAME != 'CircleLoss' else accuracy(
                    features.data, labels, topk=(1, 5))
            losses.update(lossx.data.item(), inputs.size(0))
            top1.update(prec1.data.item(), inputs.size(0))
            top5.update(prec5.data.item(), inputs.size(0))
            # dispaly training loss & acc every DISP_FREQ
            if ((batch + 1) % DISP_FREQ == 0) or batch == 0:
                print("=" * 60)
                print('Epoch {}/{} Batch {}/{}\t'
                      'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                          epoch + 1,
                          cfg['NUM_EPOCH'],
                          batch + 1,
                          len(train_loader),
                          loss=losses,
                          top1=top1,
                          top5=top5))
                print("=" * 60)

            # perform validation & save checkpoints per epoch
            # validation statistics per epoch (buffer for visualization)
            if (batch + 1) % EVAL_FREQ == 0:
                #lr = scheduler.get_last_lr()
                lr = optimizer.param_groups[0]['lr']
                print("Current lr", lr)
                print("=" * 60)
                print(
                    "Perform Evaluation on LFW, CFP_FP, AgeD and VGG2_FP, and Save Checkpoints..."
                )
                accuracy_lfw, best_threshold_lfw, roc_curve_lfw = perform_val(
                    EMBEDDING_SIZE, per_batch_size, backbone, lfw, lfw_issame)
                buffer_val(writer, "LFW", accuracy_lfw, best_threshold_lfw,
                           roc_curve_lfw, epoch + 1)
                accuracy_cfp_fp, best_threshold_cfp_fp, roc_curve_cfp_fp = perform_val(
                    EMBEDDING_SIZE, per_batch_size, backbone, cfp_fp,
                    cfp_fp_issame)
                buffer_val(writer, "CFP_FP", accuracy_cfp_fp,
                           best_threshold_cfp_fp, roc_curve_cfp_fp, epoch + 1)
                accuracy_agedb_30, best_threshold_agedb_30, roc_curve_agedb_30 = perform_val(
                    EMBEDDING_SIZE, per_batch_size, backbone, agedb_30,
                    agedb_30_issame)
                buffer_val(writer, "AgeDB", accuracy_agedb_30,
                           best_threshold_agedb_30, roc_curve_agedb_30,
                           epoch + 1)
                accuracy_vgg2_fp, best_threshold_vgg2_fp, roc_curve_vgg2_fp = perform_val(
                    EMBEDDING_SIZE, per_batch_size, backbone, vgg2_fp,
                    vgg2_fp_issame)
                buffer_val(writer, "VGGFace2_FP", accuracy_vgg2_fp,
                           best_threshold_vgg2_fp, roc_curve_vgg2_fp,
                           epoch + 1)
                print(
                    "Epoch {}/{}, Evaluation: LFW Acc: {}, CFP_FP Acc: {}, AgeDB Acc: {}, VGG2_FP Acc: {}"
                    .format(epoch + 1, NUM_EPOCH, accuracy_lfw,
                            accuracy_cfp_fp, accuracy_agedb_30,
                            accuracy_vgg2_fp))
                print("=" * 60)

                print("=" * 60)
                print("Save Checkpoint...")
                if cfg['RANK'] % ngpus_per_node == 0:
                    #torch.save(backbone.module.state_dict(), os.path.join(MODEL_ROOT, "Backbone_{}_Epoch_{}_Time_{}_checkpoint.pth".format(BACKBONE_NAME, epoch + 1, get_time())))
                    #save_dict = {'EPOCH': epoch+1,
                    #            'HEAD': head.module.state_dict(),
                    #            'OPTIMIZER': optimizer.state_dict()}
                    #torch.save(save_dict, os.path.join(MODEL_ROOT, "Head_{}_Epoch_{}_Time_{}_checkpoint.pth".format(HEAD_NAME, epoch + 1, get_time())))
                    ori_backbone.load_state_dict(backbone.module.state_dict())
                    ori_backbone.eval()
                    x = torch.randn(1, 3, 112, 112).cuda()
                    traced_cell = torch.jit.trace(ori_backbone, (x))
                    #torch.save(ori_backbone, os.path.join(MODEL_ROOT, "model.pth"))
                    torch.jit.save(
                        traced_cell,
                        os.path.join(
                            MODEL_ROOT,
                            "Epoch_{}_Time_{}_checkpoint.pth".format(
                                epoch + 1, get_time())))
            sys.stdout.flush()
            batch += 1  # batch index
        epoch_loss = losses.avg
        epoch_acc = top1.avg
        print("=" * 60)
        print('Epoch: {}/{}\t'
              'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t'
              'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
              'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                  epoch + 1,
                  cfg['NUM_EPOCH'],
                  loss=losses,
                  top1=top1,
                  top5=top5))
        sys.stdout.flush()
        print("=" * 60)
        if cfg['RANK'] % ngpus_per_node == 0:
            writer.add_scalar("Training_Loss", epoch_loss, epoch + 1)
            writer.add_scalar("Training_Accuracy", epoch_acc, epoch + 1)
            writer.add_scalar("Top1", top1.avg, epoch + 1)
            writer.add_scalar("Top5", top5.avg, epoch + 1)
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--eid', type=int, default=-1)
    parser.add_argument('--gpu_id', type=int, nargs='+', default=0)
    parser.add_argument('--yaml_file',
                        type=str,
                        default='configs/demo/mini/20way_1shot.yaml')
    outside_opts = parser.parse_args()
    if isinstance(outside_opts.gpu_id, int):
        outside_opts.gpu_id = [outside_opts.gpu_id]  # int -> list

    config = {}
    config['options'] = {
        'ctrl.yaml_file': outside_opts.yaml_file,
        'ctrl.gpu_id': outside_opts.gpu_id
    }
    opts = Config(config['options']['ctrl.yaml_file'], config['options'])
    opts.setup()

    # DATA
    meta_test = None
    train_db_list, val_db_list, _, _ = data_loader(opts)

    # MODEL
    # NOTE: we use cpu mode for demo; change to gpu for experiments
    net = CTMNet(opts).to(opts.ctrl.device)

    net_summary, param_num = model_summarize(net)
    opts.logger('Model size: param num # {:f} Mb'.format(param_num))
    opts.model.param_size = param_num

    resume_model(net, opts)
    if opts.ctrl.multi_gpu:
        opts.logger('Wrapping network into multi-gpu mode ...')
        net = torch.nn.DataParallel(net)

    # OPTIM AND LR SCHEDULE
    if opts.train.optim == 'adam':
        optimizer = optim.Adam(net.parameters(),
                               lr=opts.train.lr,
                               weight_decay=opts.train.weight_decay)
    elif opts.train.optim == 'sgd':
        optimizer = optim.SGD(net.parameters(),
                              lr=opts.train.lr,
                              weight_decay=opts.train.weight_decay,
                              momentum=opts.train.momentum)
    elif opts.train.optim == 'rmsprop':
        optimizer = optim.RMSprop(net.parameters(),
                                  lr=opts.train.lr,
                                  weight_decay=opts.train.weight_decay,
                                  momentum=opts.train.momentum,
                                  alpha=0.9,
                                  centered=True)
    if opts.train.lr_policy == 'multi_step':
        scheduler = MultiStepLR(optimizer,
                                milestones=opts.train.lr_scheduler,
                                gamma=opts.train.lr_gamma)
    elif opts.train.lr_policy == 'exp':
        scheduler = ExponentialLR(optimizer, gamma=opts.train.lr_gamma)
    if opts.model.structure == 'original':
        # ignore previous setting
        optimizer = optim.Adam(net.parameters(), lr=0.001)
        scheduler = StepLR(optimizer, step_size=100, gamma=0.5)
        opts.train.lr_policy = 'step'
        opts.train.step_size = 100 if not opts.data.use_ori_relation else 3
        opts.train.lr_scheduler = [-1]
        opts.train.lr = 0.001
        opts.train.lr_gamma = 0.5
        opts.train.weight_decay = .0

    # VISUALIZE
    if opts.misc.vis.use:
        if opts.misc.vis.method == 'tensorboard':
            NotImplementedError()
        elif opts.misc.vis.method == 'visdom':
            if opts.io.resume:
                try:
                    vis = Visualizer(opts, net.previous_loss_data)
                except:
                    vis = Visualizer(opts, net.module.previous_loss_data)
            else:
                vis = Visualizer(opts)

    if not opts.ctrl.eager:
        opts.print_args()
        opts.logger(net)
    else:
        opts.logger('config file is {:s}'.format(opts.ctrl.yaml_file))
        opts.logger('configs not shown here in eager mode ...')
        opts.logger(net)

    # ###############################################
    # ################## PIPELINE ###################
    best_accuracy = opts.io.previous_acc
    RESET_BEST_ACC = True  # for evolutionary train
    last_epoch, last_iter = opts.io.saved_epoch, opts.io.saved_iter
    opts.logger('CTM Pipeline starts now !!! (cpu demo purpose)')
    show_str = '[TRAIN FROM SCRATCH] LOG' if not opts.io.resume else '[RESUME] LOG'
    opts.logger('{}\n'.format(show_str))

    total_ep = opts.train.nep
    if opts.ctrl.start_epoch > 0 or opts.ctrl.start_iter > 0:
        assert opts.io.resume
        RESUME = True
    else:
        RESUME = False

    for epoch in range(opts.ctrl.start_epoch, total_ep):

        if epoch > opts.ctrl.start_epoch and opts.data.change_on_every_ep:
            opts.logger('')
            opts.logger('Changing a new set of data at new epoch ...')
            train_db_list, val_db_list, _, _ = data_loader(opts)

        # adjust learning rate
        old_lr = optimizer.param_groups[0]['lr']
        scheduler.step(epoch)
        new_lr = optimizer.param_groups[0]['lr']
        if epoch == opts.ctrl.start_epoch:
            opts.logger('Start lr is {:.8f}, at epoch {}\n'.format(
                old_lr, epoch))
        if new_lr != old_lr:
            opts.logger(
                'LR changes from {:.8f} to {:.8f} at epoch {:d}\n'.format(
                    old_lr, new_lr, epoch))

        # select proper train_db (legacy reason)
        which_ind = 0
        curr_shot = opts.fsl.k_shot[0]
        curr_query = opts.fsl.k_query[
            0]  # only for display (for evolutionary train)
        train_db = train_db_list[0]
        val_db = val_db_list[0]
        total_iter = opts.ctrl.total_iter_train[0]
        eval_length = opts.ctrl.total_iter_val[0]

        for step, batch in enumerate(train_db):

            step_t = time.time()
            if RESUME:
                if step < opts.ctrl.start_iter:
                    continue
                else:
                    RESUME = False

            if step >= total_iter:
                break

            support_x, support_y, query_x, query_y = process_input(
                batch, opts, mode='train')
            loss, _ = net.forward_CTM(support_x, support_y, query_x, query_y,
                                      True)
            loss = loss.mean(0)
            vis_loss = loss.data.cpu().numpy()

            vis_loss *= opts.train.total_loss_fac
            loss *= opts.train.total_loss_fac

            if len(loss) > 1:
                total_loss = loss[0]
            else:
                total_loss = loss

            optimizer.zero_grad()
            total_loss.backward()
            if opts.train.clip_grad:
                # doesn't affect that much
                torch.nn.utils.clip_grad_norm_(net.parameters(), 0.5)
            optimizer.step()

            iter_time = (time.time() - step_t)
            left_time = compute_left_time(iter_time, epoch, total_ep, step,
                                          total_iter)

            # SHOW TRAIN LOSS
            if step % opts.io.iter_vis_loss == 0 or step == total_iter - 1:
                opts.logger(
                    opts.io.loss_vis_str.format(epoch, total_ep, step,
                                                total_iter, total_loss.item()))
                # time
                if step % 1000 * opts.io.iter_vis_loss == 0 or step == total_iter - 1:
                    opts.logger(
                        opts.io.time_vis_str.format(left_time[0], left_time[1],
                                                    left_time[2]))

            # VALIDATION and SAVE BEST MODEL
            if epoch > opts.test.do_after_ep and \
                    ((step % opts.io.iter_do_val == 0 and step > 0) or step == total_iter - 1):

                # execute once only
                if RESET_BEST_ACC and opts.fsl.evolution and epoch >= opts.fsl.epoch_schedule[
                        -1]:
                    best_accuracy, last_epoch, last_iter = -1.0, -1, -1
                    RESET_BEST_ACC = False

                arguments = {
                    'step': step,
                    'epoch': epoch,
                    'eval_length': eval_length,
                    'which_ind': which_ind,
                    'curr_shot': curr_shot,
                    'curr_query': curr_query,
                    'best_accuracy': best_accuracy,
                    'last_epoch': last_epoch,
                    'last_iter': last_iter,
                    'new_lr': new_lr,
                    'train_db': train_db,
                    'total_iter': total_iter,
                    'optimizer': optimizer,
                    'meta_test': meta_test
                }
                try:
                    stats = run_test(opts, val_db, net, vis, **arguments)
                except RuntimeError:
                    vis.show_dynamic_info(phase='error')
                if sum(stats) != -1:
                    best_accuracy, last_epoch, last_iter = stats[0], stats[
                        1], stats[2]
            # DONE with validation process

    opts.logger('')
    opts.logger('Training done! check your work using:')
    if opts.misc.vis.use and opts.misc.vis.method == 'visdom':
        vis.show_dynamic_info(phase='train_finish')
        if not opts.ctrl.eager:
            opts.logger('visdom state saved!')
            vis.save()
def main(args: argparse.Namespace):
    logger = CompleteLogger(args.log, args.phase)
    print(args)

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    cudnn.benchmark = True

    # Data loading code
    normalize = T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    train_transform = T.Compose([
        T.RandomRotation(args.rotation),
        T.RandomResizedCrop(size=args.image_size, scale=args.resize_scale),
        T.ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25),
        T.GaussianBlur(),
        T.ToTensor(), normalize
    ])
    val_transform = T.Compose(
        [T.Resize(args.image_size),
         T.ToTensor(), normalize])
    image_size = (args.image_size, args.image_size)
    heatmap_size = (args.heatmap_size, args.heatmap_size)
    source_dataset = datasets.__dict__[args.source]
    train_source_dataset = source_dataset(root=args.source_root,
                                          transforms=train_transform,
                                          image_size=image_size,
                                          heatmap_size=heatmap_size)
    train_source_loader = DataLoader(train_source_dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.workers,
                                     pin_memory=True,
                                     drop_last=True)
    val_source_dataset = source_dataset(root=args.source_root,
                                        split='test',
                                        transforms=val_transform,
                                        image_size=image_size,
                                        heatmap_size=heatmap_size)
    val_source_loader = DataLoader(val_source_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=False,
                                   pin_memory=True)

    target_dataset = datasets.__dict__[args.target]
    train_target_dataset = target_dataset(root=args.target_root,
                                          transforms=train_transform,
                                          image_size=image_size,
                                          heatmap_size=heatmap_size)
    train_target_loader = DataLoader(train_target_dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.workers,
                                     pin_memory=True,
                                     drop_last=True)
    val_target_dataset = target_dataset(root=args.target_root,
                                        split='test',
                                        transforms=val_transform,
                                        image_size=image_size,
                                        heatmap_size=heatmap_size)
    val_target_loader = DataLoader(val_target_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=False,
                                   pin_memory=True)

    print("Source train:", len(train_source_loader))
    print("Target train:", len(train_target_loader))
    print("Source test:", len(val_source_loader))
    print("Target test:", len(val_target_loader))

    train_source_iter = ForeverDataIterator(train_source_loader)
    train_target_iter = ForeverDataIterator(train_target_loader)

    # create model
    model = models.__dict__[args.arch](
        num_keypoints=train_source_dataset.num_keypoints).to(device)
    criterion = JointsMSELoss()

    # define optimizer and lr scheduler
    optimizer = Adam(model.get_parameters(lr=args.lr))
    lr_scheduler = MultiStepLR(optimizer, args.lr_step, args.lr_factor)

    # optionally resume from a checkpoint
    start_epoch = 0
    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        start_epoch = checkpoint['epoch'] + 1

    # define visualization function
    tensor_to_image = Compose([
        Denormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ToPILImage()
    ])

    def visualize(image, keypoint2d, name):
        """
        Args:
            image (tensor): image in shape 3 x H x W
            keypoint2d (tensor): keypoints in shape K x 2
            name: name of the saving image
        """
        train_source_dataset.visualize(
            tensor_to_image(image), keypoint2d,
            logger.get_image_path("{}.jpg".format(name)))

    if args.phase == 'test':
        # evaluate on validation set
        source_val_acc = validate(val_source_loader, model, criterion, None,
                                  args)
        target_val_acc = validate(val_target_loader, model, criterion,
                                  visualize, args)
        print("Source: {:4.3f} Target: {:4.3f}".format(source_val_acc['all'],
                                                       target_val_acc['all']))
        for name, acc in target_val_acc.items():
            print("{}: {:4.3f}".format(name, acc))
        return

    # start training
    best_acc = 0
    for epoch in range(start_epoch, args.epochs):
        logger.set_epoch(epoch)
        lr_scheduler.step()

        # train for one epoch
        train(train_source_iter, train_target_iter, model, criterion,
              optimizer, epoch, visualize if args.debug else None, args)

        # evaluate on validation set
        source_val_acc = validate(val_source_loader, model, criterion, None,
                                  args)
        target_val_acc = validate(val_target_loader, model, criterion,
                                  visualize if args.debug else None, args)

        # remember best acc and save checkpoint
        torch.save(
            {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'epoch': epoch,
                'args': args
            }, logger.get_checkpoint_path(epoch))
        if target_val_acc['all'] > best_acc:
            shutil.copy(logger.get_checkpoint_path(epoch),
                        logger.get_checkpoint_path('best'))
            best_acc = target_val_acc['all']
        print("Source: {:4.3f} Target: {:4.3f} Target(best): {:4.3f}".format(
            source_val_acc['all'], target_val_acc['all'], best_acc))
        for name, acc in target_val_acc.items():
            print("{}: {:4.3f}".format(name, acc))

    logger.close()
Beispiel #13
0
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

vgg16_cifar100 = models.vgg16_bn(pretrained=False, **{'num_classes': 100})
vgg16_cifar100 = vgg16_cifar100.cuda()

criterion = nn.CrossEntropyLoss()

optimizer = optim.SGD(vgg16_cifar100.parameters(),
                      lr=0.01,
                      weight_decay=0.0005,
                      momentum=0.9)

sched = MultiStepLR(optimizer, milestones=[20, 30], gamma=0.1)

train_log, val_log, vgg16_cifar100_1_acc, vgg16_cifar100 = train(
    vgg16_cifar100, optimizer, criterion, dataset_loader_train,
    dataset_loader_test, 40, sched, 31, 1, 100)

torch.save(vgg16_cifar100.state_dict(), 'vgg16_cifar100_40ep_1.pt')

# Second model:
random.seed(8)
np.random.seed(8)
torch.manual_seed(8)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(8)

vgg16_cifar100 = models.vgg16_bn(pretrained=False, **{'num_classes': 100})
Beispiel #14
0
def train(args, dataset_train, rnn, output, node_f_gen=None, edge_f_gen=None):
    # check if load existing model
    if args.load:
        fname = args.model_save_path + args.fname + 'lstm_' + str(
            args.load_epoch) + '.dat'
        rnn.load_state_dict(torch.load(fname))
        fname = args.model_save_path + args.fname + 'output_' + str(
            args.load_epoch) + '.dat'
        output.load_state_dict(torch.load(fname))

        args.lr = 0.00001
        epoch = args.load_epoch
        print('model loaded!, lr: {}'.format(args.lr))
    else:
        epoch = 1

    # initialize optimizer
    optimizer_rnn = optim.Adam(list(rnn.parameters()), lr=args.lr)
    optimizer_output = optim.Adam(list(output.parameters()), lr=args.lr)

    scheduler_rnn = MultiStepLR(optimizer_rnn,
                                milestones=args.milestones,
                                gamma=args.lr_rate)
    scheduler_output = MultiStepLR(optimizer_output,
                                   milestones=args.milestones,
                                   gamma=args.lr_rate)

    # start main loop
    time_all = np.zeros(args.epochs)
    while epoch <= args.epochs:
        time_start = tm.time()
        # train
        if 'GraphRNN_VAE' in args.note:
            train_vae_epoch(epoch, args, rnn, output, dataset_train,
                            optimizer_rnn, optimizer_output, scheduler_rnn,
                            scheduler_output)
        elif 'GraphRNN_MLP' in args.note:
            train_mlp_epoch(epoch, args, rnn, output, dataset_train,
                            optimizer_rnn, optimizer_output, scheduler_rnn,
                            scheduler_output)
        elif 'GraphRNN_RNN' in args.note:
            train_rnn_epoch(epoch, args, rnn, output, dataset_train,
                            optimizer_rnn, optimizer_output, scheduler_rnn,
                            scheduler_output, node_f_gen, edge_f_gen)
        time_end = tm.time()
        time_all[epoch - 1] = time_end - time_start
        # test
        if epoch % args.epochs_test == 0 and epoch >= args.epochs_test_start:
            for sample_time in range(1, 4):
                G_pred = []
                while len(G_pred) < args.test_total_size:
                    if 'GraphRNN_VAE' in args.note:
                        G_pred_step = test_vae_epoch(
                            epoch,
                            args,
                            rnn,
                            output,
                            test_batch_size=args.test_batch_size,
                            sample_time=sample_time)
                    elif 'GraphRNN_MLP' in args.note:
                        G_pred_step = test_mlp_epoch(
                            epoch,
                            args,
                            rnn,
                            output,
                            test_batch_size=args.test_batch_size,
                            sample_time=sample_time)
                    elif 'GraphRNN_RNN' in args.note:
                        G_pred_step = test_rnn_epoch(
                            epoch,
                            args,
                            rnn,
                            output,
                            node_f_gen,
                            test_batch_size=args.test_batch_size)
                    G_pred.extend(G_pred_step)
                # save graphs
                fname = args.graph_save_path + args.fname_pred + str(
                    epoch) + '_' + str(sample_time) + '.dat'
                save_graph_list(G_pred, fname)
                if 'GraphRNN_RNN' in args.note:
                    break
            print('test done, graphs saved')

        # save model checkpoint
        if args.save:
            if epoch % args.epochs_save == 0:
                fname = args.model_save_path + args.fname + 'lstm_' + str(
                    epoch) + '.dat'
                torch.save(rnn.state_dict(), fname)
                fname = args.model_save_path + args.fname + 'output_' + str(
                    epoch) + '.dat'
                torch.save(output.state_dict(), fname)
        epoch += 1
    np.save(args.timing_save_path + args.fname, time_all)
    iterator.index_with(vocab)

    val_iterator = AdvancedBucketIterator(
        batch_size=2,
        sorting_keys=[("sentence", "num_tokens")],
    )
    val_iterator.index_with(vocab)

    USE_CUDA = True
    if USE_CUDA:
        model = model.cuda()

    num_epochs = 30

    learning_rate_scheduler = LearningRateWithoutMetricsWrapper(
        MultiStepLR(optimizer, [10, 20, 40], gamma=0.25, last_epoch=-1))

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        iterator=iterator,
        validation_iterator=val_iterator,
        #train_dataset=dataset_train_no_punct + datasets['train'],
        train_dataset=dataset_all_no_punct + datasets['all'],
        #                   validation_dataset=datasets['val'],
        patience=10,
        num_epochs=num_epochs,
        learning_rate_scheduler=learning_rate_scheduler,
        model_save_interval=10,
        cuda_device=0)
    trainer.train()
        print('Train epoch: {:.0f}, it: {:.0f}, loss: {:.4f}, loss_hr: {:.4f}, loss_img: {:.4f}, loss_cross: {:.4f}, loss_snr: {:.4f}'.format(epoch, batch_idx,
                                                                                                 loss, loss_hr, loss_img, loss_cross, loss_SNR));

def test():
    net.eval()
    test_loss = 0;

    for (data, hr, fps, bvp, idx) in test_loader:

        data = Variable(data);
        hr = Variable(hr.view(-1,1));
        data, hr = data.cuda(), hr.cuda();

        feat_hr, feat_n, output, img_out, feat_hrf1, feat_nf1, hrf1, idx1, feat_hrf2, feat_nf2, hrf2, idx2, ecg, ecg1, ecg2 = net(data, epoch);
        loss = lossfunc_HR(output, hr);

        test_loss += loss.item();

begin_epoch = 1;
scheduler = MultiStepLR(optimizer, milestones=[30,80], gamma=0.5)
for epoch in range(begin_epoch, epoch_num + 1):
    if epoch > 20:
        train_dataset.transform = transforms.Compose([resize, toTensor]);
        train_dataset.VerticalFlip = False;

        train_loader = DataLoader(train_dataset, batch_size=batch_size_num,
                                  shuffle=True, num_workers=4);

    train();
    test();
Beispiel #17
0
            lr=args.learning_rate,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
        )
    else:
        optimizer = optim.Adam(params,
                               lr=args.learning_rate,
                               weight_decay=args.weight_decay)

    if args.scheduler == "CosineAnnealing":
        scheduler = CosineAnnealingLR(optimizer=optimizer,
                                      T_max=args.max_epochs,
                                      eta_min=0)
    else:
        scheduler = MultiStepLR(optimizer,
                                milestones=args.milestones,
                                gamma=0.2)

    #############################################RESTART/RESTORE/RESUME#################

    restore_fields = {
        "model":
        model if not isinstance(model, nn.DataParallel) else model.module,
        "optimizer": optimizer,
        "scheduler": scheduler,
    }

    start_epoch = 0
    if args.resume:
        resume_epoch = restore_model(
            restore_fields,
Beispiel #18
0
def tc_trans2():
    global args, best_mae_error

    # load data
    dataset = CIFData(*args.data_options)
    collate_fn = collate_pool

    # obtain target value normalizer
    if args.task == 'classification':
        normalizer = Normalizer(torch.zeros(2))
        normalizer.load_state_dict({'mean': 0., 'std': 1.})
    else:
        if len(dataset) < 500:
            warnings.warn('Dataset has less than 500 data points. '
                          'Lower accuracy is expected. ')
            sample_data_list = [dataset[i] for i in range(len(dataset))]
        else:
            sample_data_list = [
                dataset[i] for i in sample(range(len(dataset)), 500)
            ]
        _, sample_target, _ = collate_pool(sample_data_list)
        normalizer = Normalizer(sample_target)

    # build model
    structures, _, _ = dataset[0]
    orig_atom_fea_len = structures[0].shape[-1]
    nbr_fea_len = structures[1].shape[-1]
    model_a = CrystalGraphConvNet(
        orig_atom_fea_len,
        nbr_fea_len,
        atom_fea_len=args.atom_fea_len,
        n_conv=args.n_conv,
        h_fea_len=args.h_fea_len,
        n_h=args.n_h,
        classification=True if args.task == 'classification' else False)
    model_b = CrystalGraphConvNet(
        orig_atom_fea_len,
        nbr_fea_len,
        atom_fea_len=args.atom_fea_len,
        n_conv=args.n_conv,
        h_fea_len=args.h_fea_len,
        n_h=args.n_h,
        classification=True if args.task == 'classification' else False)
    model = SimpleNN(in_feature=256, out_feature=1)

    # pretrained model path
    model_a_path = '../pre-trained/research-model/bulk_moduli-model_best.pth.tar'
    model_b_path = '../pre-trained/research-model/sps-model_best.pth.tar'

    # load latest model state
    ckpt_a = torch.load(model_a_path)
    ckpt_b = torch.load(model_b_path)

    # load model
    model_a.load_state_dict(ckpt_a['state_dict'])
    model_b.load_state_dict(ckpt_b['state_dict'])

    def get_activation_a(name, activation_a):
        def hook(model, input, output):
            activation_a[name] = output.detach()

        return hook

    def get_activation_b(name, activation_b):
        def hook(model, input, output):
            activation_b[name] = output.detach()

        return hook

    if args.cuda:
        model_a.cuda()
        model_b.cuda()
        model.cuda()

    activation_a = {}
    activation_b = {}

    # hook the activation function
    model_a.conv_to_fc.register_forward_hook(
        get_activation_a('conv_to_fc', activation_a))
    model_b.conv_to_fc.register_forward_hook(
        get_activation_b('conv_to_fc', activation_b))

    # define loss func and optimizer
    if args.task == 'classification':
        criterion = nn.NLLLoss()
    else:
        criterion = nn.MSELoss()
    if args.optim == 'SGD':
        optimizer = optim.SGD(model.parameters(),
                              args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    elif args.optim == 'Adam':
        optimizer = optim.Adam(model.parameters(),
                               args.lr,
                               weight_decay=args.weight_decay)
    else:
        raise NameError('Only SGD or Adam is allowed as --optim')

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_mae_error = checkpoint['best_mae_error']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            normalizer.load_state_dict(checkpoint['normalizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    scheduler = MultiStepLR(optimizer,
                            milestones=args.lr_milestones,
                            gamma=0.1)
    X = torch.Tensor()
    T = torch.Tensor()
    for i in range(5):

        total_size = len(dataset)
        indices = list(range(total_size))
        batch_size = args.batch_size
        num_workers = args.workers
        pin_memory = args.cuda

        if i == 0:
            train_sampler = SubsetRandomSampler(indices[:61])
            test_sampler = SubsetRandomSampler(indices[-16:])
        if i == 1:
            x = indices[:45]
            y = x.extend(indices[-16:])
            train_samplre = SubsetRandomSampler(y)
            test_sampler = SubsetRandomSampler(indices[45:-16])
        if i == 2:
            x = indices[:29]
            y = x.extend(indices[-32:])
            train_samplre = SubsetRandomSampler(y)
            test_sampler = SubsetRandomSampler(indices[29:-32])
        if i == 3:
            x = indices[:13]
            y = x.extend(indices[-48:])
            train_samplre = SubsetRandomSampler(y)
            test_sampler = SubsetRandomSampler(indices[13:-48])
        if i == 4:
            y = indices[-64:]
            train_samplre = SubsetRandomSampler(y)
            test_sampler = SubsetRandomSampler(indices[:-64])

        train_loader = DataLoader(dataset,
                                  batch_size=batch_size,
                                  sampler=train_sampler,
                                  num_workers=num_workers,
                                  collate_fn=collate_fn,
                                  pin_memory=pin_memory)

        test_loader = DataLoader(dataset,
                                 batch_size=batch_size,
                                 sampler=test_sampler,
                                 num_workers=num_workers,
                                 collate_fn=collate_fn,
                                 pin_memory=pin_memory)
        print(test_sampler)
        for epoch in range(args.start_epoch, args.epochs):
            # train for one epoch
            train(args, train_loader, model_a, model_b, model, activation_a,
                  activation_b, criterion, optimizer, epoch, normalizer)

            # evaluate on validation set
            mae_error = validate(args, train_loader, model_a, model_b, model,
                                 activation_a, activation_b, criterion,
                                 normalizer)

            if mae_error != mae_error:
                print('Exit due to NaN')
                sys.exit(1)

            scheduler.step()

            # remember the best mae_eror and save checkpoint
            if args.task == 'regression':
                is_best = mae_error < best_mae_error
                best_mae_error = min(mae_error, best_mae_error)
            else:
                is_best = mae_error > best_mae_error
                best_mae_error = max(mae_error, best_mae_error)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_mae_error': best_mae_error,
                    'optimizer': optimizer.state_dict(),
                    'normalizer': normalizer.state_dict(),
                    'args': vars(args)
                },
                is_best,
                prop=args.property)

        # test best model
        print('---------Evaluate Model on Test Set---------------')
        best_checkpoint = torch.load('../result/' + args.property +
                                     '-model_best.pth.tar')
        model.load_state_dict(best_checkpoint['state_dict'])
        x, t = validate(args,
                        test_loader,
                        model_a,
                        model_b,
                        model,
                        activation_a,
                        activation_b,
                        criterion,
                        normalizer,
                        test=True,
                        tc=True)
        X = torch.cat((X, x), dim=0)
        T = torch.cat((T, t), dim=0)
        x, t = X.numpy(), T.numpy()
        n_max = max(np.max(x), np.max(t))
        n_min = min(np.min(x), np.min(t))
        a = np.linspace(n_min - abs(n_max), n_max + abs(n_max))
        b = a
        plt.rcParams["font.family"] = "Times New Roman"
        plt.plot(a, b, color='blue')
        plt.scatter(t, x, marker=".", color='red', edgecolors='black')
        plt.xlim(n_min - abs(n_min), n_max + abs(n_min))
        plt.ylim(n_min - abs(n_min), n_max + abs(n_min))
        plt.title(
            "Thermal Conductivity Prediction by CGCNN with Combined Model Transfer Learning"
        )
        plt.xlabel("observation")
        plt.ylabel("prediction")
    plt.show()
Beispiel #19
0
def train(args, model, device, train_loader_creator, test_loader_creator, logger):   

    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = optim.SGD(model.parameters(), lr=args.lr,
                          momentum=0.9, weight_decay=args.weight_decay)

    for task_idx, train_loader in enumerate(train_loader_creator.data_loaders):

        for param_group in optimizer.param_groups:
            param_group['lr'] = args.lr
        scheduler = MultiStepLR(optimizer, milestones=args.milestones, gamma=args.gamma)

        for epoch in range(1,args.epochs+1):
            
            model.train()
            losses = AverageMeter()
            acc = AverageMeter()
            batch_time = AverageMeter()
            data_time = AverageMeter()

            end = time.time()
            for batch_idx, (data, target) in enumerate(train_loader):
                data_time.update(time.time() - end)

                data, target = data.to(device), target.to(device)
                optimizer.zero_grad()

                _, output = model(data)

                loss = criterion(output, target)

                loss.backward()                
                optimizer.step()

                it_acc = accuracy(output.data, target)[0]
                losses.update(loss.item(), data.size(0))
                acc.update(it_acc.item(), data.size(0))

                batch_time.update(time.time() - end)
                end = time.time()

                if batch_idx % args.log_interval == 0:
                    logger.info('Train Task: {0} Epoch: [{1:3d}][{2:3d}/{3:3d}]\t'
                        'DTime {data_time.avg:.3f}\t'
                        'BTime {batch_time.avg:.3f}\t'
                        'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                        'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
                            task_idx+1, epoch, batch_idx, len(train_loader),
                            batch_time=batch_time, data_time=data_time, loss=losses, acc=acc))

            scheduler.step()
            if epoch % args.test_interval == 0:
                test(args, model, device, test_loader_creator, logger)

        # plot_embedding_tsne(args, task_idx, test_loader_creator, model, device)
        if args.save_model:
            model_path = args.vis_base_dir.split('/')[-2] + 'T' + str(task_idx+1) + '.pt'
            if isinstance(model, torch.nn.DataParallel):
                torch.save(model.module.state_dict(), model_path)
            else:
                torch.save(model.state_dict(), model_path)
Beispiel #20
0
                                                              train_indices, valid_indices, test_indices, args)

    # margin and equilibirum
    margin = 0.35
    equilibrium = 0.68

    # OPTIM-LOSS
    optimizer_encoder = optim.RMSprop(params=net.encoder.parameters(), lr=lr,
                                      alpha=0.9, eps=1e-8, weight_decay=0, momentum=0, centered=False)
    optimizer_decoder = optim.RMSprop(params=net.decoder.parameters(), lr=lr,
                                      alpha=0.9, eps=1e-8, weight_decay=0, momentum=0, centered=False)
    optimizer_discriminator = optim.RMSprop(params=net.discriminator.parameters(), lr=lr,
                                            alpha=0.9, eps=1e-8, weight_decay=0, momentum=0, centered=False)

    Steps = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
    lr_encoder = MultiStepLR(optimizer_encoder, milestones=Steps, gamma=decay_lr)
    lr_decoder = MultiStepLR(optimizer_decoder, milestones=Steps, gamma=decay_lr)
    lr_discriminator = MultiStepLR(optimizer_discriminator, milestones=Steps, gamma=decay_lr)

    count_update_step = 0
    for i in range(n_epochs):

        for j, sample_batched in enumerate(dataloader):

            net.train()

            # target and input are the same images
            data = sample_batched['image']
            batch_x = data.cuda()

            # get output
Beispiel #21
0
def main():
    """Perform training, validation, and testing, with checkpoint loading and saving"""

    # Build Model
    print("==> Building model..")
    base_model = ResNet34()
    if args.compress:
        model = FeatherNet(
            base_model,
            compress=args.compress,
        )
    else:
        if args.lr != 0.1:
            print("Warning: Suggest setting base-model learning rate to 0.1")
        model = base_model

    # Enable GPU support
    print("==> Setting up device..")
    if torch.cuda.is_available():
        print("Utilizing", torch.cuda.device_count(), "GPU(s)!")
        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
        DEV = torch.device("cuda:0")
        cuda_kwargs = {"num_workers": args.num_workers, "pin_memory": True}
        cudnn.benchmark = True
    else:
        print("Utilizing CPU!")
        DEV = torch.device("cpu")
        cuda_kwargs = {}
    model.to(DEV)

    # Create dataloaders
    print("==> Preparing data..")
    train_loader, valid_loader = get_train_valid_loader(
        data_dir=args.data_dir,
        batch_size=args.batch_size,
        valid_size=args.valid_size,
        **cuda_kwargs
    )
    test_loader = get_test_loader(data_dir=args.data_dir, **cuda_kwargs)

    best_acc = 0  # best validation accuracy
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    save_display = False

    # Load checkpoint
    if args.resume:
        print("==> Resuming from checkpoint..")
        assert os.path.isdir("checkpoint"), "Error: no checkpoint directory found!"
        checkpoint = torch.load("./checkpoint/" + args.ckpt_name)
        model.load_state_dict(checkpoint["model"])
        best_acc = checkpoint["acc"]
        start_epoch = checkpoint["epoch"]

    # Initialize optimizers and loss fn
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(
        model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4
    )
    scheduler = MultiStepLR(optimizer, milestones=[100, 200], gamma=0.1)

    def train(epoch: int) -> None:
        """Train on CIFAR10 per epoch"""
        # maintain backward compatibility; get_last_lr requires PyTorch >= 1.4
        last_lr = (
            scheduler.get_last_lr()[0]
            if version.parse(torch.__version__) >= version.parse("1.4")
            else scheduler.get_lr()[0]
        )
        print(
            "\nEpoch: {}  |  Compression: {:.2f}  |  lr: {:<6}".format(
                epoch, args.compress, last_lr
            )
        )
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(DEV), targets.to(DEV)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            progress_bar(
                batch_idx,
                len(train_loader),
                "Loss: {:.3f} | Acc: {:.3f}% ({}/{})".format(
                    train_loss / (batch_idx + 1),
                    100.0 * correct / total,
                    correct,
                    total,
                ),
            )

    # Validation
    def validate(epoch: int) -> None:
        """Validate on CIFAR10 per epoch. Save best accuracy for checkpoint storing"""
        nonlocal best_acc
        nonlocal save_display
        model.eval()
        valid_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(valid_loader):
                inputs, targets = inputs.to(DEV), targets.to(DEV)
                outputs = model(inputs)
                loss = criterion(outputs, targets)

                valid_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                progress_bar(
                    batch_idx,
                    len(valid_loader),
                    "Loss: {:.3f} | Acc: {:.3f}% ({}/{})".format(
                        valid_loss / (batch_idx + 1),
                        100.0 * correct / total,
                        correct,
                        total,
                    ),
                )

        # Save checkpoint.
        acc = 100.0 * correct / total
        save_display = acc > best_acc
        if acc > best_acc:
            state = {
                "model": model.state_dict(),
                "acc": acc,
                "epoch": epoch,
            }
            if not os.path.isdir("checkpoint"):
                os.mkdir("checkpoint")
            torch.save(state, "./checkpoint/" + args.ckpt_name)
            best_acc = acc

    # Testing
    def test(epoch: int) -> None:
        """Test on CIFAR10 per epoch."""
        model.eval()
        test_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(test_loader):
                inputs, targets = inputs.to(DEV), targets.to(DEV)
                outputs = model(inputs)
                loss = criterion(outputs, targets)

                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                progress_bar(
                    batch_idx,
                    len(test_loader),
                    "Loss: {:.3f} | Acc: {:.3f}% ({}/{})".format(
                        test_loss / (batch_idx + 1),
                        100.0 * correct / total,
                        correct,
                        total,
                    ),
                )

    # Train up to 300 epochs
    # *Displays* concurent performance on validation and test set while training,
    # but strictly uses validation set to determine early stopping
    print("==> Initiate Training..")
    for epoch in range(start_epoch, 300):
        train(epoch)
        validate(epoch)
        test(epoch)
        if save_display:
            print("Saving..")
        scheduler.step()
Beispiel #22
0
    def __init__(self, config, model, trn_data, val_data=None):
        self.config = config
        self.model = model.cuda()
        self.trn_data = DataFetcher(trn_data)
        self.val_data = val_data

        #create the optimizer
        if config['optim'] == 'SGD':
            self.optimizer = SGD(model.parameters(),
                                 lr=config['lr'],
                                 momentum=config['momentum'],
                                 weight_decay=config['wd'])
        elif config['optim'] == 'AdamW':
            self.optimizer = AdamW(
                model.parameters(), lr=config['lr'],
                weight_decay=config['wd'])  #momentum is default
        else:
            optim = config['optim']
            raise Exception(
                f'Optimizer {optim} is not supported! Must be SGD or AdamW')

        #create the learning rate scheduler
        schedule = config['lr_policy']
        if schedule == 'OneCycle':
            self.scheduler = OneCycleLR(self.optimizer,
                                        config['lr'],
                                        total_steps=config['iters'])
        elif schedule == 'MultiStep':
            self.scheduler = MultiStepLR(self.optimizer,
                                         milestones=config['lr_decay_epochs'])
        elif schedule == 'Poly':
            func = lambda iteration: (1 - (iteration / config['iters'])
                                      )**config['power']
            self.scheduler = LambdaLR(self.optimizer, func)
        else:
            lr_policy = config['lr_policy']
            raise Exception(
                f'Policy {lr_policy} is not supported! Must be OneCycle, MultiStep or Poly'
            )

        #create the loss criterion
        if config['num_classes'] > 1:
            #load class weights if they were given in the config file
            if 'class_weights' in config:
                weight = torch.Tensor(config['class_weights']).float().cuda()
            else:
                weight = None

            self.criterion = nn.CrossEntropyLoss(weight=weight).cuda()
        else:
            self.criterion = nn.BCEWithLogitsLoss().cuda()

        #define train and validation metrics and class names
        class_names = config['class_names']

        #make training metrics using the EMAMeter. this meter gives extra
        #weight to the most recent metric values calculated during training
        #this gives a better reflection of how well the model is performing
        #when the metrics are printed
        trn_md = {
            name: metric_lookup[name](EMAMeter())
            for name in config['metrics']
        }
        self.trn_metrics = ComposeMetrics(trn_md, class_names)
        self.trn_loss_meter = EMAMeter()

        #the only difference between train and validation metrics
        #is that we use the AverageMeter. this is because there are
        #no weight updates during evaluation, so all batches should
        #count equally
        val_md = {
            name: metric_lookup[name](AverageMeter())
            for name in config['metrics']
        }
        self.val_metrics = ComposeMetrics(val_md, class_names)
        self.val_loss_meter = AverageMeter()

        self.logging = config['logging']

        #now, if we're resuming from a previous run we need to load
        #the state for the model, optimizer, and schedule and resume
        #the mlflow run (if there is one and we're using logging)
        if config['resume']:
            self.resume(config['resume'])
        elif self.logging:
            #if we're not resuming, but are logging, then we
            #need to setup mlflow with a new experiment
            #everytime that Trainer is instantiated we want to
            #end the current active run and let a new one begin
            mlflow.end_run()

            #extract the experiment name from config so that
            #we know where to save our files, if experiment name
            #already exists, we'll use it, otherwise we create a
            #new experiment
            mlflow.set_experiment(self.config['experiment_name'])

            #add the config file as an artifact
            mlflow.log_artifact(config['config_file'])

            #we don't want to add everything in the config
            #to mlflow parameters, we'll just add the most
            #likely to change parameters
            mlflow.log_param('lr_policy', config['lr_policy'])
            mlflow.log_param('optim', config['optim'])
            mlflow.log_param('lr', config['lr'])
            mlflow.log_param('wd', config['wd'])
            mlflow.log_param('bsz', config['bsz'])
            mlflow.log_param('momentum', config['momentum'])
            mlflow.log_param('iters', config['iters'])
            mlflow.log_param('epochs', config['epochs'])
            mlflow.log_param('encoder', config['encoder'])
            mlflow.log_param('finetune_layer', config['finetune_layer'])
            mlflow.log_param('pretraining', config['pretraining'])
Beispiel #23
0
def main():
    # Views the training images and displays the distance on anchor-negative and anchor-positive
    # test_display_triplet_distance = False
    # print the experiment configuration
    print('\nCurrent time is \33[91m{}\33[0m.'.format(str(time.asctime())))
    print('Parsed options: {}'.format(vars(args)))
    print('Number of Speakers: {}.\n'.format(train_dir.num_spks))

    model_kwargs = {
        'embedding_size': args.embedding_size,
        'num_classes': train_dir.num_spks,
        'input_dim': args.feat_dim,
        'dropout_p': args.dropout_p
    }

    print('Model options: {}'.format(model_kwargs))
    model = create_model(args.model, **model_kwargs)

    # model = ASTDNN(num_classes=train_dir.num_spks, input_dim=args.feat_dim,
    #                embedding_size=args.embedding_size,
    #                dropout_p=args.dropout_p)

    start_epoch = 0
    if args.save_init:
        check_path = '{}/checkpoint_{}.pth'.format(args.check_path,
                                                   start_epoch)
        torch.save(model, check_path)

    if args.resume:
        if os.path.isfile(args.resume):
            print('=> loading checkpoint {}'.format(args.resume))
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']

            filtered = {
                k: v
                for k, v in checkpoint['state_dict'].items()
                if 'num_batches_tracked' not in k
            }
            model_dict = model.state_dict()
            model_dict.update(filtered)

            model.load_state_dict(model_dict)
            #
            try:
                model.dropout.p = args.dropout_p
            except:
                pass
        else:
            print('=> no checkpoint found at {}'.format(args.resume))

    ce_criterion = nn.CrossEntropyLoss()
    if args.loss_type == 'soft':
        xe_criterion = None
    elif args.loss_type == 'asoft':
        ce_criterion = None
        model.classifier = AngleLinear(in_features=args.embedding_size,
                                       out_features=train_dir.num_spks,
                                       m=args.m)
        xe_criterion = AngleSoftmaxLoss(lambda_min=args.lambda_min,
                                        lambda_max=args.lambda_max)
    elif args.loss_type == 'center':
        xe_criterion = CenterLoss(num_classes=train_dir.num_spks,
                                  feat_dim=args.embedding_size)
    elif args.loss_type == 'amsoft':
        ce_criterion = None
        model.classifier = AdditiveMarginLinear(feat_dim=args.embedding_size,
                                                n_classes=train_dir.num_spks)
        xe_criterion = AMSoftmaxLoss(margin=args.margin, s=args.s)

    optimizer = create_optimizer(model.parameters(), args.optimizer,
                                 **opt_kwargs)
    if args.loss_type == 'center':
        optimizer = torch.optim.SGD([{
            'params': xe_criterion.parameters(),
            'lr': args.lr * 5
        }, {
            'params': model.parameters()
        }],
                                    lr=args.lr,
                                    weight_decay=args.weight_decay,
                                    momentum=args.momentum)
    if args.finetune:
        if args.loss_type == 'asoft' or args.loss_type == 'amsoft':
            classifier_params = list(map(id, model.classifier.parameters()))
            rest_params = filter(lambda p: id(p) not in classifier_params,
                                 model.parameters())
            optimizer = torch.optim.SGD(
                [{
                    'params': model.classifier.parameters(),
                    'lr': args.lr * 5
                }, {
                    'params': rest_params
                }],
                lr=args.lr,
                weight_decay=args.weight_decay,
                momentum=args.momentum)

    if args.scheduler == 'exp':
        scheduler = ExponentialLR(optimizer, gamma=args.gamma)
    else:
        milestones = args.milestones.split(',')
        milestones = [int(x) for x in milestones]
        milestones.sort()
        scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1)

    ce = [ce_criterion, xe_criterion]

    start = args.start_epoch + start_epoch
    print('Start epoch is : ' + str(start))
    # start = 0
    end = start + args.epochs

    train_loader = torch.utils.data.DataLoader(train_dir,
                                               batch_size=args.batch_size,
                                               collate_fn=PadCollate(
                                                   dim=2,
                                                   fix_len=False,
                                                   min_chunk_size=250,
                                                   max_chunk_size=450),
                                               shuffle=True,
                                               **kwargs)
    valid_loader = torch.utils.data.DataLoader(
        valid_dir,
        batch_size=int(args.batch_size / 2),
        collate_fn=PadCollate(dim=2,
                              fix_len=False,
                              min_chunk_size=250,
                              max_chunk_size=450),
        shuffle=False,
        **kwargs)
    test_loader = torch.utils.data.DataLoader(test_dir,
                                              batch_size=args.test_batch_size,
                                              shuffle=False,
                                              **kwargs)
    # sitw_test_loader = torch.utils.data.DataLoader(sitw_test_dir, batch_size=args.test_batch_size,
    #                                                shuffle=False, **kwargs)
    # sitw_dev_loader = torch.utils.data.DataLoader(sitw_dev_part, batch_size=args.test_batch_size, shuffle=False,
    #                                               **kwargs)

    if args.cuda:
        model = model.cuda()
        for i in range(len(ce)):
            if ce[i] != None:
                ce[i] = ce[i].cuda()

    for epoch in range(start, end):
        # pdb.set_trace()
        print('\n\33[1;34m Current \'{}\' learning rate is '.format(
            args.optimizer),
              end='')
        for param_group in optimizer.param_groups:
            print('{:.5f} '.format(param_group['lr']), end='')
        print(' \33[0m')

        train(train_loader, model, ce, optimizer, epoch)
        test(test_loader, valid_loader, model, epoch)
        # sitw_test(sitw_test_loader, model, epoch)
        # sitw_test(sitw_dev_loader, model, epoch)
        scheduler.step()
        # exit(1)

    writer.close()
Beispiel #24
0
    k, batch_size, epochs = args.k, args.batch_size, args.epochs

    # data prepare
    train_data = utils.CIFAR10Instance(root='data', train=True, transform=utils.train_transform, download=True)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=8)
    memory_data = utils.CIFAR10Instance(root='data', train=True, transform=utils.test_transform, download=True)
    memory_loader = DataLoader(memory_data, batch_size=batch_size, shuffle=False, num_workers=8)
    test_data = utils.CIFAR10Instance(root='data', train=False, transform=utils.test_transform, download=True)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=8)

    # model setup and optimizer config
    model = Model(feature_dim).to('cuda')
    optimizer = optim.SGD(model.parameters(), lr=0.03, momentum=0.9, weight_decay=5e-4)
    print("# trainable model parameters:", sum(param.numel() if param.requires_grad else 0
                                               for param in model.parameters()))
    lr_scheduler = MultiStepLR(optimizer, milestones=[int(epochs * 0.6), int(epochs * 0.8)], gamma=0.1)

    # z as normalizer, init with None, c as num of train class, n as num of train data
    z, c, n = None, len(memory_data.classes), len(train_data)
    # init memory bank as unit random vector ---> [N, D]
    memory_bank = F.normalize(torch.randn(n, feature_dim), dim=-1)

    # training loop
    results = {'train_loss': [], 'test_acc@1': [], 'test_acc@5': []}
    best_acc = 0.0
    for epoch in range(1, epochs + 1):
        train_loss = train(model, train_loader, optimizer)
        results['train_loss'].append(train_loss)
        test_acc_1, test_acc_5 = test(model, memory_loader, test_loader)
        results['test_acc@1'].append(test_acc_1)
        results['test_acc@5'].append(test_acc_5)
Beispiel #25
0
    # net = DenseNet121()
    # net = ResNeXt29_2x64d()
    # net = MobileNet()
    net = MobileNetV2()
    # net = DPN92()
    # net = ShuffleNetG2()
    # net = SENet18()

if use_cuda:
    net.cuda()
    net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count()))
    cudnn.benchmark = True

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
scheduler = MultiStepLR(optimizer, milestones=[150,250], gamma=0.1)

# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    epoch_steps = stream.size() // BATCH
    for batch_idx in range(epoch_steps):
        meta, inputs = stream.next()
        targets = meta.labels.astype(np.int64)
        inputs = torch.from_numpy(inputs)
        targets = torch.from_numpy(targets)
def main():
    dataset_train = Dataset(data_path=opt.data_path, augment=True)
    loader_train = DataLoader(dataset=dataset_train,
                              num_workers=4,
                              batch_size=opt.batch_size,
                              shuffle=True)
    print("# of training samples: %d\n" % int(len(dataset_train)))

    # Build model
    model = fusion_can_multiscale(recurrent_iter=opt.recurrent_iter,
                                  use_GPU=opt.use_gpu)
    print_network(model)

    # loss function
    criterion1 = SSIM()
    if opt.use_gpu:
        model = model.cuda()
        criterion1.cuda()

    # Optimizer for PreNet
    optimizer = optim.Adam(model.parameters(), lr=opt.lr)
    scheduler = MultiStepLR(optimizer, milestones=opt.milestone, gamma=0.1)
    # record training
    writer = SummaryWriter(opt.save_path)

    # load the lastest model
    initial_epoch = findLastCheckpoint(save_dir=opt.save_path)
    if initial_epoch > 0:
        print('resuming by loading epoch %d' % initial_epoch)
        model.load_state_dict(
            torch.load(
                os.path.join(opt.save_path,
                             'net_epoch%d.pth' % initial_epoch)))

    # start training
    step = 0
    for epoch in range(initial_epoch, opt.epochs):
        scheduler.step(epoch)  #update learning rate
        for param_group in optimizer.param_groups:
            print('learning rate %f' % param_group["lr"])

        ## epoch training start
        for i, (input_train, target_train) in enumerate(loader_train, 0):
            model.train()  #training mode of model
            model.zero_grad()
            optimizer.zero_grad()

            input_train, target_train = Variable(input_train), Variable(
                target_train)

            if opt.use_gpu:
                input_train, target_train = input_train.cuda(
                ), target_train.cuda()

            out_train = model(input_train)
            pixel_metric = criterion1(target_train, out_train)
            loss1 = -pixel_metric
            loss1.backward()
            optimizer.step()

            # training curve
            model.eval()  #evaluation mode of model
            out_train = model(input_train)
            out_train = torch.clamp(out_train, 0., 1.)
            psnr_train = batch_PSNR(out_train, target_train, 1.)
            print(
                "[epoch %d][%d/%d] ssim_loss: %.4f, pixel_metric: %.4f, PSNR: %.4f"
                % (epoch + 1, i + 1, len(loader_train), loss1.item(),
                   pixel_metric.item(), psnr_train))

            if step % 10 == 0:
                # Log the scalar values
                writer.add_scalar('SSIM_loss', loss1.item(), step)
                #writer.add_scalar('learning_rate', loss2.item(), step)
                writer.add_scalar('PSNR on training data', psnr_train, step)
            step += 1
        ## epoch training end

        # log the images
        model.eval()
        out_train = model(input_train)
        out_train = torch.clamp(out_train, 0., 1.)

        im_target = utils.make_grid(target_train.data,
                                    nrow=8,
                                    normalize=True,
                                    scale_each=True)
        im_input = utils.make_grid(input_train.data,
                                   nrow=8,
                                   normalize=True,
                                   scale_each=True)
        im_derain = utils.make_grid(out_train.data,
                                    nrow=8,
                                    normalize=True,
                                    scale_each=True)

        writer.add_image('Clean image', im_target, epoch + 1)
        writer.add_image('Rainy image', im_input, epoch + 1)
        writer.add_image('Derained image', im_derain, epoch + 1)

        # save model
        torch.save(model.state_dict(),
                   os.path.join(opt.save_path, 'net_latest.pth'))
        if epoch % opt.save_freq == 0:
            torch.save(
                model.state_dict(),
                os.path.join(opt.save_path, 'net_epoch%d.pth' % (epoch + 1)))
    def train(self, epoch, trainloader):
        self.model.train()

        train_loss = AverageMeter()
        prec = AverageMeter()

        # Declare optimizer.
        params = self.master_params if self.fp16_mode else self.model.parameters(
        )
        optimizer = optim.SGD(params,
                              self.args.lr,
                              momentum=self.args.momentum,
                              weight_decay=self.args.weight_decay)

        # learning rate scheduler
        scheduler = MultiStepLR(optimizer,
                                milestones=[80, 120, 160, 180],
                                gamma=0.1)

        # If epoch less than 5 use warmup, else use scheduler.
        if epoch < 5 and self.args.warm_up:
            lr = self.warmup_learning_rate(self.args.lr, self.args.epochs,
                                           epoch, len(trainloader))
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
        else:
            scheduler.step(epoch=epoch)

        # Loss criterion is in FP32.
        criterion = nn.CrossEntropyLoss()

        with trange(len(trainloader)) as t:
            for idx, (inputs, targets) in enumerate(trainloader):
                if self.train_on_gpu:
                    inputs, targets = inputs.cuda(), targets.cuda()
                self.model.zero_grad()
                outputs = self.model(inputs)
                # We calculate the loss in FP32 since reduction ops can
                # be wrong when represented in FP16.
                loss = criterion(outputs, targets)

                # Sometime the loss may become small to be represente in FP16
                # So we scale the losses by a large power of 2, 2**7 here.
                if self.loss_scaling: loss = loss * self._LOSS_SCALE
                # Calculate the gradients
                loss.backward()
                if self.fp16_mode:
                    # Move the calculated gradients to the master params
                    # so that we can apply the gradient update in FP32.
                    self.model_grads_to_master_grads(self.model_params,
                                                     self.master_params)
                    if self.loss_scaling:
                        # If we scaled our losses now is a good time to scale it
                        # back since our gradients are in FP32.
                        for params in self.master_params:
                            params.grad.data = params.grad.data / self._LOSS_SCALE
                    # Apply weight update in FP32.
                    optimizer.step()
                    # Copy the updated weights back FP16 model weights.
                    self.master_params_to_model_params(self.model_params,
                                                       self.master_params)
                else:
                    optimizer.step()

                train_loss.update(loss.item() / self._LOSS_SCALE,
                                  inputs.size(0))
                top1 = accuracy(outputs, targets)[0]
                prec.update(top1.item(), inputs.size(0))

                metrics = {
                    'Epoch': f'{epoch + 1}',
                    'Loss': '%.2f' % train_loss.avg,
                    'Acc': '%.1f' % prec.avg,
                    'LR': '%.4f' % get_optim_lr(optimizer)
                }
                t.set_postfix(metrics)
                t.update()
            t.close()

        self.history['loss'].append(train_loss.avg)
        self.history['acc'].append(prec.avg)
Beispiel #28
0
def main(factor):
    global meter_loss
    global meter_psnr
    global scheduler
    global engine
    global epoch_num 
    global psnr_value 
    global loss_value 
    global train_loader
    global val_loader
    global model
    global criterion
    global UPSCALE_FACTOR
    
    parser = argparse.ArgumentParser(description='Super Resolution Training')
    parser.add_argument('--upscale_factor', default=3, type=int, help='super resolution upscale factor')
    parser.add_argument('--num_epochs', default=100, type=int, help='super resolution epochs number')
    opt = parser.parse_args()

    UPSCALE_FACTOR = opt.upscale_factor
    NUM_EPOCHS = opt.num_epochs
    if factor != 3:
        UPSCALE_FACTOR = factor

    train_set = DatasetFromFolder('data/train', upscale_factor=UPSCALE_FACTOR, input_transform=transforms.ToTensor(),
                                  target_transform=transforms.ToTensor())
    val_set = DatasetFromFolder('data/val', upscale_factor=UPSCALE_FACTOR, input_transform=transforms.ToTensor(),
                                target_transform=transforms.ToTensor())
    train_loader = DataLoader(dataset=train_set, num_workers=0, batch_size=64, shuffle=True)
    val_loader = DataLoader(dataset=val_set, num_workers=0, batch_size=64, shuffle=False)

    model = SPCNNet(upscale_factor=UPSCALE_FACTOR)
    criterion = nn.MSELoss()
    
    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()
        
    print('# upscale factor:', UPSCALE_FACTOR)
    print('# parameters:', sum(param.numel() for param in model.parameters()))

    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    scheduler = MultiStepLR(optimizer, milestones=[30, 80], gamma=0.1)

    engine = Engine()
    meter_loss = tnt.meter.AverageValueMeter()
    meter_psnr = PSNRMeter()
    epoch_num = []
    psnr_value = []
    loss_value = []

    engine.hooks['on_sample'] = on_sample
    engine.hooks['on_forward'] = on_forward
    engine.hooks['on_start_epoch'] = on_start_epoch
    engine.hooks['on_end_epoch'] = on_end_epoch

    engine.train(processor, train_loader, maxepoch=NUM_EPOCHS, optimizer=optimizer)
    
    plt.plot(epoch_num, psnr_value, lw=2, ls='-', label="PSNR--x"+str(UPSCALE_FACTOR), color="r", marker="+")
    plt.xlabel("epoch time(s)", fontsize=16, horizontalalignment="right")
    plt.ylabel("PSNR value", fontsize=16, horizontalalignment="right")
    
    plt.legend()
    plt.savefig('D:\大三上\数字图像处理\SR_Project\plots\PSNRx'+str(UPSCALE_FACTOR)+'.png')
    plt.show()
    
    plt.plot(epoch_num, loss_value, lw=2, ls='-', label="Loss--x"+str(UPSCALE_FACTOR), color="r", marker="+")
    plt.xlabel("epoch time(s)", fontsize=16, horizontalalignment="right")
    plt.ylabel("Loss value", fontsize=16, horizontalalignment="right")
    
    plt.legend()
    plt.savefig('D:\大三上\数字图像处理\SR_Project\plots\LOSSx'+str(UPSCALE_FACTOR)+'.png')
    plt.show()
Beispiel #29
0
def train(ds,
          fold,
          train_idx,
          val_idx,
          conf,
          val_ds=None,
          transforms=None,
          val_transforms=None):
    if conf.model_fqn.endswith('SeResnext50_32d4d_upsample'):
        model = dynamic_load(conf.model_fqn)(
            num_classes=conf.num_classes,
            num_channels=conf.num_channels,
            pretrained_file=(conf.pretrained_model
                             if 'pretrained_model' in conf else None),
        )
    else:
        model = dynamic_load(conf.model_fqn)(
            num_classes=conf.num_classes,
            num_channels=conf.num_channels,
        )
    # save_path = u.prefix_path() + f'/working/sp5r2/models/weights/{conf.modelname}/fold{fold}'
    save_path = f'/wdata/working/sp5r2/models/weights/{conf.modelname}/fold{fold}'
    Path(save_path).mkdir(parents=True, exist_ok=True)

    # tfb_path = u.prefix_path() + f'/working/sp5r2/models/logs/{conf.modelname}/fold{fold}'
    tfb_path = f'/wdata/working/sp5r2/models/logs/{conf.modelname}/fold{fold}'
    Path(tfb_path).mkdir(parents=True, exist_ok=True)

    optimizer = dynamic_load(conf.optimizer_fqn)
    estimator = Estimator(model, optimizer, save_path, config=conf)
    estimator.lr_scheduler = MultiStepLR(estimator.optimizer,
                                         conf.lr_steps,
                                         gamma=conf.lr_gamma)
    if 'scheduler' in conf:
        scheduler_class = dynamic_load(conf.scheduler)
        if conf.scheduler.endswith('CosineAnnealingLR'):
            conf.scheduler_params['optimizer'] = estimator.optimizer
        estimator.lr_scheduler = scheduler_class(**conf.scheduler_params)

    callbacks = [
        ModelSaver(1, ("fold" + str(fold) + "_best.pth"), best_only=True),
        ModelSaver(1, ("fold" + str(fold) + "_last.pth"), best_only=False),
        CheckpointSaver(1, ("fold" + str(fold) + "_checkpoint.pth")),
        CheckpointSaver(
            1, ("fold" + str(fold) + "_ep{epoch}_{loss}_checkpoint.pth")),
        TensorBoard(tfb_path),
    ]
    if 'early_stopper_patience' in conf:
        callbacks.append(EarlyStopper(conf.early_stopper_patience))

    trainer = PytorchTrain(estimator,
                           conf=conf,
                           fold=fold,
                           callbacks=callbacks,
                           no_eval_period=conf.get('no_eval_period', 0))

    train_dataset = TrainDataset(ds,
                                 train_idx,
                                 conf,
                                 transforms=transforms,
                                 verbose=False)
    train_loader = PytorchDataLoader(train_dataset,
                                     batch_size=conf.batch_size,
                                     shuffle=True,
                                     drop_last=True,
                                     num_workers=conf.num_workers,
                                     pin_memory=True)

    val_dataset = ValDataset(val_ds if val_ds is not None else ds,
                             val_idx,
                             conf,
                             transforms=val_transforms)
    val_loader = PytorchDataLoader(
        val_dataset,
        batch_size=conf.batch_size if not conf.ignore_target_size else 1,
        shuffle=False,
        drop_last=False,
        num_workers=conf.num_workers,
        pin_memory=True)

    trainer.fit(train_loader, val_loader, conf.nb_epoch)
Beispiel #30
0
                             center_variance=0.1,
                             size_variance=0.2,
                             device=DEVICE)
    optimizer = torch.optim.SGD(params,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    logging.info(
        f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, " +
        f"Extra Layers learning rate: {extra_layers_lr}.")

    if args.scheduler == 'multi-step':
        logging.info("Uses MultiStepLR scheduler.")
        milestones = [int(v.strip()) for v in args.milestones.split(",")]
        scheduler = MultiStepLR(optimizer,
                                milestones=milestones,
                                gamma=0.1,
                                last_epoch=last_epoch)
    elif args.scheduler == 'cosine':
        logging.info("Uses CosineAnnealingLR scheduler.")
        scheduler = CosineAnnealingLR(optimizer,
                                      args.t_max,
                                      last_epoch=last_epoch)
    else:
        logging.fatal(f"Unsupported Scheduler: {args.scheduler}.")
        parser.print_help(sys.stderr)
        sys.exit(1)

    logging.info(f"Start training from epoch {last_epoch + 1}.")
    for epoch in range(last_epoch + 1, args.num_epochs):
        scheduler.step()
        train(train_loader,
def train_model_residual_lowlight_rdn():

    device = DEVICE
    #准备数据
    train_set = HsiCubicTrainDataset('./data/train_lowlik04/')
    #print('trainset32 training example:', len(train_set32))
    #train_set = HsiCubicTrainDataset('./data/train_lowlight/')

    #train_set_64 = HsiCubicTrainDataset('./data/train_lowlight_patchsize64/')

    #train_set_list = [train_set32, train_set_64]
    #train_set = ConcatDataset(train_set_list) #里面的样本大小必须是一致的,否则会连接失败
    print('total training example:', len(train_set))

    train_loader = DataLoader(dataset=train_set,
                              batch_size=BATCH_SIZE,
                              shuffle=True)

    #加载测试label数据
    mat_src_path = './data/test_lowlight/origin/soup_bigcorn_orange_1ms.mat'
    test_label_hsi = scio.loadmat(mat_src_path)['label']

    #加载测试数据
    batch_size = 1
    #test_data_dir = './data/test_lowlight/cuk12/'
    test_data_dir = './data/test_lowlight/cuk04/'

    test_set = HsiCubicLowlightTestDataset(test_data_dir)
    test_dataloader = DataLoader(dataset=test_set,
                                 batch_size=batch_size,
                                 shuffle=False)

    batch_size, channel, width, height = next(iter(test_dataloader))[0].shape

    band_num = len(test_dataloader)
    denoised_hsi = np.zeros((width, height, band_num))

    save_model_path = './checkpoints/hsirnd_k04'
    if not os.path.exists(save_model_path):
        os.mkdir(save_model_path)

    #创建模型
    net = HSIRDNECA(K)
    init_params(net)
    net = nn.DataParallel(net).to(device)
    #net = net.to(device)

    #创建优化器
    #hsid_optimizer = optim.Adam(net.parameters(), lr=INIT_LEARNING_RATE, betas=(0.9, 0,999))
    hsid_optimizer = optim.Adam(net.parameters(), lr=INIT_LEARNING_RATE)
    scheduler = MultiStepLR(hsid_optimizer, milestones=[200, 400], gamma=0.5)

    #定义loss 函数
    #criterion = nn.MSELoss()
    best_psnr = 0

    is_resume = RESUME
    #唤醒训练
    if is_resume:
        path_chk_rest = dir_utils.get_last_path(save_model_path,
                                                'model_latest.pth')
        model_utils.load_checkpoint(net, path_chk_rest)
        start_epoch = model_utils.load_start_epoch(path_chk_rest) + 1
        model_utils.load_optim(hsid_optimizer, path_chk_rest)
        best_psnr = model_utils.load_best_psnr(path_chk_rest)

        for i in range(1, start_epoch):
            scheduler.step()
        new_lr = scheduler.get_lr()[0]
        print(
            '------------------------------------------------------------------------------'
        )
        print("==> Resuming Training with learning rate:", new_lr)
        print(
            '------------------------------------------------------------------------------'
        )

    global tb_writer
    tb_writer = get_summary_writer(log_dir='logs')

    gen_epoch_loss_list = []

    cur_step = 0

    first_batch = next(iter(train_loader))

    best_epoch = 0
    best_iter = 0
    if not is_resume:
        start_epoch = 1
    num_epoch = 600

    mpsnr_list = []
    for epoch in range(start_epoch, num_epoch + 1):
        epoch_start_time = time.time()
        scheduler.step()
        print('epoch = ', epoch, 'lr={:.6f}'.format(scheduler.get_lr()[0]))
        print(scheduler.get_lr())

        gen_epoch_loss = 0

        net.train()
        #for batch_idx, (noisy, label) in enumerate([first_batch] * 300):
        for batch_idx, (noisy, cubic, label) in enumerate(train_loader):
            #print('batch_idx=', batch_idx)
            noisy = noisy.to(device)
            label = label.to(device)
            cubic = cubic.to(device)

            hsid_optimizer.zero_grad()
            #denoised_img = net(noisy, cubic)
            #loss = loss_fuction(denoised_img, label)

            residual = net(noisy, cubic)
            alpha = 0.8
            loss = recon_criterion(residual, label - noisy)
            #loss = alpha*recon_criterion(residual, label-noisy) + (1-alpha)*loss_function_mse(residual, label-noisy)
            #loss = recon_criterion(residual, label-noisy)
            loss.backward()  # calcu gradient
            hsid_optimizer.step()  # update parameter

            gen_epoch_loss += loss.item()

            if cur_step % display_step == 0:
                if cur_step > 0:
                    print(
                        f"Epoch {epoch}: Step {cur_step}: Batch_idx {batch_idx}: MSE loss: {loss.item()}"
                    )
                else:
                    print("Pretrained initial state")

            tb_writer.add_scalar("MSE loss", loss.item(), cur_step)

            #step ++,每一次循环,每一个batch的处理,叫做一个step
            cur_step += 1

        gen_epoch_loss_list.append(gen_epoch_loss)
        tb_writer.add_scalar("mse epoch loss", gen_epoch_loss, epoch)

        #scheduler.step()
        #print("Decaying learning rate to %g" % scheduler.get_last_lr()[0])

        torch.save(
            {
                'gen': net.state_dict(),
                'gen_opt': hsid_optimizer.state_dict(),
            },
            f"{save_model_path}/hsid_rdn_eca_l1_loss_600epoch_patchsize32_{epoch}.pth"
        )

        #测试代码
        net.eval()
        psnr_list = []

        for batch_idx, (noisy_test, cubic_test,
                        label_test) in enumerate(test_dataloader):
            noisy_test = noisy_test.type(torch.FloatTensor)
            label_test = label_test.type(torch.FloatTensor)
            cubic_test = cubic_test.type(torch.FloatTensor)

            noisy_test = noisy_test.to(DEVICE)
            label_test = label_test.to(DEVICE)
            cubic_test = cubic_test.to(DEVICE)

            with torch.no_grad():

                residual = net(noisy_test, cubic_test)
                denoised_band = noisy_test + residual

                denoised_band_numpy = denoised_band.cpu().numpy().astype(
                    np.float32)
                denoised_band_numpy = np.squeeze(denoised_band_numpy)

                denoised_hsi[:, :, batch_idx] = denoised_band_numpy

                if batch_idx == 49:
                    residual_squeezed = torch.squeeze(residual, axis=0)
                    denoised_band_squeezed = torch.squeeze(denoised_band,
                                                           axis=0)
                    label_test_squeezed = torch.squeeze(label_test, axis=0)
                    noisy_test_squeezed = torch.squeeze(noisy_test, axis=0)
                    tb_writer.add_image(f"images/{epoch}_restored",
                                        denoised_band_squeezed,
                                        1,
                                        dataformats='CHW')
                    tb_writer.add_image(f"images/{epoch}_residual",
                                        residual_squeezed,
                                        1,
                                        dataformats='CHW')
                    tb_writer.add_image(f"images/{epoch}_label",
                                        label_test_squeezed,
                                        1,
                                        dataformats='CHW')
                    tb_writer.add_image(f"images/{epoch}_noisy",
                                        noisy_test_squeezed,
                                        1,
                                        dataformats='CHW')

            test_label_current_band = test_label_hsi[:, :, batch_idx]

            psnr = PSNR(denoised_band_numpy, test_label_current_band)
            psnr_list.append(psnr)

        mpsnr = np.mean(psnr_list)
        mpsnr_list.append(mpsnr)

        denoised_hsi_trans = denoised_hsi.transpose(2, 0, 1)
        test_label_hsi_trans = test_label_hsi.transpose(2, 0, 1)
        mssim = SSIM(denoised_hsi_trans, test_label_hsi_trans)
        sam = SAM(denoised_hsi_trans, test_label_hsi_trans)

        #计算pnsr和ssim
        print("=====averPSNR:{:.3f}=====averSSIM:{:.4f}=====averSAM:{:.3f}".
              format(mpsnr, mssim, sam))
        tb_writer.add_scalars("validation metrics", {
            'average PSNR': mpsnr,
            'average SSIM': mssim,
            'avarage SAM': sam
        }, epoch)  #通过这个我就可以看到,那个epoch的性能是最好的

        #保存best模型
        if mpsnr > best_psnr:
            best_psnr = mpsnr
            best_epoch = epoch
            best_iter = cur_step
            torch.save(
                {
                    'epoch': epoch,
                    'gen': net.state_dict(),
                    'gen_opt': hsid_optimizer.state_dict(),
                },
                f"{save_model_path}/hsid_rdn_eca_l1_loss_600epoch_patchsize32_best.pth"
            )

        print(
            "[epoch %d it %d PSNR: %.4f --- best_epoch %d best_iter %d Best_PSNR %.4f]"
            % (epoch, cur_step, mpsnr, best_epoch, best_iter, best_psnr))

        print(
            "------------------------------------------------------------------"
        )
        print("Epoch: {}\tTime: {:.4f}\tLoss: {:.4f}\tLearningRate {:.6f}".
              format(epoch,
                     time.time() - epoch_start_time, gen_epoch_loss,
                     INIT_LEARNING_RATE))
        print(
            "------------------------------------------------------------------"
        )

        #保存当前模型
        torch.save(
            {
                'epoch': epoch,
                'gen': net.state_dict(),
                'gen_opt': hsid_optimizer.state_dict(),
                'best_psnr': best_psnr,
            }, os.path.join(save_model_path, "model_latest.pth"))
    mpsnr_list_numpy = np.array(mpsnr_list)
    np.save(os.path.join(save_model_path, "mpsnr_per_epoch.npy"),
            mpsnr_list_numpy)
    tb_writer.close()
def main(args):
    model = load_config(args.model)
    dataset = load_config(args.dataset)

    device = torch.device('cuda' if model['common']['cuda'] else 'cpu')

    if model['common']['cuda'] and not torch.cuda.is_available():
        sys.exit('Error: CUDA requested but not available')

    # if args.batch_size < 2:
    #     sys.exit('Error: PSPNet requires more than one image for BatchNorm in Pyramid Pooling')

    os.makedirs(model['common']['checkpoint'], exist_ok=True)

    num_classes = len(dataset['common']['classes'])
    net = UNet(num_classes).to(device)

    if args.resume:
        path = os.path.join(model['common']['checkpoint'], args.resume)

        cuda = model['common']['cuda']

        def map_location(storage, _):
            return storage.cuda() if cuda else storage.cpu()

        chkpt = torch.load(path, map_location=map_location)
        net.load_state_dict(chkpt)
        resume_at_epoch = int(args.resume[11:16])
    else:
        resume_at_epoch = 0

    if model['common']['cuda']:
        torch.backends.cudnn.benchmark = True
        net = DataParallel(net)

    optimizer = SGD(net.parameters(), lr=model['opt']['lr'], momentum=model['opt']['momentum'])

    scheduler = MultiStepLR(optimizer, milestones=model['opt']['milestones'], gamma=model['opt']['gamma'])

    weight = torch.Tensor(dataset['weights']['values'])

    for i in range(resume_at_epoch):
        scheduler.step()

    criterion = CrossEntropyLoss2d(weight=weight).to(device)
    # criterion = FocalLoss2d(weight=weight).to(device)

    train_loader, val_loader = get_dataset_loaders(model, dataset)

    num_epochs = model['opt']['epochs']

    history = collections.defaultdict(list)

    for epoch in range(resume_at_epoch, num_epochs):
        print('Epoch: {}/{}'.format(epoch + 1, num_epochs))

        train_hist = train(train_loader, num_classes, device, net, optimizer, scheduler, criterion)
        print('Train loss: {:.4f}, mean IoU: {:.4f}'.format(train_hist['loss'], train_hist['iou']))

        for k, v in train_hist.items():
            history['train ' + k].append(v)

        val_hist = validate(val_loader, num_classes, device, net, criterion)
        print('Validate loss: {:.4f}, mean IoU: {:.4f}'.format(val_hist['loss'], val_hist['iou']))

        for k, v in val_hist.items():
            history['val ' + k].append(v)

        visual = 'history-{:05d}-of-{:05d}.png'.format(epoch + 1, num_epochs)
        plot(os.path.join(model['common']['checkpoint'], visual), history)

        checkpoint = 'checkpoint-{:05d}-of-{:05d}.pth'.format(epoch + 1, num_epochs)
        torch.save(net.state_dict(), os.path.join(model['common']['checkpoint'], checkpoint))