Exemple #1
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        model = custom_models.__dict__[args.arch]([2, 2, 2, 2],
                                                  pooling_type='max',
                                                  in_chns=1,
                                                  num_classes=12,
                                                  inplanes=64)
        # print("=> creating model '{}'".format(args.arch))
        # model = models.__dict__[args.arch]()

    if not torch.cuda.is_available():
        print('using CPU, this will be slow')
    elif args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    args.mean = [0.5, 0.5, 0.5]
    args.std = [0.5, 0.5, 0.5]

    trans_funcs = []

    val_loader = torch.utils.data.DataLoader(get_val_dataset(
        args.data_dir + '/img/', args.data_dir + '/gt/',
        args.data_dir + '/all_imgs.txt', args.test_inds, trans_funcs,
        args.mean, args.std, args.target_size),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    train_dataset = get_train_dataset(args.data_dir + '/img/',
                                      args.data_dir + '/gt/',
                                      args.data_dir + '/all_imgs.txt',
                                      args.test_inds, trans_funcs, args.mean,
                                      args.std, args.target_size)

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
def main():
  if not torch.cuda.is_available():
    logging.info('no gpu device available')
    sys.exit(1)

  np.random.seed(args.seed)
  torch.cuda.set_device(args.gpu)
  cudnn.benchmark = True
  torch.manual_seed(args.seed)
  cudnn.enabled=True
  torch.cuda.manual_seed(args.seed)
  logging.info('gpu device = %d' % args.gpu)
  logging.info("args = %s", args)

  criterion = nn.CrossEntropyLoss()
  criterion = criterion.cuda()
  model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion)
  model = model.cuda()
  logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

  optimizer = torch.optim.SGD(
      model.parameters(),
      args.learning_rate,
      momentum=args.momentum,
      weight_decay=args.weight_decay)

  train_data = get_train_dataset( args.set, args )

  num_train = len(train_data) // 1
  indices = list(range(num_train))
  split = int(np.floor(args.train_portion * num_train))

  train_queue = torch.utils.data.DataLoader(
      train_data, batch_size=args.batch_size,
      sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
      pin_memory=True, num_workers=8)

  valid_queue = torch.utils.data.DataLoader(
      train_data, batch_size=args.batch_size,
      sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]),
      pin_memory=True, num_workers=8)

  scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs), eta_min=args.learning_rate_min)

  architect = Architect(model, args)

  for epoch in range(args.epochs):
    lr = scheduler.get_lr()[0]
    logging.info('epoch %d lr %e', epoch, lr)

    genotype = model.genotype()
    logging.info('genotype = %s', genotype)

    #print(F.softmax(model.alphas_normal, dim=-1))
    #print(F.softmax(model.alphas_reduce, dim=-1))

    # training
    train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr,epoch)
    logging.info('train_acc %f', train_acc)

    # validation
    if args.epochs-epoch<=1:
      valid_acc, valid_obj = infer(valid_queue, model, criterion)
      logging.info('valid_acc %f', valid_acc)

    utils.save(model, os.path.join(args.save, 'weights.pt'))
    scheduler.step()
  print( 'Experiment Dir:', args.save )
        variable_parameters *= dim.value
    print(variable_parameters)
    total_parameters += variable_parameters
print('total params: ')
print(total_parameters)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
lin_net.crf_feature_net.overwrite_init(sess)

summary = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(
    os.path.join(ARGS.logdir_path, 'summary'),
    sess.graph,
)
dataset_reader = RandDatasetReader(get_train_dataset(ARGS.hdr_prefix), b)

for it in range(ARGS.it_num):
    print(it)
    if it == 0 or it % 10000 == 9999:
        print('start save')
        checkpoint_path = os.path.join(ARGS.logdir_path, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=it)
        print('finish save')
    hdr_val, crf_val, invcrf_val, t_val = dataset_reader.read_batch_data()
    _, summary_val = sess.run(
        [train_op, summary], {
            hdr: hdr_val,
            crf: crf_val,
            invcrf: invcrf_val,
            t: t_val,
Exemple #4
0
    batch_sampler = torch.utils.data.sampler.BatchSampler(
        sampler, config.VAL.IMG_PER_GPU, drop_last=False)

    loader = torch.utils.data.DataLoader(subset,
                                         num_workers=config.VAL.NUM_WORKERS,
                                         batch_sampler=batch_sampler)

    return loader


if __name__ == '__main__':
    from easydict import EasyDict as edict
    from exps.baseline.config import config

    from dataset import get_train_dataset, get_val_dataset
    dataset = get_train_dataset(config)
    # dataset = get_val_dataset(config)

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()

    torch.distributed.init_process_group(backend="nccl", init_method='env://')

    # loader = train_loader(dataset, config)
    loader = val_loader(dataset, config, 0, 2)

    iter_loader = iter(loader)
    if args.local_rank == 0:
        lr, hr = iter_loader.next()
Exemple #5
0
               len(self.text_map), len(
                   self.code_map), self.stats['total_same_input_tests'],
               self.stats['total_same_other_tests']))
        if self.args.show_tags:
            for name, value in self.task_types_stats.iteritems():
                print("%s: %d" % (name, value))
            for name, value in self.tags_stats.iteritems():
                print("%s: %d" % (name, value))


def report_stats(args, dataset):
    ds = DatasetStats(args)
    for example in dataset.data:
        ds.update(example)
    ds.display()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Data Statistics')
    parser.add_argument('--dataset', type=str, default='karel')
    parser.add_argument('--dataset_max_size', type=int, default=0)
    parser.add_argument('--dataset_max_code_length', type=int, default=0)
    parser.add_argument('--show-tags', action='store_true', default=False)
    parser.add_argument('--vocab_min_freq', type=int, default=50)
    args, _ = parser.parse_known_args(sys.argv)

    import dataset
    args.batch_size = 1
    train_dataset = dataset.get_train_dataset(args)
    report_stats(args, train_dataset)
def main(args):
    parser = argparse.ArgumentParser(description='Variational AutoEncoders')
    parser.add_argument('data_dir', help='path to training data')
    parser.add_argument('--test_inds',
                        type=int,
                        nargs='+',
                        help='inds test participants')
    parser.add_argument('--test_file',
                        type=str,
                        help='path to a file containing test inds')
    parser.add_argument('--target-size', default=260, type=int)
    parser.add_argument('-j',
                        '--workers',
                        default=4,
                        type=int,
                        help='number of data loading workers (default: 4)')
    parser.add_argument('--pred',
                        type=str,
                        default=None,
                        help='Only prediction')
    model_parser = parser.add_argument_group('Model Parameters')
    model_parser.add_argument('--model',
                              default='vqvae',
                              choices=['vae', 'vqvae', 'resnet'],
                              help='autoencoder variant to use: vae | vqvae')
    model_parser.add_argument(
        '--batch-size',
        type=int,
        default=4,
        metavar='N',
        help='input batch size for training (default: 128)')
    model_parser.add_argument('--hidden',
                              type=int,
                              metavar='N',
                              help='number of hidden channels')
    model_parser.add_argument('-k',
                              '--dict-size',
                              type=int,
                              dest='k',
                              metavar='K',
                              help='number of atoms in dictionary')
    model_parser.add_argument('-kl',
                              '--kl',
                              type=int,
                              dest='kl',
                              default=None,
                              help='length of vectors in embedded space')
    model_parser.add_argument('--lr',
                              type=float,
                              default=None,
                              help='learning rate')
    model_parser.add_argument('--vq_coef',
                              type=float,
                              default=None,
                              help='vq coefficient in loss')
    model_parser.add_argument('--commit_coef',
                              type=float,
                              default=None,
                              help='commitment coefficient in loss')
    model_parser.add_argument('--kl_coef',
                              type=float,
                              default=None,
                              help='kl-divergence coefficient in loss')
    model_parser.add_argument('--gabor_layer',
                              action='store_true',
                              default=False,
                              help='using gabor like layer')
    parser.add_argument('--resume',
                        type=str,
                        default=None,
                        help='The path to resume.')

    training_parser = parser.add_argument_group('Training Parameters')
    training_parser.add_argument(
        '--dataset',
        default='custom',
        choices=['mnist', 'cifar10', 'imagenet', 'coco', 'custom'],
        help='dataset to use: mnist | cifar10 | imagenet | coco | custom')
    training_parser.add_argument(
        '--dataset_dir_name',
        default='',
        help='name of the dir containing the dataset if dataset == custom')
    training_parser.add_argument('--data-dir',
                                 default='/media/ssd/Datasets',
                                 help='directory containing the dataset')
    training_parser.add_argument(
        '--epochs',
        type=int,
        default=20,
        metavar='N',
        help='number of epochs to train (default: 10)')
    training_parser.add_argument('--max-epoch-samples',
                                 type=int,
                                 default=50000,
                                 help='max num of samples per epoch')
    training_parser.add_argument('--no-cuda',
                                 action='store_true',
                                 default=False,
                                 help='enables CUDA training')
    training_parser.add_argument('--seed',
                                 type=int,
                                 default=1,
                                 metavar='S',
                                 help='random seed (default: 1)')
    training_parser.add_argument('--gpus',
                                 default='0',
                                 help='gpus used for training - e.g 0,1,3')

    logging_parser = parser.add_argument_group('Logging Parameters')
    logging_parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    logging_parser.add_argument('--results-dir',
                                metavar='RESULTS_DIR',
                                default='./results',
                                help='results dir')
    logging_parser.add_argument('--save-name', default='', help='saved folder')
    logging_parser.add_argument('--data-format',
                                default='json',
                                help='in which format to save the data')
    model_parser.add_argument('--backbone',
                              type=str,
                              default=None,
                              nargs='+',
                              help='details of backbone')

    args = parser.parse_args(args)
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    lr = args.lr or default_hyperparams[args.dataset]['lr']
    k = args.k or default_hyperparams[args.dataset]['k']
    hidden = args.hidden or default_hyperparams[args.dataset]['hidden']
    num_channels = dataset_n_channels[args.dataset]

    save_path = ex_util.setup_logging_from_args(args)
    writer = SummaryWriter(save_path)

    # if test file is specified use it for selecting test train sets
    if args.test_file is not None:
        args.test_inds = args.test_file

    args.inv_func = None

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed_all(args.seed)
        args.gpus = [int(i) for i in args.gpus.split(',')]
        torch.cuda.set_device(args.gpus[0])
        cudnn.benchmark = True
        torch.cuda.manual_seed(args.seed)

    if args.model == 'resnet':
        backbone = {
            'arch_name': args.backbone[0],
            'layer_name': args.backbone[1]
        }
        if len(args.backbone) > 2:
            backbone['weights_path'] = args.backbone[2]
        model = models[args.dataset][args.model](hidden,
                                                 k=k,
                                                 kl=args.kl,
                                                 num_channels=num_channels,
                                                 gabor_layer=args.gabor_layer,
                                                 backbone=backbone)
    else:
        model = models[args.dataset][args.model](hidden,
                                                 k=k,
                                                 kl=args.kl,
                                                 num_channels=num_channels,
                                                 gabor_layer=args.gabor_layer)
    if args.resume is not None:
        weights = torch.load(args.resume, map_location='cpu')
        model.load_state_dict(weights)
    if args.cuda:
        model.cuda()

    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, int(args.epochs / 3), 0.5)

    # NOTE: right now there's no additional transformaiton function
    trans_funcs = []
    # normlisation
    args.mean = [0.5, 0.5, 0.5]
    args.std = [0.5, 0.5, 0.5]

    in_chns = 1
    if args.model == 'resnet':
        in_chns = 3

    val_dataset = get_val_dataset(args.data_dir + '/img/',
                                  args.data_dir + '/gt/',
                                  args.data_dir + '/all_imgs.txt',
                                  args.test_inds,
                                  trans_funcs,
                                  args.mean,
                                  args.std,
                                  args.target_size,
                                  chns=in_chns)

    # NOTE: shuffle is False
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True,
                                             sampler=None)

    if args.pred is not None:
        weights = torch.load(args.pred, map_location='cpu')
        model.load_state_dict(weights)
        model.cuda()
        predict_net(model, val_loader, save_path, args)
        return

    train_dataset = get_train_dataset(args.data_dir + '/img/',
                                      args.data_dir + '/gt/',
                                      args.data_dir + '/all_imgs.txt',
                                      args.test_inds,
                                      trans_funcs,
                                      args.mean,
                                      args.std,
                                      args.target_size,
                                      chns=in_chns)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=None)

    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    logging.getLogger('').addHandler(console)
    for epoch in range(1, args.epochs + 1):
        train_losses = train(epoch, model, train_loader, optimizer, args.cuda,
                             args.log_interval, save_path, args, writer)
        test_losses = test_net(epoch, model, val_loader, args.cuda, save_path,
                               args, writer)
        ex_util.save_checkpoint(model, epoch, save_path)

        for k in train_losses.keys():
            name = k.replace('_train', '')
            train_name = k
            test_name = k.replace('train', 'test')
            writer.add_scalars(
                name, {
                    'train': train_losses[train_name],
                    'test': test_losses[test_name],
                })
        scheduler.step()
Exemple #7
0
def main(opts):
    """Main function for the training pipeline
    :opts: commandlien arguments
    :returns: None
    """
    ##########################################################################
    #                             Basic settings                             #
    ##########################################################################
    exp_dir = 'experiments'
    log_dir = os.path.join(exp_dir, 'logs')
    model_dir = os.path.join(exp_dir, 'models')
    os.makedirs(os.path.join(model_dir, opts.run_name), exist_ok=True)
    os.makedirs(os.path.join(log_dir, opts.run_name))

    pprint(vars(opts))
    with open(os.path.join(log_dir, opts.run_name, "args.json"), 'w') as f:
        json.dump(vars(opts), f, indent=True)

    torch.manual_seed(opts.seed)
    np.random.seed(opts.seed)
    random.seed(opts.seed)

    ##########################################################################
    #  Define all the necessary variables for model training and evaluation  #
    ##########################################################################
    writer = SummaryWriter(os.path.join(log_dir, opts.run_name), flush_secs=5)

    if opts.train_mode == 'combined':
        train_dataset = get_train_dataset(opts.data_root, opts, opts.folder1,
                                          opts.folder2, opts.folder3)
    elif opts.train_mode == 'oversampling':
        train_dataset = get_train_dataset_by_oversampling(
            opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3)
    elif opts.train_mode == 'pretrain_and_finetune':
        train_dataset, finetune_dataset = get_pretrain_and_finetune_datast(
            opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3)
        finetune_loader = torch.utils.data.DataLoader(
            finetune_dataset,
            batch_size=opts.batch_size,
            num_workers=opts.num_workers,
            drop_last=False,
            shuffle=True)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=opts.batch_size,
                                               num_workers=opts.num_workers,
                                               drop_last=False,
                                               shuffle=True)

    val_dataset = get_val_dataset(os.path.join('data', 'val'), opts)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=opts.eval_batch_size,
                                             shuffle=False,
                                             num_workers=opts.num_workers,
                                             drop_last=False)

    test_dataset = get_test_dataset(os.path.join('data', 'test'), opts)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=opts.eval_batch_size,
                                              shuffle=False,
                                              num_workers=opts.num_workers,
                                              drop_last=False)

    assert train_dataset.class_to_idx == val_dataset.class_to_idx == test_dataset.class_to_idx, "Mapping not correct"

    model = get_model(opts)

    opts.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    if torch.cuda.device_count() > 1 and not opts.no_data_parallel:
        model = nn.DataParallel(model)

    model = model.to(opts.device)

    optimizer = optim.RMSprop(model.parameters(),
                              lr=opts.lr,
                              alpha=0.9,
                              weight_decay=1e-5,
                              momentum=0.9)
    scheduler = get_lr_scheduler(optimizer, opts)

    best_val_loss = float('inf')
    best_val_accu = float(0)
    best_val_rec = float(0)
    best_val_prec = float(0)
    best_val_f1 = float(0)
    best_val_auc = float(0)

    iteration_change_loss = 0
    t_start_training = time.time()

    ##########################################################################
    #                           Main training loop                           #
    ##########################################################################
    for epoch in range(opts.epochs):
        current_lr = get_lr(optimizer)
        t_start = time.time()

        ############################################################
        #  The actual training and validation step for each epoch  #
        ############################################################
        train_loss, train_metric = train_model(model, train_loader, optimizer,
                                               opts)

        if epoch == opts.finetune_epoch and opts.train_mode == 'pretrain_and_finetune':
            train_loader = finetune_loader
            optimizer = optim.RMSprop(model.parameters(),
                                      lr=opts.lr,
                                      alpha=0.9,
                                      weight_decay=1e-5,
                                      momentum=0.9)
            scheduler = torch.optim.lr_scheduler.StepLR(
                optimizer,
                step_size=opts.step_size_finetuning,
                gamma=opts.gamma)

        # Run the validation set
        with torch.no_grad():
            val_loss, val_metric = evaluate_model(model, val_loader, opts)

        ##############################
        #  Write to summary writer   #
        ##############################

        train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy']
        train_rec, val_rec = train_metric['recalls'], val_metric['recalls']
        train_prec, val_prec = train_metric['precisions'], val_metric[
            'precisions']
        train_f1, val_f1 = train_metric['f1'], val_metric['f1']
        train_auc, val_auc = train_metric['auc'], val_metric['auc']

        writer.add_scalar('Loss/Train', train_loss, epoch)
        writer.add_scalar('Accuracy/Train', train_acc, epoch)
        writer.add_scalar('Precision/Train', train_prec, epoch)
        writer.add_scalar('Recall/Train', train_rec, epoch)
        writer.add_scalar('F1/Train', train_f1, epoch)
        writer.add_scalar('AUC/Train', train_auc, epoch)

        writer.add_scalar('Loss/Val', val_loss, epoch)
        writer.add_scalar('Accuracy/Val', val_acc, epoch)
        writer.add_scalar('Precision/Val', val_prec, epoch)
        writer.add_scalar('Recall/Val', val_rec, epoch)
        writer.add_scalar('F1/Val', val_f1, epoch)
        writer.add_scalar('AUC/Val', val_auc, epoch)

        ##############################
        #  Adjust the learning rate  #
        ##############################
        if opts.lr_scheduler == 'plateau':
            scheduler.step(val_loss)
        elif opts.lr_scheduler in ['step', 'cosine']:
            scheduler.step()

        t_end = time.time()
        delta = t_end - t_start

        print_epoch_progress(epoch, opts.epochs, train_loss, val_loss, delta,
                             train_metric, val_metric)
        iteration_change_loss += 1
        print('-' * 30)

        if val_acc > best_val_accu:
            best_val_accu = val_acc
            if bool(opts.save_model):
                torch.save(
                    model.state_dict(),
                    os.path.join(model_dir, opts.run_name,
                                 'best_state_dict.pth'))

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            iteration_change_loss = 0

        if val_rec > best_val_rec:
            best_val_rec = val_rec

        if val_prec > best_val_prec:
            best_val_prec = val_prec

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            print(f'The best validation F1-score is now {best_val_f1}')
            print(
                f'The validation accuracy and AUC are now {val_acc} and {val_auc}'
            )

        if val_auc > best_val_auc:
            best_val_auc = val_auc

        if iteration_change_loss == opts.patience and opts.early_stopping:
            print(
                ('Early stopping after {0} iterations without the decrease ' +
                 'of the val loss').format(iteration_change_loss))
            break

    t_end_training = time.time()
    print(f'training took {t_end_training - t_start_training}s')
    print(f'Best validation accuracy: {best_val_accu}')
    print(f'Best validation loss: {best_val_loss}')
    print(f'Best validation precision: {best_val_prec}')
    print(f'Best validation recall: {best_val_rec}')
    print(f'Best validation f1: {best_val_f1}')
    print(f'Best validation AUC: {best_val_auc}')

    with torch.no_grad():
        if opts.train_mode in ['combined', 'oversampling']:
            model.load_state_dict(
                torch.load(
                    os.path.join(model_dir, opts.run_name,
                                 'best_state_dict.pth')))
        test_loss, test_metric = evaluate_model(model, test_loader, opts)

    print(f'The best test F1: {test_metric["f1"]}')
    print(f'The best test auc: {test_metric["auc"]}')
    print(f'The best test accuracy: {test_metric["accuracy"]}')
Exemple #8
0
def main(opts):
    """Main function for the training pipeline
    :opts: commandlien arguments
    :returns: None
    """
    ##########################################################################
    #                             Basic settings                             #
    ##########################################################################
    exp_dir = 'experiments'
    log_dir = os.path.join(exp_dir, 'logs')
    model_dir = os.path.join(exp_dir, 'models')
    os.makedirs(os.path.join(model_dir, opts.run_name), exist_ok=True)

    ##########################################################################
    #  Define all the necessary variables for model training and evaluation  #
    ##########################################################################
    writer = SummaryWriter(os.path.join(log_dir, opts.run_name), flush_secs=5)

    train_dataset = get_train_dataset(root=os.path.join('data', 'train'))
    weights = make_weights_for_balanced_classes(train_dataset.imgs,
                                                len(train_dataset.classes))
    weights = torch.DoubleTensor(weights)
    sampler = torch.utils.data.sampler.WeightedRandomSampler(
        weights, len(weights))
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=opts.batch_size,
                                               num_workers=6,
                                               drop_last=False,
                                               sampler=sampler)

    val_dataset = get_val_dataset(root=os.path.join('data', 'val'))
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=opts.batch_size,
                                             shuffle=False,
                                             num_workers=6,
                                             drop_last=False)

    assert train_dataset.class_to_idx == val_dataset.class_to_idx, "Mapping not correct"

    model = load_baseline(n_classes=2)

    if torch.cuda.is_available():
        model = model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=opts.lr, weight_decay=0.1)

    if opts.lr_scheduler == "plateau":
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                               patience=3,
                                                               factor=.3,
                                                               threshold=1e-4,
                                                               verbose=True)
    elif opts.lr_scheduler == "step":
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    step_size=3,
                                                    gamma=opts.gamma)

    best_val_loss = float('inf')
    best_val_accu = float(0)

    iteration_change_loss = 0
    t_start_training = time.time()

    ##########################################################################
    #                           Main training loop                           #
    ##########################################################################
    for epoch in range(opts.epochs):
        current_lr = get_lr(optimizer)
        t_start = time.time()

        ############################################################
        #  The actual training and validation step for each epoch  #
        ############################################################
        train_loss, train_metric = train_model(model, train_loader, epoch,
                                               opts.epochs, optimizer, writer,
                                               current_lr, opts.log_every)

        with torch.no_grad():
            val_loss, val_metric = evaluate_model(model, val_loader, epoch,
                                                  opts.epochs, writer,
                                                  current_lr)

        ##############################
        #  Write to summary writer   #
        ##############################

        writer.add_scalar('Loss/Train', train_loss, epoch)
        writer.add_scalar('Accuracy/Train', train_metric['accuracy'], epoch)
        writer.add_scalar('Precision/Train', train_metric['precisions'], epoch)
        writer.add_scalar('Recall/Train', train_metric['recalls'], epoch)
        writer.add_scalar('F1/Train', train_metric['f1'], epoch)

        writer.add_scalar('Loss/Val', val_loss, epoch)
        writer.add_scalar('Accuracy/Val', val_metric['accuracy'], epoch)
        writer.add_scalar('Precision/Val', val_metric['precisions'], epoch)
        writer.add_scalar('Recall/Val', val_metric['recalls'], epoch)
        writer.add_scalar('F1/Val', val_metric['f1'], epoch)

        ##############################
        #  Adjust the learning rate  #
        ##############################
        if opts.lr_scheduler == 'plateau':
            scheduler.step(val_loss)
        elif opts.lr_scheduler == 'step':
            scheduler.step()

        t_end = time.time()
        delta = t_end - t_start

        print_epoch_progress(train_loss, val_loss, delta, train_metric,
                             val_metric)
        iteration_change_loss += 1
        print('-' * 30)

        train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy']
        # file_name = ('val_acc_{}_train_acc_{}_epoch_{}.pth'.
        #              format(train_acc, val_acc, epoch))
        # torch.save(model, os.path.join(model_dir, opts.run_name, file_name))

        if val_acc > best_val_accu:
            best_val_accu = val_acc
            if bool(opts.save_model):
                torch.save(model,
                           os.path.join(model_dir, opts.run_name, 'best.pth'))

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            iteration_change_loss = 0

        if iteration_change_loss == opts.patience and opts.early_stopping:
            print(
                ('Early stopping after {0} iterations without the decrease ' +
                 'of the val loss').format(iteration_change_loss))
            break
    t_end_training = time.time()
    print('training took {}s'.format(t_end_training - t_start_training))
Exemple #9
0
weight_decay = 1e-3
norm = None
num_epochs = 350
cude_if = True

## Tensorboard
writer = None
#writer = SummaryWriter(log_dir='board_view/simple_endecoder/whole_data')

## Stage 2 Letter Data Set
persons = read_rawdata_to_person(data_path,
                                 all_data,
                                 millisec=1000,
                                 resample=resample)
dataset = data_split(persons, alpha=alpha)
trainset = get_train_dataset(dataset)
testset = get_test_dataset(dataset)

### write own dataloader
trainloader = t.utils.data.DataLoader(trainset,
                                      batch_size=batch_size,
                                      shuffle=True)
testloader = t.utils.data.DataLoader(testset,
                                     batch_size=batch_size,
                                     shuffle=True)
## Model Setting
model = Cnn_LstmNet(10, 32)
# model = ResNet_lstm(BasicBlock, [2, 2, 1, 1])

## Loss function and Optimizer
criterion = t.nn.CrossEntropyLoss()
def main():
    if device != 'cuda':
        logging.info('no gpu device available')
        sys.exit(1)
    np.random.seed(args.seed)
    torch.cuda.set_device(args.gpu)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)
    logging.info('gpu device = %d' % args.gpu)
    logging.info("args = %s", args)

    # setup criterion, model
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()
    label_criterion = nn.NLLLoss().to(device)
    domain_criterion = nn.NLLLoss().to(device)
    #model = NetworkFE(args.init_channels, NUM_CLASSES, args.layers, criterion)
    model = DANN(args.init_channels, NUM_CLASSES, args.layers, label_criterion,
                 domain_criterion)
    model = model.to(device)
    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

    # TODO: setup right optimizer
    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    #optimizer = torch.optim.Adam(
    #        model.parameters(),
    #        args.learning_rate )

    src_train_data = get_train_dataset(args.src_set, args)
    tgt_train_data = get_train_dataset(args.tgt_set, args)

    num_train = min(len(src_train_data), len(tgt_train_data)) // 1
    indices = list(range(num_train))
    split = int(np.floor(args.train_portion * num_train))

    # DataLoader for src,tgt training data
    src_train_queue = torch.utils.data.DataLoader(
        src_train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
        pin_memory=True,
        num_workers=8)
    tgt_train_queue = torch.utils.data.DataLoader(
        tgt_train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
        pin_memory=True,
        num_workers=8)

    # DataLoader for src,tgt validation data
    src_valid_queue = torch.utils.data.DataLoader(
        src_train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(
            indices[split:num_train]),
        pin_memory=True,
        num_workers=8)
    tgt_valid_queue = torch.utils.data.DataLoader(
        tgt_train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(
            indices[split:num_train]),
        pin_memory=True,
        num_workers=8)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs), eta_min=args.learning_rate_min)

    # Use architect for domain adaptation
    architect = ArchitectDA(model, args)

    # import pdb; pdb.set_trace()
    # main loop
    for epoch in range(args.epochs):
        lr = scheduler.get_lr()[0]
        logging.info('epoch %d lr %e', epoch, lr)
        genotype = model.genotype()
        logging.info('genotype = %s', genotype)

        # train step
        train_acc, train_obj = train(src_train_queue, tgt_train_queue,
                                     src_valid_queue, tgt_valid_queue, model,
                                     architect, label_criterion,
                                     domain_criterion, optimizer, lr, epoch)
        logging.info('train_acc %f', train_acc)

        # validation only on last epoch
        # if args.epochs-epoch<=1:
        src_val_acc, tgt_val_acc, valid_obj = infer(src_valid_queue,
                                                    tgt_valid_queue, model,
                                                    label_criterion,
                                                    domain_criterion)
        logging.info('src_val_acc %f tgt_val_acc %f', src_val_acc, tgt_val_acc)

        # save model
        utils.save(model, os.path.join(args.save, 'weights.pt'))
        scheduler.step()

    print('Experiment Dir:', args.save)