Esempio n. 1
0
  def __init__(self, opt):
    if opt.gpus[0] >= 0:
      opt.device = torch.device('cuda')
    else:
      opt.device = torch.device('cpu')
    
    print('Creating model...')
    self.model = create_model(opt.arch, opt.heads, opt.head_conv)
    self.model = load_model(self.model, opt.load_model)
    self.model = self.model.to(opt.device)
    self.model.eval()

    self.mean = np.array(opt.mean, dtype=np.float32).reshape(1, 1, 3)
    self.std = np.array(opt.std, dtype=np.float32).reshape(1, 1, 3)
    self.max_per_image = 100
    self.num_classes = opt.num_classes
    self.scales = opt.test_scales
    self.opt = opt
    self.pause = True
Esempio n. 2
0
def main(argv):
    """Run the training script with command line arguments @argv."""
    args = parse_args(argv)
    if args.label_refinery_model is None:
        if args.coslinear:
            save_dir = args.save+'/'+args.model+'_cos_'+name_time
        else:
            save_dir = args.save+'/'+args.model+'_'+name_time
    else:
        if args.coslinear:
            save_dir = args.save+'/'+args.model+'_cos_rfn_'+name_time
        else:
            save_dir = args.save+'/'+args.model+'_rfn_'+name_time
    if not os.path.isdir(save_dir):
            os.mkdir(save_dir)
    utils.general_setup(save_dir, args.gpus)

    logging.info("Arguments parsed.\n{}".format(pprint.pformat(vars(args))))

    train_loader = mul_cifar100.mul_CIFAR100DataLoader(root=args.data_dir, 
        image_size=32, train=True, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers)
    val_loader = mul_cifar100.mul_CIFAR100DataLoader(root=args.data_dir, 
        image_size=32, train=False, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers)
    # Create model with optional label refinery.
    model, loss = model_factory.create_model(
        args.model, args.model_state_file, args.gpus, args.label_refinery_model,
        args.label_refinery_state_file, args.coslinear, args.s)

    if args.lr_regime is None:
        lr_regime = model.LR_REGIME
    else:
        lr_regime = args.lr_regime
    regime = LearningRateRegime(lr_regime)
    # Train and test for needed number of epochs.
    optimizer = create_optimizer(model, args.momentum, args.weight_decay)
    for epoch in range(1, regime.num_epochs + 1):
        lr = regime.get_lr(epoch)
        _set_learning_rate(optimizer, lr)
        train_for_one_epoch(model, loss, train_loader, optimizer, epoch)
        test.test_for_one_epoch(model, loss, val_loader, epoch)
        save_checkpoint(save_dir, model, optimizer, epoch)
def main(argv):
    """Run the training script with command line arguments @argv."""
    args = parse_args(argv)
    utils.general_setup(args.save, args.gpus)

    logging.info("Arguments parsed.\n{}".format(pprint.pformat(vars(args))))

    # Create the train and the validation data loaders.
    train_loader = imagenet.get_train_loader(args.imagenet, args.batch_size,
                                             args.num_workers, args.image_size)
    val_loader = imagenet.get_val_loader(args.imagenet, args.batch_size,
                                         args.num_workers, args.image_size)
    # Create model with optional teachers.
    model, loss = model_factory.create_model(args.model,
                                             args.student_state_file,
                                             args.gpus, args.teacher_model,
                                             args.teacher_state_file)
    logging.info("Model:\n{}".format(model))

    discriminator_loss, update_parameters = create_discriminator_criterion(
        args)

    if args.lr_regime is None:
        lr_regime = model.LR_REGIME
    else:
        lr_regime = args.lr_regime
    regime = LearningRateRegime(lr_regime)
    # Train and test for needed number of epochs.
    optimizer = create_optimizer(model, update_parameters, args.momentum,
                                 args.weight_decay)

    for epoch in range(args.start_epoch, args.epochs):
        lr = regime.get_lr(epoch)
        _set_learning_rate(optimizer, lr)
        train_for_one_epoch(model, loss, discriminator_loss, train_loader,
                            optimizer, epoch)
        test.test_for_one_epoch(model, loss, val_loader, epoch)
        save_checkpoint(args.save, model, optimizer, epoch)
Esempio n. 4
0
def main(opt):
  torch.manual_seed(opt.seed)
  torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
  Dataset = get_dataset(opt.dataset, opt.task)
  opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
  print(opt)

  logger = Logger(opt)

  os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
  opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')
  
  print('Creating model...')
  # Model creation -> pick backbone, heads, and head convolution 
  model = create_model(opt.arch, opt.heads, opt.head_conv)
  optimizer = torch.optim.Adam(model.parameters(), opt.lr)
  start_epoch = 0
  if opt.load_model != '':
    model, optimizer, start_epoch = load_model(
      model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step)

  Trainer = train_factory[opt.task]
  trainer = Trainer(opt, model, optimizer)
  trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

  print('Setting up data...')
  val_loader = torch.utils.data.DataLoader(
      Dataset(opt, 'val'), 
      batch_size=1, 
      shuffle=False,
      num_workers=1,
      pin_memory=True
  )

  if opt.test:
    _, preds = trainer.val(0, val_loader)
    val_loader.dataset.run_eval(preds, opt.save_dir)
    return

  if opt.export_onnx: 
    print('Exporting onnx model')

    # TODO: adapt the input size to the onnx 
    width   = opt.input_res
    height  = opt.input_res

    # create a dummy input that would be used to export the model
    #dummy_input = torch.randn(10, 3, width, height, device='cuda')

    # this method does not support variable input sizes 
    #torch.onnx.export(model, dummy_input, 
    #                  os.path.join(opt.save_dir, 'model.onnx'), 
    #                  verbose=True)

    flops, params = profile(model, input_size=(1,3,width, height), device='cuda')
    print(width, height, flops, params)
    print('Model exported. Done!')
    return


  train_loader = torch.utils.data.DataLoader(
      Dataset(opt, 'train'), 
      batch_size=opt.batch_size, 
      shuffle=True,
      num_workers=opt.num_workers,
      pin_memory=True,
      drop_last=True
  )

  print('Starting training...')
  best = 1e10
  for epoch in range(start_epoch + 1, opt.num_epochs + 1):
    mark = epoch if opt.save_all else 'last'
    log_dict_train, _ = trainer.train(epoch, train_loader)
    logger.write('epoch: {} |'.format(epoch))
    for k, v in log_dict_train.items():
      logger.scalar_summary('train_{}'.format(k), v, epoch)
      logger.write('{} {:8f} | '.format(k, v))
    if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)), 
                 epoch, model, optimizer)
      with torch.no_grad():
        log_dict_val, preds = trainer.val(epoch, val_loader)
      for k, v in log_dict_val.items():
        logger.scalar_summary('val_{}'.format(k), v, epoch)
        logger.write('{} {:8f} | '.format(k, v))
      if log_dict_val[opt.metric] < best:
        best = log_dict_val[opt.metric]
        save_model(os.path.join(opt.save_dir, 'model_best.pth'), 
                   epoch, model)
    else:
      save_model(os.path.join(opt.save_dir, 'model_last.pth'), 
                 epoch, model, optimizer)
    logger.write('\n')
    if epoch in opt.lr_step:
      save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)), 
                 epoch, model, optimizer)
      lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
      print('Drop LR to', lr)
      for param_group in optimizer.param_groups:
          param_group['lr'] = lr
  logger.close()
Esempio n. 5
0
def train(params):
    train_params = params.train_param()

    train_loader, valid_loader, test_loader, train_params = get_data_loader(
        train_params, mode='train')
    # IPython.embed()
    params._save_overwrite_parameters(params_key='train_param',
                                      params_value=train_params)

    train_params['data_mean'] = torch.tensor(
        train_params['data_stats']['speed_mean'],
        dtype=torch.float).unsqueeze(0).to(device)
    train_params['data_std'] = torch.tensor(
        train_params['data_stats']['speed_std'],
        dtype=torch.float).unsqueeze(0).to(device)

    model = create_model(params)
    model = model.to(device)

    criterion_traj = torch.nn.MSELoss(reduction='mean').to(device)
    criterion_intend = CrossEntropyLoss(
        class_num=train_params['class_num'],
        label_smooth=train_params['label_smooth']).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=train_params['lr'])

    scheduler = get_lr_schedule(train_params['lr_schedule'], train_params,
                                optimizer)

    best_result = {
        'valid_acc': 0,
        'valid_mse': 99999,
        'test_acc': 0,
        'test_mse': 99999,
        'epoch': 0
    }
    print('begin to train')
    for epoch in range(1, train_params['epochs'] + 1):
        for i, data in enumerate(train_loader, 0):
            # IPython.embed()
            print_result = True if i % train_params[
                'print_step'] == 0 else False
            train_on_batch(data,
                           model,
                           optimizer,
                           criterion_traj,
                           criterion_intend,
                           params=train_params,
                           print_result=print_result,
                           epoch=epoch,
                           iter=i)

        save_model_path = os.path.join(train_params['save_dir'],
                                       'model_%d.pkl' % (epoch))
        torch.save(model, save_model_path)
        print('save model to', save_model_path)

        model.eval()
        valid_acc, valid_mse = evaluate(model,
                                        valid_loader,
                                        criterion_traj,
                                        criterion_intend,
                                        params=train_params,
                                        epoch=epoch,
                                        mark='valid')
        test_acc, test_mse = evaluate(model,
                                      test_loader,
                                      criterion_traj,
                                      criterion_intend,
                                      params=train_params,
                                      epoch=epoch,
                                      mark='test')
        model.train()
        if valid_mse < best_result['valid_mse'] or valid_acc > best_result[
                'valid_acc']:
            best_result['valid_mse'] = valid_mse
            best_result['valid_acc'] = valid_acc
            best_result['test_mse'] = test_mse
            best_result['test_acc'] = test_acc
            best_result['epoch'] = epoch

        if scheduler is not None:
            scheduler.step(epoch)

    print('Best Results (epoch %d):' % best_result['epoch'])
    print(
        'validation_acc = %f, validation_mse = %f, test_acc = %f, test_mse = %f'
        % (best_result['valid_acc'], best_result['valid_mse'],
           best_result['test_acc'], best_result['test_mse']))
    return model
Esempio n. 6
0
def setup(gpu_idx, configs):
    configs.gpu_idx = gpu_idx
    device = torch.device('cpu' if configs.gpu_idx == -1 else 'cuda:{}'.format(configs.gpu_idx))
    configs.update({'DEVICE': device})
    save_dir = os.path.join(configs.TRAINING.WEIGHTS, configs.MODEL.BACKBONE)
    logs_dir = os.path.join(configs.TRAINING.LOGDIR, configs.MODEL.BACKBONE)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    if not os.path.exists(logs_dir):
        os.mkdir(logs_dir)

    if configs.distributed:
        if configs.dist_url == "env://" and configs.rank == -1:
            configs.rank = int(os.environ["RANK"])
        if configs.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            configs.rank = configs.rank * configs.ngpus_per_node + gpu_idx

        dist.init_process_group(backend=configs.dist_backend, init_method=configs.dist_url,
                                world_size=configs.world_size, rank=configs.rank)
        configs.subdivisions = int(64 / configs.BATCH_SIZE / configs.ngpus_per_node)
    else:
        configs.subdivisions = int(64 / configs.BATCH_SIZE)

    configs.is_master_node = (not configs.distributed) or (
            configs.distributed and (configs.rank % configs.ngpus_per_node == 0))

    if configs.is_master_node:
        logger = logging.Logger('RTM3D')
        logger.info('>>> Created a new logger')
        logger.info('>>> configs: {}'.format(configs))
        tb_writer = SummaryWriter(log_dir=os.path.join(logs_dir, 'tensorboard'))
    else:
        logger = None
        tb_writer = None
    model = model_factory.create_model(configs)
    model.to(configs.DEVICE)
    checkpointer = check_point.CheckPointer(model,
                                            save_dir=save_dir,
                                            save_to_disk=True,
                                            mode='state-dict',
                                            device=configs.DEVICE)
    configs.start_epoch = 0
    configs.min_loss = 10000

    ckpt = {}
    if len(configs.TRAINING.CHECKPOINT_FILE) > 0:
        ckpt = checkpointer.load(configs.TRAINING.CHECKPOINT_FILE,
                                 use_latest=(configs.TRAINING.CHECKPOINT_MODE != 'pretrained'),
                                 load_solver=configs.SOLVER.LOAD_SOLVER)
        if 'epoch' in ckpt and configs.TRAINING.CHECKPOINT_MODE == 'resume':
            configs.start_epoch = ckpt['epoch'] + 1
        if 'min_loss' in ckpt and configs.TRAINING.CHECKPOINT_MODE == 'resume':
            configs.min_loss = ckpt['min_loss']
    # Data Parallel
    model = model_factory.make_data_parallel(model, configs)
    solver = Solver(model, configs)
    checkpointer.set_solver(solver)
    checkpointer.load_solver(ckpt)
    del ckpt
    rtm3d_loss = RTM3DLoss(configs)

    if configs.is_master_node:
        num_parameters = model_factory.get_num_parameters(model)
        logger.info('number of trained parameters of the model: {}'.format(num_parameters))

    if logger is not None:
        logger.info(">>> Loading dataset & getting dataloader...")
    # Create dataloader
    train_dataloader = create_dataloader(configs.DATASET.PATH, configs,
                                         TrainAugmentation(configs.INPUT_SIZE[0], mean=configs.DATASET.MEAN),
                                         is_training=True)[:2]

    test_dataloader = create_dataloader(configs.DATASET.PATH, configs,
                                        TestTransform(configs.INPUT_SIZE[0],
                                                      mean=configs.DATASET.MEAN),
                                        is_training=True,
                                        split='test')[0]
    if logger is not None:
        logger.info('number of batches in training set: {}'.format(len(train_dataloader)))

    return model, checkpointer, (train_dataloader, test_dataloader), solver, rtm3d_loss, configs, tb_writer
Esempio n. 7
0
def main():
    args = parser.parse_args()

    if args.output:
        output_base = args.output
    else:
        output_base = './output'
    exp_name = '-'.join([
        datetime.now().strftime("%Y%m%d-%H%M%S"),
        args.model,
        args.gp,
        'f'+str(args.fold)])
    output_dir = get_outdir(output_base, 'train', exp_name)

    train_input_root = os.path.join(args.data)
    batch_size = args.batch_size
    num_epochs = args.epochs
    wav_size = (16000,)
    num_classes = len(dataset.get_labels())

    torch.manual_seed(args.seed)

    model = model_factory.create_model(
        args.model,
        in_chs=1,
        pretrained=args.pretrained,
        num_classes=num_classes,
        drop_rate=args.drop,
        global_pool=args.gp,
        checkpoint_path=args.initial_checkpoint)
    #model.reset_classifier(num_classes=num_classes)

    dataset_train = dataset.CommandsDataset(
        root=train_input_root,
        mode='train',
        fold=args.fold,
        wav_size=wav_size,
        format='spectrogram',
    )

    loader_train = data.DataLoader(
        dataset_train,
        batch_size=batch_size,
        pin_memory=True,
        shuffle=True,
        num_workers=args.workers
    )

    dataset_eval = dataset.CommandsDataset(
        root=train_input_root,
        mode='validate',
        fold=args.fold,
        wav_size=wav_size,
        format='spectrogram',
    )

    loader_eval = data.DataLoader(
        dataset_eval,
        batch_size=args.batch_size,
        pin_memory=True,
        shuffle=False,
        num_workers=args.workers
    )

    train_loss_fn = validate_loss_fn = torch.nn.CrossEntropyLoss()
    train_loss_fn = train_loss_fn.cuda()
    validate_loss_fn = validate_loss_fn.cuda()

    opt_params = list(model.parameters())
    if args.opt.lower() == 'sgd':
        optimizer = optim.SGD(
            opt_params, lr=args.lr,
            momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True)
    elif args.opt.lower() == 'adam':
        optimizer = optim.Adam(
            opt_params, lr=args.lr, weight_decay=args.weight_decay, eps=args.opt_eps)
    elif args.opt.lower() == 'nadam':
        optimizer = nadam.Nadam(
            opt_params, lr=args.lr, weight_decay=args.weight_decay, eps=args.opt_eps)
    elif args.opt.lower() == 'adadelta':
        optimizer = optim.Adadelta(
            opt_params, lr=args.lr, weight_decay=args.weight_decay, eps=args.opt_eps)
    elif args.opt.lower() == 'rmsprop':
        optimizer = optim.RMSprop(
            opt_params, lr=args.lr, alpha=0.9, eps=args.opt_eps,
            momentum=args.momentum, weight_decay=args.weight_decay)
    else:
        assert False and "Invalid optimizer"
    del opt_params

    if not args.decay_epochs:
        print('No decay epoch set, using plateau scheduler.')
        lr_scheduler = ReduceLROnPlateau(optimizer, patience=10)
    else:
        lr_scheduler = None

    # optionally resume from a checkpoint
    start_epoch = 0 if args.start_epoch is None else args.start_epoch
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
                if 'args' in checkpoint:
                    print(checkpoint['args'])
                new_state_dict = OrderedDict()
                for k, v in checkpoint['state_dict'].items():
                    if k.startswith('module'):
                        name = k[7:] # remove `module.`
                    else:
                        name = k
                    new_state_dict[name] = v
                model.load_state_dict(new_state_dict)
                if 'optimizer' in checkpoint:
                    optimizer.load_state_dict(checkpoint['optimizer'])
                if 'loss' in checkpoint:
                    train_loss_fn.load_state_dict(checkpoint['loss'])
                print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
                start_epoch = checkpoint['epoch'] if args.start_epoch is None else args.start_epoch
            else:
                model.load_state_dict(checkpoint)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
            exit(1)

    saver = CheckpointSaver(checkpoint_dir=output_dir)

    if args.num_gpu > 1:
        model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda()
    else:
        model.cuda()

    # Optional fine-tune of only the final classifier weights for specified number of epochs (or part of)
    if not args.resume and args.ft_epochs > 0.:
        if isinstance(model, torch.nn.DataParallel):
            classifier_params = model.module.get_classifier().parameters()
        else:
            classifier_params = model.get_classifier().parameters()
        if args.opt.lower() == 'adam':
            finetune_optimizer = optim.Adam(
                classifier_params,
                lr=args.ft_lr, weight_decay=args.weight_decay)
        else:
            finetune_optimizer = optim.SGD(
                classifier_params,
                lr=args.ft_lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True)

        finetune_epochs_int = int(np.ceil(args.ft_epochs))
        finetune_final_batches = int(np.ceil((1 - (finetune_epochs_int - args.ft_epochs)) * len(loader_train)))
        print(finetune_epochs_int, finetune_final_batches)
        for fepoch in range(0, finetune_epochs_int):
            if fepoch == finetune_epochs_int - 1 and finetune_final_batches:
                batch_limit = finetune_final_batches
            else:
                batch_limit = 0
            train_epoch(
                fepoch, model, loader_train, finetune_optimizer, train_loss_fn, args,
                output_dir=output_dir, batch_limit=batch_limit)

    best_loss = None
    try:
        for epoch in range(start_epoch, num_epochs):
            if args.decay_epochs:
                adjust_learning_rate(
                    optimizer, epoch, initial_lr=args.lr,
                    decay_rate=args.decay_rate, decay_epochs=args.decay_epochs)

            train_metrics = train_epoch(
                epoch, model, loader_train, optimizer, train_loss_fn, args,
                saver=saver, output_dir=output_dir)

            # save a recovery in case validation blows up
            saver.save_recovery({
                'epoch': epoch + 1,
                'arch': args.model,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'loss': train_loss_fn.state_dict(),
                'args': args,
                'gp': args.gp,
                },
                epoch=epoch + 1,
                batch_idx=0)

            step = epoch * len(loader_train)
            eval_metrics = validate(
                step, model, loader_eval, validate_loss_fn, args,
                output_dir=output_dir)

            if lr_scheduler is not None:
                lr_scheduler.step(eval_metrics['eval_loss'])

            rowd = OrderedDict(epoch=epoch)
            rowd.update(train_metrics)
            rowd.update(eval_metrics)
            with open(os.path.join(output_dir, 'summary.csv'), mode='a') as cf:
                dw = csv.DictWriter(cf, fieldnames=rowd.keys())
                if best_loss is None:  # first iteration (epoch == 1 can't be used)
                    dw.writeheader()
                dw.writerow(rowd)

            # save proper checkpoint with eval metric
            best_loss = saver.save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.model,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'args': args,
                'gp': args.gp,
                },
                epoch=epoch + 1,
                metric=eval_metrics['eval_loss'])

    except KeyboardInterrupt:
        pass
    print('*** Best loss: {0} (epoch {1})'.format(best_loss[1], best_loss[0]))
Esempio n. 8
0
def main(_):
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode',
                        type=str,
                        required=True,
                        choices=('train', 'interactive', 'test'),
                        help='work mode')
    parser.add_argument('--model_dir',
                        type=str,
                        required=True,
                        help='model directory')

    parser.add_argument(
        '--config',
        type=str,
        help='config file containing parameters to configure the model')

    parser.add_argument('--pretrain_data', type=str, help='pretrain dataset')
    parser.add_argument('--train_data', type=str, help='training dataset')
    parser.add_argument('--dev_data', type=str, help='development dataset')
    parser.add_argument('--test_data', type=str, help='test dataset')

    parser.add_argument('--embed_conf',
                        type=str,
                        default="conf/word_embeddings.yml",
                        help='embedding config file')
    parser.add_argument(
        '--enable_epoch_evals',
        action='store_true',
        help='enable evals after finishing an apoch during training')
    parser.add_argument('--enable_final_eval',
                        action='store_true',
                        help='enable the last eval once training finished')
    parser.add_argument(
        '--disable_encoder_var_sharing',
        action='store_true',
        help='disable encoders sharing variables to support testing old models'
    )

    parser.add_argument('--num_gpus',
                        type=int,
                        default=4,
                        help='number of GPUs to use')
    parser.add_argument('--n_responses',
                        type=int,
                        default=1,
                        help='number of generated responses')
    parser.add_argument('--beam_width',
                        type=int,
                        help='beam width to override the value in config file')
    parser.add_argument(
        '--length_penalty_weight',
        type=float,
        help='length penalty to override the value in config file')
    parser.add_argument(
        '--sampling_temperature',
        type=float,
        help='sampling temperature to override the value in config file')
    parser.add_argument(
        '--lda_model_dir',
        type=str,
        help=
        'required only for testing with topical models (THRED and TA-Seq2Seq)')

    args = vars(parser.parse_args())
    config = Config(**args)

    model = model_factory.create_model(config)

    if config.mode == 'train':
        model.train()
    elif config.mode == 'interactive':
        model.interactive()
    elif config.mode == 'test':
        model.test()
Esempio n. 9
0
def setup_student(s_name, params):
    # Student Model
    num_classes = params["num_classes"]
    s_net = create_model(s_name, num_classes, params["device"])
    return s_net
Esempio n. 10
0
def main():
    args = parser.parse_args()

    num_classes = len(get_labels())
    test_time_pool = 0  #5 if 'dpn' in args.model else 0

    model = model_factory.create_model(args.model,
                                       in_chs=1,
                                       num_classes=num_classes,
                                       global_pool=args.gp,
                                       test_time_pool=test_time_pool)
    #model.reset_classifier(num_classes=num_classes)

    if args.num_gpu > 1:
        model = torch.nn.DataParallel(model,
                                      device_ids=list(range(
                                          args.num_gpu))).cuda()
    else:
        model.cuda()

    if not os.path.exists(args.checkpoint):
        print("=> no checkpoint found at '{}'".format(args.checkpoint))
        exit(1)
    print("=> loading checkpoint '{}'".format(args.checkpoint))
    checkpoint = torch.load(args.checkpoint)
    if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
        model.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            args.checkpoint, checkpoint['epoch']))
    else:
        model.load_state_dict(checkpoint)

    csplit = os.path.normpath(args.checkpoint).split(sep=os.path.sep)
    if len(csplit) > 1:
        exp_name = csplit[-2] + '-' + csplit[-1].split('.')[0]
    else:
        exp_name = ''

    if args.output:
        output_base = args.output
    else:
        output_base = './output'

    output_dir = get_outdir(output_base, 'predictions', exp_name)

    dataset = CommandsDataset(root=args.data,
                              mode='test',
                              format='spectrogram')

    loader = data.DataLoader(dataset,
                             batch_size=args.batch_size,
                             pin_memory=True,
                             shuffle=False,
                             num_workers=args.workers)

    model.eval()
    batch_time_m = AverageMeter()
    data_time_m = AverageMeter()
    try:
        # open CSV for writing predictions
        cf = open(os.path.join(output_dir, 'results.csv'), mode='w')
        res_writer = csv.writer(cf)
        res_writer.writerow(['fname'] + dataset.id_to_label)

        # open CSV for writing submission
        cf = open(os.path.join(output_dir, 'submission.csv'), mode='w')
        sub_writer = csv.writer(cf)
        sub_writer.writerow(['fname', 'label', 'prob'])

        end = time.time()
        batch_sample_idx = 0
        for batch_idx, (input, target) in enumerate(loader):
            data_time_m.update(time.time() - end)
            input = input.cuda()
            output = model(input)

            # augmentation reduction
            #reduce_factor = loader.dataset.get_aug_factor()
            #if reduce_factor > 1:
            #    output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2).squeeze(dim=2)
            #    index = index[0:index.size(0):reduce_factor]

            # move data to CPU and collect)
            output_logprob = F.log_softmax(output, dim=1).cpu().numpy()
            output = F.softmax(output, dim=1)
            output_prob, output_idx = output.max(1)
            output_prob = output_prob.cpu().numpy()
            output_idx = output_idx.cpu().numpy()
            for i in range(output_logprob.shape[0]):
                index = batch_sample_idx + i
                pred_label = dataset.id_to_label[output_idx[i]]
                pred_prob = output_prob[i]
                filename = dataset.filename(index)
                res_writer.writerow([filename] + list(output_logprob[i]))
                sub_writer.writerow([filename] + [pred_label, pred_prob])

            batch_sample_idx += input.size(0)
            batch_time_m.update(time.time() - end)
            if batch_idx % args.print_freq == 0:
                print('Inference: [{}/{} ({:.0f}%)]  '
                      'Time: {batch_time.val:.3f}s, {rate:.3f}/s  '
                      '({batch_time.avg:.3f}s, {rate_avg:.3f}/s)  '
                      'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
                          batch_sample_idx,
                          len(loader.sampler),
                          100. * batch_idx / len(loader),
                          batch_time=batch_time_m,
                          rate=input.size(0) / batch_time_m.val,
                          rate_avg=input.size(0) / batch_time_m.avg,
                          data_time=data_time_m))

            end = time.time()
            # end iterating through dataset

    except KeyboardInterrupt:
        pass
    except Exception as e:
        print(str(e))
Esempio n. 11
0
def main(argv):
    """Run the training script with command line arguments @argv."""
    args = parse_args(argv)
    if args.label_refinery_model is None:
        if args.coslinear:
            save_dir = args.save + '/' + args.model + '_cos_' + name_time
        else:
            save_dir = args.save + '/' + args.model + '_' + name_time
    else:
        if args.coslinear:
            save_dir = args.save + '/' + args.model + '_cos_rfn_' + name_time
        else:
            save_dir = args.save + '/' + args.model + '_rfn_' + name_time
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir)
    utils.general_setup(save_dir, args.gpus)

    logging.info("Arguments parsed.\n{}".format(pprint.pformat(vars(args))))

    # Create the train and the validation data loaders.
    train_loader = mul_cifar100.mul_CIFAR100DataLoader(
        root=args.data_dir,
        image_size=32,
        train=True,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers)
    val_loader = mul_cifar100.mul_CIFAR100DataLoader(
        root=args.data_dir,
        image_size=32,
        train=False,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers)

    # Create model with optional label refinery.
    model, loss = model_factory.create_model(args.model, args.model_state_file,
                                             args.gpus,
                                             args.label_refinery_model,
                                             args.label_refinery_state_file,
                                             args.coslinear, args.s)
    # logging.info("Model:\n{}".format(model))

    if args.lr_regime is None:
        lr_regime = model.LR_REGIME
    else:
        lr_regime = args.lr_regime
    regime = LearningRateRegime(lr_regime)
    # Train and test for needed number of epochs.
    optimizer = create_optimizer(model, args.momentum, args.weight_decay)

    for epoch in range(1, int(regime.num_epochs) + 1):
        lr = regime.get_lr(epoch)
        _set_learning_rate(optimizer, lr)
        train_for_one_epoch(model, loss, train_loader, optimizer, epoch)
        ############# Print results ########
        # weights = [ p for n,p in model.named_parameters() if 'weight' in n and 'se' not in n and 'conv' in n and len(p.size())==4]
        # name = [ n for n,p in model.named_parameters() if 'weight' in n and 'se' not in n and 'conv' in n and len(p.size())==4]
        # j = 0
        # for wt in weights:
        #     zr_ch = 0
        #     rs = wt.pow(2).sum(dim=[0,2,3]).pow(1/2.)
        #     for i in range(len(rs)):
        #         if rs[i]<=1e-15:
        #             zr_ch += 1
        #     csize = list(wt.size())
        #     num_ch = csize[1]
        #     print('Number of zero channels: '+str(zr_ch)+'/'+str(num_ch)+'  in :'+name[j])
        #     j += 1
        ####################################
        test.test_for_one_epoch(model, loss, val_loader, epoch)
        save_checkpoint(save_dir, model, optimizer, epoch)
Esempio n. 12
0
def evaluate_model(cache_dict, layer_id, finally_pruned_layers, config,
                   model_args, data_args, training_args, train_dataset,
                   eval_dataset, compute_metrics, tokenizer, data_collator,
                   datasets):

    # Set seed before initializing model.
    set_seed(training_args.seed)

    if (layer_id in cache_dict):
        print("Layer %d from cache: %.4f" % (layer_id, cache_dict[layer_id]))
        return cache_dict[layer_id]

    print(f"Calculate layer {str(layer_id)}")
    model = create_model(config, model_args)

    model.prune_layers(finally_pruned_layers)
    if isinstance(layer_id, int):
        model.prune_layers([layer_id])

    # for param in model.base_model.parameters():
    #     param.requires_grad = False

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    trainer.add_callback(DisableCheckpointCallbackHandler())

    # Training
    train_result = trainer.train(resume_from_checkpoint=None)
    metrics = train_result.metrics

    # Evaluation
    eval_results = {}
    logger.info("*** Evaluate ***")

    # Loop to handle MNLI double evaluation (matched, mis-matched)
    tasks = [data_args.task_name]
    eval_datasets = [eval_dataset]
    if data_args.task_name == "mnli":
        tasks.append("mnli-mm")
        eval_mismatch = datasets["validation_mismatched"]
        eval_datasets.append(eval_mismatch)

    for eval_dataset, task in zip(eval_datasets, tasks):
        eval_result = trainer.evaluate(eval_dataset=eval_dataset)
        eval_results.update(eval_result)

    # res = eval_results.get("eval_loss", None)
    res = None
    res = res or eval_results.get("eval_f1", None)
    res = res or eval_results.get("eval_accuracy", None)
    res = res or eval_results.get("eval_spearmanr", None)
    res = res or eval_results.get("eval_matthews_correlation", None)

    res = round(res, 3)

    if (res == None):
        raise Exception("Now performance metric found!")

    cache_dict[layer_id] = res
    return cache_dict
Esempio n. 13
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    #training_args.output_dir = f"{training_args.output_dir}/{data_args.task_name}/{model_args.model_name_or_path}/{model_args.prune_method}/{str(model_args.prune_n_layers)}/{str(training_args.seed)}"
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
    # label if at least two columns are provided.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.task_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset("glue", data_args.task_name)
    else:
        # Loading a dataset from your local files.
        # CSV/JSON training and evaluation files are needed.
        data_files = {
            "train": data_args.train_file,
            "validation": data_args.validation_file
        }

        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
        # when you use `do_predict` without specifying a GLUE benchmark task.
        if training_args.do_predict:
            if data_args.test_file is not None:
                train_extension = data_args.train_file.split(".")[-1]
                test_extension = data_args.test_file.split(".")[-1]
                assert (
                    test_extension == train_extension
                ), "`test_file` should have the same extension (csv or json) as `train_file`."
                data_files["test"] = data_args.test_file
            else:
                raise ValueError(
                    "Need either a GLUE task or a test file for `do_predict`.")

        for key in data_files.keys():
            logger.info(f"load a local file for {key}: {data_files[key]}")

        if data_args.train_file.endswith(".csv"):
            # Loading a dataset from local csv files
            datasets = load_dataset("csv", data_files=data_files)
        else:
            # Loading a dataset from local json files
            datasets = load_dataset("json", data_files=data_files)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Labels
    if data_args.task_name is not None:
        is_regression = data_args.task_name == "stsb"
        if not is_regression:
            label_list = datasets["train"].features["label"].names
            num_labels = len(label_list)
        else:
            num_labels = 1
    else:
        # Trying to have good defaults here, don't hesitate to tweak to your needs.
        is_regression = datasets["train"].features["label"].dtype in [
            "float32", "float64"
        ]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = create_model(config, model_args)

    #
    # Prune model before training starts
    #
    if (model_args.prune_method == "prune-greedy"):
        current_path = pathlib.Path(__file__).parent.absolute()
        layer_file_path = os.path.join(
            current_path, "layer_files/",
            f"{model.name_or_path}_{data_args.task_name}_greedy.txt")
        with open(layer_file_path, 'r') as f:
            layers_to_prune = f.readlines()
        layers_to_prune = layers_to_prune[:model_args.prune_n_layers]
        layers_to_prune = [int(l.replace("\n", "")) for l in layers_to_prune]
        print(f"Pruned {str(layers_to_prune)}")
        model.prune_layers(layers_to_prune)

    elif (model_args.prune_method == "top-layers"):
        print(
            f"# Prune {model_args.prune_n_layers} layers with {model_args.prune_method}"
        )
        first_layer_to_prune = config.num_hidden_layers - model_args.prune_n_layers
        model.prune_layers(
            [i for i in range(first_layer_to_prune, config.num_hidden_layers)])

        # # Measure number of parameters
        # It really depends how pruning is implemented - if its deleted from the layers
        # module list or if the layer is simply skipped (then torch still measures those values)
        # if hasattr(model.base_model, "encoder"):
        #     base_class = model.base_model.encoder
        # else:
        #     base_class = model.base_model

        # layers = base_class.layer
        # layers = [l for (i, l) in enumerate(layers) if i not in model.get_pruned_layers()]
        # layers = nn.ModuleList(layers)
        # setattr(base_class, "layer", layers)

    # Print number of parameters
    num_params = sum(p.numel() for p in model.parameters())
    print("NUM Paramerers: %d" % num_params)

    # Preprocessing the datasets
    if data_args.task_name is not None:
        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
    else:
        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
        non_label_column_names = [
            name for name in datasets["train"].column_names if name != "label"
        ]
        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
            sentence1_key, sentence2_key = "sentence1", "sentence2"
        else:
            if len(non_label_column_names) >= 2:
                sentence1_key, sentence2_key = non_label_column_names[:2]
            else:
                sentence1_key, sentence2_key = non_label_column_names[0], None

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if (model.config.label2id !=
            PretrainedConfig(num_labels=num_labels).label2id
            and data_args.task_name is not None and not is_regression):
        # Some have all caps in their config, some don't.
        label_name_to_id = {
            k.lower(): v
            for k, v in model.config.label2id.items()
        }
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            label_to_id = {
                i: label_name_to_id[label_list[i]]
                for i in range(num_labels)
            }
        else:
            logger.warn(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
            )
    elif data_args.task_name is None and not is_regression:
        label_to_id = {v: i for i, v in enumerate(label_list)}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warn(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args,
                           padding=padding,
                           max_length=max_seq_length,
                           truncation=True)

        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
            result["label"] = [label_to_id[l] for l in examples["label"]]
        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)

    train_dataset = datasets["train"]
    eval_dataset = datasets["validation_matched" if data_args.task_name ==
                            "mnli" else "validation"]
    if data_args.task_name is not None or data_args.test_file is not None:
        test_dataset = datasets["test_matched" if data_args.task_name ==
                                "mnli" else "test"]

    # Get the metric function
    if data_args.task_name is not None:
        metric = load_metric("glue", data_args.task_name)
    # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from
    # compute_metrics

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions,
                                               tuple) else p.predictions
        preds = np.squeeze(preds) if is_regression else np.argmax(preds,
                                                                  axis=1)
        if data_args.task_name is not None:
            result = metric.compute(predictions=preds, references=p.label_ids)
            if len(result) > 1:
                result["combined_score"] = np.mean(list(
                    result.values())).item()
            return result
        elif is_regression:
            return {"mse": ((preds - p.label_ids)**2).mean().item()}
        else:
            return {
                "accuracy":
                (preds == p.label_ids).astype(np.float32).mean().item()
            }

    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # trainer.add_callback(
    #     PruneCallbackHandler(
    #         model_args.prune_method,
    #         model_args.prune_n_layers,
    #         data_args.task_name)
    # )
    trainer.add_callback(DisableCheckpointCallbackHandler())

    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics

        #trainer.save_model()  # Saves the tokenizer too for easy upload

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        if trainer.is_world_process_zero():
            with open(output_train_file, "w") as writer:
                logger.info("***** Train results *****")
                for key, value in sorted(metrics.items()):
                    logger.info(f"  {key}: {value}")
                    writer.write(f"{key}: {value}\n")

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            eval_datasets.append(datasets["validation_mismatched"])

        for eval_dataset, task in zip(eval_datasets, tasks):

            # eval_result = trainer.evaluate(eval_dataset=eval_dataset)
            start = time.time()
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)
            print(f"TIMING: {(time.time() - start)} ")
            exit()
            output_eval_file = os.path.join(training_args.output_dir,
                                            f"eval_results_{task}.txt")
            if trainer.is_world_process_zero():
                with open(output_eval_file, "w") as writer:
                    logger.info(f"***** Eval results {task} *****")
                    for key, value in sorted(eval_result.items()):
                        logger.info(f"  {key}: {value}")
                        writer.write(f"{key}: {value}\n")

            eval_results.update(eval_result)

    if training_args.do_predict:
        logger.info("*** Test ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            test_datasets.append(datasets["test_mismatched"])

        for test_dataset, task in zip(test_datasets, tasks):
            # Removing the `label` columns because it contains -1 and Trainer won't like that.
            test_dataset.remove_columns_("label")
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions
            predictions = np.squeeze(
                predictions) if is_regression else np.argmax(predictions,
                                                             axis=1)

            output_test_file = os.path.join(training_args.output_dir,
                                            f"test_results_{task}.txt")
            if trainer.is_world_process_zero():
                with open(output_test_file, "w") as writer:
                    logger.info(f"***** Test results {task} *****")
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if is_regression:
                            writer.write(f"{index}\t{item:3.3f}\n")
                        else:
                            item = label_list[item]
                            writer.write(f"{index}\t{item}\n")
    return eval_results
Esempio n. 14
0
    return label2topn


print('init config params')
args = vars()
args['mode'] = 'infer'
args['model_dir'] = 'service_model'
args['beam_width'] = 5
args['length_penalty_weight'] = 0.8
config = Config(**args)

stop_words = load_stop_words(config.model_dir)
word2label = load_keyword_label(config.model_dir)
label2topn = load_top_words(config.model_dir)

model = model_factory.create_model(config)

infer_model = model.create_infer_model_graph()
config_proto = model_helper.get_config_proto(config.log_device)

sess = tf.InteractiveSession(graph=infer_model.graph, config=config_proto)

ckpt = tf.train.latest_checkpoint(config.model_dir)
loaded_infer_model = model_helper.load_model(infer_model.model, ckpt, sess, "infer")


def extract_keyword(query):
    tags = jieba.analyse.extract_tags(query, topK=10)
    keyword = []
    for w in tags:
        if re.search('^\d+$', w):
Esempio n. 15
0
def main():
    args = parser.parse_args()

    train_input_root = os.path.join(args.data)
    train_labels_file = './data/labels.csv'

    if args.output:
        output_base = args.output
    else:
        output_base = './output'

    exp_name = '-'.join([
        datetime.now().strftime("%Y%m%d-%H%M%S"),
        args.model,
        str(args.img_size),
        'f'+str(args.fold),
        'tif' if args.tif else 'jpg'])
    output_dir = get_outdir(output_base, 'train', exp_name)

    batch_size = args.batch_size
    num_epochs = args.epochs
    img_type = '.tif' if args.tif else '.jpg'
    img_size = (args.img_size, args.img_size)
    num_classes = get_tags_size(args.labels)

    torch.manual_seed(args.seed)

    dataset_train = AmazonDataset(
        train_input_root,
        train_labels_file,
        train=True,
        tags_type=args.labels,
        multi_label=args.multi_label,
        img_type=img_type,
        img_size=img_size,
        fold=args.fold,
    )

    #sampler = WeightedRandomOverSampler(dataset_train.get_sample_weights())

    loader_train = data.DataLoader(
        dataset_train,
        batch_size=batch_size,
        shuffle=True,
        #sampler=sampler,
        num_workers=args.num_processes
    )

    dataset_eval = AmazonDataset(
        train_input_root,
        train_labels_file,
        train=False,
        tags_type=args.labels,
        multi_label=args.multi_label,
        img_type=img_type,
        img_size=img_size,
        test_aug=args.tta,
        fold=args.fold,
    )

    loader_eval = data.DataLoader(
        dataset_eval,
        batch_size=batch_size,
        shuffle=False,
        num_workers=args.num_processes
    )

    model = model_factory.create_model(
        args.model,
        pretrained=args.pretrained,
        num_classes=num_classes,
        drop_rate=args.drop,
        global_pool=args.gp)

    if not args.no_cuda:
        if args.num_gpu > 1:
            model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda()
        else:
            model.cuda()

    if args.opt.lower() == 'sgd':
        optimizer = optim.SGD(
            model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
    elif args.opt.lower() == 'adam':
        optimizer = optim.Adam(
            model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.opt.lower() == 'adadelta':
        optimizer = optim.Adadelta(
            model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    elif args.opt.lower() == 'rmsprop':
        optimizer = optim.RMSprop(
            model.parameters(), lr=args.lr, alpha=0.9, momentum=args.momentum, weight_decay=args.weight_decay)
    elif args.opt.lower() == 'yellowfin':
        optimizer = YFOptimizer(
            model.parameters(), lr=args.lr, weight_decay=args.weight_decay, clip_thresh=2)
    else:
        assert False and "Invalid optimizer"

    if not args.decay_epochs:
        lr_scheduler = ReduceLROnPlateau(optimizer, patience=8)
    else:
        lr_scheduler = None

    if args.class_weights:
        class_weights = torch.from_numpy(dataset_train.get_class_weights()).float()
        class_weights_norm = class_weights / class_weights.sum()
        if not args.no_cuda:
            class_weights = class_weights.cuda()
            class_weights_norm = class_weights_norm.cuda()
    else:
        class_weights = None
        class_weights_norm = None

    if args.loss.lower() == 'nll':
        #assert not args.multi_label and 'Cannot use crossentropy with multi-label target.'
        loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
    elif args.loss.lower() == 'mlsm':
        assert args.multi_label
        loss_fn = torch.nn.MultiLabelSoftMarginLoss(weight=class_weights)
    else:
        assert False and "Invalid loss function"

    if not args.no_cuda:
        loss_fn = loss_fn.cuda()

    # optionally resume from a checkpoint
    start_epoch = 1
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            sparse_checkpoint = True if 'sparse' in checkpoint and checkpoint['sparse'] else False
            if sparse_checkpoint:
                print("Loading sparse model")
                dense_sparse_dense.sparsify(model, sparsity=0.)  # ensure sparsity_masks exist in model definition
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
            start_epoch = checkpoint['epoch']
            if args.sparse and not sparse_checkpoint:
                print("Sparsifying loaded model")
                dense_sparse_dense.sparsify(model, sparsity=0.5)
            elif sparse_checkpoint and not args.sparse:
                print("Densifying loaded model")
                dense_sparse_dense.densify(model)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
            exit(-1)
    else:
        if args.sparse:
            dense_sparse_dense.sparsify(model, sparsity=0.5)

    use_tensorboard = not args.no_tb and CrayonClient is not None
    if use_tensorboard:
        hostname = '127.0.0.1'
        port = 8889
        host_port = args.tbh.split(':')[:2]
        if len(host_port) == 1:
            hostname = host_port[0]
        elif len(host_port) >= 2:
            hostname, port = host_port[:2]
        try:
            cc = CrayonClient(hostname=hostname, port=port)
            try:
                cc.remove_experiment(exp_name)
            except ValueError:
                pass
            exp = cc.create_experiment(exp_name)
        except Exception as e:
            exp = None
            print("Error (%s) connecting to Tensoboard/Crayon server. Giving up..." % str(e))
    else:
        exp = None

    # Optional fine-tune of only the final classifier weights for specified number of epochs (or part of)
    if not args.resume and args.ft_epochs > 0.:
        if args.opt.lower() == 'adam':
            finetune_optimizer = optim.Adam(
                model.get_fc().parameters(), lr=args.ft_lr, weight_decay=args.weight_decay)
        else:
            finetune_optimizer = optim.SGD(
                model.get_fc().parameters(), lr=args.ft_lr, momentum=args.momentum, weight_decay=args.weight_decay)

        finetune_epochs_int = int(np.ceil(args.ft_epochs))
        finetune_final_batches = int(np.ceil((1 - (finetune_epochs_int - args.ft_epochs)) * len(loader_train)))
        print(finetune_epochs_int, finetune_final_batches)
        for fepoch in range(1, finetune_epochs_int + 1):
            if fepoch == finetune_epochs_int and finetune_final_batches:
                batch_limit = finetune_final_batches
            else:
                batch_limit = 0
            train_epoch(
                fepoch, model, loader_train, finetune_optimizer, loss_fn, args,
                class_weights_norm, output_dir, batch_limit=batch_limit)
            step = fepoch * len(loader_train)
            score, _ = validate(step, model, loader_eval, loss_fn, args, 0.3, output_dir)

    score_metric = 'f2'
    best_loss = None
    best_f2 = None
    threshold = 0.3
    try:
        for epoch in range(start_epoch, num_epochs + 1):
            if args.decay_epochs:
                adjust_learning_rate(optimizer, epoch, initial_lr=args.lr, decay_epochs=args.decay_epochs)

            train_metrics = train_epoch(
                epoch, model, loader_train, optimizer, loss_fn, args, class_weights_norm, output_dir, exp=exp)

            step = epoch * len(loader_train)
            eval_metrics, latest_threshold = validate(
                step, model, loader_eval, loss_fn, args, threshold, output_dir, exp=exp)

            if lr_scheduler is not None:
                lr_scheduler.step(eval_metrics['eval_loss'])

            rowd = OrderedDict(epoch=epoch)
            rowd.update(train_metrics)
            rowd.update(eval_metrics)
            with open(os.path.join(output_dir, 'summary.csv'), mode='a') as cf:
                dw = csv.DictWriter(cf, fieldnames=rowd.keys())
                if best_loss is None:  # first iteration (epoch == 1 can't be used)
                    dw.writeheader()
                dw.writerow(rowd)

            best = False
            if best_loss is None or eval_metrics['eval_loss'] < best_loss[1]:
                best_loss = (epoch, eval_metrics['eval_loss'])
                if score_metric == 'loss':
                    best = True
            if best_f2 is None or eval_metrics['eval_f2'] > best_f2[1]:
                best_f2 = (epoch, eval_metrics['eval_f2'])
                if score_metric == 'f2':
                    best = True

            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.model,
                'sparse': args.sparse,
                'state_dict':  model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'threshold': latest_threshold,
                'args': args,
                'gp': args.gp,
                },
                is_best=best,
                filename='checkpoint-%d.pth.tar' % epoch,
                output_dir=output_dir)

    except KeyboardInterrupt:
        pass
    print('*** Best loss: {0} (epoch {1})'.format(best_loss[1], best_loss[0]))
    print('*** Best f2: {0} (epoch {1})'.format(best_f2[1], best_f2[0]))
def convert(mxnet_name, torch_name):
    # download and load the pre-trained model
    net = gluoncv.model_zoo.get_model(mxnet_name, pretrained=True)

    # create corresponding torch model
    torch_net = create_model(torch_name)

    mxp = [(k, v) for k, v in net.collect_params().items()
           if 'running' not in k]
    torchp = list(torch_net.named_parameters())
    torch_params = {}

    # convert parameters
    # NOTE: we are relying on the fact that the order of parameters
    # are usually exactly the same between these models, thus no key name mapping
    # is necessary. Asserts will trip if this is not the case.
    for (tn, tv), (mn, mv) in zip(torchp, mxp):
        m_split = mn.split('_')
        t_split = tn.split('.')
        print(t_split, m_split)
        print(tv.shape, mv.shape)

        # ensure ordering of BN params match since their sizes are not specific
        if m_split[-1] == 'gamma':
            assert t_split[-1] == 'weight'
        if m_split[-1] == 'beta':
            assert t_split[-1] == 'bias'

        # ensure shapes match
        assert all(t == m for t, m in zip(tv.shape, mv.shape))

        torch_tensor = torch.from_numpy(mv.data().asnumpy())
        torch_params[tn] = torch_tensor

    # convert buffers (batch norm running stats)
    mxb = [(k, v) for k, v in net.collect_params().items()
           if any(x in k for x in ['running_mean', 'running_var'])]
    torchb = [(k, v) for k, v in torch_net.named_buffers()
              if 'num_batches' not in k]
    for (tn, tv), (mn, mv) in zip(torchb, mxb):
        print(tn, mn)
        print(tv.shape, mv.shape)

        # ensure ordering of BN params match since their sizes are not specific
        if 'running_var' in tn:
            assert 'running_var' in mn
        if 'running_mean' in tn:
            assert 'running_mean' in mn

        torch_tensor = torch.from_numpy(mv.data().asnumpy())
        torch_params[tn] = torch_tensor

    torch_net.load_state_dict(torch_params)
    torch_filename = './%s.pth' % torch_name
    torch.save(torch_net.state_dict(), torch_filename)
    with open(torch_filename, 'rb') as f:
        sha_hash = hashlib.sha256(f.read()).hexdigest()
    final_filename = os.path.splitext(
        torch_filename)[0] + '-' + sha_hash[:8] + '.pth'
    os.rename(torch_filename, final_filename)
    print("=> Saved converted model to '{}, SHA256: {}'".format(
        final_filename, sha_hash))