Exemple #1
0
def main(run_id, pretrained, data_files, model_params, training_params,
         device):
    best_acc1 = 0
    batch_size = training_params['batch_size']
    test_batch_size = training_params['test_batch_size']
    epochs = training_params['epochs']
    start_epoch = training_params['start_epoch']
    n_warmup_steps = training_params['n_warmup_steps']
    log_interval = training_params['log_interval']

    # model is trained for binary classification (for datalaoder)
    if model_params['NUM_SPOOF_CLASS'] == 2:
        binary_class = True
    else:
        binary_class = False

    kwargs = {
        'num_workers': 2,
        'pin_memory': True
    } if device == torch.device('cuda') else {}

    # create model
    model = Detector(**model_params).to(device)
    num_model_params = sum(p.numel() for p in model.parameters()
                           if p.requires_grad)
    print('===> Model total parameter: {}'.format(num_model_params))

    # Wrap model for multi-GPUs, if necessary
    if device == torch.device('cuda') and torch.cuda.device_count() > 1:
        print('multi-gpu')
        model = nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    optim = optimizer.ScheduledOptim(
        torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                         betas=(0.9, 0.98),
                         eps=1e-09,
                         weight_decay=1e-4,
                         lr=3e-4,
                         amsgrad=True), training_params['n_warmup_steps'])

    # optionally resume from a checkpoint
    if pretrained:
        if os.path.isfile(pretrained):
            print("===> loading checkpoint '{}'".format(pretrained))
            checkpoint = torch.load(pretrained)
            start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            model.load_state_dict(checkpoint['state_dict'])
            optim.load_state_dict(checkpoint['optimizer'])
            print("===> loaded checkpoint '{}' (epoch {})".format(
                pretrained, checkpoint['epoch']))
        else:
            print("===> no checkpoint found at '{}'".format(pretrained))

    # Data loading code
    train_data = SpoofDatsetSystemID(data_files['train_scp'],
                                     data_files['train_utt2index'],
                                     binary_class)
    val_data = SpoofDatsetSystemID(data_files['dev_scp'],
                                   data_files['dev_utt2index'], binary_class)

    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               **kwargs)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=test_batch_size,
                                             shuffle=True,
                                             **kwargs)

    best_epoch = 0
    early_stopping, max_patience = 0, 100  # for early stopping
    os.makedirs("model_snapshots/" + run_id, exist_ok=True)
    for epoch in range(start_epoch, start_epoch + epochs):

        trainer.train(train_loader, model, optim, epoch, device, log_interval)
        acc1 = validate.validate(val_loader, data_files['dev_utt2systemID'],
                                 model, device, log_interval)

        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        # adjust learning rate + early stopping
        if is_best:
            early_stopping = 0
            best_epoch = epoch + 1
        else:
            early_stopping += 1
            if epoch - best_epoch > 2:
                optim.increase_delta()
                best_epoch = epoch + 1
        if early_stopping == max_patience:
            break

        # save model
        optimizer.save_checkpoint(
            {
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optim.state_dict(),
            }, is_best, "model_snapshots/" + str(run_id),
            str(epoch) + ('_%.3f' % acc1) + ".pth.tar")
Exemple #2
0
def train(output_model_dir: str, input_model_path: Optional[str] = None, tb_path: str = None,
          nuscenes_version: str = 'v1.0-mini', data_path: str = "data/v1.0-mini", n_scenes: int = None,
          learning_rate: int = 1e-4, n_dumps_per_epoch: int = 10, n_loader_workers: int = 4, batch_size: int = 12,
          n_epochs: int = 50, device_id: List[int] = None) -> None:
    """
    Train model, log training statistics if tb_path is specified.
    :param output_model_dir: path to directory to save model weights to
    :param input_model_path: path to model weights. If None, create new model
    :param tb_path: name of the folder for tensorboard data to be store in
    :param nuscenes_version: version of the dataset
    :param data_path: relative path to data folder
    :param n_scenes: number of scenes in dataset
    :param learning_rate: learning rate for Adam
    :param n_dumps_per_epoch: how many times per epoch to dump images to tensorboard (not implemented yet)
    :param n_loader_workers: number of CPU workers for data loader processing
    :param batch_size: batch size
    :param n_epochs: total number of epochs to train the model
    :param device_id: list of gpu device ids to use, e.g [0, 1]
    """
    # create path for model save
    os.makedirs(output_model_dir, exist_ok=True)

    # set up computing device for pytorch
    if torch.cuda.is_available():
        if device_id is None:
            device_id = [0]
        if max(device_id) < torch.cuda.device_count():
            # device_id/s all exist on machine,
            # device is set as a root device
            device = torch.device(f'cuda:{device_id[0]}')
        else:
            # device_id is out of range, setting to defaults cuda:0
            print('Warning: specified number of gpu device_id is larger than available, using cuda:0.')
            device = torch.device('cuda:0')
        print('Using device: GPU\n')
    else:
        device = torch.device('cpu')
        print('Using device: CPU\n')

    date = datetime.datetime.now().strftime('%b-%d-%Y-%H:%M:%S')

    # set up tensorboard writer
    if tb_path is not None:
        train_writer = SummaryWriter(log_dir=f'{tb_path}/{date}/train')
        val_writer = SummaryWriter(log_dir=f'{tb_path}/{date}/val')
        print(f'Logging tensorboard data to directory: {tb_path}/{date}\n')
    else:
        train_writer, val_writer = None, None
        print(f'No tensorboard logging will be performed\n')

    # set up dataset and model
    nuscenes = create_nuscenes(data_path, nuscenes_version)
    train_dataset = NuscenesBEVDataset(nuscenes=nuscenes, n_scenes=n_scenes, mode='train')
    val_dataset = NuscenesBEVDataset(nuscenes=nuscenes, n_scenes=n_scenes, mode='val')
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=n_loader_workers,
                              collate_fn=frames_bboxes_collate_fn, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=n_loader_workers,
                            collate_fn=frames_bboxes_collate_fn, pin_memory=True)
    print('Loaders are ready.',
          f'Number of batches in train loader: {len(train_loader)}'
          f'Number of batches in validation loader: {len(val_loader)}', sep='\n')

    frame_depth, frame_width, frame_length = train_dataset.grid_size
    model = Detector(img_depth=frame_depth)
    if input_model_path is not None:
        model.load_state_dict(torch.load(input_model_path, map_location="cpu"))
    model = model.to(device)
    criterion = DetectionLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, gamma=0.5, step_size=50)  # TODO: adjust step_size empirically
    detector_out_shape = (batch_size, model.out_channels, frame_width // (2 ** model.n_pools),
                          frame_length // (2 ** model.n_pools))
    gt_former = GroundTruthFormer((frame_width, frame_length), detector_out_shape, device=device)

    if len(device_id) > 1 and max(device_id) < torch.cuda.device_count():
        # if more than one device_id specified, use DataParallel
        model = nn.DataParallel(model, device_ids=device_id)
    model = model.to(device)

    best_val_score = float('-inf')
    for epoch in trange(n_epochs, desc="Epoch"):
        run_epoch(model, train_loader, criterion, gt_former, epoch, mode='train',
                  writer=train_writer, optimizer=optimizer, device=device)
        scheduler.step()
        val_loss, val_score = run_epoch(model, val_loader, criterion, gt_former, epoch,
                                        mode='val', train_loader_size=len(train_loader), writer=val_writer,
                                        device=device)
        # saving model weights in case validation loss AND score are better
        if val_score > best_val_score:
            best_val_score = val_score
            torch.save(model.state_dict(), f'{output_model_dir}/{date}.pth')
            print('\nModel checkpoint is saved.', f'loss: {val_loss:.3f}, score: {val_score:.3f}', sep='\n')
def main(pretrained, data_files, model_params, training_params, device):
    """ forward pass dev and eval data to trained model  """
    batch_size = training_params['batch_size']
    test_batch_size = training_params['test_batch_size']
    epochs = training_params['epochs']
    start_epoch = training_params['start_epoch']
    n_warmup_steps = training_params['n_warmup_steps']
    log_interval = training_params['log_interval']

    kwargs = {
        'num_workers': 4,
        'pin_memory': True
    } if device == torch.device('cuda') else {}

    # create model
    model = Detector(**model_params).to(device)
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('===> Model total parameter: {}'.format(num_params))

    if device == torch.device('cuda') and torch.cuda.device_count() > 1:
        print('multi-gpu')
        model = nn.DataParallel(model).cuda()

    if pretrained:
        epoch_id = pretrained.split('/')[2].split('_')[0]
        pretrained_id = pretrained.split('/')[1]
        if os.path.isfile(pretrained):
            print("===> loading checkpoint '{}'".format(pretrained))
            checkpoint = torch.load(
                pretrained,
                map_location=lambda storage, loc: storage)  # load for cpu
            best_acc1 = checkpoint['best_acc1']
            model.load_state_dict(checkpoint['state_dict'])
            print("===> loaded checkpoint '{}' (epoch {})".format(
                pretrained, checkpoint['epoch']))
        else:
            print("===> no checkpoint found at '{}'".format(pretrained))
            exit()
    else:
        raise NameError

    # Data loading code (class analysis for multi-class classification only)
    val_data = SpoofDatsetSystemID(data_files['dev_scp'],
                                   data_files['dev_utt2index'],
                                   binary_class=False)
    eval_data = SpoofDatsetSystemID(data_files['eval_scp'],
                                    data_files['eval_utt2index'],
                                    binary_class=False)

    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=test_batch_size,
                                             shuffle=False,
                                             **kwargs)
    eval_loader = torch.utils.data.DataLoader(eval_data,
                                              batch_size=test_batch_size,
                                              shuffle=False,
                                              **kwargs)

    os.makedirs(data_files['scoring_dir'], exist_ok=True)
    # forward pass for dev
    print("===> forward pass for dev set")
    score_file_pth = os.path.join(
        data_files['scoring_dir'],
        str(pretrained_id) + '-epoch%s-dev_scores.txt' % (epoch_id))
    print("===> dev scoring file saved at: '{}'".format(score_file_pth))
    prediction.prediction(val_loader, model, device, score_file_pth,
                          data_files['dev_utt2systemID'])

    # forward pass for eval
    print("===> forward pass for eval set")
    score_file_pth = os.path.join(
        data_files['scoring_dir'],
        str(pretrained_id) + '-epoch%s-eval_scores.txt' % (epoch_id))
    print("===> eval scoring file saved at: '{}'".format(score_file_pth))
    prediction.prediction(eval_loader, model, device, score_file_pth,
                          data_files['eval_utt2systemID'])
        print('Skipping:', args.run_dir)
        sys.exit(0)

    ckpt_dir = os.path.join(args.run_dir, 'ckpt')
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)

    params_file = os.path.join(args.run_dir, 'params.csv')
    params.to_csv(params_file, index=False)
    with pd.option_context('display.width', None, 'max_columns', None):
        print(params)

    log_file = os.path.join(args.run_dir, 'log.csv')
    log = pd.DataFrame()

    optimizer = Adam(detector.parameters(), lr=args.lr, weight_decay=args.weight_decay)

    weights = train_data.weights
    sampler = WeightedRandomSampler(weights, len(weights))
    
    # Precompute Embeddings for train and val set once
    print('Precomputing embedded features: TRAIN')
    train_cache = 'cache/cache_train_{}_{}.pth'.format('cos' if args.distance == 'cosine' else 'euc',
                                     'med' if 'medoids' in args.centroids else 'centr')
    train_data = precompute_embeddings(features_state, train_data, model, args, cache=train_cache)
    print('Precomputing embedded features: VAL')
    val_cache = 'cache/cache_val_{}_{}.pth'.format('cos' if args.distance == 'cosine' else 'euc',
                                       'med' if 'medoids' in args.centroids else 'centr')
    val_paths, val_data = precompute_embeddings(features_state, val_data, model, args, return_paths=True, cache=val_cache)

    train_loader = DataLoader(train_data, sampler=sampler, pin_memory=True, batch_size=args.batch_size)
Exemple #5
0
                               args.image_size,
                               args.file_loc,
                               args.shuffle,
                               fold=0,
                               do_transform=False)
    val_loader = DataLoader(val_dataset,
                            num_workers=0,
                            shuffle=True,
                            batch_size=args.batch_size,
                            pin_memory=False,
                            drop_last=True)

    model = Detector(args.dropout_rate).cuda()
    set_requires_grad([model.feature_extractor], False)

    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-6)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                     T_max=args.epochs,
                                                     eta_min=args.min_lr,
                                                     last_epoch=-1)
    bce_loss = torch.nn.BCEWithLogitsLoss()

    epoch = 0
    step = 0
    writer = SummaryWriter(args.logging_path)

    if args.resume_from_last:
        args.checkpoint_path = get_last_checkpoint_filename(args.logging_path)

    if args.checkpoint_path != "":
        epoch, step = load_checkpoint(model, optimizer, args.checkpoint_path)