Esempio n. 1
0
def load_model(dirname, device, weights=None, half=False):
    """
    Load a model from disk
    """
    if not os.path.isdir(dirname) and os.path.isdir(
            os.path.join(__dir__, "models", dirname)):
        dirname = os.path.join(__dir__, "models", dirname)

    if not weights:  # take the latest checkpoint
        weight_files = glob(os.path.join(dirname, "weights_*.tar"))
        if not weight_files:
            raise FileNotFoundError("no model weights found in '%s'" % dirname)
        weights = max(
            [int(re.sub(".*_([0-9]+).tar", "\\1", w)) for w in weight_files])

    device = torch.device(device)
    config = os.path.join(dirname, 'config.toml')
    weights = os.path.join(dirname, 'weights_%s.tar' % weights)
    model = Model(toml.load(config))
    model.to(device)

    state_dict = torch.load(weights, map_location=device)
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k.replace('module.', '')
        new_state_dict[name] = v

    model.load_state_dict(new_state_dict)

    if half: model = model.half()
    model.eval()
    return model
Esempio n. 2
0
def load_model(dirname, device, weights=None):
    """
    Load a model from disk
    """
    if not weights:  # take the latest checkpoint
        weight_files = glob(os.path.join(dirname, "weights_*.tar"))
        weights = max(
            [int(re.sub(".*_([0-9]+).tar", "\\1", w)) for w in weight_files])

    device = torch.device(device)
    config = os.path.join(dirname, 'config.toml')
    weights = os.path.join(dirname, 'weights_%s.tar' % weights)
    model = Model(toml.load(config))
    model.to(device)
    model.load_state_dict(torch.load(weights, map_location=device))
    model.eval()
    return model
Esempio n. 3
0
    def objective(trial):

        config = toml.load(args.config)

        lr = 1e-3
        #lr = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)

        config['encoder']['activation'] = 'gelu'
        #config['block'][0]['stride'] = [trial.suggest_int('stride', 4, 6)]

        # C1
        config['block'][0]['kernel'] = [
            int(trial.suggest_discrete_uniform('c1_kernel', 1, 129, 2))
        ]
        config['block'][0]['filters'] = trial.suggest_int(
            'c1_filters', 1, 1024)

        # B1 - B5
        for i in range(1, 6):
            config['block'][i]['repeat'] = trial.suggest_int(
                'b%s_repeat' % i, 1, 9)
            config['block'][i]['filters'] = trial.suggest_int(
                'b%s_filters' % i, 1, 512)
            config['block'][i]['kernel'] = [
                int(trial.suggest_discrete_uniform('b%s_kernel' % i, 1, 129,
                                                   2))
            ]

        # C2
        config['block'][-2]['kernel'] = [
            int(trial.suggest_discrete_uniform('c2_kernel', 1, 129, 2))
        ]
        config['block'][-2]['filters'] = trial.suggest_int(
            'c2_filters', 1, 1024)

        # C3
        config['block'][-1]['kernel'] = [
            int(trial.suggest_discrete_uniform('c3_kernel', 1, 129, 2))
        ]
        config['block'][-1]['filters'] = trial.suggest_int(
            'c3_filters', 1, 1024)

        model = Model(config)
        num_params = sum(p.numel() for p in model.parameters())

        print("[trial %s]" % trial.number)

        if num_params > args.max_params:
            print("[pruned] network too large")
            raise optuna.exceptions.TrialPruned()

        model.to(args.device)
        model.train()

        os.makedirs(workdir, exist_ok=True)

        optimizer = AdamW(model.parameters(), amsgrad=True, lr=lr)
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          verbosity=0)
        schedular = CosineAnnealingLR(optimizer,
                                      args.epochs * len(train_loader))

        for epoch in range(1, args.epochs + 1):

            try:
                train_loss, duration = train(model,
                                             device,
                                             train_loader,
                                             optimizer,
                                             use_amp=True)
                val_loss, val_mean, val_median = test(model, device,
                                                      test_loader)
                print(
                    "[epoch {}] directory={} loss={:.4f} mean_acc={:.3f}% median_acc={:.3f}%"
                    .format(epoch, workdir, val_loss, val_mean, val_median))
            except KeyboardInterrupt:
                exit()
            except:
                print("[pruned] exception")
                raise optuna.exceptions.TrialPruned()

            if np.isnan(val_loss): val_loss = 9.9
            trial.report(val_loss, epoch)

            if trial.should_prune():
                print("[pruned] unpromising")
                raise optuna.exceptions.TrialPruned()

        trial.set_user_attr('seed', args.seed)
        trial.set_user_attr('val_loss', val_loss)
        trial.set_user_attr('val_mean', val_mean)
        trial.set_user_attr('val_median', val_median)
        trial.set_user_attr('train_loss', train_loss)
        trial.set_user_attr('batchsize', args.batch)
        trial.set_user_attr('model_params', num_params)

        torch.save(model.state_dict(),
                   os.path.join(workdir, "weights_%s.tar" % trial.number))
        toml.dump(
            config,
            open(os.path.join(workdir, 'config_%s.toml' % trial.number), 'w'))

        print("[loss] %.4f" % val_loss)
        return val_loss
Esempio n. 4
0
def main(args):

    workdir = os.path.expanduser(args.training_directory)

    if os.path.exists(workdir) and not args.force:
        print("[error] %s exists." % workdir)
        exit(1)

    init(args.seed, args.device)
    device = torch.device(args.device)

    print("[loading data]")
    chunks, chunk_lengths, targets, target_lengths = load_data(
        limit=args.chunks, shuffle=True, directory=args.directory)

    split = np.floor(chunks.shape[0] * args.validation_split).astype(np.int32)
    train_dataset = ChunkDataSet(chunks[:split], chunk_lengths[:split],
                                 targets[:split], target_lengths[:split])
    test_dataset = ChunkDataSet(chunks[split:], chunk_lengths[split:],
                                targets[split:], target_lengths[split:])
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch,
                             num_workers=4,
                             pin_memory=True)

    config = toml.load(args.config)
    argsdict = dict(training=vars(args))

    chunk_config = {}
    chunk_config_file = os.path.join(
        args.directory if args.directory else __data__, 'config.toml')
    if os.path.isfile(chunk_config_file):
        chunk_config = toml.load(os.path.join(chunk_config_file))

    print("[loading model]")
    model = Model(config)

    weights = os.path.join(workdir, 'weights.tar')
    if os.path.exists(weights): model.load_state_dict(torch.load(weights))

    model.to(device)
    model.train()

    os.makedirs(workdir, exist_ok=True)
    toml.dump({
        **config,
        **argsdict,
        **chunk_config
    }, open(os.path.join(workdir, 'config.toml'), 'w'))

    optimizer = AdamW(model.parameters(), amsgrad=True, lr=args.lr)

    if args.amp:
        try:
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level="O1",
                                              verbosity=0)
        except NameError:
            print(
                "[error]: Cannot use AMP: Apex package needs to be installed manually, See https://github.com/NVIDIA/apex"
            )
            exit(1)

    schedular = CosineAnnealingLR(optimizer, args.epochs * len(train_loader))

    for epoch in range(1, args.epochs + 1):

        try:
            train_loss, duration = train(model,
                                         device,
                                         train_loader,
                                         optimizer,
                                         use_amp=args.amp)
            val_loss, val_mean, val_median = test(model, device, test_loader)
        except KeyboardInterrupt:
            break

        print(
            "[epoch {}] directory={} loss={:.4f} mean_acc={:.3f}% median_acc={:.3f}%"
            .format(epoch, workdir, val_loss, val_mean, val_median))

        torch.save(model.state_dict(),
                   os.path.join(workdir, "weights_%s.tar" % epoch))
        with open(os.path.join(workdir, 'training.csv'), 'a',
                  newline='') as csvfile:
            csvw = csv.writer(csvfile, delimiter=',')
            if epoch == 1:
                csvw.writerow([
                    'time', 'duration', 'epoch', 'train_loss',
                    'validation_loss', 'validation_mean', 'validation_median'
                ])
            csvw.writerow([
                datetime.today(),
                int(duration),
                epoch,
                train_loss,
                val_loss,
                val_mean,
                val_median,
            ])

        schedular.step()