def find_lr(self, start_lr=1e-7, end_lr=100, num_iter=100):
     optimizer_state = self.optimizer.state_dict()
     self.update_lr(start_lr)
     self.lr_finder = LRFinder(self.model, self.optimizer, self.criterion,
                               self.device)
     self.lr_finder.range_test(self.train_loader,
                               end_lr=end_lr,
                               num_iter=num_iter)
     self.optimizer.load_state_dict(optimizer_state)
     self.lr_finder.plot()
Esempio n. 2
0
def lr_finder(model, optimizer, criterion, trainloader):
    lr_finder = LRFinder(model, optimizer, criterion, device="cuda")
    lr_finder.range_test(trainloader,
                         end_lr=100,
                         num_iter=100,
                         step_mode="exp")
    lr_finder.plot()  #to plot the loss vs Learning Rate curve
    lr_finder.reset()  # to reset the lr_finder
Esempio n. 3
0
def lr_finder(model, train_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()
    optimizer_ft = optim.Adam(model.parameters(), lr=0.0000001)
    lr_finder = LRFinder(model, optimizer_ft, criterion, device=device)
    lr_finder.range_test(train_loader, end_lr=1, num_iter=1000)
    lr_finder.reset()
    lr_finder.plot()
Esempio n. 4
0
def executeLr_finder(model, optimizer, device, trainloader, criterion):

    #finding and plotting the best LR
    lr_finder = LRFinder(model, optimizer, criterion, device="cuda")
    lr_finder.range_test(trainloader,
                         end_lr=100,
                         num_iter=100,
                         step_mode="exp")
    lr_finder.plot()  # to inspect the loss-learning rate graph

    lr_finder.reset(
    )  # to reset the model and optimizer to their initial state
Esempio n. 5
0
def get_model(batchsize=8, dropout=0.3):
    # Inputs
    inp_normal = keras.layers.Input(shape=(xtrain.shape[1] - len(embedding_features), ), name='inp_normal')
    inp_dow_embedding = keras.layers.Input(shape=(1, ), name='inp_dow_embedding')
    inp_hod_embedding = keras.layers.Input(shape=(1, ), name='inp_hod_embedding')

    # Embeddings
    dow_embedding = keras.layers.Embedding(input_dim=7, output_dim=3, input_length=1)(inp_dow_embedding)
    dow_embedding = keras.layers.Flatten()(dow_embedding)

    hod_embedding = keras.layers.Embedding(input_dim=24, output_dim=10, input_length=1)(inp_hod_embedding)
    hod_embedding = keras.layers.Flatten()(hod_embedding)

    # Hidden layers
    concat = keras.layers.Concatenate()([inp_normal, dow_embedding, hod_embedding])
    x = keras.layers.Dense(units=100, activation='relu')(concat)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(dropout)(x)
    x = keras.layers.Dense(units=40, activation='relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(dropout)(x)
    x = keras.layers.Dense(units=10, activation='relu')(x)
    x = keras.layers.BatchNormalization()(x)
    out = keras.layers.Dense(units=1, activation='sigmoid')(x)

    nn = keras.Model(inputs=[inp_normal, inp_dow_embedding, inp_hod_embedding], outputs=out)
    nn.compile(
        loss='binary_crossentropy',
        optimizer=keras.optimizers.SGD(lr=0.0001),
        metrics=['accuracy', keras.metrics.Precision()]
    )

    lr_finder = LRFinder(0.0001, 0.1)
    nn.fit(
        x={
            'inp_normal': xtrain.drop(embedding_features, axis=1).values,
            'inp_dow_embedding': xtrain.dayofweek.values.reshape(-1, 1),
            'inp_hod_embedding': xtrain.hourofday.values.reshape(-1, 1)
        },
        y=ytrain.values,
        validation_data=(
            [
                xval.drop(embedding_features, axis=1).values,
                xval.dayofweek.values.reshape(-1, 1),
                xval.hourofday.values.reshape(-1, 1)
            ],
            yval.values
        ),
        epochs=2,
        batch_size=batchsize,
        callbacks=[lr_finder]
    )
    return nn
Esempio n. 6
0
	def find_lr(self,model, device, train_loader, lr_val=1e-8, decay=1e-2):
		criterion = nn.CrossEntropyLoss()
		optimizer = optim.SGD(model.parameters(), lr=lr_val, weight_decay=decay)
		lr_finder = LRFinder(model, optimizer, criterion, device)
		lr_finder.range_test(train_loader, end_lr=100, num_iter=100, step_mode="exp")
		lr_finder.plot()
		return lr_finder
Esempio n. 7
0
def lr_finder(net, optimizer, loss_fun, trainloader, testloader):
    # Using LRFinder
    lr_finder = LRFinder(net, optimizer, loss_fun, device='cuda')
    lr_finder.range_test(trainloader,
                         val_loader=testloader,
                         start_lr=1e-3,
                         end_lr=0.1,
                         num_iter=100,
                         step_mode='exp')
    lr_finder.plot(log_lr=False)
    lr_finder.reset(
    )  # important to restore the model and optimizer's parameters to its initial state

    return lr_finder.history
def train_and_test():
    # load network
    model = vgg11(365, 1)

    model.compile(
        loss='categorical_crossentropy',
        # Learning rate will be set by lr_finder
        optimizer=SGD(lr=0.0, momentum=0.9),
        metrics=['accuracy', top_5])

    # load data
    img_size = (224, 224)
    color_mode = 'grayscale'
    batch_size = 64
    train_dir = '/usr/local/data/gabriel/places365_line_drawings/train'
    test_dir = '/usr/local/data/gabriel/places365_line_drawings/val'

    # fixed for places365
    nb_train_samples = 1803460.
    nb_test_samples = 36500.

    train_datagen = ImageDataGenerator(rescale=1. / 255,
                                       shear_range=0.2,
                                       zoom_range=0.2,
                                       horizontal_flip=True)

    # no test data for now
    #  test_datagen = ImageDataGenerator(rescale=1. / 255)

    train_gen = train_datagen.flow_from_directory(train_dir,
                                                  target_size=img_size,
                                                  batch_size=batch_size,
                                                  class_mode='categorical',
                                                  color_mode=color_mode)

    # no test data for now
    #  test_gen = test_datagen.flow_from_directory(
    #  test_dir,
    #  target_size = img_size,
    #  batch_size = batch_size,
    #  class_mode = 'categorical',
    #  color_mode = color_mode
    #  )

    # find best learning rate
    lr_finder = LRFinder(min_lr=1e-5,
                         max_lr=1e-2,
                         steps_per_epoch=np.ceil(nb_train_samples /
                                                 batch_size),
                         epochs=4)

    model.fit_generator(train_gen,
                        steps_per_epoch=np.ceil(nb_train_samples / batch_size),
                        epochs=4,
                        callbacks=[lr_finder])

    # save loss and learning rate plots to files
    lr_finder.plot_loss('loss.png')
    lr_finder.plot_lr('lr.png')
    def lr_finder(self, end_lr):

        lr_find = LRFinder(self.model, self.optimizer, self.criterion,
                           cfg.device)
        lr_find.range_test(self.data_loaders['val'],
                           end_lr=end_lr,
                           num_iter=2000)
        lr_find.plot()
Esempio n. 10
0
def _main():
    annotation_path = 'train.txt'
    classes_path = 'model_data/openimgs_classes.txt'
    anchors_path = 'model_data/yolo_anchors.txt'
    class_names = get_classes(classes_path)
    num_classes = len(class_names)
    anchors = get_anchors(anchors_path)

    input_shape = (416, 416)  # multiple of 32, hw

    # use darknet53 weights
    #model = create_model(input_shape, anchors, num_classes,
    #        freeze_body=2, weights_path='model_data/darknet53_weights.h5')
    model = create_model(input_shape,
                         anchors,
                         num_classes,
                         freeze_body=0,
                         weights_path='logs/001/trained_weights_stage_2.h5')

    val_split = 0.1
    with open(annotation_path) as f:
        lines = f.readlines()
    #np.random.seed(10101)
    np.random.shuffle(lines)
    #np.random.seed(None)
    num_val = int(len(lines) * val_split)
    num_val = 10000 if num_val > 10000 else num_val
    num_train = len(lines) - num_val

    if True:
        batch_size = 6
        lr_finder = LRFinder(min_lr=1e-10,
                             max_lr=2e-2,
                             steps_per_epoch=TOTAL_ITERATIONS,
                             epochs=1)
        for i in range(len(model.layers)):
            model.layers[i].trainable = True
        model.compile(optimizer=SGD(lr=1e-8),
                      loss={
                          'yolo_loss': lambda y_true, y_pred: y_pred
                      })

        print('train on {} samples, val on {} samples, with batch size {}.'.
              format(num_train, num_val, batch_size))
        model.fit_generator(data_generator_wrapper(lines[:num_train],
                                                   batch_size, input_shape,
                                                   anchors, num_classes),
                            steps_per_epoch=TOTAL_ITERATIONS,
                            validation_data=data_generator_wrapper(
                                lines[num_train:], batch_size, input_shape,
                                anchors, num_classes),
                            validation_steps=1,
                            epochs=1,
                            initial_epoch=0,
                            callbacks=[lr_finder])

        lr_finder.save_history('lr_finder_loss.csv')
        lr_finder.plot_loss('lr_finder_loss.png')
Esempio n. 11
0
def get_LR(model, trainloader, optimizer, criterion, device, testloader=None):

    # print("########## Tweaked version from fastai ###########")
    # lr_find = LRFinder(model, optimizer, criterion, device="cuda")
    # lr_find.range_test(trainloader, end_lr=100, num_iter=100)
    # best_lr=lr_find.plot()  # to inspect the loss-learning rate graph
    # lr_find.reset()
    # return best_lr

    # print("########## Tweaked version from fastai ###########")
    # lr_find = LRFinder(model, optimizer, criterion, device="cuda")
    # lr_find.range_test(trainloader, end_lr=1, num_iter=100)
    # lr_find.plot() # to inspect the loss-learning rate graph
    # lr_find.reset()
    # for index in range(len(lr_find.history['loss'])):
    #   item = lr_find.history['loss'][index]
    #   if item == lr_find.best_loss:
    #     min_val_index = index
    #     print(f"{min_val_index}")
    #
    # lr_find.plot(show_lr=lr_find.history['lr'][75])
    # lr_find.plot(show_lr=lr_find.history['lr'][min_val_index])
    #
    # val_index = 75
    # mid_val_index = math.floor((val_index + min_val_index)/2)
    # show_lr=[{'data': lr_find.history['lr'][val_index], 'linestyle': 'dashed'}, {'data': lr_find.history['lr'][mid_val_index], 'linestyle': 'solid'}, {'data': lr_find.history['lr'][min_val_index], 'linestyle': 'dashed'}]
    # # lr_find.plot_best_lr(skip_start=10, skip_end=5, log_lr=True, show_lr=show_lr, ax=None)
    #
    # best_lr = lr_find.history['lr'][mid_val_index]
    # print(f"LR to be used: {best_lr}")
    #
    # return best_lr

    print("########## Leslie Smith's approach ###########")
    lr_find = LRFinder(model, optimizer, criterion, device="cuda")
    lr_find.range_test(trainloader,
                       val_loader=testloader,
                       end_lr=1,
                       num_iter=100,
                       step_mode="linear")
    best_lr = lr_find.plot(log_lr=False)
    lr_find.reset()
    return best_lr
Esempio n. 12
0
    #iaa.Sometimes(0.1, iaa.Grayscale(alpha=(0.0, 1.0), from_colorspace="RGB", name="grayscale")),
    # iaa.Sometimes(0.2, iaa.AdditiveLaplaceNoise(scale=(0, 0.1*255), per_channel=True, name="gaus-noise")),
    # Color, Contrast, etc.
    iaa.Sometimes(0.2, iaa.Multiply((0.75, 1.25), per_channel=0.1, name="brightness")),
    iaa.Sometimes(0.2, iaa.GammaContrast((0.7, 1.3), per_channel=0.1, name="contrast")),
    iaa.Sometimes(0.2, iaa.AddToHueAndSaturation((-20, 20), name="hue-sat")),
    iaa.Sometimes(0.3, iaa.Add((-20, 20), per_channel=0.5, name="color-jitter")),
])
augs_test = iaa.Sequential([
    # Geometric Augs
    iaa.Scale((imsize, imsize), 0),
])


db_train = AlphaPilotSegmentation(
    input_dir='data/dataset/train/images', label_dir='data/dataset/train/labels',
    transform=augs_train,
    input_only=["gaus-blur", "grayscale", "gaus-noise", "brightness", "contrast", "hue-sat", "color-jitter"],
    return_image_name=False
)


trainloader = DataLoader(db_train, batch_size=p['trainBatchSize'], shuffle=True, num_workers=32, drop_last=True)


# %matplotlib inline

lr_finder = LRFinder(net, optimizer, criterion, device="cuda")
lr_finder.range_test(trainloader, end_lr=1, num_iter=100)
lr_finder.plot()
# plt.show()
Esempio n. 13
0
#                 else tensor.size(self.ch_dim)
#                 for i in range(2)]
# pad = torch.empty(padding_size, dtype=tensor.dtype).fill_(self.fill_value)
# tensor = torch.cat((tensor, pad), dim=self.len_dim)

# import os
# os.environ['MKL_NUM_THREADS'] = '1'
# import numpy as np
# import utils
#
#
# def main():
#     inpt = np.random.randint(-10, 10, 150000)
#
#     for i in range(1000):
#         out = utils.spectrogram(inpt, 256)
#
#
# if __name__ == '__main__':
#     main()

from lr_finder import LRFinder

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)
lr_finder = LRFinder(model, optimizer, criterion, device="cuda")
lr_finder.range_test(train_loader, end_lr=1, num_iter=50, step_mode="exp")
lr_finder.get_best_lr()
# lr_finder.plot()
# lr_finder.history
Esempio n. 14
0
def main(batch_size: int = 24,
         epochs: int = 384,
         train_path: str = 'train',
         val_path: str = 'val',
         multi_gpu_weights=None,
         weights=None,
         workers: int = 8,
         find_lr: bool = False):

    keras_model = MobileDetectNetModel.complete_model()
    keras_model.summary()

    if weights is not None:
        keras_model.load_weights(weights, by_name=True)

    train_seq = MobileDetectNetSequence(train_path,
                                        stage="train",
                                        batch_size=batch_size)
    val_seq = MobileDetectNetSequence(val_path,
                                      stage="val",
                                      batch_size=batch_size)

    keras_model = keras.utils.multi_gpu_model(keras_model,
                                              gpus=[0, 1],
                                              cpu_merge=True,
                                              cpu_relocation=False)
    if multi_gpu_weights is not None:
        keras_model.load_weights(multi_gpu_weights, by_name=True)

    callbacks = []

    def region_loss(classes):
        def loss_fn(y_true, y_pred):
            # Don't penalize bounding box errors when there is no object present
            return 10 * classes * K.abs(y_pred - y_true)

        return loss_fn

    keras_model.compile(optimizer=SGD(),
                        loss=[
                            'mean_absolute_error',
                            region_loss(
                                keras_model.get_layer('classes').output),
                            'binary_crossentropy'
                        ])

    if find_lr:
        from lr_finder import LRFinder
        lr_finder = LRFinder(keras_model)
        lr_finder.find_generator(train_seq,
                                 start_lr=0.000001,
                                 end_lr=1,
                                 epochs=5)
        lr_finder.plot_loss()
        return

    filepath = "weights-{epoch:02d}-{val_loss:.4f}-multi-gpu.hdf5"
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='min')
    callbacks.append(checkpoint)

    sgdr_sched = SGDRScheduler(0.00001,
                               0.01,
                               steps_per_epoch=np.ceil(
                                   len(train_seq) / batch_size),
                               mult_factor=1.5)
    callbacks.append(sgdr_sched)

    keras_model.fit_generator(
        train_seq,
        validation_data=val_seq,
        epochs=epochs,
        steps_per_epoch=np.ceil(len(train_seq) / batch_size),
        validation_steps=np.ceil(len(val_seq) / batch_size),
        callbacks=callbacks,
        use_multiprocessing=True,
        workers=workers,
        shuffle=True)
Esempio n. 15
0
def main(args=None):
    set_random_seed(63)
    chainer.global_config.autotune = True
    chainer.cuda.set_max_workspace_size(512 * 1024 * 1024)
    parser = argparse.ArgumentParser()
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=64,
                        help='Number of images in each mini-batch')
    parser.add_argument('--learnrate',
                        '-l',
                        type=float,
                        default=0.01,
                        help='Learning rate for SGD')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=80,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=0,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--loss-function',
                        choices=['focal', 'sigmoid'],
                        default='focal')
    parser.add_argument('--optimizer',
                        choices=['sgd', 'adam', 'adabound'],
                        default='adam')
    parser.add_argument('--size', type=int, default=224)
    parser.add_argument('--limit', type=int, default=None)
    parser.add_argument('--data-dir', type=str, default='data')
    parser.add_argument('--lr-search', action='store_true')
    parser.add_argument('--pretrained', type=str, default='')
    parser.add_argument('--backbone',
                        choices=['resnet', 'seresnet', 'debug_model'],
                        default='resnet')
    parser.add_argument('--log-interval', type=int, default=100)
    parser.add_argument('--find-threshold', action='store_true')
    parser.add_argument('--finetune', action='store_true')
    parser.add_argument('--mixup', action='store_true')
    args = parser.parse_args() if args is None else parser.parse_args(args)

    print(args)

    if args.mixup and args.loss_function != 'focal':
        raise ValueError('mixupを使うときはfocal lossしか使えません(いまんところ)')

    train, test, cooccurrence = get_dataset(args.data_dir, args.size,
                                            args.limit, args.mixup)
    base_model = backbone_catalog[args.backbone](args.dropout)

    if args.pretrained:
        print('loading pretrained model: {}'.format(args.pretrained))
        chainer.serializers.load_npz(args.pretrained, base_model, strict=False)
    model = TrainChain(base_model,
                       1,
                       loss_fn=args.loss_function,
                       cooccurrence=cooccurrence,
                       co_coef=0)
    if args.gpu >= 0:
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    if args.optimizer in ['adam', 'adabound']:
        optimizer = Adam(alpha=args.learnrate,
                         adabound=args.optimizer == 'adabound',
                         weight_decay_rate=1e-5,
                         gamma=5e-7)
    elif args.optimizer == 'sgd':
        optimizer = chainer.optimizers.MomentumSGD(lr=args.learnrate)

    optimizer.setup(model)

    if not args.finetune:
        print('最初のエポックは特徴抽出層をfreezeします')
        model.freeze_extractor()

    train_iter = chainer.iterators.MultiprocessIterator(train,
                                                        args.batchsize,
                                                        n_processes=8,
                                                        n_prefetch=2)
    test_iter = chainer.iterators.MultithreadIterator(test,
                                                      args.batchsize,
                                                      n_threads=8,
                                                      repeat=False,
                                                      shuffle=False)

    if args.find_threshold:
        # train_iter, optimizerなど無駄なsetupもあるが。。
        print('thresholdを探索して終了します')
        chainer.serializers.load_npz(join(args.out, 'bestmodel_loss'),
                                     base_model)
        print('lossがもっとも小さかったモデルに対しての結果:')
        find_threshold(base_model, test_iter, args.gpu, args.out)

        chainer.serializers.load_npz(join(args.out, 'bestmodel_f2'),
                                     base_model)
        print('f2がもっとも大きかったモデルに対しての結果:')
        find_threshold(base_model, test_iter, args.gpu, args.out)
        return

    # Set up a trainer
    updater = training.updaters.StandardUpdater(
        train_iter,
        optimizer,
        device=args.gpu,
        converter=lambda batch, device: chainer.dataset.concat_examples(
            batch, device=device))
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    # Evaluate the model with the test dataset for each epoch
    trainer.extend(FScoreEvaluator(test_iter, model, device=args.gpu))

    if args.optimizer == 'sgd':
        # Adamにweight decayはあんまりよくないらしい
        optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4))
        trainer.extend(extensions.ExponentialShift('lr', 0.1),
                       trigger=(3, 'epoch'))
        if args.lr_search:
            print('最適な学習率を探します')
            trainer.extend(LRFinder(1e-7, 1, 5, optimizer),
                           trigger=(1, 'iteration'))
    elif args.optimizer in ['adam', 'adabound']:
        if args.lr_search:
            print('最適な学習率を探します')
            trainer.extend(LRFinder(1e-7, 1, 5, optimizer, lr_key='alpha'),
                           trigger=(1, 'iteration'))

        trainer.extend(extensions.ExponentialShift('alpha', 0.2),
                       trigger=triggers.EarlyStoppingTrigger(
                           monitor='validation/main/loss'))

    # Take a snapshot of Trainer at each epoch
    trainer.extend(
        extensions.snapshot(filename='snaphot_epoch_{.updater.epoch}'),
        trigger=(10, 'epoch'))

    # Take a snapshot of Model which has best val loss.
    # Because searching best threshold for each evaluation takes too much time.
    trainer.extend(extensions.snapshot_object(model.model, 'bestmodel_loss'),
                   trigger=triggers.MinValueTrigger('validation/main/loss'))
    trainer.extend(extensions.snapshot_object(model.model, 'bestmodel_f2'),
                   trigger=triggers.MaxValueTrigger('validation/main/f2'))
    trainer.extend(extensions.snapshot_object(model.model,
                                              'model_{.updater.epoch}'),
                   trigger=(5, 'epoch'))

    # Write a log of evaluation statistics for each epoch
    trainer.extend(
        extensions.LogReport(trigger=(args.log_interval, 'iteration')))

    trainer.extend(
        extensions.PrintReport([
            'epoch', 'lr', 'elapsed_time', 'main/loss', 'main/co_loss',
            'validation/main/loss', 'validation/main/co_loss',
            'validation/main/precision', 'validation/main/recall',
            'validation/main/f2', 'validation/main/threshold'
        ]))

    trainer.extend(extensions.ProgressBar(update_interval=args.log_interval))
    trainer.extend(extensions.observe_lr(),
                   trigger=(args.log_interval, 'iteration'))
    trainer.extend(CommandsExtension())
    save_args(args, args.out)

    trainer.extend(lambda trainer: model.unfreeze_extractor(),
                   trigger=(1, 'epoch'))

    if args.resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(args.resume, trainer)

    # save args with pickle for prediction time
    pickle.dump(args, open(str(Path(args.out).joinpath('args.pkl')), 'wb'))

    # Run the training
    trainer.run()

    # find optimal threshold
    chainer.serializers.load_npz(join(args.out, 'bestmodel_loss'), base_model)
    print('lossがもっとも小さかったモデルに対しての結果:')
    find_threshold(base_model, test_iter, args.gpu, args.out)

    chainer.serializers.load_npz(join(args.out, 'bestmodel_f2'), base_model)
    print('f2がもっとも大きかったモデルに対しての結果:')
    find_threshold(base_model, test_iter, args.gpu, args.out)
Esempio n. 16
0
from lr_finder import LRFinder
from src.model_lib.MultiFTNet import MultiFTNet
from src.model_lib.MiniFASNet import MiniFASNetV1, MiniFASNetV2,MiniFASNetV1SE,MiniFASNetV2SE
from src.utility import get_kernel
from torch.nn import CrossEntropyLoss, MSELoss
from torch import optim
from src.data_io.dataset_loader import get_train_loader,get_eval_loader
from src.default_config import get_default_config, update_config
from train import parse_args
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
kernel_size = get_kernel(80, 60)
model = MultiFTNet(conv6_kernel = kernel_size)
cls_criterion = CrossEntropyLoss()
FT_criterion = MSELoss()
from torch import optim
# optimizer = optim.SGD(model.parameters(),
#                                    lr=0.1,
#                                    weight_decay=5e-4,
#                                    momentum=0.9)
optimizer = optim.AdamW(model.parameters())
lr_finder = LRFinder(model, optimizer, cls_criterion,FT_criterion)
conf = get_default_config()
args = parse_args()
conf = update_config(args, conf)
trainloader = get_train_loader(conf)
val_loader = get_eval_loader(conf)
lr_finder.range_test(trainloader, end_lr=1, num_iter=100, step_mode="linear")
lr_finder.plot(log_lr=False)
lr_finder.reset()
Esempio n. 17
0
from learner import Learner
from sklearn import model_selection as ms
from classifier import Classifier

if __name__ == '__main__':
    device = torch.device("cpu")

    net = Classifier('tf_efficientnet_b4_ns', 5, pretrained=True)

    transform = T.Compose([
        T.ToTensor(),
        T.Resize((380, 380)),
        T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    data_root = "/home/namnd/personal-workspace/cassava-leaf-disease-classification"
    df = pd.read_csv(os.path.join(data_root, 'train.csv'))
    # train_df, val_df = ms.train_test_split(df, test_size=0.2, random_state=42, stratify=df.label.values)
    #
    # train_dataset = CassavaLeafDiseaseDataset(data_root, df=train_df, transform=transform)
    # val_dataset = CassavaLeafDiseaseDataset(data_root, df=val_df, transform=transform)
    #
    # train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=mp.cpu_count())
    # val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=mp.cpu_count())

    dataloader = CassavaLeafDiseaseDataset(data_root, df, transform=transform)
    learner = Learner(net, dataloader, device)
    lr_finder = LRFinder(learner)
    lr_finder.find()
    lr_finder.plot()
Esempio n. 18
0
def main(args):
    # load train data into ram
    # data_path = '/mntlong/lanl_comp/data/'
    file_dir = os.path.dirname(__file__)
    data_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'data'))
    train_info_path = os.path.join(data_path, 'train_info.csv')
    train_data_path = os.path.join(data_path, 'train_compressed.npz')

    train_info = pd.read_csv(train_info_path, index_col='Unnamed: 0')
    train_info['exp_len'] = train_info['indx_end'] - train_info['indx_start']

    train_signal = np.load(train_data_path)['signal']
    train_quaketime = np.load(train_data_path)['quake_time']

    # В валидацию берем 2 последних волны (части эксперимента)
    val_start_idx = train_info.iloc[-2, :]['indx_start']

    val_signal = train_signal[val_start_idx:]
    val_quaketime = train_quaketime[val_start_idx:]

    train_signal = train_signal[:val_start_idx]
    train_quaketime = train_quaketime[:val_start_idx]

    # training params
    window_size = 150000
    overlap_size = int(window_size * 0.5)
    num_bins = 17

    model = models.BaselineNetRawSignalCnnRnnV1(out_size=num_bins-1)
    loss_fn = nn.CrossEntropyLoss()  # L1Loss() SmoothL1Loss() MSELoss()

    # logs_path = '/mntlong/scripts/logs/'
    logs_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'logs'))
    current_datetime = datetime.today().strftime('%b-%d_%H-%M-%S')
    log_writer_path = os.path.join(logs_path, 'runs',
                                   current_datetime + '_' + args.model_name)

    train_dataset = data.SignalDataset(train_signal, train_quaketime,
                                       num_bins=num_bins,
                                       idxs_wave_end=train_info['indx_end'].values,
                                       window_size=window_size,
                                       overlap_size=overlap_size)
    val_dataset = data.SignalDataset(val_signal, val_quaketime,
                                     num_bins=num_bins,
                                     idxs_wave_end=train_info['indx_end'].values,
                                     window_size=window_size,
                                     overlap_size=overlap_size)

    print('wave size:', train_dataset[0][0].size())

    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=5,
                              pin_memory=True)
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=5,
                            pin_memory=True)

    if args.find_lr:
        from lr_finder import LRFinder
        optimizer = optim.Adam(model.parameters(), lr=1e-6)
        lr_find = LRFinder(model, optimizer, loss_fn, device='cuda')
        lr_find.range_test(train_loader, end_lr=1, num_iter=50, step_mode='exp')
        best_lr = lr_find.get_best_lr()
        lr_find.plot()
        lr_find.reset()
        print('best lr found: {:.2e}'.format(best_lr))
    else:
        best_lr = 3e-4

    optimizer = optim.Adam(model.parameters(), lr=best_lr)  # weight_decay=0.1
    lr_sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                    factor=0.5,
                                                    patience=3,
                                                    threshold=0.005)
    log_writer = SummaryWriter(log_writer_path)

    utils.train_clf_model(model=model, optimizer=optimizer, lr_scheduler=lr_sched,
                          train_loader=train_loader, val_loader=val_loader,
                          num_epochs=args.num_epochs, model_name=args.model_name,
                          logs_path=logs_path, log_writer=log_writer,
                          loss_fn=loss_fn, num_bins=num_bins)
Esempio n. 19
0
def main():
    global global_token_count, event_writer, train_step, train_loss, last_log_step, \
        best_val_loss, epoch, model

    if args.local_rank > 0:
        pass  # skip shutdown when rank is explicitly set + not zero rank
    else:
        os.system('shutdown -c')

    if not args.local:
        logger.info(
            f'Distributed initializing process group with {args.dist_backend}, {args.dist_url}, {util.get_world_size()}'
        )
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=util.get_world_size())
        assert (util.get_world_size() == dist.get_world_size())
        logger.info(
            f"Distributed: success ({args.local_rank}/{dist.get_world_size()})"
        )

    model = MemTransformerLM(ntokens,
                             args.n_layer,
                             args.n_head,
                             args.d_model,
                             args.d_head,
                             args.d_inner,
                             args.dropout,
                             args.dropatt,
                             tie_weight=args.tied,
                             d_embed=args.d_embed,
                             div_val=args.div_val,
                             tie_projs=tie_projs,
                             pre_lnorm=args.pre_lnorm,
                             tgt_len=args.tgt_len,
                             ext_len=args.ext_len,
                             mem_len=args.mem_len,
                             cutoffs=cutoffs,
                             same_length=args.same_length,
                             attn_type=args.attn_type,
                             clamp_len=args.clamp_len,
                             sample_softmax=args.sample_softmax)

    # log model info
    n_all_param = sum([p.nelement() for p in model.parameters()])
    log_tb('sizes/params', n_all_param)
    n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()])
    log_tb('sizes/non_emb_params', n_nonemb_param)
    logger.info('params %s non_emb_params %s', n_all_param, n_nonemb_param)

    # optimizer
    if args.optim.lower() == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.mom)
    elif args.optim.lower() == 'lamb':
        optimizer = Lamb(model.parameters(), lr=args.lr, weight_decay=args.wd)
    else:
        assert args.optim.lower() == 'adam'
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)

    # scheduler
    if args.scheduler == 'cosine':
        # Divide by 1e6 for numerical stability.
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         args.max_tokens //
                                                         1e6,
                                                         eta_min=args.eta_min)
    elif args.scheduler == 'finder':
        scheduler = LRFinder(optimizer,
                             args.max_tokens,
                             init_value=args.lr / 1e3)
    elif args.scheduler == 'constant':
        pass

    model.apply(weights_init)
    model.word_emb.apply(
        weights_init
    )  # ensure embedding init is not overridden by out_layer in case of weight sharing

    if args.checkpoint:
        if global_rank == 0:
            util.restore_from_checkpoint(model=model,
                                         checkpoint_fn=args.checkpoint)

    model = model.to(device)
    if args.fp16:
        model = FP16_Module(model)
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=args.static_loss_scale,
                                   dynamic_loss_scale=args.dynamic_loss_scale,
                                   dynamic_loss_args={'init_scale': 2**16},
                                   verbose=False)

    if args.local:
        model = nn.DataParallel(model, dim=1)
    else:
        # Uncomment find_unused_parameters and upgrade to torch 1.1 for adaptive embedding.
        model = DistributedDataParallel(
            model, device_ids=[args.local_rank],
            output_device=args.local_rank)  #, find_unused_parameters=True)

    if global_rank == 0:
        event_writer = SummaryWriter(args.logdir)

    event_writer.add_text('args', str(args))

    # test checkpoint writing
    if args.checkpoint_each_epoch:
        logger.info(f'Saving checkpoint for epoch {epoch}')
        util.dist_save_checkpoint(model, optimizer, args.logdir, suffix=f'{0}')

    # Loop over epochs.
    train_step = 0
    train_loss = 0
    last_log_step = 0
    best_val_loss = None
    va_iter, te_iter = [
        corpus.get_dist_iterator(split,
                                 global_rank,
                                 max_rank,
                                 args.batch_size * 2,
                                 args.tgt_len,
                                 device=device,
                                 ext_len=args.ext_len)
        for split in ('valid', 'test')
    ]

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        for epoch in itertools.count(start=1):
            train(va_iter, optimizer, scheduler)
    except KeyboardInterrupt:
        logger.info('-' * 100)
        logger.info('Exiting from training early')
    except StopIteration:
        pass

    # Eval one more time.
    evaluate_and_log(optimizer, va_iter, 'val', train_step=-1)

    # Load the best saved model.
    logger.info("Loading best checkpoint")
    model_file = os.path.join(args.logdir, 'model-best.pt')
    if os.path.exists(model_file):
        with open(model_file, 'rb') as model_f:
            with timeit('load'):
                if args.local:
                    model = torch.load(model_f)
                else:
                    model = torch.load(model_f,
                                       map_location=lambda storage, loc:
                                       storage.cuda(args.local_rank))
                    model = DistributedDataParallel(
                        model,
                        device_ids=[args.local_rank],
                        output_device=args.local_rank)
    else:
        logger.warn('no model file, using current model for loss')

    # Run on test data.
    evaluate_and_log(optimizer, te_iter, 'test', -1)
Esempio n. 20
0
    sum_kl_loss = keras.backend.sum(kl_loss, axis=0)
    sum_g_loss = keras.backend.sum(g_loss, axis=0)
    sum_g_loss = sum_g_loss * alpha  #This is basically a loss penalty
    loss = sum_g_loss + sum_kl_loss
    return loss


#Model: define inputs and outputs
model = Model(inputs=[in_1, in_2, in_3], outputs=out_vals)  #probabilities)#
opt = optimizers.Adam(
    clipnorm=1.,
    lr=lrate)  #remove clipnorm and add loss penalty - clipnorm works better
model.compile(loss=bin_loss, optimizer=opt)

if find_lr == True:
    lr_finder = LRFinder(model)

    X_train = [X1_train, X2_train, X3_train]
    lr_finder.find(X_train,
                   y_train,
                   start_lr=0.00000001,
                   end_lr=1,
                   batch_size=batch_size,
                   epochs=1)
    losses = lr_finder.losses
    lrs = lr_finder.lrs
    l_l = np.asarray([lrs, losses])
    np.savetxt(out_dir + 'lrs_losses.txt', l_l)
    num_epochs = 0

Esempio n. 21
0
def train_loop(folds, fold):

    if CFG.device == 'GPU':
        LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index

    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(train_folds,
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds,
                                 transform=get_transforms(data='valid'))

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers,
                              pin_memory=True,
                              drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers,
                              pin_memory=True,
                              drop_last=False)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler == 'ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer,
                                          mode='min',
                                          factor=CFG.factor,
                                          patience=CFG.patience,
                                          verbose=True,
                                          eps=CFG.eps)
        elif CFG.scheduler == 'CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer,
                                          T_max=CFG.T_max,
                                          eta_min=CFG.min_lr,
                                          last_epoch=-1)
        elif CFG.scheduler == 'CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer,
                                                    T_0=CFG.T_0,
                                                    T_mult=1,
                                                    eta_min=CFG.min_lr,
                                                    last_epoch=-1)
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = CustomModel(CFG.model_name, pretrained=False)
    model = torch.nn.DataParallel(model)
    model.load_state_dict(
        torch.load(f'{CFG.model_name}_student_fold{fold}_best_score.pth',
                   map_location=torch.device('cpu'))['model'])
    # model.load_state_dict(torch.load(f'0.9647/{CFG.model_name}_no_hflip_fold{fold}_best_score.pth', map_location=torch.device('cpu'))['model'])
    model.to(device)

    # criterion = nn.BCEWithLogitsLoss()
    criterion = FocalLoss(alpha=1, gamma=6)

    # optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False)
    optimizer = SGD(model.parameters(),
                    lr=1e-2,
                    weight_decay=CFG.weight_decay,
                    momentum=0.9)

    find_lr = False
    if find_lr:
        from lr_finder import LRFinder
        lr_finder = LRFinder(model, optimizer, criterion, device=device)
        lr_finder.range_test(train_loader,
                             start_lr=1e-2,
                             end_lr=1e0,
                             num_iter=100,
                             accumulation_steps=1)

        fig_name = f'{CFG.model_name}_lr_finder.png'
        lr_finder.plot(fig_name)
        lr_finder.reset()
        return
    scheduler = get_scheduler(optimizer)
    swa_model = torch.optim.swa_utils.AveragedModel(model)
    swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=1e-3)
    swa_start = 9

    # ====================================================
    # loop
    # ====================================================

    best_score = 0.
    best_loss = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch,
                            scheduler, device)

        # eval
        # avg_val_loss, preds, _ = valid_fn(valid_loader, model, criterion, device)
        if epoch > swa_start:
            swa_model.update_parameters(model)
            swa_scheduler.step()
        else:
            if isinstance(scheduler, ReduceLROnPlateau):
                scheduler.step(avg_val_loss)
            elif isinstance(scheduler, CosineAnnealingLR):
                scheduler.step()
            elif isinstance(scheduler, CosineAnnealingWarmRestarts):
                scheduler.step()

        # scoring
        avg_val_loss, preds, _ = valid_fn(valid_loader, model, criterion,
                                          device)
        score, scores = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(
            f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s'
        )
        LOGGER.info(
            f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {np.round(scores, decimals=4)}'
        )

        if score > best_score:
            best_score = score
            LOGGER.info(
                f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict()}, OUTPUT_DIR +
                       f'{CFG.model_name}_no_hflip_fold{fold}_best_score.pth')

        # if avg_val_loss < best_loss:
        #     best_loss = avg_val_loss
        #     LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
        #     torch.save({'model': model.state_dict(),
        #                 'preds': preds},
        #                 OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_loss.pth')

    torch.optim.swa_utils.update_bn(train_loader, swa_model)
    avg_val_loss, preds, _ = valid_fn(valid_loader, swa_model, criterion,
                                      device)
    score, scores = get_score(valid_labels, preds)
    LOGGER.info(f'Save swa Score: {score:.4f} Model')
    torch.save({'model': swa_model.state_dict()},
               OUTPUT_DIR + f'swa_{CFG.model_name}_fold{fold}_{score:.4f}.pth')
    # if CFG.nprocs != 8:
    #     check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_score.pth')
    #     for c in [f'pred_{c}' for c in CFG.target_cols]:
    #         valid_folds[c] = np.nan
    #     try:
    #         valid_folds[[f'pred_{c}' for c in CFG.target_cols]] = check_point['preds']
    #     except:
    #         pass

    return
class Trainer:
    def __init__(self,
                 model,
                 criterion,
                 optimizer,
                 train_loader,
                 val_loader=None,
                 name="experiment",
                 experiments_dir="runs",
                 save_dir=None,
                 div_lr=1):
        self.device = device()
        self.model = model.to(self.device)
        self.criterion = criterion
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.div_lr = div_lr
        self.update_lr(self.optimizer.defaults['lr'])
        self._epoch_count = 0
        self._best_loss = None
        self._best_acc = None
        if save_dir is None:
            save_dir = f"{self.get_num_dir(experiments_dir):04d}-{get_git_hash()}-{name}"
        self._save_dir = os.path.join(experiments_dir, save_dir)
        self.writer = Logger(self._save_dir)
        atexit.register(self.cleanup)

    def train(self, epochs=1):
        for epoch in range(epochs):
            self._epoch_count += 1
            print("\n----- epoch ", self._epoch_count, " -----")
            train_loss, train_acc = self._train_epoch()
            if self.val_loader:
                val_loss, val_acc = self._validate_epoch()
                if self._best_loss is None or val_loss < self._best_loss:
                    self.save_checkpoint('best_model')
                    self._best_loss = val_loss
                    print("new best val loss!")
                if self._best_acc is None or val_acc > self._best_acc:
                    self.save_checkpoint('best_model_acc')
                    self._best_acc = val_acc
                    print("new best val acc!")

    def test(self, test_loader):
        self.model.eval()
        running_loss = 0
        running_acc = 0
        for iter, (inputs, targets) in enumerate(tqdm(test_loader)):
            inputs = inputs.to(device())
            targets = targets.to(device())
            with torch.set_grad_enabled(False):
                outputs = self.model(inputs)
                batch_loss = self.criterion(outputs, targets)
                batch_acc = accuracy(outputs, targets)
            running_loss += batch_loss.item()
            running_acc += batch_acc.item()
        epoch_loss = running_loss / len(test_loader)
        epoch_acc = running_acc / len(test_loader)
        print(f"test loss: {epoch_loss:.5f} test acc: {epoch_acc:.5f}")
        return epoch_loss, epoch_acc

    def train_one_cycle(self, epochs=1, lr=None):
        if lr is None:
            lr = self.optimizer.defaults['lr']
        self.onecycle = OneCycle(len(self.train_loader) * epochs, lr)
        self.train(epochs)
        self.onecycle = None

    def _train_epoch(self, save_histogram=False):
        self.model.train()
        running_loss = 0
        running_acc = 0
        for iter, (inputs, targets) in enumerate(tqdm(self.train_loader)):
            inputs = inputs.to(device())
            targets = targets.to(device())
            if self.onecycle is not None:
                lr, mom = next(self.onecycle)
                self.update_lr(lr)
                self.update_mom(mom)
            with torch.set_grad_enabled(True):
                outputs = self.model(inputs)
                batch_loss = self.criterion(outputs, targets)
                batch_acc = accuracy(outputs, targets)
                batch_loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
            running_loss += batch_loss.item()
            running_acc += batch_acc.item()
            if self.log_every(iter):
                self.writer.add_scalars(
                    "loss", {"train_loss": running_loss / float(iter + 1)},
                    (self._epoch_count - 1) * len(self.train_loader) + iter)
                self.writer.add_scalars(
                    "acc", {"train_acc": running_acc / float(iter + 1)},
                    (self._epoch_count - 1) * len(self.train_loader) + iter)
        epoch_loss = running_loss / len(self.train_loader)
        epoch_acc = running_acc / len(self.train_loader)
        print(f"train loss: {epoch_loss:.5f} train acc: {epoch_acc:.5f}")
        return epoch_loss, epoch_acc

    def _validate_epoch(self):
        self.model.eval()
        running_loss = 0
        running_acc = 0
        for iter, (inputs, targets) in enumerate(tqdm(self.val_loader)):
            inputs = inputs.to(device())
            targets = targets.to(device())
            with torch.set_grad_enabled(False):
                outputs = self.model(inputs)
                batch_loss = self.criterion(outputs, targets)
                batch_acc = accuracy(outputs, targets)
            running_loss += batch_loss.item()
            running_acc += batch_acc.item()
            if self.log_every(iter):
                self.writer.add_scalars(
                    "loss", {"val_loss": running_loss / float(iter + 1)},
                    (self._epoch_count - 1) * len(self.val_loader) + iter)
                self.writer.add_scalars(
                    "acc", {"val_acc": running_acc / float(iter + 1)},
                    (self._epoch_count - 1) * len(self.val_loader) + iter)
        epoch_loss = running_loss / len(self.val_loader)
        epoch_acc = running_acc / len(self.val_loader)
        print(f"val loss: {epoch_loss:.5f} val acc: {epoch_acc:.5f}")
        return epoch_loss, epoch_acc

    def get_num_dir(self, path):
        num_dir = len(os.listdir(path))
        return num_dir

    def save_checkpoint(self, fname):
        path = os.path.join(self._save_dir, fname)
        torch.save(
            dict(
                epoch=self._epoch_count,
                best_loss=self._best_loss,
                best_acc=self._best_acc,
                model=self.model.state_dict(),
                optimizer=self.optimizer.state_dict(),
            ), path)

    def load_checkpoint(self, fname):
        path = os.path.join(self._save_dir, fname)
        checkpoint = torch.load(path,
                                map_location=lambda storage, loc: storage)
        self._epoch_count = checkpoint['epoch']
        self.model.load_state_dict(checkpoint['model'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])

    def log_every(self, i):
        return (i % 100) == 0

    def update_lr(self, lr):
        n = len(self.optimizer.param_groups) - 1
        for i, g in enumerate(self.optimizer.param_groups):
            g['lr'] = lr / (self.div_lr**(n - i))

    def update_mom(self, mom):
        keys = self.optimizer.param_groups[0].keys()
        for g in self.optimizer.param_groups:
            if 'momentum' in g.keys():
                g['momentum'] = mom
            elif 'betas' in g.keys():
                g['betas'] = mom if isinstance(mom, tuple) else (mom,
                                                                 g['betas'][1])
            else:
                raise ValueError

    def find_lr(self, start_lr=1e-7, end_lr=100, num_iter=100):
        optimizer_state = self.optimizer.state_dict()
        self.update_lr(start_lr)
        self.lr_finder = LRFinder(self.model, self.optimizer, self.criterion,
                                  self.device)
        self.lr_finder.range_test(self.train_loader,
                                  end_lr=end_lr,
                                  num_iter=num_iter)
        self.optimizer.load_state_dict(optimizer_state)
        self.lr_finder.plot()

    def cleanup(self):
        copy_runpy(self._save_dir)
        path = os.path.join(self._save_dir, "./all_scalars.json")
        self.writer.export_scalars_to_json(path)
        self.writer.close()
Esempio n. 23
0
def main_loop():
    util.cancel_shutdown()
    losses = []

    args = g.args

    if not args.local:
        g.logger.info(
            f'Distributed initializing process group with '
            f'{args.dist_backend}, {args.dist_url}, {util.get_world_size()}')
        dist.init_process_group(
            backend=args.dist_backend,
            #init_method=args.dist_url,
            #world_size=util.get_world_size()
        )
        assert (util.get_world_size() == dist.get_world_size())
        g.logger.info(
            f"Distributed: success ({args.local_rank}/{dist.get_world_size()})"
        )

    g.logger.info("creating new model")
    g.state = TrainState(args)
    g.state.model = MemTransformerLM(g.ntokens,
                                     args.n_layer,
                                     args.n_head,
                                     args.d_model,
                                     args.d_head,
                                     args.d_inner,
                                     args.dropout,
                                     args.dropatt,
                                     tie_weight=args.tied,
                                     d_embed=args.d_embed,
                                     div_val=args.div_val,
                                     tie_projs=g.tie_projs,
                                     pre_lnorm=args.pre_lnorm,
                                     tgt_len=args.tgt_len,
                                     ext_len=args.ext_len,
                                     mem_len=args.mem_len,
                                     cutoffs=g.cutoffs,
                                     same_length=args.same_length,
                                     attn_type=args.attn_type,
                                     clamp_len=args.clamp_len,
                                     sample_softmax=args.sample_softmax,
                                     freeze_below=args.freeze_below)
    g.state.model.to(g.device)
    optimizer_setup(g.state)
    if args.checkpoint:
        if args.checkpoint_secondary:
            g.logger.info(f"restoring extra checkpoint")
            util.restore_from_checkpoint(g.state.model, g.state.optimizer,
                                         args.checkpoint_secondary,
                                         args.optim_state_dict)
        g.logger.info(f"Restoring model from {args.checkpoint}" +
                      f" and optimizer from {args.optim_state_dict}" if args.
                      optim_state_dict else "")
        util.restore_from_checkpoint(g.state.model, g.state.optimizer,
                                     args.checkpoint, args.optim_state_dict)

    else:
        g.state.model.apply(weights_init)
        # ensure embedding init is not overridden by out_layer in case of weight sharing
        g.state.model.word_emb.apply(weights_init)

    model: MemTransformerLM = g.state.model
    optimizer = g.state.optimizer

    if g.state.args.fp16:
        model = FP16_Module(model)
        optimizer = FP16_Optimizer(
            optimizer,
            static_loss_scale=g.state.args.static_loss_scale,
            dynamic_loss_scale=g.state.args.dynamic_loss_scale,
            dynamic_loss_args={'init_scale': 2**16},
            verbose=False)

    # log model info
    # n_all_param = sum([p.nelement() for p in model.parameters()])
    # log_tb('sizes/params', n_all_param)
    # n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()])
    # log_tb('sizes/non_emb_params', n_nonemb_param)
    # g.logger.info('params %s non_emb_params %s', n_all_param, n_nonemb_param)

    # scheduler
    if args.scheduler == 'cosine':
        # Divide by 1e6 for numerical stability.
        g.state.scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, args.max_tokens // 1e6, eta_min=args.eta_min)
    elif args.scheduler == 'finder':
        g.state.scheduler: LRFinder = LRFinder(optimizer,
                                               args.max_tokens,
                                               init_value=args.lr / 1e3)
    else:
        assert args.scheduler == 'constant'
        g.state.scheduler = util.NoOp()

    # Setup distributed model
    if args.local:
        model = nn.DataParallel(model, dim=1)
    else:
        # Uncomment find_unused_parameters and upgrade to torch 1.1 for adaptive embedding.
        model = DistributedDataParallel(
            model, device_ids=[args.local_rank],
            output_device=args.local_rank)  # , find_unused_parameters=True)

    if util.get_global_rank() == 0:
        if not args.test:
            wandb.config.update(vars(args))
            # wandb.watch(model)

    g.event_writer.add_text('args', str(args))  # TODO: replace with log_tb

    accumulated_loss = 0
    # At any point you can hit Ctrl + C to break out of training early.
    try:
        for epoch in itertools.count(start=g.state.last_epoch):
            print(f"epoch -- {epoch}, token_count -- {g.state.token_count}")
            model.train()

            log_tb('sizes/batch_size', args.batch_size)
            log_tb('sizes/seq_size', args.tgt_len)

            if g.state.partial_epoch:
                # reuse previously loaded tr_iter and states
                assert g.state.tr_iter is not None
                assert g.state.mems is not None
            else:
                g.state.tr_iter = g.corpus.get_dist_iterator(
                    'train',
                    rank=util.get_global_rank(),
                    max_rank=util.get_world_size(),
                    bsz=args.batch_size,
                    bptt=args.tgt_len,
                    device=g.device,
                    ext_len=args.ext_len,
                    skip_files=g.args.skip_files)
                g.state.mems = tuple()
            g.state.last_epoch = epoch

            log_start_time = time.time()
            tokens_per_epoch = 0
            for batch, (data, target, seq_len) in enumerate(g.state.tr_iter):
                # assert seq_len == data.shape[0]
                # for i in range(1, data.shape[0]):
                #     assert torch.all(torch.eq(data[i], target[i - 1]))
                #     break

                # print(g.state.token_count, data)

                if g.state.train_step % args.eval_interval == 0:
                    evaluate_and_log(model,
                                     g.va_iter,
                                     'val_short-mem-1',
                                     generate_text=False,
                                     reset_mems_interval=1)
                    evaluate_and_log(model,
                                     g.va_iter,
                                     'val_short-mem-2',
                                     generate_text=False,
                                     reset_mems_interval=2)
                    evaluate_and_log(model,
                                     g.va_iter,
                                     'val_short-mem-3',
                                     generate_text=False,
                                     reset_mems_interval=3)
                    evaluate_and_log(model, g.va_iter, 'val')
                    if g.va_custom_iter:
                        evaluate_and_log(g.state.model,
                                         g.va_custom_iter,
                                         g.args.valid_custom,
                                         generate_text=False)

                batch_total = torch.tensor(data.shape[1]).to(g.device)
                if args.local:  # TODO(y): factor out (need way to see if dist was inited)
                    batch_total = batch_total.sum()
                else:
                    batch_total = util.dist_sum_tensor(
                        batch_total)  # global batch size
                batch_total = util.toscalar(batch_total)

                should_log = (g.state.train_step < args.verbose_log_steps) or \
                             (g.state.train_step + 1) % args.log_interval == 0

                model.zero_grad()

                ret = model(data, target, *g.state.mems)
                loss, g.state.mems = ret[0], ret[1:]

                loss: torch.Tensor = loss.float().mean().type_as(loss)
                with timeit('backwards', noop=not should_log):
                    if args.fp16:
                        optimizer.backward(loss)
                    else:
                        loss.backward()
                loss0 = util.toscalar(loss)
                util.record('loss', loss0)

                util.record('params', torch.sum(util.flat_param(model)).item())
                losses.append(loss0)
                accumulated_loss += loss0

                if args.fp16:
                    optimizer.clip_master_grads(args.clip)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.clip)

                # step-wise learning rate annealing
                if hasattr(optimizer, 'overflow') and optimizer.overflow:
                    g.logger.info("skipped iteration")
                else:
                    if args.scheduler in ['cosine', 'constant', 'dev_perf']:
                        # linear warmup stage
                        if g.state.token_count < args.warmup_tokens:
                            curr_lr = args.lr * float(
                                g.state.token_count) / args.warmup_tokens
                            optimizer.param_groups[0]['lr'] = curr_lr
                        elif args.scheduler == 'cosine':
                            # Divide by 1e6 for numerical stability.
                            g.state.scheduler.step(g.state.token_count //
                                                   1000 // 1000)
                    else:
                        g.state.scheduler.step(g.state.token_count)

                optimizer.step()
                g.state.train_step += 1

                consumed_tokens = data.shape[0] * data.shape[1]
                world_size = int(os.environ.get("WORLD_SIZE", "8"))
                if world_size > 8:  # correction factor for multiple machines
                    consumed_tokens = consumed_tokens * (world_size // 8)
                tokens_per_epoch += consumed_tokens
                g.state.token_count += consumed_tokens
                g.token_count = g.state.token_count
                if g.state.token_count >= args.max_tokens:
                    g.state.partial_epoch = True
                    raise StopIteration  # break out of parent train loop

                if should_log:
                    elapsed_time = time.time() - log_start_time
                    elapsed_steps = g.state.train_step - g.state.last_log_step

                    # compute average loss over last logging interval
                    cur_loss = accumulated_loss / elapsed_steps
                    cur_loss_mean = util.dist_mean(cur_loss)
                    log_str = f'| epoch {epoch:3d} step {g.state.train_step:>8d} ' \
                              f'| {batch:>6d} batches ' \
                              f'| lr {optimizer.param_groups[0]["lr"]:.3g} ' \
                              f'| ms/batch {elapsed_time * 1000 / elapsed_steps:5.2f} ' \
                              f'| loss {cur_loss:5.2f}'
                    if args.dataset in ['enwik8', 'text8']:
                        log_str += f' | bpc {cur_loss / math.log(2):9.5f}'
                    else:
                        log_str += f' | ppl {math.exp(cur_loss):9.3f}'
                    g.logger.info(log_str)
                    log_tb('learning/epoch', epoch)
                    log_tb('_loss', cur_loss_mean)  # the most important thing
                    log_tb('learning/loss', cur_loss_mean)
                    log_tb('learning/ppl', math.exp(cur_loss_mean))

                    # currently step timings are not synchronized in multi-machine
                    # case (see #4). Can add torch.distributed.barrier() to get
                    # more accurate timings, but this may add slowness.
                    log_tb('times/step', 1000 * elapsed_time / elapsed_steps)
                    current_lr = optimizer.param_groups[0]['lr']

                    log_tb('learning/lr', current_lr)

                    # 32 is the "canonical" batch size
                    linear_scaling_factor = batch_total / 32  # TODO(y): merge logic from master
                    log_tb('learning/base_lr',
                           current_lr / linear_scaling_factor)
                    if args.optim == 'lamb':
                        log_lamb_rs(optimizer, g.event_writer,
                                    g.state.token_count)

                    time_per_batch = elapsed_time / elapsed_steps
                    time_per_sample = time_per_batch / args.batch_size
                    time_per_token = time_per_sample / args.tgt_len

                    log_tb('times/batches_per_sec', 1 / time_per_batch)
                    log_tb('times/samples_per_sec', 1 / time_per_sample)
                    log_tb('times/tokens_per_sec', 1 / time_per_token)

                    if str(g.device) == 'cuda':
                        log_tb("memory/allocated_gb",
                               torch.cuda.memory_allocated() / 1e9)
                        log_tb("memory/max_allocated_gb",
                               torch.cuda.max_memory_allocated() / 1e9)
                        log_tb("memory/cached_gb",
                               torch.cuda.memory_cached() / 1e9)
                        log_tb("memory/max_cached_gb",
                               torch.cuda.max_memory_cached() / 1e9)

                    accumulated_loss = 0
                    log_start_time = time.time()
                    g.state.last_log_step = g.state.train_step

            if args.checkpoint_each_epoch:
                g.logger.info(f'Saving checkpoint for epoch {epoch}')
                util.dist_save_checkpoint(model,
                                          optimizer,
                                          args.logdir,
                                          suffix=f'{epoch}')
            if tokens_per_epoch == 0:
                logging.info("Zero tokens in last epoch, breaking")

                break

            g.state.partial_epoch = False

    except KeyboardInterrupt:
        g.logger.info('-' * 100)
        g.logger.info('Exiting from training early')
    except StopIteration:
        pass

    return losses
Esempio n. 24
0
                              factor=0.5,
                              patience=4,
                              min_lr=0.000001,
                              cooldown=3,
                              verbose=1)

stop_on_nan = keras.callbacks.TerminateOnNaN()

# LR finder
if opt.find_lr:
    # pre-train to avoid model being too far away from interesting range
    history = model.fit_generator(gen_x_train,
                                  epochs=2,
                                  verbose=1,
                                  callbacks=[clr])
    lr_finder = LRFinder(model)
    lr_finder.find_generator(gen_x_train, 0.00001, 1.0, 5)
    lr_finder.plot_loss()
    import pdb
    pdb.set_trace()

# Run training
if not opt.notrain:
    # Train classifier
    history = model.fit_generator(
        gen_x_train,
        epochs=epochs,
        verbose=1,  # switch to 1 for more verbosity
        callbacks=[early_stopping, clr,
                   stop_on_nan],  #, reduce_lr], #, lr,   reduce_lr],
        # callbacks=[early_stopping, reduce_lr], #, lr, reduce_lr],
Esempio n. 25
0
def main_loop():
    util.cancel_shutdown()
    losses = []

    args = g.args

    if not args.local:
        g.logger.info(
            f'Distributed initializing process group with {args.dist_backend}, {args.dist_url}, {util.get_world_size()}')
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=util.get_world_size())
        assert (util.get_world_size() == dist.get_world_size())
        g.logger.info(f"Distributed: success ({args.local_rank}/{dist.get_world_size()})")

    if args.load_state_fn:
        g.state = load_state(args.load_state_fn)
        g.logger.info(f"Restoring training from {args.load_state_fn}")
    else:
        g.logger.info("creating new model")
        g.state = TrainState(args)

        g.state.model = MemTransformerLM(g.ntokens, args.n_layer, args.n_head, args.d_model,
                                         args.d_head, args.d_inner, args.dropout, args.dropatt,
                                         tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val,
                                         tie_projs=g.tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len,
                                         ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=g.cutoffs,
                                         same_length=args.same_length, attn_type=args.attn_type,
                                         clamp_len=args.clamp_len, sample_softmax=args.sample_softmax)
        if args.checkpoint:
            util.restore_from_checkpoint(g.state.model, checkpoint_fn=args.checkpoint)
        else:
            g.state.model.apply(weights_init)
            g.state.model.word_emb.apply(
                weights_init)  # ensure embedding init is not overridden by out_layer in case of weight sharing
        g.state.model.to(g.device)
        optimizer_setup(g.state)

    model: MemTransformerLM = g.state.model
    optimizer = g.state.optimizer

    # log model info
    # n_all_param = sum([p.nelement() for p in model.parameters()])
    # log_tb('sizes/params', n_all_param)
    # n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()])
    # log_tb('sizes/non_emb_params', n_nonemb_param)
    # g.logger.info('params %s non_emb_params %s', n_all_param, n_nonemb_param)

    # scheduler
    if not g.args.load_state_fn:
        if args.scheduler == 'cosine':
            # Divide by 1e6 for numerical stability.
            g.state.scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_tokens // 1e6,
                                                                     eta_min=args.eta_min)
        elif args.scheduler == 'finder':
            g.state.scheduler: LRFinder = LRFinder(optimizer, args.max_tokens, init_value=args.lr / 1e3)
        else:
            assert args.scheduler == 'constant'
            g.state.scheduler = util.NoOp()

    # Setup distributed model
    if args.local:
        model = nn.DataParallel(model, dim=1)
    else:
        # Uncomment find_unused_parameters and upgrade to torch 1.1 for adaptive embedding.
        model = DistributedDataParallel(model, device_ids=[args.local_rank],
                                        output_device=args.local_rank)  # , find_unused_parameters=True)

    if util.get_global_rank() == 0:
        if not args.test:
            wandb.config.update(vars(args))
            # wandb.watch(model)

    g.event_writer.add_text('args', str(args))  # TODO: replace with log_tb

    accumulated_loss = 0
    # At any point you can hit Ctrl + C to break out of training early.
    try:
        for epoch in itertools.count(start=g.state.last_epoch):
            print(f"epoch -- {epoch}, token_count -- {g.state.
Esempio n. 26
0
def main(args):
    # load train data into ram
    # data_path = '/mntlong/lanl_comp/data/'
    file_dir = os.path.dirname(__file__)
    data_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'data'))
    train_info_path = os.path.join(data_path, 'train_info.csv')
    train_data_path = os.path.join(data_path, 'train_compressed.npz')

    train_info = pd.read_csv(train_info_path, index_col='Unnamed: 0')
    train_info['exp_len'] = train_info['indx_end'] - train_info['indx_start']

    train_signal = np.load(train_data_path)['signal']
    train_quaketime = np.load(train_data_path)['quake_time']

    # В валидацию берем 2 последних волны (части эксперимента)
    val_start_idx = train_info.iloc[-2, :]['indx_start']

    val_signal = train_signal[val_start_idx:]
    val_quaketime = train_quaketime[val_start_idx:]

    train_signal = train_signal[:val_start_idx]
    train_quaketime = train_quaketime[:val_start_idx]

    # training params
    large_ws = 1500000
    overlap_size = int(large_ws * 0.5)
    small_ws = 150000
    num_bins = 17

    cpc_meta_model = models.CPCv1(out_size=num_bins - 1)

    # logs_path = '/mntlong/scripts/logs/'
    logs_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'logs'))
    current_datetime = datetime.today().strftime('%b-%d_%H-%M-%S')
    log_writer_path = os.path.join(logs_path, 'runs',
                                   current_datetime + '_' + args.model_name)

    train_dataset = data.SignalCPCDataset(
        train_signal,
        train_quaketime,
        num_bins=num_bins,
        idxs_wave_end=train_info['indx_end'].values,
        large_ws=large_ws,
        overlap_size=overlap_size,
        small_ws=small_ws)
    val_dataset = data.SignalCPCDataset(
        val_signal,
        val_quaketime,
        num_bins=num_bins,
        idxs_wave_end=train_info['indx_end'].values,
        large_ws=large_ws,
        overlap_size=overlap_size,
        small_ws=small_ws)

    print('x_t size:', train_dataset[0][0].size())

    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=5,
                              pin_memory=True)
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=5,
                            pin_memory=True)

    if args.find_lr:
        from lr_finder import LRFinder
        optimizer = optim.Adam(cpc_meta_model.parameters(), lr=1e-6)
        lr_find = LRFinder(cpc_meta_model,
                           optimizer,
                           criterion=None,
                           is_cpc=True,
                           device='cuda')
        lr_find.range_test(train_loader,
                           end_lr=2,
                           num_iter=75,
                           step_mode='exp')
        best_lr = lr_find.get_best_lr()
        lr_find.plot()
        lr_find.reset()
        print('best lr found: {:.2e}'.format(best_lr))
    else:
        best_lr = 3e-4
    # sys.exit()

    # model_path = os.path.join(logs_path, 'cpc_no_target_head_cont_last_state.pth')
    # cpc_meta_model.load_state_dict(torch.load(model_path)['model_state_dict'])
    # cpc_meta_model.to(torch.device('cuda'))

    optimizer = optim.Adam(cpc_meta_model.parameters(), lr=best_lr)
    # optimizer.load_state_dict(torch.load(model_path)['optimizer_state_dict'])
    lr_sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                    factor=0.5,
                                                    patience=3,
                                                    threshold=0.005)

    log_writer = SummaryWriter(log_writer_path)

    utils.train_cpc_model(cpc_meta_model=cpc_meta_model,
                          optimizer=optimizer,
                          num_bins=num_bins,
                          lr_scheduler=lr_sched,
                          train_loader=train_loader,
                          val_loader=val_loader,
                          num_epochs=args.num_epochs,
                          model_name=args.model_name,
                          logs_path=logs_path,
                          log_writer=log_writer)
Esempio n. 27
0
def main(args):
    np.random.seed(432)
    torch.random.manual_seed(432)
    try:
        os.makedirs(args.outpath)
    except OSError:
        pass
    experiment_path = utils.get_new_model_path(args.outpath)
    print(experiment_path)

    train_writer = SummaryWriter(os.path.join(experiment_path, 'train_logs'))
    val_writer = SummaryWriter(os.path.join(experiment_path, 'val_logs'))
    trainer = train.Trainer(train_writer, val_writer)

    # todo: add config
    train_transform = data.build_preprocessing()
    eval_transform = data.build_preprocessing()

    trainds, evalds = data.build_dataset(args.datadir, None)
    trainds.transform = train_transform
    evalds.transform = eval_transform

    model = models.resnet34()
    opt = torch.optim.Adam(model.parameters(), lr=1e-8)

    trainloader = DataLoader(trainds,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=8,
                             pin_memory=True)
    evalloader = DataLoader(evalds,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=16,
                            pin_memory=True)

    #find lr fast ai
    criterion = torch.nn.BCEWithLogitsLoss()
    lr_finder = LRFinder(model, opt, criterion, device="cuda")
    #     lr_finder.range_test(trainloader, val_loader=evalloader, end_lr=1, num_iter=10, step_mode="exp")
    lr_finder.range_test(trainloader,
                         end_lr=100,
                         num_iter=100,
                         step_mode="exp")

    #plot graph fast ai
    skip_start = 6
    skip_end = 3
    lrs = lr_finder.history["lr"]
    losses = lr_finder.history["loss"]
    grad_norm = lr_finder.history["grad_norm"]

    #     ind = grad_norm.index(min(grad_norm))
    #     opt_lr = lrs[ind]
    #     print('LR with min grad_norm =', opt_lr)

    lrs = lrs[skip_start:-skip_end]
    losses = losses[skip_start:-skip_end]

    fig = plt.figure(figsize=(12, 9))
    plt.plot(lrs, losses)
    plt.xscale("log")
    plt.xlabel("Learning rate")
    plt.ylabel("Loss")
    train_writer.add_figure('loss_vs_lr', fig)

    lr_finder.reset()

    #     fixed_lr = 1e-3
    fixed_lr = 3e-4
    opt = torch.optim.Adam(model.parameters(), lr=fixed_lr)

    #     #new
    #     lr = 1e-3
    #     eta_min = 1e-5
    #     t_max = 10
    #     opt = torch.optim.Adam(model.parameters(), lr=lr)
    #     scheduler = CosineAnnealingLR(opt, T_max=t_max, eta_min=eta_min)
    #     #new

    #     one cycle for 5 ehoches
    #     scheduler = CosineAnnealingLR(opt, 519*4, eta_min=1e-4)
    scheduler = CosineAnnealingLR(opt, args.epochs)

    #     scheduler = CosineAnnealingLR(opt, 519, eta_min=1e-5)
    #     scheduler = StepLR(opt, step_size=3, gamma=0.1)

    state_list = []
    for epoch in range(args.epochs):
        #         t = epoch / args.epochs
        #         lr = np.exp((1 - t) * np.log(lr_begin) + t * np.log(lr_end))
        # выставляем lr для всех параметров
        trainer.train_epoch(model, opt, trainloader, fixed_lr, scheduler)
        #         trainer.train_epoch(model, opt, trainloader, 3e-4, scheduler)
        #         trainer.train_epoch(model, opt, trainloader, 9.0451e-4, scheduler)
        metrics = trainer.eval_epoch(model, evalloader)

        state = dict(
            epoch=epoch,
            model_state_dict=model.state_dict(),
            optimizer_state_dict=opt.state_dict(),
            loss=metrics['loss'],
            lwlrap=metrics['lwlrap'],
            global_step=trainer.global_step,
        )
        state_copy = copy.deepcopy(state)
        state_list.append(state_copy)
        export_path = os.path.join(experiment_path, 'last.pth')
        torch.save(state, export_path)

    # save the best path
    best_export_path = os.path.join(experiment_path, 'best.pth')

    max_lwlrap = 0
    max_lwlrap_ind = 0
    for i in range(args.epochs):
        if state_list[i]['lwlrap'] > max_lwlrap:
            max_lwlrap = state_list[i]['lwlrap']
            max_lwlrap_ind = i

    best_state = state_list[max_lwlrap_ind]
    torch.save(best_state, best_export_path)
  train_loader = DataLoader(train_ds,batch_size=batch_size, sampler=BalanceClassSampler(labels=train_ds.get_labels(), mode="downsampling"), shuffle=False, num_workers=4)
else:
  train_loader = DataLoader(train_ds,batch_size=batch_size, shuffle=True, num_workers=4)

plist = [
        {'params': model.backbone.parameters(),  'lr': learning_rate/50},
        {'params': model.meta_fc.parameters(),  'lr': learning_rate},
        # {'params': model.metric_classify.parameters(),  'lr': learning_rate},
    ]

optimizer = optim.Adam(plist, lr=learning_rate)
# lr_reduce_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=patience, verbose=True, threshold=1e-4, threshold_mode='rel', cooldown=0, min_lr=1e-7, eps=1e-08)
# cyclic_scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=learning_rate, max_lr=10*learning_rate, step_size_up=2000, step_size_down=2000, mode='triangular', gamma=1.0, scale_fn=None, scale_mode='cycle', cycle_momentum=False, base_momentum=0.8, max_momentum=0.9, last_epoch=-1)

criterion = criterion_margin_focal_binary_cross_entropy
if load_model:
  tmp = torch.load(os.path.join(model_dir, model_name+'_loss.pth'))
  model.load_state_dict(tmp['model'])
  # optimizer.load_state_dict(tmp['optim'])
  # lr_reduce_scheduler.load_state_dict(tmp['scheduler'])
  # cyclic_scheduler.load_state_dict(tmp['cyclic_scheduler'])
  # amp.load_state_dict(tmp['amp'])
  prev_epoch_num = tmp['epoch']
  best_valid_loss = tmp['best_loss']
  del tmp
  print('Model Loaded!')
# model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
lr_finder = LRFinder(model, optimizer, criterion, device="cuda")
lr_finder.range_test(train_loader, end_lr=100, num_iter=500,  accumulation_steps=accum_step)
lr_finder.plot() # to inspect the loss-learning rate graph
Esempio n. 29
0
xval[cols_to_scale] = ss.transform(xval[cols_to_scale])
yval = yval.values

xtrain, xval = xtrain.values, xval.values

# %%
nn = keras.Sequential([
    keras.layers.Dense(units=15, activation='relu'),
    keras.layers.Dense(units=15, activation='relu'),
    keras.layers.Dense(units=1, activation='linear'),
])

nn.compile(keras.optimizers.SGD(lr=0.001), 'MAE', metrics=['MAE'])
nn.build((None, xtrain.shape[1]))
# Find optimal learning rate. Use the one with the steepest descent of loss (not minimum)
lrf = LRFinder(0.01, 1)
nn.fit(xtrain,
       ytrain,
       validation_data=(xval, yval),
       epochs=5,
       batch_size=32,
       callbacks=[lrf])

# %%
from CLRCallback import CyclicLR
clr = CyclicLR((10**-1) / 3, 10**-1)
h = nn.fit(xtrain,
           ytrain,
           validation_data=(xval, yval),
           epochs=25,
           batch_size=32,
Esempio n. 30
0
                                             shuffle=True, num_workers=4)
              for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")




# Finetuning the convnet
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)

model = model.to(device)

criterion = nn.CrossEntropyLoss()

#Select a small learning rate for the start
optimizer_ft = optim.SGD(model.parameters(), lr=1e-5, momentum = 0.9)
lr_finder = LRFinder(model,optimizer_ft, criterion, device="cuda")
#Using the train loss
lr_finder.range_test(dataloaders['train'], end_lr=100,num_iter=1000,step_mode='exp')
lr_finder.plot()

#Using the validation loss
lr_finder.reset()
lr_finder.range_test(dataloaders['train'], val_loader=dataloaders['val'],end_lr=100,num_iter=200,step_mode='exp')
lr_finder.plot(skip_end=0)