def find_lr(self, start_lr=1e-7, end_lr=100, num_iter=100): optimizer_state = self.optimizer.state_dict() self.update_lr(start_lr) self.lr_finder = LRFinder(self.model, self.optimizer, self.criterion, self.device) self.lr_finder.range_test(self.train_loader, end_lr=end_lr, num_iter=num_iter) self.optimizer.load_state_dict(optimizer_state) self.lr_finder.plot()
def lr_finder(model, optimizer, criterion, trainloader): lr_finder = LRFinder(model, optimizer, criterion, device="cuda") lr_finder.range_test(trainloader, end_lr=100, num_iter=100, step_mode="exp") lr_finder.plot() #to plot the loss vs Learning Rate curve lr_finder.reset() # to reset the lr_finder
def lr_finder(model, train_loader): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") criterion = nn.CrossEntropyLoss() optimizer_ft = optim.Adam(model.parameters(), lr=0.0000001) lr_finder = LRFinder(model, optimizer_ft, criterion, device=device) lr_finder.range_test(train_loader, end_lr=1, num_iter=1000) lr_finder.reset() lr_finder.plot()
def executeLr_finder(model, optimizer, device, trainloader, criterion): #finding and plotting the best LR lr_finder = LRFinder(model, optimizer, criterion, device="cuda") lr_finder.range_test(trainloader, end_lr=100, num_iter=100, step_mode="exp") lr_finder.plot() # to inspect the loss-learning rate graph lr_finder.reset( ) # to reset the model and optimizer to their initial state
def get_model(batchsize=8, dropout=0.3): # Inputs inp_normal = keras.layers.Input(shape=(xtrain.shape[1] - len(embedding_features), ), name='inp_normal') inp_dow_embedding = keras.layers.Input(shape=(1, ), name='inp_dow_embedding') inp_hod_embedding = keras.layers.Input(shape=(1, ), name='inp_hod_embedding') # Embeddings dow_embedding = keras.layers.Embedding(input_dim=7, output_dim=3, input_length=1)(inp_dow_embedding) dow_embedding = keras.layers.Flatten()(dow_embedding) hod_embedding = keras.layers.Embedding(input_dim=24, output_dim=10, input_length=1)(inp_hod_embedding) hod_embedding = keras.layers.Flatten()(hod_embedding) # Hidden layers concat = keras.layers.Concatenate()([inp_normal, dow_embedding, hod_embedding]) x = keras.layers.Dense(units=100, activation='relu')(concat) x = keras.layers.BatchNormalization()(x) x = keras.layers.Dropout(dropout)(x) x = keras.layers.Dense(units=40, activation='relu')(x) x = keras.layers.BatchNormalization()(x) x = keras.layers.Dropout(dropout)(x) x = keras.layers.Dense(units=10, activation='relu')(x) x = keras.layers.BatchNormalization()(x) out = keras.layers.Dense(units=1, activation='sigmoid')(x) nn = keras.Model(inputs=[inp_normal, inp_dow_embedding, inp_hod_embedding], outputs=out) nn.compile( loss='binary_crossentropy', optimizer=keras.optimizers.SGD(lr=0.0001), metrics=['accuracy', keras.metrics.Precision()] ) lr_finder = LRFinder(0.0001, 0.1) nn.fit( x={ 'inp_normal': xtrain.drop(embedding_features, axis=1).values, 'inp_dow_embedding': xtrain.dayofweek.values.reshape(-1, 1), 'inp_hod_embedding': xtrain.hourofday.values.reshape(-1, 1) }, y=ytrain.values, validation_data=( [ xval.drop(embedding_features, axis=1).values, xval.dayofweek.values.reshape(-1, 1), xval.hourofday.values.reshape(-1, 1) ], yval.values ), epochs=2, batch_size=batchsize, callbacks=[lr_finder] ) return nn
def find_lr(self,model, device, train_loader, lr_val=1e-8, decay=1e-2): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=lr_val, weight_decay=decay) lr_finder = LRFinder(model, optimizer, criterion, device) lr_finder.range_test(train_loader, end_lr=100, num_iter=100, step_mode="exp") lr_finder.plot() return lr_finder
def lr_finder(net, optimizer, loss_fun, trainloader, testloader): # Using LRFinder lr_finder = LRFinder(net, optimizer, loss_fun, device='cuda') lr_finder.range_test(trainloader, val_loader=testloader, start_lr=1e-3, end_lr=0.1, num_iter=100, step_mode='exp') lr_finder.plot(log_lr=False) lr_finder.reset( ) # important to restore the model and optimizer's parameters to its initial state return lr_finder.history
def train_and_test(): # load network model = vgg11(365, 1) model.compile( loss='categorical_crossentropy', # Learning rate will be set by lr_finder optimizer=SGD(lr=0.0, momentum=0.9), metrics=['accuracy', top_5]) # load data img_size = (224, 224) color_mode = 'grayscale' batch_size = 64 train_dir = '/usr/local/data/gabriel/places365_line_drawings/train' test_dir = '/usr/local/data/gabriel/places365_line_drawings/val' # fixed for places365 nb_train_samples = 1803460. nb_test_samples = 36500. train_datagen = ImageDataGenerator(rescale=1. / 255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True) # no test data for now # test_datagen = ImageDataGenerator(rescale=1. / 255) train_gen = train_datagen.flow_from_directory(train_dir, target_size=img_size, batch_size=batch_size, class_mode='categorical', color_mode=color_mode) # no test data for now # test_gen = test_datagen.flow_from_directory( # test_dir, # target_size = img_size, # batch_size = batch_size, # class_mode = 'categorical', # color_mode = color_mode # ) # find best learning rate lr_finder = LRFinder(min_lr=1e-5, max_lr=1e-2, steps_per_epoch=np.ceil(nb_train_samples / batch_size), epochs=4) model.fit_generator(train_gen, steps_per_epoch=np.ceil(nb_train_samples / batch_size), epochs=4, callbacks=[lr_finder]) # save loss and learning rate plots to files lr_finder.plot_loss('loss.png') lr_finder.plot_lr('lr.png')
def lr_finder(self, end_lr): lr_find = LRFinder(self.model, self.optimizer, self.criterion, cfg.device) lr_find.range_test(self.data_loaders['val'], end_lr=end_lr, num_iter=2000) lr_find.plot()
def _main(): annotation_path = 'train.txt' classes_path = 'model_data/openimgs_classes.txt' anchors_path = 'model_data/yolo_anchors.txt' class_names = get_classes(classes_path) num_classes = len(class_names) anchors = get_anchors(anchors_path) input_shape = (416, 416) # multiple of 32, hw # use darknet53 weights #model = create_model(input_shape, anchors, num_classes, # freeze_body=2, weights_path='model_data/darknet53_weights.h5') model = create_model(input_shape, anchors, num_classes, freeze_body=0, weights_path='logs/001/trained_weights_stage_2.h5') val_split = 0.1 with open(annotation_path) as f: lines = f.readlines() #np.random.seed(10101) np.random.shuffle(lines) #np.random.seed(None) num_val = int(len(lines) * val_split) num_val = 10000 if num_val > 10000 else num_val num_train = len(lines) - num_val if True: batch_size = 6 lr_finder = LRFinder(min_lr=1e-10, max_lr=2e-2, steps_per_epoch=TOTAL_ITERATIONS, epochs=1) for i in range(len(model.layers)): model.layers[i].trainable = True model.compile(optimizer=SGD(lr=1e-8), loss={ 'yolo_loss': lambda y_true, y_pred: y_pred }) print('train on {} samples, val on {} samples, with batch size {}.'. format(num_train, num_val, batch_size)) model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes), steps_per_epoch=TOTAL_ITERATIONS, validation_data=data_generator_wrapper( lines[num_train:], batch_size, input_shape, anchors, num_classes), validation_steps=1, epochs=1, initial_epoch=0, callbacks=[lr_finder]) lr_finder.save_history('lr_finder_loss.csv') lr_finder.plot_loss('lr_finder_loss.png')
def get_LR(model, trainloader, optimizer, criterion, device, testloader=None): # print("########## Tweaked version from fastai ###########") # lr_find = LRFinder(model, optimizer, criterion, device="cuda") # lr_find.range_test(trainloader, end_lr=100, num_iter=100) # best_lr=lr_find.plot() # to inspect the loss-learning rate graph # lr_find.reset() # return best_lr # print("########## Tweaked version from fastai ###########") # lr_find = LRFinder(model, optimizer, criterion, device="cuda") # lr_find.range_test(trainloader, end_lr=1, num_iter=100) # lr_find.plot() # to inspect the loss-learning rate graph # lr_find.reset() # for index in range(len(lr_find.history['loss'])): # item = lr_find.history['loss'][index] # if item == lr_find.best_loss: # min_val_index = index # print(f"{min_val_index}") # # lr_find.plot(show_lr=lr_find.history['lr'][75]) # lr_find.plot(show_lr=lr_find.history['lr'][min_val_index]) # # val_index = 75 # mid_val_index = math.floor((val_index + min_val_index)/2) # show_lr=[{'data': lr_find.history['lr'][val_index], 'linestyle': 'dashed'}, {'data': lr_find.history['lr'][mid_val_index], 'linestyle': 'solid'}, {'data': lr_find.history['lr'][min_val_index], 'linestyle': 'dashed'}] # # lr_find.plot_best_lr(skip_start=10, skip_end=5, log_lr=True, show_lr=show_lr, ax=None) # # best_lr = lr_find.history['lr'][mid_val_index] # print(f"LR to be used: {best_lr}") # # return best_lr print("########## Leslie Smith's approach ###########") lr_find = LRFinder(model, optimizer, criterion, device="cuda") lr_find.range_test(trainloader, val_loader=testloader, end_lr=1, num_iter=100, step_mode="linear") best_lr = lr_find.plot(log_lr=False) lr_find.reset() return best_lr
#iaa.Sometimes(0.1, iaa.Grayscale(alpha=(0.0, 1.0), from_colorspace="RGB", name="grayscale")), # iaa.Sometimes(0.2, iaa.AdditiveLaplaceNoise(scale=(0, 0.1*255), per_channel=True, name="gaus-noise")), # Color, Contrast, etc. iaa.Sometimes(0.2, iaa.Multiply((0.75, 1.25), per_channel=0.1, name="brightness")), iaa.Sometimes(0.2, iaa.GammaContrast((0.7, 1.3), per_channel=0.1, name="contrast")), iaa.Sometimes(0.2, iaa.AddToHueAndSaturation((-20, 20), name="hue-sat")), iaa.Sometimes(0.3, iaa.Add((-20, 20), per_channel=0.5, name="color-jitter")), ]) augs_test = iaa.Sequential([ # Geometric Augs iaa.Scale((imsize, imsize), 0), ]) db_train = AlphaPilotSegmentation( input_dir='data/dataset/train/images', label_dir='data/dataset/train/labels', transform=augs_train, input_only=["gaus-blur", "grayscale", "gaus-noise", "brightness", "contrast", "hue-sat", "color-jitter"], return_image_name=False ) trainloader = DataLoader(db_train, batch_size=p['trainBatchSize'], shuffle=True, num_workers=32, drop_last=True) # %matplotlib inline lr_finder = LRFinder(net, optimizer, criterion, device="cuda") lr_finder.range_test(trainloader, end_lr=1, num_iter=100) lr_finder.plot() # plt.show()
# else tensor.size(self.ch_dim) # for i in range(2)] # pad = torch.empty(padding_size, dtype=tensor.dtype).fill_(self.fill_value) # tensor = torch.cat((tensor, pad), dim=self.len_dim) # import os # os.environ['MKL_NUM_THREADS'] = '1' # import numpy as np # import utils # # # def main(): # inpt = np.random.randint(-10, 10, 150000) # # for i in range(1000): # out = utils.spectrogram(inpt, 256) # # # if __name__ == '__main__': # main() from lr_finder import LRFinder criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=1e-5) lr_finder = LRFinder(model, optimizer, criterion, device="cuda") lr_finder.range_test(train_loader, end_lr=1, num_iter=50, step_mode="exp") lr_finder.get_best_lr() # lr_finder.plot() # lr_finder.history
def main(batch_size: int = 24, epochs: int = 384, train_path: str = 'train', val_path: str = 'val', multi_gpu_weights=None, weights=None, workers: int = 8, find_lr: bool = False): keras_model = MobileDetectNetModel.complete_model() keras_model.summary() if weights is not None: keras_model.load_weights(weights, by_name=True) train_seq = MobileDetectNetSequence(train_path, stage="train", batch_size=batch_size) val_seq = MobileDetectNetSequence(val_path, stage="val", batch_size=batch_size) keras_model = keras.utils.multi_gpu_model(keras_model, gpus=[0, 1], cpu_merge=True, cpu_relocation=False) if multi_gpu_weights is not None: keras_model.load_weights(multi_gpu_weights, by_name=True) callbacks = [] def region_loss(classes): def loss_fn(y_true, y_pred): # Don't penalize bounding box errors when there is no object present return 10 * classes * K.abs(y_pred - y_true) return loss_fn keras_model.compile(optimizer=SGD(), loss=[ 'mean_absolute_error', region_loss( keras_model.get_layer('classes').output), 'binary_crossentropy' ]) if find_lr: from lr_finder import LRFinder lr_finder = LRFinder(keras_model) lr_finder.find_generator(train_seq, start_lr=0.000001, end_lr=1, epochs=5) lr_finder.plot_loss() return filepath = "weights-{epoch:02d}-{val_loss:.4f}-multi-gpu.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min') callbacks.append(checkpoint) sgdr_sched = SGDRScheduler(0.00001, 0.01, steps_per_epoch=np.ceil( len(train_seq) / batch_size), mult_factor=1.5) callbacks.append(sgdr_sched) keras_model.fit_generator( train_seq, validation_data=val_seq, epochs=epochs, steps_per_epoch=np.ceil(len(train_seq) / batch_size), validation_steps=np.ceil(len(val_seq) / batch_size), callbacks=callbacks, use_multiprocessing=True, workers=workers, shuffle=True)
def main(args=None): set_random_seed(63) chainer.global_config.autotune = True chainer.cuda.set_max_workspace_size(512 * 1024 * 1024) parser = argparse.ArgumentParser() parser.add_argument('--batchsize', '-b', type=int, default=64, help='Number of images in each mini-batch') parser.add_argument('--learnrate', '-l', type=float, default=0.01, help='Learning rate for SGD') parser.add_argument('--epoch', '-e', type=int, default=80, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=0, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--loss-function', choices=['focal', 'sigmoid'], default='focal') parser.add_argument('--optimizer', choices=['sgd', 'adam', 'adabound'], default='adam') parser.add_argument('--size', type=int, default=224) parser.add_argument('--limit', type=int, default=None) parser.add_argument('--data-dir', type=str, default='data') parser.add_argument('--lr-search', action='store_true') parser.add_argument('--pretrained', type=str, default='') parser.add_argument('--backbone', choices=['resnet', 'seresnet', 'debug_model'], default='resnet') parser.add_argument('--log-interval', type=int, default=100) parser.add_argument('--find-threshold', action='store_true') parser.add_argument('--finetune', action='store_true') parser.add_argument('--mixup', action='store_true') args = parser.parse_args() if args is None else parser.parse_args(args) print(args) if args.mixup and args.loss_function != 'focal': raise ValueError('mixupを使うときはfocal lossしか使えません(いまんところ)') train, test, cooccurrence = get_dataset(args.data_dir, args.size, args.limit, args.mixup) base_model = backbone_catalog[args.backbone](args.dropout) if args.pretrained: print('loading pretrained model: {}'.format(args.pretrained)) chainer.serializers.load_npz(args.pretrained, base_model, strict=False) model = TrainChain(base_model, 1, loss_fn=args.loss_function, cooccurrence=cooccurrence, co_coef=0) if args.gpu >= 0: chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() if args.optimizer in ['adam', 'adabound']: optimizer = Adam(alpha=args.learnrate, adabound=args.optimizer == 'adabound', weight_decay_rate=1e-5, gamma=5e-7) elif args.optimizer == 'sgd': optimizer = chainer.optimizers.MomentumSGD(lr=args.learnrate) optimizer.setup(model) if not args.finetune: print('最初のエポックは特徴抽出層をfreezeします') model.freeze_extractor() train_iter = chainer.iterators.MultiprocessIterator(train, args.batchsize, n_processes=8, n_prefetch=2) test_iter = chainer.iterators.MultithreadIterator(test, args.batchsize, n_threads=8, repeat=False, shuffle=False) if args.find_threshold: # train_iter, optimizerなど無駄なsetupもあるが。。 print('thresholdを探索して終了します') chainer.serializers.load_npz(join(args.out, 'bestmodel_loss'), base_model) print('lossがもっとも小さかったモデルに対しての結果:') find_threshold(base_model, test_iter, args.gpu, args.out) chainer.serializers.load_npz(join(args.out, 'bestmodel_f2'), base_model) print('f2がもっとも大きかったモデルに対しての結果:') find_threshold(base_model, test_iter, args.gpu, args.out) return # Set up a trainer updater = training.updaters.StandardUpdater( train_iter, optimizer, device=args.gpu, converter=lambda batch, device: chainer.dataset.concat_examples( batch, device=device)) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # Evaluate the model with the test dataset for each epoch trainer.extend(FScoreEvaluator(test_iter, model, device=args.gpu)) if args.optimizer == 'sgd': # Adamにweight decayはあんまりよくないらしい optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4)) trainer.extend(extensions.ExponentialShift('lr', 0.1), trigger=(3, 'epoch')) if args.lr_search: print('最適な学習率を探します') trainer.extend(LRFinder(1e-7, 1, 5, optimizer), trigger=(1, 'iteration')) elif args.optimizer in ['adam', 'adabound']: if args.lr_search: print('最適な学習率を探します') trainer.extend(LRFinder(1e-7, 1, 5, optimizer, lr_key='alpha'), trigger=(1, 'iteration')) trainer.extend(extensions.ExponentialShift('alpha', 0.2), trigger=triggers.EarlyStoppingTrigger( monitor='validation/main/loss')) # Take a snapshot of Trainer at each epoch trainer.extend( extensions.snapshot(filename='snaphot_epoch_{.updater.epoch}'), trigger=(10, 'epoch')) # Take a snapshot of Model which has best val loss. # Because searching best threshold for each evaluation takes too much time. trainer.extend(extensions.snapshot_object(model.model, 'bestmodel_loss'), trigger=triggers.MinValueTrigger('validation/main/loss')) trainer.extend(extensions.snapshot_object(model.model, 'bestmodel_f2'), trigger=triggers.MaxValueTrigger('validation/main/f2')) trainer.extend(extensions.snapshot_object(model.model, 'model_{.updater.epoch}'), trigger=(5, 'epoch')) # Write a log of evaluation statistics for each epoch trainer.extend( extensions.LogReport(trigger=(args.log_interval, 'iteration'))) trainer.extend( extensions.PrintReport([ 'epoch', 'lr', 'elapsed_time', 'main/loss', 'main/co_loss', 'validation/main/loss', 'validation/main/co_loss', 'validation/main/precision', 'validation/main/recall', 'validation/main/f2', 'validation/main/threshold' ])) trainer.extend(extensions.ProgressBar(update_interval=args.log_interval)) trainer.extend(extensions.observe_lr(), trigger=(args.log_interval, 'iteration')) trainer.extend(CommandsExtension()) save_args(args, args.out) trainer.extend(lambda trainer: model.unfreeze_extractor(), trigger=(1, 'epoch')) if args.resume: # Resume from a snapshot chainer.serializers.load_npz(args.resume, trainer) # save args with pickle for prediction time pickle.dump(args, open(str(Path(args.out).joinpath('args.pkl')), 'wb')) # Run the training trainer.run() # find optimal threshold chainer.serializers.load_npz(join(args.out, 'bestmodel_loss'), base_model) print('lossがもっとも小さかったモデルに対しての結果:') find_threshold(base_model, test_iter, args.gpu, args.out) chainer.serializers.load_npz(join(args.out, 'bestmodel_f2'), base_model) print('f2がもっとも大きかったモデルに対しての結果:') find_threshold(base_model, test_iter, args.gpu, args.out)
from lr_finder import LRFinder from src.model_lib.MultiFTNet import MultiFTNet from src.model_lib.MiniFASNet import MiniFASNetV1, MiniFASNetV2,MiniFASNetV1SE,MiniFASNetV2SE from src.utility import get_kernel from torch.nn import CrossEntropyLoss, MSELoss from torch import optim from src.data_io.dataset_loader import get_train_loader,get_eval_loader from src.default_config import get_default_config, update_config from train import parse_args import os os.environ["CUDA_VISIBLE_DEVICES"] = "3" kernel_size = get_kernel(80, 60) model = MultiFTNet(conv6_kernel = kernel_size) cls_criterion = CrossEntropyLoss() FT_criterion = MSELoss() from torch import optim # optimizer = optim.SGD(model.parameters(), # lr=0.1, # weight_decay=5e-4, # momentum=0.9) optimizer = optim.AdamW(model.parameters()) lr_finder = LRFinder(model, optimizer, cls_criterion,FT_criterion) conf = get_default_config() args = parse_args() conf = update_config(args, conf) trainloader = get_train_loader(conf) val_loader = get_eval_loader(conf) lr_finder.range_test(trainloader, end_lr=1, num_iter=100, step_mode="linear") lr_finder.plot(log_lr=False) lr_finder.reset()
from learner import Learner from sklearn import model_selection as ms from classifier import Classifier if __name__ == '__main__': device = torch.device("cpu") net = Classifier('tf_efficientnet_b4_ns', 5, pretrained=True) transform = T.Compose([ T.ToTensor(), T.Resize((380, 380)), T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) data_root = "/home/namnd/personal-workspace/cassava-leaf-disease-classification" df = pd.read_csv(os.path.join(data_root, 'train.csv')) # train_df, val_df = ms.train_test_split(df, test_size=0.2, random_state=42, stratify=df.label.values) # # train_dataset = CassavaLeafDiseaseDataset(data_root, df=train_df, transform=transform) # val_dataset = CassavaLeafDiseaseDataset(data_root, df=val_df, transform=transform) # # train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=mp.cpu_count()) # val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=mp.cpu_count()) dataloader = CassavaLeafDiseaseDataset(data_root, df, transform=transform) learner = Learner(net, dataloader, device) lr_finder = LRFinder(learner) lr_finder.find() lr_finder.plot()
def main(args): # load train data into ram # data_path = '/mntlong/lanl_comp/data/' file_dir = os.path.dirname(__file__) data_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'data')) train_info_path = os.path.join(data_path, 'train_info.csv') train_data_path = os.path.join(data_path, 'train_compressed.npz') train_info = pd.read_csv(train_info_path, index_col='Unnamed: 0') train_info['exp_len'] = train_info['indx_end'] - train_info['indx_start'] train_signal = np.load(train_data_path)['signal'] train_quaketime = np.load(train_data_path)['quake_time'] # В валидацию берем 2 последних волны (части эксперимента) val_start_idx = train_info.iloc[-2, :]['indx_start'] val_signal = train_signal[val_start_idx:] val_quaketime = train_quaketime[val_start_idx:] train_signal = train_signal[:val_start_idx] train_quaketime = train_quaketime[:val_start_idx] # training params window_size = 150000 overlap_size = int(window_size * 0.5) num_bins = 17 model = models.BaselineNetRawSignalCnnRnnV1(out_size=num_bins-1) loss_fn = nn.CrossEntropyLoss() # L1Loss() SmoothL1Loss() MSELoss() # logs_path = '/mntlong/scripts/logs/' logs_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'logs')) current_datetime = datetime.today().strftime('%b-%d_%H-%M-%S') log_writer_path = os.path.join(logs_path, 'runs', current_datetime + '_' + args.model_name) train_dataset = data.SignalDataset(train_signal, train_quaketime, num_bins=num_bins, idxs_wave_end=train_info['indx_end'].values, window_size=window_size, overlap_size=overlap_size) val_dataset = data.SignalDataset(val_signal, val_quaketime, num_bins=num_bins, idxs_wave_end=train_info['indx_end'].values, window_size=window_size, overlap_size=overlap_size) print('wave size:', train_dataset[0][0].size()) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=5, pin_memory=True) val_loader = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=5, pin_memory=True) if args.find_lr: from lr_finder import LRFinder optimizer = optim.Adam(model.parameters(), lr=1e-6) lr_find = LRFinder(model, optimizer, loss_fn, device='cuda') lr_find.range_test(train_loader, end_lr=1, num_iter=50, step_mode='exp') best_lr = lr_find.get_best_lr() lr_find.plot() lr_find.reset() print('best lr found: {:.2e}'.format(best_lr)) else: best_lr = 3e-4 optimizer = optim.Adam(model.parameters(), lr=best_lr) # weight_decay=0.1 lr_sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3, threshold=0.005) log_writer = SummaryWriter(log_writer_path) utils.train_clf_model(model=model, optimizer=optimizer, lr_scheduler=lr_sched, train_loader=train_loader, val_loader=val_loader, num_epochs=args.num_epochs, model_name=args.model_name, logs_path=logs_path, log_writer=log_writer, loss_fn=loss_fn, num_bins=num_bins)
def main(): global global_token_count, event_writer, train_step, train_loss, last_log_step, \ best_val_loss, epoch, model if args.local_rank > 0: pass # skip shutdown when rank is explicitly set + not zero rank else: os.system('shutdown -c') if not args.local: logger.info( f'Distributed initializing process group with {args.dist_backend}, {args.dist_url}, {util.get_world_size()}' ) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=util.get_world_size()) assert (util.get_world_size() == dist.get_world_size()) logger.info( f"Distributed: success ({args.local_rank}/{dist.get_world_size()})" ) model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model, args.d_head, args.d_inner, args.dropout, args.dropatt, tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, same_length=args.same_length, attn_type=args.attn_type, clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) # log model info n_all_param = sum([p.nelement() for p in model.parameters()]) log_tb('sizes/params', n_all_param) n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) log_tb('sizes/non_emb_params', n_nonemb_param) logger.info('params %s non_emb_params %s', n_all_param, n_nonemb_param) # optimizer if args.optim.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.mom) elif args.optim.lower() == 'lamb': optimizer = Lamb(model.parameters(), lr=args.lr, weight_decay=args.wd) else: assert args.optim.lower() == 'adam' optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) # scheduler if args.scheduler == 'cosine': # Divide by 1e6 for numerical stability. scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_tokens // 1e6, eta_min=args.eta_min) elif args.scheduler == 'finder': scheduler = LRFinder(optimizer, args.max_tokens, init_value=args.lr / 1e3) elif args.scheduler == 'constant': pass model.apply(weights_init) model.word_emb.apply( weights_init ) # ensure embedding init is not overridden by out_layer in case of weight sharing if args.checkpoint: if global_rank == 0: util.restore_from_checkpoint(model=model, checkpoint_fn=args.checkpoint) model = model.to(device) if args.fp16: model = FP16_Module(model) optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, dynamic_loss_args={'init_scale': 2**16}, verbose=False) if args.local: model = nn.DataParallel(model, dim=1) else: # Uncomment find_unused_parameters and upgrade to torch 1.1 for adaptive embedding. model = DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) #, find_unused_parameters=True) if global_rank == 0: event_writer = SummaryWriter(args.logdir) event_writer.add_text('args', str(args)) # test checkpoint writing if args.checkpoint_each_epoch: logger.info(f'Saving checkpoint for epoch {epoch}') util.dist_save_checkpoint(model, optimizer, args.logdir, suffix=f'{0}') # Loop over epochs. train_step = 0 train_loss = 0 last_log_step = 0 best_val_loss = None va_iter, te_iter = [ corpus.get_dist_iterator(split, global_rank, max_rank, args.batch_size * 2, args.tgt_len, device=device, ext_len=args.ext_len) for split in ('valid', 'test') ] # At any point you can hit Ctrl + C to break out of training early. try: for epoch in itertools.count(start=1): train(va_iter, optimizer, scheduler) except KeyboardInterrupt: logger.info('-' * 100) logger.info('Exiting from training early') except StopIteration: pass # Eval one more time. evaluate_and_log(optimizer, va_iter, 'val', train_step=-1) # Load the best saved model. logger.info("Loading best checkpoint") model_file = os.path.join(args.logdir, 'model-best.pt') if os.path.exists(model_file): with open(model_file, 'rb') as model_f: with timeit('load'): if args.local: model = torch.load(model_f) else: model = torch.load(model_f, map_location=lambda storage, loc: storage.cuda(args.local_rank)) model = DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) else: logger.warn('no model file, using current model for loss') # Run on test data. evaluate_and_log(optimizer, te_iter, 'test', -1)
sum_kl_loss = keras.backend.sum(kl_loss, axis=0) sum_g_loss = keras.backend.sum(g_loss, axis=0) sum_g_loss = sum_g_loss * alpha #This is basically a loss penalty loss = sum_g_loss + sum_kl_loss return loss #Model: define inputs and outputs model = Model(inputs=[in_1, in_2, in_3], outputs=out_vals) #probabilities)# opt = optimizers.Adam( clipnorm=1., lr=lrate) #remove clipnorm and add loss penalty - clipnorm works better model.compile(loss=bin_loss, optimizer=opt) if find_lr == True: lr_finder = LRFinder(model) X_train = [X1_train, X2_train, X3_train] lr_finder.find(X_train, y_train, start_lr=0.00000001, end_lr=1, batch_size=batch_size, epochs=1) losses = lr_finder.losses lrs = lr_finder.lrs l_l = np.asarray([lrs, losses]) np.savetxt(out_dir + 'lrs_losses.txt', l_l) num_epochs = 0
def train_loop(folds, fold): if CFG.device == 'GPU': LOGGER.info(f"========== fold: {fold} training ==========") # ==================================================== # loader # ==================================================== trn_idx = folds[folds['fold'] != fold].index val_idx = folds[folds['fold'] == fold].index train_folds = folds.loc[trn_idx].reset_index(drop=True) valid_folds = folds.loc[val_idx].reset_index(drop=True) valid_labels = valid_folds[CFG.target_cols].values train_dataset = TrainDataset(train_folds, transform=get_transforms(data='train')) valid_dataset = TrainDataset(valid_folds, transform=get_transforms(data='valid')) train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size * 2, shuffle=False, num_workers=CFG.num_workers, pin_memory=True, drop_last=False) # ==================================================== # scheduler # ==================================================== def get_scheduler(optimizer): if CFG.scheduler == 'ReduceLROnPlateau': scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps) elif CFG.scheduler == 'CosineAnnealingLR': scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1) elif CFG.scheduler == 'CosineAnnealingWarmRestarts': scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1) return scheduler # ==================================================== # model & optimizer # ==================================================== device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = CustomModel(CFG.model_name, pretrained=False) model = torch.nn.DataParallel(model) model.load_state_dict( torch.load(f'{CFG.model_name}_student_fold{fold}_best_score.pth', map_location=torch.device('cpu'))['model']) # model.load_state_dict(torch.load(f'0.9647/{CFG.model_name}_no_hflip_fold{fold}_best_score.pth', map_location=torch.device('cpu'))['model']) model.to(device) # criterion = nn.BCEWithLogitsLoss() criterion = FocalLoss(alpha=1, gamma=6) # optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False) optimizer = SGD(model.parameters(), lr=1e-2, weight_decay=CFG.weight_decay, momentum=0.9) find_lr = False if find_lr: from lr_finder import LRFinder lr_finder = LRFinder(model, optimizer, criterion, device=device) lr_finder.range_test(train_loader, start_lr=1e-2, end_lr=1e0, num_iter=100, accumulation_steps=1) fig_name = f'{CFG.model_name}_lr_finder.png' lr_finder.plot(fig_name) lr_finder.reset() return scheduler = get_scheduler(optimizer) swa_model = torch.optim.swa_utils.AveragedModel(model) swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=1e-3) swa_start = 9 # ==================================================== # loop # ==================================================== best_score = 0. best_loss = np.inf for epoch in range(CFG.epochs): start_time = time.time() # train avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device) # eval # avg_val_loss, preds, _ = valid_fn(valid_loader, model, criterion, device) if epoch > swa_start: swa_model.update_parameters(model) swa_scheduler.step() else: if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(avg_val_loss) elif isinstance(scheduler, CosineAnnealingLR): scheduler.step() elif isinstance(scheduler, CosineAnnealingWarmRestarts): scheduler.step() # scoring avg_val_loss, preds, _ = valid_fn(valid_loader, model, criterion, device) score, scores = get_score(valid_labels, preds) elapsed = time.time() - start_time LOGGER.info( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s' ) LOGGER.info( f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}' ) if score > best_score: best_score = score LOGGER.info( f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model') torch.save({'model': model.state_dict()}, OUTPUT_DIR + f'{CFG.model_name}_no_hflip_fold{fold}_best_score.pth') # if avg_val_loss < best_loss: # best_loss = avg_val_loss # LOGGER.info(f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model') # torch.save({'model': model.state_dict(), # 'preds': preds}, # OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_loss.pth') torch.optim.swa_utils.update_bn(train_loader, swa_model) avg_val_loss, preds, _ = valid_fn(valid_loader, swa_model, criterion, device) score, scores = get_score(valid_labels, preds) LOGGER.info(f'Save swa Score: {score:.4f} Model') torch.save({'model': swa_model.state_dict()}, OUTPUT_DIR + f'swa_{CFG.model_name}_fold{fold}_{score:.4f}.pth') # if CFG.nprocs != 8: # check_point = torch.load(OUTPUT_DIR+f'{CFG.model_name}_fold{fold}_best_score.pth') # for c in [f'pred_{c}' for c in CFG.target_cols]: # valid_folds[c] = np.nan # try: # valid_folds[[f'pred_{c}' for c in CFG.target_cols]] = check_point['preds'] # except: # pass return
class Trainer: def __init__(self, model, criterion, optimizer, train_loader, val_loader=None, name="experiment", experiments_dir="runs", save_dir=None, div_lr=1): self.device = device() self.model = model.to(self.device) self.criterion = criterion self.optimizer = optimizer self.train_loader = train_loader self.val_loader = val_loader self.div_lr = div_lr self.update_lr(self.optimizer.defaults['lr']) self._epoch_count = 0 self._best_loss = None self._best_acc = None if save_dir is None: save_dir = f"{self.get_num_dir(experiments_dir):04d}-{get_git_hash()}-{name}" self._save_dir = os.path.join(experiments_dir, save_dir) self.writer = Logger(self._save_dir) atexit.register(self.cleanup) def train(self, epochs=1): for epoch in range(epochs): self._epoch_count += 1 print("\n----- epoch ", self._epoch_count, " -----") train_loss, train_acc = self._train_epoch() if self.val_loader: val_loss, val_acc = self._validate_epoch() if self._best_loss is None or val_loss < self._best_loss: self.save_checkpoint('best_model') self._best_loss = val_loss print("new best val loss!") if self._best_acc is None or val_acc > self._best_acc: self.save_checkpoint('best_model_acc') self._best_acc = val_acc print("new best val acc!") def test(self, test_loader): self.model.eval() running_loss = 0 running_acc = 0 for iter, (inputs, targets) in enumerate(tqdm(test_loader)): inputs = inputs.to(device()) targets = targets.to(device()) with torch.set_grad_enabled(False): outputs = self.model(inputs) batch_loss = self.criterion(outputs, targets) batch_acc = accuracy(outputs, targets) running_loss += batch_loss.item() running_acc += batch_acc.item() epoch_loss = running_loss / len(test_loader) epoch_acc = running_acc / len(test_loader) print(f"test loss: {epoch_loss:.5f} test acc: {epoch_acc:.5f}") return epoch_loss, epoch_acc def train_one_cycle(self, epochs=1, lr=None): if lr is None: lr = self.optimizer.defaults['lr'] self.onecycle = OneCycle(len(self.train_loader) * epochs, lr) self.train(epochs) self.onecycle = None def _train_epoch(self, save_histogram=False): self.model.train() running_loss = 0 running_acc = 0 for iter, (inputs, targets) in enumerate(tqdm(self.train_loader)): inputs = inputs.to(device()) targets = targets.to(device()) if self.onecycle is not None: lr, mom = next(self.onecycle) self.update_lr(lr) self.update_mom(mom) with torch.set_grad_enabled(True): outputs = self.model(inputs) batch_loss = self.criterion(outputs, targets) batch_acc = accuracy(outputs, targets) batch_loss.backward() self.optimizer.step() self.optimizer.zero_grad() running_loss += batch_loss.item() running_acc += batch_acc.item() if self.log_every(iter): self.writer.add_scalars( "loss", {"train_loss": running_loss / float(iter + 1)}, (self._epoch_count - 1) * len(self.train_loader) + iter) self.writer.add_scalars( "acc", {"train_acc": running_acc / float(iter + 1)}, (self._epoch_count - 1) * len(self.train_loader) + iter) epoch_loss = running_loss / len(self.train_loader) epoch_acc = running_acc / len(self.train_loader) print(f"train loss: {epoch_loss:.5f} train acc: {epoch_acc:.5f}") return epoch_loss, epoch_acc def _validate_epoch(self): self.model.eval() running_loss = 0 running_acc = 0 for iter, (inputs, targets) in enumerate(tqdm(self.val_loader)): inputs = inputs.to(device()) targets = targets.to(device()) with torch.set_grad_enabled(False): outputs = self.model(inputs) batch_loss = self.criterion(outputs, targets) batch_acc = accuracy(outputs, targets) running_loss += batch_loss.item() running_acc += batch_acc.item() if self.log_every(iter): self.writer.add_scalars( "loss", {"val_loss": running_loss / float(iter + 1)}, (self._epoch_count - 1) * len(self.val_loader) + iter) self.writer.add_scalars( "acc", {"val_acc": running_acc / float(iter + 1)}, (self._epoch_count - 1) * len(self.val_loader) + iter) epoch_loss = running_loss / len(self.val_loader) epoch_acc = running_acc / len(self.val_loader) print(f"val loss: {epoch_loss:.5f} val acc: {epoch_acc:.5f}") return epoch_loss, epoch_acc def get_num_dir(self, path): num_dir = len(os.listdir(path)) return num_dir def save_checkpoint(self, fname): path = os.path.join(self._save_dir, fname) torch.save( dict( epoch=self._epoch_count, best_loss=self._best_loss, best_acc=self._best_acc, model=self.model.state_dict(), optimizer=self.optimizer.state_dict(), ), path) def load_checkpoint(self, fname): path = os.path.join(self._save_dir, fname) checkpoint = torch.load(path, map_location=lambda storage, loc: storage) self._epoch_count = checkpoint['epoch'] self.model.load_state_dict(checkpoint['model']) self.optimizer.load_state_dict(checkpoint['optimizer']) def log_every(self, i): return (i % 100) == 0 def update_lr(self, lr): n = len(self.optimizer.param_groups) - 1 for i, g in enumerate(self.optimizer.param_groups): g['lr'] = lr / (self.div_lr**(n - i)) def update_mom(self, mom): keys = self.optimizer.param_groups[0].keys() for g in self.optimizer.param_groups: if 'momentum' in g.keys(): g['momentum'] = mom elif 'betas' in g.keys(): g['betas'] = mom if isinstance(mom, tuple) else (mom, g['betas'][1]) else: raise ValueError def find_lr(self, start_lr=1e-7, end_lr=100, num_iter=100): optimizer_state = self.optimizer.state_dict() self.update_lr(start_lr) self.lr_finder = LRFinder(self.model, self.optimizer, self.criterion, self.device) self.lr_finder.range_test(self.train_loader, end_lr=end_lr, num_iter=num_iter) self.optimizer.load_state_dict(optimizer_state) self.lr_finder.plot() def cleanup(self): copy_runpy(self._save_dir) path = os.path.join(self._save_dir, "./all_scalars.json") self.writer.export_scalars_to_json(path) self.writer.close()
def main_loop(): util.cancel_shutdown() losses = [] args = g.args if not args.local: g.logger.info( f'Distributed initializing process group with ' f'{args.dist_backend}, {args.dist_url}, {util.get_world_size()}') dist.init_process_group( backend=args.dist_backend, #init_method=args.dist_url, #world_size=util.get_world_size() ) assert (util.get_world_size() == dist.get_world_size()) g.logger.info( f"Distributed: success ({args.local_rank}/{dist.get_world_size()})" ) g.logger.info("creating new model") g.state = TrainState(args) g.state.model = MemTransformerLM(g.ntokens, args.n_layer, args.n_head, args.d_model, args.d_head, args.d_inner, args.dropout, args.dropatt, tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, tie_projs=g.tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=g.cutoffs, same_length=args.same_length, attn_type=args.attn_type, clamp_len=args.clamp_len, sample_softmax=args.sample_softmax, freeze_below=args.freeze_below) g.state.model.to(g.device) optimizer_setup(g.state) if args.checkpoint: if args.checkpoint_secondary: g.logger.info(f"restoring extra checkpoint") util.restore_from_checkpoint(g.state.model, g.state.optimizer, args.checkpoint_secondary, args.optim_state_dict) g.logger.info(f"Restoring model from {args.checkpoint}" + f" and optimizer from {args.optim_state_dict}" if args. optim_state_dict else "") util.restore_from_checkpoint(g.state.model, g.state.optimizer, args.checkpoint, args.optim_state_dict) else: g.state.model.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing g.state.model.word_emb.apply(weights_init) model: MemTransformerLM = g.state.model optimizer = g.state.optimizer if g.state.args.fp16: model = FP16_Module(model) optimizer = FP16_Optimizer( optimizer, static_loss_scale=g.state.args.static_loss_scale, dynamic_loss_scale=g.state.args.dynamic_loss_scale, dynamic_loss_args={'init_scale': 2**16}, verbose=False) # log model info # n_all_param = sum([p.nelement() for p in model.parameters()]) # log_tb('sizes/params', n_all_param) # n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) # log_tb('sizes/non_emb_params', n_nonemb_param) # g.logger.info('params %s non_emb_params %s', n_all_param, n_nonemb_param) # scheduler if args.scheduler == 'cosine': # Divide by 1e6 for numerical stability. g.state.scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, args.max_tokens // 1e6, eta_min=args.eta_min) elif args.scheduler == 'finder': g.state.scheduler: LRFinder = LRFinder(optimizer, args.max_tokens, init_value=args.lr / 1e3) else: assert args.scheduler == 'constant' g.state.scheduler = util.NoOp() # Setup distributed model if args.local: model = nn.DataParallel(model, dim=1) else: # Uncomment find_unused_parameters and upgrade to torch 1.1 for adaptive embedding. model = DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) # , find_unused_parameters=True) if util.get_global_rank() == 0: if not args.test: wandb.config.update(vars(args)) # wandb.watch(model) g.event_writer.add_text('args', str(args)) # TODO: replace with log_tb accumulated_loss = 0 # At any point you can hit Ctrl + C to break out of training early. try: for epoch in itertools.count(start=g.state.last_epoch): print(f"epoch -- {epoch}, token_count -- {g.state.token_count}") model.train() log_tb('sizes/batch_size', args.batch_size) log_tb('sizes/seq_size', args.tgt_len) if g.state.partial_epoch: # reuse previously loaded tr_iter and states assert g.state.tr_iter is not None assert g.state.mems is not None else: g.state.tr_iter = g.corpus.get_dist_iterator( 'train', rank=util.get_global_rank(), max_rank=util.get_world_size(), bsz=args.batch_size, bptt=args.tgt_len, device=g.device, ext_len=args.ext_len, skip_files=g.args.skip_files) g.state.mems = tuple() g.state.last_epoch = epoch log_start_time = time.time() tokens_per_epoch = 0 for batch, (data, target, seq_len) in enumerate(g.state.tr_iter): # assert seq_len == data.shape[0] # for i in range(1, data.shape[0]): # assert torch.all(torch.eq(data[i], target[i - 1])) # break # print(g.state.token_count, data) if g.state.train_step % args.eval_interval == 0: evaluate_and_log(model, g.va_iter, 'val_short-mem-1', generate_text=False, reset_mems_interval=1) evaluate_and_log(model, g.va_iter, 'val_short-mem-2', generate_text=False, reset_mems_interval=2) evaluate_and_log(model, g.va_iter, 'val_short-mem-3', generate_text=False, reset_mems_interval=3) evaluate_and_log(model, g.va_iter, 'val') if g.va_custom_iter: evaluate_and_log(g.state.model, g.va_custom_iter, g.args.valid_custom, generate_text=False) batch_total = torch.tensor(data.shape[1]).to(g.device) if args.local: # TODO(y): factor out (need way to see if dist was inited) batch_total = batch_total.sum() else: batch_total = util.dist_sum_tensor( batch_total) # global batch size batch_total = util.toscalar(batch_total) should_log = (g.state.train_step < args.verbose_log_steps) or \ (g.state.train_step + 1) % args.log_interval == 0 model.zero_grad() ret = model(data, target, *g.state.mems) loss, g.state.mems = ret[0], ret[1:] loss: torch.Tensor = loss.float().mean().type_as(loss) with timeit('backwards', noop=not should_log): if args.fp16: optimizer.backward(loss) else: loss.backward() loss0 = util.toscalar(loss) util.record('loss', loss0) util.record('params', torch.sum(util.flat_param(model)).item()) losses.append(loss0) accumulated_loss += loss0 if args.fp16: optimizer.clip_master_grads(args.clip) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # step-wise learning rate annealing if hasattr(optimizer, 'overflow') and optimizer.overflow: g.logger.info("skipped iteration") else: if args.scheduler in ['cosine', 'constant', 'dev_perf']: # linear warmup stage if g.state.token_count < args.warmup_tokens: curr_lr = args.lr * float( g.state.token_count) / args.warmup_tokens optimizer.param_groups[0]['lr'] = curr_lr elif args.scheduler == 'cosine': # Divide by 1e6 for numerical stability. g.state.scheduler.step(g.state.token_count // 1000 // 1000) else: g.state.scheduler.step(g.state.token_count) optimizer.step() g.state.train_step += 1 consumed_tokens = data.shape[0] * data.shape[1] world_size = int(os.environ.get("WORLD_SIZE", "8")) if world_size > 8: # correction factor for multiple machines consumed_tokens = consumed_tokens * (world_size // 8) tokens_per_epoch += consumed_tokens g.state.token_count += consumed_tokens g.token_count = g.state.token_count if g.state.token_count >= args.max_tokens: g.state.partial_epoch = True raise StopIteration # break out of parent train loop if should_log: elapsed_time = time.time() - log_start_time elapsed_steps = g.state.train_step - g.state.last_log_step # compute average loss over last logging interval cur_loss = accumulated_loss / elapsed_steps cur_loss_mean = util.dist_mean(cur_loss) log_str = f'| epoch {epoch:3d} step {g.state.train_step:>8d} ' \ f'| {batch:>6d} batches ' \ f'| lr {optimizer.param_groups[0]["lr"]:.3g} ' \ f'| ms/batch {elapsed_time * 1000 / elapsed_steps:5.2f} ' \ f'| loss {cur_loss:5.2f}' if args.dataset in ['enwik8', 'text8']: log_str += f' | bpc {cur_loss / math.log(2):9.5f}' else: log_str += f' | ppl {math.exp(cur_loss):9.3f}' g.logger.info(log_str) log_tb('learning/epoch', epoch) log_tb('_loss', cur_loss_mean) # the most important thing log_tb('learning/loss', cur_loss_mean) log_tb('learning/ppl', math.exp(cur_loss_mean)) # currently step timings are not synchronized in multi-machine # case (see #4). Can add torch.distributed.barrier() to get # more accurate timings, but this may add slowness. log_tb('times/step', 1000 * elapsed_time / elapsed_steps) current_lr = optimizer.param_groups[0]['lr'] log_tb('learning/lr', current_lr) # 32 is the "canonical" batch size linear_scaling_factor = batch_total / 32 # TODO(y): merge logic from master log_tb('learning/base_lr', current_lr / linear_scaling_factor) if args.optim == 'lamb': log_lamb_rs(optimizer, g.event_writer, g.state.token_count) time_per_batch = elapsed_time / elapsed_steps time_per_sample = time_per_batch / args.batch_size time_per_token = time_per_sample / args.tgt_len log_tb('times/batches_per_sec', 1 / time_per_batch) log_tb('times/samples_per_sec', 1 / time_per_sample) log_tb('times/tokens_per_sec', 1 / time_per_token) if str(g.device) == 'cuda': log_tb("memory/allocated_gb", torch.cuda.memory_allocated() / 1e9) log_tb("memory/max_allocated_gb", torch.cuda.max_memory_allocated() / 1e9) log_tb("memory/cached_gb", torch.cuda.memory_cached() / 1e9) log_tb("memory/max_cached_gb", torch.cuda.max_memory_cached() / 1e9) accumulated_loss = 0 log_start_time = time.time() g.state.last_log_step = g.state.train_step if args.checkpoint_each_epoch: g.logger.info(f'Saving checkpoint for epoch {epoch}') util.dist_save_checkpoint(model, optimizer, args.logdir, suffix=f'{epoch}') if tokens_per_epoch == 0: logging.info("Zero tokens in last epoch, breaking") break g.state.partial_epoch = False except KeyboardInterrupt: g.logger.info('-' * 100) g.logger.info('Exiting from training early') except StopIteration: pass return losses
factor=0.5, patience=4, min_lr=0.000001, cooldown=3, verbose=1) stop_on_nan = keras.callbacks.TerminateOnNaN() # LR finder if opt.find_lr: # pre-train to avoid model being too far away from interesting range history = model.fit_generator(gen_x_train, epochs=2, verbose=1, callbacks=[clr]) lr_finder = LRFinder(model) lr_finder.find_generator(gen_x_train, 0.00001, 1.0, 5) lr_finder.plot_loss() import pdb pdb.set_trace() # Run training if not opt.notrain: # Train classifier history = model.fit_generator( gen_x_train, epochs=epochs, verbose=1, # switch to 1 for more verbosity callbacks=[early_stopping, clr, stop_on_nan], #, reduce_lr], #, lr, reduce_lr], # callbacks=[early_stopping, reduce_lr], #, lr, reduce_lr],
def main_loop(): util.cancel_shutdown() losses = [] args = g.args if not args.local: g.logger.info( f'Distributed initializing process group with {args.dist_backend}, {args.dist_url}, {util.get_world_size()}') dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=util.get_world_size()) assert (util.get_world_size() == dist.get_world_size()) g.logger.info(f"Distributed: success ({args.local_rank}/{dist.get_world_size()})") if args.load_state_fn: g.state = load_state(args.load_state_fn) g.logger.info(f"Restoring training from {args.load_state_fn}") else: g.logger.info("creating new model") g.state = TrainState(args) g.state.model = MemTransformerLM(g.ntokens, args.n_layer, args.n_head, args.d_model, args.d_head, args.d_inner, args.dropout, args.dropatt, tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, tie_projs=g.tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=g.cutoffs, same_length=args.same_length, attn_type=args.attn_type, clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) if args.checkpoint: util.restore_from_checkpoint(g.state.model, checkpoint_fn=args.checkpoint) else: g.state.model.apply(weights_init) g.state.model.word_emb.apply( weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing g.state.model.to(g.device) optimizer_setup(g.state) model: MemTransformerLM = g.state.model optimizer = g.state.optimizer # log model info # n_all_param = sum([p.nelement() for p in model.parameters()]) # log_tb('sizes/params', n_all_param) # n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) # log_tb('sizes/non_emb_params', n_nonemb_param) # g.logger.info('params %s non_emb_params %s', n_all_param, n_nonemb_param) # scheduler if not g.args.load_state_fn: if args.scheduler == 'cosine': # Divide by 1e6 for numerical stability. g.state.scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_tokens // 1e6, eta_min=args.eta_min) elif args.scheduler == 'finder': g.state.scheduler: LRFinder = LRFinder(optimizer, args.max_tokens, init_value=args.lr / 1e3) else: assert args.scheduler == 'constant' g.state.scheduler = util.NoOp() # Setup distributed model if args.local: model = nn.DataParallel(model, dim=1) else: # Uncomment find_unused_parameters and upgrade to torch 1.1 for adaptive embedding. model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) # , find_unused_parameters=True) if util.get_global_rank() == 0: if not args.test: wandb.config.update(vars(args)) # wandb.watch(model) g.event_writer.add_text('args', str(args)) # TODO: replace with log_tb accumulated_loss = 0 # At any point you can hit Ctrl + C to break out of training early. try: for epoch in itertools.count(start=g.state.last_epoch): print(f"epoch -- {epoch}, token_count -- {g.state.
def main(args): # load train data into ram # data_path = '/mntlong/lanl_comp/data/' file_dir = os.path.dirname(__file__) data_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'data')) train_info_path = os.path.join(data_path, 'train_info.csv') train_data_path = os.path.join(data_path, 'train_compressed.npz') train_info = pd.read_csv(train_info_path, index_col='Unnamed: 0') train_info['exp_len'] = train_info['indx_end'] - train_info['indx_start'] train_signal = np.load(train_data_path)['signal'] train_quaketime = np.load(train_data_path)['quake_time'] # В валидацию берем 2 последних волны (части эксперимента) val_start_idx = train_info.iloc[-2, :]['indx_start'] val_signal = train_signal[val_start_idx:] val_quaketime = train_quaketime[val_start_idx:] train_signal = train_signal[:val_start_idx] train_quaketime = train_quaketime[:val_start_idx] # training params large_ws = 1500000 overlap_size = int(large_ws * 0.5) small_ws = 150000 num_bins = 17 cpc_meta_model = models.CPCv1(out_size=num_bins - 1) # logs_path = '/mntlong/scripts/logs/' logs_path = os.path.abspath(os.path.join(file_dir, os.path.pardir, 'logs')) current_datetime = datetime.today().strftime('%b-%d_%H-%M-%S') log_writer_path = os.path.join(logs_path, 'runs', current_datetime + '_' + args.model_name) train_dataset = data.SignalCPCDataset( train_signal, train_quaketime, num_bins=num_bins, idxs_wave_end=train_info['indx_end'].values, large_ws=large_ws, overlap_size=overlap_size, small_ws=small_ws) val_dataset = data.SignalCPCDataset( val_signal, val_quaketime, num_bins=num_bins, idxs_wave_end=train_info['indx_end'].values, large_ws=large_ws, overlap_size=overlap_size, small_ws=small_ws) print('x_t size:', train_dataset[0][0].size()) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=5, pin_memory=True) val_loader = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=5, pin_memory=True) if args.find_lr: from lr_finder import LRFinder optimizer = optim.Adam(cpc_meta_model.parameters(), lr=1e-6) lr_find = LRFinder(cpc_meta_model, optimizer, criterion=None, is_cpc=True, device='cuda') lr_find.range_test(train_loader, end_lr=2, num_iter=75, step_mode='exp') best_lr = lr_find.get_best_lr() lr_find.plot() lr_find.reset() print('best lr found: {:.2e}'.format(best_lr)) else: best_lr = 3e-4 # sys.exit() # model_path = os.path.join(logs_path, 'cpc_no_target_head_cont_last_state.pth') # cpc_meta_model.load_state_dict(torch.load(model_path)['model_state_dict']) # cpc_meta_model.to(torch.device('cuda')) optimizer = optim.Adam(cpc_meta_model.parameters(), lr=best_lr) # optimizer.load_state_dict(torch.load(model_path)['optimizer_state_dict']) lr_sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3, threshold=0.005) log_writer = SummaryWriter(log_writer_path) utils.train_cpc_model(cpc_meta_model=cpc_meta_model, optimizer=optimizer, num_bins=num_bins, lr_scheduler=lr_sched, train_loader=train_loader, val_loader=val_loader, num_epochs=args.num_epochs, model_name=args.model_name, logs_path=logs_path, log_writer=log_writer)
def main(args): np.random.seed(432) torch.random.manual_seed(432) try: os.makedirs(args.outpath) except OSError: pass experiment_path = utils.get_new_model_path(args.outpath) print(experiment_path) train_writer = SummaryWriter(os.path.join(experiment_path, 'train_logs')) val_writer = SummaryWriter(os.path.join(experiment_path, 'val_logs')) trainer = train.Trainer(train_writer, val_writer) # todo: add config train_transform = data.build_preprocessing() eval_transform = data.build_preprocessing() trainds, evalds = data.build_dataset(args.datadir, None) trainds.transform = train_transform evalds.transform = eval_transform model = models.resnet34() opt = torch.optim.Adam(model.parameters(), lr=1e-8) trainloader = DataLoader(trainds, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) evalloader = DataLoader(evalds, batch_size=args.batch_size, shuffle=False, num_workers=16, pin_memory=True) #find lr fast ai criterion = torch.nn.BCEWithLogitsLoss() lr_finder = LRFinder(model, opt, criterion, device="cuda") # lr_finder.range_test(trainloader, val_loader=evalloader, end_lr=1, num_iter=10, step_mode="exp") lr_finder.range_test(trainloader, end_lr=100, num_iter=100, step_mode="exp") #plot graph fast ai skip_start = 6 skip_end = 3 lrs = lr_finder.history["lr"] losses = lr_finder.history["loss"] grad_norm = lr_finder.history["grad_norm"] # ind = grad_norm.index(min(grad_norm)) # opt_lr = lrs[ind] # print('LR with min grad_norm =', opt_lr) lrs = lrs[skip_start:-skip_end] losses = losses[skip_start:-skip_end] fig = plt.figure(figsize=(12, 9)) plt.plot(lrs, losses) plt.xscale("log") plt.xlabel("Learning rate") plt.ylabel("Loss") train_writer.add_figure('loss_vs_lr', fig) lr_finder.reset() # fixed_lr = 1e-3 fixed_lr = 3e-4 opt = torch.optim.Adam(model.parameters(), lr=fixed_lr) # #new # lr = 1e-3 # eta_min = 1e-5 # t_max = 10 # opt = torch.optim.Adam(model.parameters(), lr=lr) # scheduler = CosineAnnealingLR(opt, T_max=t_max, eta_min=eta_min) # #new # one cycle for 5 ehoches # scheduler = CosineAnnealingLR(opt, 519*4, eta_min=1e-4) scheduler = CosineAnnealingLR(opt, args.epochs) # scheduler = CosineAnnealingLR(opt, 519, eta_min=1e-5) # scheduler = StepLR(opt, step_size=3, gamma=0.1) state_list = [] for epoch in range(args.epochs): # t = epoch / args.epochs # lr = np.exp((1 - t) * np.log(lr_begin) + t * np.log(lr_end)) # выставляем lr для всех параметров trainer.train_epoch(model, opt, trainloader, fixed_lr, scheduler) # trainer.train_epoch(model, opt, trainloader, 3e-4, scheduler) # trainer.train_epoch(model, opt, trainloader, 9.0451e-4, scheduler) metrics = trainer.eval_epoch(model, evalloader) state = dict( epoch=epoch, model_state_dict=model.state_dict(), optimizer_state_dict=opt.state_dict(), loss=metrics['loss'], lwlrap=metrics['lwlrap'], global_step=trainer.global_step, ) state_copy = copy.deepcopy(state) state_list.append(state_copy) export_path = os.path.join(experiment_path, 'last.pth') torch.save(state, export_path) # save the best path best_export_path = os.path.join(experiment_path, 'best.pth') max_lwlrap = 0 max_lwlrap_ind = 0 for i in range(args.epochs): if state_list[i]['lwlrap'] > max_lwlrap: max_lwlrap = state_list[i]['lwlrap'] max_lwlrap_ind = i best_state = state_list[max_lwlrap_ind] torch.save(best_state, best_export_path)
train_loader = DataLoader(train_ds,batch_size=batch_size, sampler=BalanceClassSampler(labels=train_ds.get_labels(), mode="downsampling"), shuffle=False, num_workers=4) else: train_loader = DataLoader(train_ds,batch_size=batch_size, shuffle=True, num_workers=4) plist = [ {'params': model.backbone.parameters(), 'lr': learning_rate/50}, {'params': model.meta_fc.parameters(), 'lr': learning_rate}, # {'params': model.metric_classify.parameters(), 'lr': learning_rate}, ] optimizer = optim.Adam(plist, lr=learning_rate) # lr_reduce_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=patience, verbose=True, threshold=1e-4, threshold_mode='rel', cooldown=0, min_lr=1e-7, eps=1e-08) # cyclic_scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=learning_rate, max_lr=10*learning_rate, step_size_up=2000, step_size_down=2000, mode='triangular', gamma=1.0, scale_fn=None, scale_mode='cycle', cycle_momentum=False, base_momentum=0.8, max_momentum=0.9, last_epoch=-1) criterion = criterion_margin_focal_binary_cross_entropy if load_model: tmp = torch.load(os.path.join(model_dir, model_name+'_loss.pth')) model.load_state_dict(tmp['model']) # optimizer.load_state_dict(tmp['optim']) # lr_reduce_scheduler.load_state_dict(tmp['scheduler']) # cyclic_scheduler.load_state_dict(tmp['cyclic_scheduler']) # amp.load_state_dict(tmp['amp']) prev_epoch_num = tmp['epoch'] best_valid_loss = tmp['best_loss'] del tmp print('Model Loaded!') # model, optimizer = amp.initialize(model, optimizer, opt_level='O1') lr_finder = LRFinder(model, optimizer, criterion, device="cuda") lr_finder.range_test(train_loader, end_lr=100, num_iter=500, accumulation_steps=accum_step) lr_finder.plot() # to inspect the loss-learning rate graph
xval[cols_to_scale] = ss.transform(xval[cols_to_scale]) yval = yval.values xtrain, xval = xtrain.values, xval.values # %% nn = keras.Sequential([ keras.layers.Dense(units=15, activation='relu'), keras.layers.Dense(units=15, activation='relu'), keras.layers.Dense(units=1, activation='linear'), ]) nn.compile(keras.optimizers.SGD(lr=0.001), 'MAE', metrics=['MAE']) nn.build((None, xtrain.shape[1])) # Find optimal learning rate. Use the one with the steepest descent of loss (not minimum) lrf = LRFinder(0.01, 1) nn.fit(xtrain, ytrain, validation_data=(xval, yval), epochs=5, batch_size=32, callbacks=[lrf]) # %% from CLRCallback import CyclicLR clr = CyclicLR((10**-1) / 3, 10**-1) h = nn.fit(xtrain, ytrain, validation_data=(xval, yval), epochs=25, batch_size=32,
shuffle=True, num_workers=4) for x in ['train', 'val']} dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']} class_names = image_datasets['train'].classes device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Finetuning the convnet model = models.resnet18(pretrained=True) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, 2) model = model.to(device) criterion = nn.CrossEntropyLoss() #Select a small learning rate for the start optimizer_ft = optim.SGD(model.parameters(), lr=1e-5, momentum = 0.9) lr_finder = LRFinder(model,optimizer_ft, criterion, device="cuda") #Using the train loss lr_finder.range_test(dataloaders['train'], end_lr=100,num_iter=1000,step_mode='exp') lr_finder.plot() #Using the validation loss lr_finder.reset() lr_finder.range_test(dataloaders['train'], val_loader=dataloaders['val'],end_lr=100,num_iter=200,step_mode='exp') lr_finder.plot(skip_end=0)