def train_and_eval_IS(csv_fp, image_fp, label_colname, eval_key, dataset_name, group_key, all_group_colnames=[], data_dir=DATA_DIR, num_workers=64, seed=None, num_classes=2, num_epochs=20, return_model=False, model_name='resnet18', **sgd_params): print("Training and evaluating IS") # set up the dataloaders dataloaders = get_data_loaders(data_dir=data_dir, csv_fp=csv_fp, image_fp=image_fp, label_colname=label_colname, eval_key=eval_key, dataset_name=dataset_name, all_group_colnames=all_group_colnames, this_group_key=group_key, sample_by_groups=True, weight_to_eval_set_distribution=True, num_workers=num_workers) x = train_and_eval(dataloaders, eval_key, seed, num_classes, num_epochs, return_model, model_name, **sgd_params) return x
def main(): # 定数 epochs = 100 train_batchsize = 128 valid_batchsize = 4 log_interval = 50 device = "cuda" if torch.cuda.is_available() else "cpu" # 各学習に必要なもの model = Net(num_class=10) train_loader, valid_loader = get_data_loaders(train_batchsize, valid_batchsize) criterion = F.nll_loss optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) log_writer = SummaryWriter('./log') # 学習開始 train(epochs=epochs, model=model, train_loader=train_loader, valid_loader=valid_loader, criterion=criterion, optimizer=optimizer, writer=log_writer, device=device, log_interval=log_interval) # モデル保存 torch.save(model.state_dict(), './checkpoints/final_weights.pt') log_writer.close()
def learning_rate_validation(num_of_epochs, dataset_name, learning_rate_list, batch_size, model_type, num_of_topics, hidden_size, topic_hidden_size, drop_out_prob): result = dict() dump_file_name = './val_results/learning_rate_valid_result_' + model_type + '_' + dataset_name train_loader, validation_loader, test_loader = get_data_loaders( 0.1, dataset_name=dataset_name) for learning_rate in learning_rate_list: net = Net(num_of_epochs, train_loader, test_loader, validation_loader, learning_rate, model_type, early_stopping_mode, early_stopping_min_delta, early_stopping_patience, num_of_topics=num_of_topics, hidden_size=hidden_size, input_size=300, topic_hidden_size=topic_hidden_size, drop_out_prob=drop_out_prob) gc.collect() train_loss, validation_loss = net.train(batch_size=batch_size, validate=True) result[learning_rate] = [train_loss, validation_loss] with open(dump_file_name, 'wb') as fp: pickle.dump(result, fp)
def main(): # setup the device for running device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") train_loader, val_loader = dataset.get_data_loaders(100) classes = val_loader.dataset.classes # load model and set to evaluation mode model = load_model('ResNet') model.to(device) model.eval() paths = load_imagepaths_from_folder('data/test/999/') # load the image f = open("OneMoreSecond.txt", "w") # opens file with name of "test.txt" for path in paths: image = imread(path) image = base_transform(image) image = image.view(-1, 3, 128, 128) image = image.to(device) # run the forward process prediction = model(image) prediction = prediction.to('cpu') _, cls = torch.topk(prediction, dim=1, k=5) output = path for i in cls.data[0]: output = output + " " x = classes[i.item()] output = output + str(x) a = output.split("/") output = a[1] + "/" + a[3] print(output) f.write(output + "\n") #os.system("echo %s > text1/output_file.txt" %output) f.close()
def run(): # Parameters num_epochs = 10 output_period = 100 batch_size = 100 # setup the device for running device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = resnet_18() model = model.to(device) train_loader, val_loader = dataset.get_data_loaders(batch_size) num_train_batches = len(train_loader) criterion = nn.CrossEntropyLoss().to(device) # TODO: optimizer is currently unoptimized # there's a lot of room for improvement/different optimizers optimizer = optim.SGD(model.parameters(), lr=1e-3) epoch = 1 while epoch <= num_epochs: running_loss = 0.0 for param_group in optimizer.param_groups: print('Current learning rate: ' + str(param_group['lr'])) model.train() for batch_num, (inputs, labels) in enumerate(train_loader, 1): print('I am working') inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() if batch_num % output_period == 0: print('[%d:%.2f] loss: %.3f' % (epoch, batch_num * 1.0 / num_train_batches, running_loss / output_period)) running_loss = 0.0 gc.collect() gc.collect() # save after every epoch torch.save(model.state_dict(), "models/model_sgd.%d" % epoch) # TODO: Calculate classification error and Top-5 Error # on training and validation datasets here gc.collect() epoch += 1 return
def validate(learning_rate, batch_size, dataset_name, model_type): train_loader, validation_loader, test_loader = get_data_loaders( validation_percentage, dataset_name) net = Net(250, train_loader, test_loader, validation_loader, learning_rate, model_type=model_type) train_loss, test_loss = net.train(batch_size=batch_size, validate=True)
def run(): # Parameters num_epochs = 10 output_period = 100 batch_size = 100 # setup the device for running device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = resnet_18() model = model.to(device) train_loader, val_loader = dataset.get_data_loaders(batch_size) num_train_batches = len(train_loader) criterion = nn.CrossEntropyLoss().to(device) # optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-2) # optimizer = optim.Adagrad(model.parameters(), lr=1e-3, weight_decay=1e-2) # optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-3) # optimizer = optim.SGD(model.parameters(), lr=1e-3); optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-2); trainAccLog = [] valAccLog = [] epoch = 1 while epoch <= num_epochs: running_loss = 0.0 for param_group in optimizer.param_groups: print('Current learning rate: ' + str(param_group['lr'])) model.train() for batch_num, (inputs, labels) in enumerate(train_loader, 1): inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() if batch_num % output_period == 0: print('[%d:%.2f] loss: %.3f' % ( epoch, batch_num*1.0/num_train_batches, running_loss/output_period )) running_loss = 0.0 gc.collect() gc.collect() # save after every epoch torch.save(model.state_dict(), "models/model.%d" % epoch) epoch+=1
def run(num_epochs, out_period, batch_size, model): # setup the device for running device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = model.to(device) train_loader, val_loader = dataset.get_data_loaders(batch_size) if len(sys.argv) > 1: #output file for val set epoch = sys.argv[1] #take number print("loading models/model.%s" % epoch) model.load_state_dict(torch.load("models/model.%s" % epoch)) model.eval() # Opens file to write results to, will overwrite existing files out_file = open("resultsVAL.txt", "w") total = 0 for (inputs, labels) in tqdm(val_loader): inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs) top5 = torch.topk(outputs, 5)[1] # path = "test/" + '{0:08d}'.format(i) + ".jpg" for i in range(len(inputs)): filename = val_loader.dataset.samples[total][0] # formats string in the structure of "val/39/00000132.jpg 1 3 5 6 9" path_top5 = filename for j in top5[i]: path_top5 = path_top5 + " " + str(j.item()) out_file.write(path_top5 + "\n") # print(labels[i], "TOP5:", top5[i]) # print(path_top5) total += 1 gc.collect() #remove final newline out_file.seek(out_file.tell() - 2) out_file.truncate() else: #print accuracy for all epochs on val set epoch = 1 while epoch <= num_epochs: print("loading models/model.%s" % epoch) model.load_state_dict(torch.load("models/model.%s" % epoch)) model.eval() # Calculate classification error and Top-5 Error # on training and validation datasets here printAccuracy(val_loader, device, model, "VALSET", epoch) gc.collect() epoch += 1
def train_and_eval_GDRO(csv_fp, image_fp, label_colname, eval_key, dataset_name, group_key, gdro_params, all_group_colnames=[], data_dir=DATA_DIR, num_workers=64, seed=None, num_classes=2, num_epochs=20, return_model=False, model_name='resnet18', **sgd_params): print("Training and Evaluating GDRO") # set up the dataloaders - GDRO needs sample_by_groups = True dataloaders = get_data_loaders(data_dir=data_dir, csv_fp=csv_fp, image_fp=image_fp, label_colname=label_colname, eval_key=eval_key, dataset_name=dataset_name, all_group_colnames=all_group_colnames, this_group_key=group_key, sample_by_groups=True, weight_to_eval_set_distribution=False, num_workers=num_workers) # update gdro params with dataset specific paramters dataloader_train = dataloaders[0] gdro_params['group_key'] = group_key gdro_params['num_groups'] = len(dataloader_train.dataset.group_counts) gdro_params['group_sizes'] = dataloader_train.dataset.group_counts x = train_and_eval(dataloaders, eval_key, seed, num_classes, num_epochs, return_model, model_name, gdro=True, gdro_params=gdro_params, **sgd_params) return x
def main(): config = get_config() if config['train'] and not config['resume']: for key in ['folder_log', 'folder_out']: if os.path.exists(config[key]): raise FileExistsError(config[key]) os.makedirs(config[key]) with open(os.path.join(config['folder_out'], 'config.yaml'), 'w') as f: yaml.safe_dump(config, f) data_loaders, image_shape = get_data_loaders(config) config['image_shape'] = image_shape net = get_model(config) if config['train']: train_model(config, data_loaders, net) test_model(config, data_loaders, net) return
def main(args, seed): torch.random.manual_seed(seed) torch.random.manual_seed(seed) train_loader, val_loader, shape = get_data_loaders( config.Training.batch_size, start_idx=args.start_idx, test_batch_size=args.horizon, ) n, d, t = shape model = models.ConvNet(d, seq_len=t) if args.ckpt is not None: state_dict = torch.load(args.ckpt) model.load_state_dict(state_dict) out = ar(val_loader, model) plot_output(*out) plt.show() plt.close()
def main(): gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) config = get_config() if config['train'] and not config['resume']: for key in ['folder_log', 'folder_out']: if os.path.exists(config[key]): raise FileExistsError(config[key]) os.makedirs(config[key]) with open(os.path.join(config['folder_out'], 'config.yaml'), 'w') as f: yaml.safe_dump(config, f) strategy = tf.distribute.MirroredStrategy() data_loaders = get_data_loaders(strategy, config) net = get_model(config) if config['train']: train_model(strategy, config, data_loaders, net) test_model(strategy, config, data_loaders, net) return
def main(): config = get_config() if config['train'] and not config['resume']: for key in ['folder_log', 'folder_out']: if os.path.exists(config[key]): raise FileExistsError(config[key]) os.makedirs(config[key]) with open(os.path.join(config['folder_out'], 'config.yaml'), 'w') as f: yaml.safe_dump(config, f) data_loaders, image_shape = get_data_loaders(config) config['image_shape'] = image_shape if 'crop_shape' not in config: config['crop_shape'] = [ val if idx == 0 else val // 2 for idx, val in enumerate(image_shape) ] net = get_model(config) net_gen = None if config['path_pretrain'] is None else get_model(config) if config['train']: train_model(config, data_loaders, net, net_gen) test_model(config, data_loaders, net) return
def test(learning_rate, batch_size, dataset_name, model_type, early_stopping_mode, early_stopping_min_delta, early_stopping_patience): train_loader, validation_loader, test_loader, embeddings = get_data_loaders( validation_percentage, dataset_name) net = Net(300, train_loader, test_loader, validation_loader, learning_rate, model_type, early_stopping_mode, early_stopping_min_delta, early_stopping_patience, input_size=input_size, num_of_topics=num_of_topics, hidden_size=hidden_size, topic_hidden_size=topic_hidden_size, drop_out_prob=drop_out_prob, embeddings=embeddings) result = net.train(batch_size=batch_size, validate=False) print(result)
def __init__(self, args): # Training configurations self.method = args.method self.dataset = args.dataset self.dim = args.dim self.lr = args.lr self.batch_size = args.batch_size self.val_batch_size = self.batch_size // 2 self.iteration = args.iteration self.evaluation = args.evaluation self.show_iter = 1000 self.update_epoch = 10 self.balanced = args.balanced self.instances = args.instances self.cm = args.cm self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') self.file_name = '{}_{}_{}'.format( self.method, self.dataset, self.lr, ) print('========================================') print(json.dumps(vars(args), indent=2)) print(self.file_name) # Paths self.root_dir = os.path.join('/', 'home', 'lyz') self.data_dir = os.path.join(self.root_dir, 'datasets', self.dataset) self.model_dir = self._get_path('./trained_model') self.code_dir = self._get_path(os.path.join('codes', self.dataset)) self.fig_dir = self._get_path( os.path.join('fig', self.dataset, self.file_name)) # Preparing data self.transforms = get_transform() self.datasets = get_datasets(dataset=self.dataset, data_dir=self.data_dir, transforms=self.transforms) self.cm_sampler = ClassMiningSampler(self.datasets['train'], batch_size=self.batch_size, n_instance=self.instances, balanced=self.balanced) self.data_loaders = get_data_loaders( datasets=self.datasets, batch_size=self.batch_size, val_batch_size=self.val_batch_size, n_instance=self.instances, balanced=self.balanced, cm=self.cm_sampler if self.cm else None) self.dataset_sizes = { x: len(self.datasets[x]) for x in ['train', 'test'] } # Set up model self.model = get_model(self.device, self.dim) self.optimizer = optim.SGD( [{ 'params': self.model.google_net.parameters() }, { 'params': self.model.linear.parameters(), 'lr': self.lr * 10, 'momentum': 0.9 }], lr=self.lr, momentum=0.9) self.scheduler = lr_scheduler.StepLR(self.optimizer, step_size=2000, gamma=0.5)
n_class = 9 plt.ion() # interactive mode #def run_cv(img_size, pre_trained, target): if __name__ == '__main__': image_files = get_img_files(data_dir) kf = KFold(n_splits=N_CV, random_state=RANDOM_STATE, shuffle=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") for n, (train_idx, val_idx) in enumerate(kf.split(image_files)): #----Prepare Data------------------------------------------------------------------------------- train_files = image_files[train_idx] val_files = image_files[val_idx] data_loaders = get_data_loaders(train_files, val_files, img_size) dataset_sizes = [len(train_files), len(val_files)] print('dataset_sizes:', dataset_sizes) inputs, classes = next(iter(data_loaders[0])) #out = torchvision.utils.make_grid(inputs) #imshow(out, title=[x for x in classes]) #----Prepare Model------------------------------------------------------------------------------- model_mobnet2 = MobileNetV2() model_mobnet2.load_state_dict(torch.load(pre_trained_mobnet2)) model_mobnet2.classifier = nn.Sequential( nn.Dropout(0.2), nn.Linear(model_mobnet2.last_channel, n_class), ) model_mobnet2 = model_mobnet2.to(device) criterion = nn.CrossEntropyLoss() # Observe that all parameters are being optimized
def run(config): # Update the config dict as necessary # This is for convenience, to add settings derived from the user-specified # configuration into the config-dict (e.g. inferring the number of classes # and size of the images from the dataset, passing in a pytorch object # for the activation specified as a string) config['resolution'] = 64 config['n_classes'] = 120 config['G_activation'] = utils.activation_dict[config['G_nl']] config['D_activation'] = utils.activation_dict[config['D_nl']] # By default, skip init if resuming training. if config['resume']: print('Skipping initialization for training resumption...') config['skip_init'] = True config = utils.update_config_roots(config) device = 'cuda' # Seed RNG utils.seed_rng(config['seed']) # Prepare root folders if necessary utils.prepare_root(config) # Setup cudnn.benchmark for free speed torch.backends.cudnn.benchmark = True experiment_name = (config['experiment_name'] if config['experiment_name'] else 'generative_dog_images') print('Experiment name is %s' % experiment_name) G = BigGAN.Generator(**config).to(device) D = BigGAN.Discriminator(**config).to(device) # If using EMA, prepare it if config['ema']: print('Preparing EMA for G with decay of {}'.format( config['ema_decay'])) G_ema = BigGAN.Generator(**{ **config, 'skip_init': True, 'no_optim': True }).to(device) ema = utils.ema(G, G_ema, config['ema_decay'], config['ema_start']) else: G_ema, ema = None, None GD = BigGAN.G_D(G, D) print(G) print(D) print('Number of params in G: {} D: {}'.format( * [sum([p.data.nelement() for p in net.parameters()]) for net in [G, D]])) # Prepare state dict, which holds things like epoch # and itr # state_dict = {'itr': 0, 'epoch': 0, 'save_num': 0, 'config': config} # If loading from a pre-trained model, load weights if config['resume']: print('Loading weights...') utils.load_weights( G, D, state_dict, config['weights_root'], experiment_name, config['load_weights'] if config['load_weights'] else None, G_ema if config['ema'] else None) # Prepare data; the Discriminator's batch size is all that needs to be passed # to the dataloader, as G doesn't require dataloading. # Note that at every loader iteration we pass in enough data to complete # a full D iteration (regardless of number of D steps and accumulations) D_batch_size = (config['batch_size'] * config['num_D_steps'] * config['num_D_accumulations']) loaders = dataset.get_data_loaders(data_root=config['data_root'], label_root=config['label_root'], batch_size=D_batch_size, num_workers=config['num_workers'], shuffle=config['shuffle'], pin_memory=config['pin_memory'], drop_last=True) # Prepare noise and randomly sampled label arrays # Allow for different batch sizes in G G_batch_size = max(config['G_batch_size'], config['batch_size']) z_, y_ = utils.prepare_z_y(G_batch_size, G.dim_z, config['n_classes'], device=device, fp16=config['G_fp16']) # Prepare a fixed z & y to see individual sample evolution throghout training fixed_z, fixed_y = utils.prepare_z_y(G_batch_size, G.dim_z, config['n_classes'], device=device, fp16=config['G_fp16']) fixed_z.sample_() fixed_y.sample_() # Loaders are loaded, prepare the training function train = train_fns.create_train_fn(G, D, GD, z_, y_, ema, state_dict, config) print('Beginning training at epoch %d...' % state_dict['epoch']) start_time = time.perf_counter() total_iters = config['num_epochs'] * len(loaders[0]) # Train for specified number of epochs, although we mostly track G iterations. for epoch in range(state_dict['epoch'], config['num_epochs']): for i, (x, y) in enumerate(loaders[0]): # Increment the iteration counter state_dict['itr'] += 1 # Make sure G and D are in training mode, just in case they got set to eval # For D, which typically doesn't have BN, this shouldn't matter much. G.train() D.train() if config['ema']: G_ema.train() x, y = x.to(device), y.to(device) metrics = train(x, y) if not (state_dict['itr'] % config['log_interval']): curr_time = time.perf_counter() curr_time_str = datetime.datetime.fromtimestamp( curr_time).strftime('%H:%M:%S') elapsed = str( datetime.timedelta(seconds=(curr_time - start_time))) log = ("[{}] [{}] [{} / {}] Ep {}, ".format( curr_time_str, elapsed, state_dict['itr'], total_iters, epoch) + ', '.join([ '%s : %+4.3f' % (key, metrics[key]) for key in metrics ])) print(log) # Save weights and copies as configured at specified interval if not (state_dict['itr'] % config['save_every']): if config['G_eval_mode']: print('Switching G to eval mode...') G.eval() # if config['ema']: # G_ema.eval() train_fns.save_and_sample(G, D, G_ema, z_, y_, fixed_z, fixed_y, state_dict, config, experiment_name) # Increment epoch counter at end of epoch state_dict['epoch'] += 1
use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") # Load model. model = load_fcn(num_classes=1) model.to(device) # Load optimizer and loss loss_fn = nn.BCEWithLogitsLoss() optimizer = optim.Adam(model.parameters(), lr=config["lr"]) ## Set up Data Loaders. epochs = config["epochs"] train_loader, val_loader, _ = get_data_loaders( args.id_path, args.im_path, args.label_path, batch_size=config["batch_size"]) ## Begin training best_val_iou = -np.inf for epoch in range(epochs): print(f"Starting epoch {epoch+1}:") ## Metrics train_loss, val_loss = 0, 0 train_iou, val_iou = 0, 0 train_prec, val_prec = 0, 0 train_recall, val_recall = 0, 0 ## Training.
def run(): # Parameters num_epochs = 10 output_period = 100 batch_size = 100 # setup the device for running device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = resnet_18() model = model.to(device) train_loader, val_loader = dataset.get_data_loaders(batch_size) num_train_batches = len(train_loader) num_val_batches = len(val_loader) criterion = nn.CrossEntropyLoss().to(device) # TODO:ptimizer is currently unoptimized # there's a lot of room for improvement/different optimizers optimizer = optim.SGD( model.parameters(), lr=1e-3) # we can to stochastic graddescent, adagrad, adadelta, # optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.05) epoch = 1 with open(cwd + "/resnet_18-" + datestr + ".txt", "w") as outptfile: while epoch <= num_epochs: running_loss = 0.0 epoch_train_loss = 0.0 epoch_val_loss = 0.0 correctInTrainEpoch = 0 top5InTrainEpoch = 0 epoch_samples = 0 val_samples = 0 for param_group in optimizer.param_groups: print('Current learning rate: ' + str(param_group['lr'])) model.train() for batch_num, (inputs, labels) in enumerate(train_loader, 1): inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() # with torch.no_grad(): # _, prediction = torch.max(outputs, dim=1) # prediction = prediction.cpu() # outputs = outputs.cpu() # labels = labels.cpu() # numCorrect = prediction.numpy() == labels.numpy() # correctInTrainEpoch += sum(numCorrect) # top5 = torch.topk(outputs, 5, dim=1) # top5 = top5[:][1] # accuracy = [1 if int(labels[ind]) in x else 0 for ind, x in enumerate(top5.numpy())] # top5InTrainEpoch += sum(accuracy) # if batch_num % output_period == 0: # print('[%d:%.2f] loss: %.3f' % ( # epoch, batch_num*1.0/num_train_batches, # running_loss/output_period # )) # epoch_train_loss += running_loss # running_loss = 0.0 # gc.collect() # if batch_num > 5: # print('[%d:%.2f] loss: %.3f' % ( # epoch, batch_num*1.0/num_train_batches, # running_loss/output_period # )) # epoch_train_loss += running_loss # running_loss = 0.0 # gc.collect() # print(outputs) # _, prediction = torch.max(outputs, dim=1) # print('predicted class: ', prediction) # print('actual class: ', labels) # numCorrect = prediction == labels # print('classification error: ', sum(numCorrect)/len(labels)) # top5 = torch.topk(outputs, 5, dim=1) # print('Top 5 classes were: ', top5[:][1]) # top5 = top5[:][1] # # print(labels.repeat(1,5).view(5,-1)) # print(labels.transpose(0,-1)) # accuracy = [1 if int(labels[ind]) in x else 0 for ind, x in enumerate(top5.numpy())] # print('top5 accuracy: ', sum(accuracy)/len(labels)) # break acc1, acc5 = accuracy(outputs, labels, topk=(1, 5)) n = outputs.size(0) correctInTrainEpoch += acc1[0] top5InTrainEpoch += acc5[0] epoch_samples += n if batch_num % output_period == 0: print('[%d:%.2f] loss: %.3f' % (epoch, batch_num * 1.0 / num_train_batches, running_loss / output_period)) epoch_train_loss += running_loss running_loss = 0.0 gc.collect() # if batch_num > 5: # break gc.collect() # save after every epoch torch.save(model.state_dict(), "models/model.%d" % epoch) # TODO: Calculate classification error and Top-5 Error # on training and validation datasets here correctInValEpoch = 0 top5InValEpoch = 0 model.eval() with torch.no_grad(): for batch_num, (inputs, labels) in enumerate(val_loader): inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs) loss = criterion(outputs, labels) acc1, acc5 = accuracy(outputs, labels, topk=(1, 5)) n = outputs.size(0) correctInValEpoch += acc1[0] top5InValEpoch += acc5[0] epoch_val_loss += loss.item() val_samples += n # if batch_num > 5: # break # _, prediction = torch.max(outputs, dim=1) # prediction = prediction.cpu() # outputs = outputs.cpu() # labels = labels.cpu() # numCorrect = prediction.numpy() == labels.numpy() # correctInValEpoch += sum(numCorrect) # top5 = torch.topk(outputs, 5, dim=1) # top5 = top5[:][1] # accuracy = [1 if int(labels[ind]) in x else 0 for ind, x in enumerate(top5.numpy())] # top5InValEpoch += sum(accuracy) # if batch_num > 5: # print('[%d:%.2f] loss: %.3f' % ( # epoch, batch_num*1.0/num_train_batches, # running_loss/output_period # )) # epoch_train_loss += running_loss # running_loss = 0.0 # gc.collect() # print(outputs) # _, prediction = torch.max(outputs, dim=1) # print('predicted class: ', prediction) # print('actual class: ', labels) # numCorrect = prediction == labels # print('classification error: ', sum(numCorrect)/len(labels)) # top5 = torch.topk(outputs, 5, dim=1) # print('Top 5 classes were: ', top5[:][1]) # top5 = top5[:][1] # # print(labels.repeat(1,5).view(5,-1)) # print(labels.transpose(0,-1)) # accuracy = [1 if int(labels[ind]) in x else 0 for ind, x in enumerate(top5.numpy())] # print('top5 accuracy: ', sum(accuracy)/len(labels)) # break accuractyString = 'Epoch %d Train: T1 %.2f, T5 %.2f, Loss %.2f \nEpoch %d Val: V1 %.2f, V5 %.2f, Loss %.2f\n' % ( epoch, 100.0 - correctInTrainEpoch / (epoch_samples) * 100, 100.0 - top5InTrainEpoch / epoch_samples * 100, epoch_train_loss / (num_train_batches), epoch, 100.0 - correctInValEpoch / (val_samples) * 100, 100.0 - top5InValEpoch / (val_samples) * 100, epoch_val_loss / (num_val_batches), ) print(accuractyString) outptfile.write(accuractyString) outptfile.write("\n") gc.collect() epoch += 1
def main(args): # Store name of experiment exp_name = args.exp_name exp_name = '{}_r{}_p{}_n{}_i{}_k{}'.format(exp_name, args.rho, args.pos_reward, args.neg_reward, args.class_imbalance, args.kldiv_lambda) # Create an directory for output path args.output_path = os.path.join(args.output_path, args.exp_name) os.makedirs(args.output_path, exist_ok=True) utils.LOG_FILE = os.path.join(args.output_path, 'log.txt') LEARNING_PROFILE_FILE = os.path.join(args.output_path, 'learning_curve.txt') lpf = open(LEARNING_PROFILE_FILE, 'a') args.lpf = lpf # Set logging logging.basicConfig(filename=utils.LOG_FILE, filemode='a', format='%(levelname)s :: %(asctime)s - %(message)s', level=args.log_level, datefmt='%d/%m/%Y %I:%M:%S %p') console = logging.StreamHandler() console.setLevel(args.log_level) formatter = logging.Formatter('%(levelname)s :: %(asctime)s - %(message)s', datefmt='%d/%m/%Y %I:%M:%S %p') console.setFormatter(formatter) logging.getLogger().addHandler(console) logging.info( 'Beginning code for experiment {} and storing stuff in {}'.format( exp_name, args.output_path)) logging.info('Loaded arguments as \n{}'.format(str(pprint.pformat(args)))) # Begin of main code train_loader, val_loader, labelled_train_loader = dataset.get_data_loaders( args) model = models.select_model(args) my_eval_fn = compute.get_evaluation_function(args) if args.optim == 'sgd': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), momentum=args.momentum, lr=args.lr, weight_decay=args.decay) else: optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.decay) checkpoint_file = os.path.join(args.output_path, '{}_checkpoint.pth'.format(exp_name)) best_checkpoint_file = os.path.join( args.output_path, '{}_best_checkpoint.pth'.format(exp_name)) logging.info('Saving checkpoints at {} and best checkpoint at : {}'.format( checkpoint_file, best_checkpoint_file)) start_epoch = 0 best_score = -9999999 # Load checkpoint if present in input arguments if args.checkpoint != '': logging.info('Starting from checkpoint: {}'.format(args.checkpoint)) cp = torch.load(args.checkpoint) start_epoch = cp['epoch'] + 1 model.load_state_dict(cp['model']) # optimizer.load_state_dict(cp['optimizer']) TODO: - Why not do this? best_score = cp['best_score'] for param_group in optimizer.param_groups: param_group['lr'] = args.lr param_group['weight_decay'] = args.decay num_epochs = args.num_epochs logging.info('Beginning train/validate cycle') time1 = time.time() if val_loader is not None: record, metric_idx, headers = compute.compute(start_epoch - 1, model, val_loader, optimizer, 'eval', eval_fn=my_eval_fn, args=args) if (args.log_eval is not None): handler = open(args.log_eval, "a") print(','.join([ str(round(x, 6)) if isinstance(x, float) else str(x) for x in record ]), file=handler) handler.close() print("Time taken:", time.time() - time1) if (args.only_eval): logging.info('Ran only eval mode, now exiting') exit(0) # Start TRAINING for epoch in range(start_epoch, num_epochs): logging.info('Beginning epoch {}'.format(epoch)) if labelled_train_loader is not None: record, metric_idx, _ = compute.compute(epoch, model, labelled_train_loader, optimizer, 'train_sup', eval_fn=my_eval_fn, args=args) if train_loader is not None: record, metric_idx, _ = compute.compute( epoch, model, train_loader, optimizer, 'train_un', eval_fn=my_eval_fn, args=args, labelled_train_loader=labelled_train_loader) if val_loader is not None: record, metric_idx, _ = compute.compute(epoch, model, val_loader, None, 'eval', eval_fn=my_eval_fn, args=args) is_best = False logging.info('Best score: {}, This score: {}'.format( best_score, record[metric_idx])) if record[metric_idx] > best_score: best_score = record[metric_idx] is_best = True utils.save_checkpoint( { 'epoch': epoch, 'best_score': best_score, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'is_best': is_best }, epoch, is_best, checkpoint_file, best_checkpoint_file) args.lpf.close()
def run(): # Parameters num_epochs = 10 output_period = 100 batch_size = 100 # setup the device for running device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = resnet_18() model = model.to(device) train_loader, val_loader = dataset.get_data_loaders(batch_size) num_train_batches = len(train_loader) criterion = nn.CrossEntropyLoss().to(device) # TODO: optimizer is currently unoptimized # there's a lot of room for improvement/different optimizers optimizer = optim.Adam(model.parameters(), lr=1e-3) scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.001) epoch = 1 while epoch <= num_epochs: scheduler.step() running_loss = 0.0 for param_group in optimizer.param_groups: print('Current learning rate: ' + str(param_group['lr'])) model.train() for batch_num, (inputs, labels) in enumerate(train_loader, 1): print(labels) inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() if batch_num % output_period == 0: print('[%d:%.2f] loss: %.3f' % (epoch, batch_num * 1.0 / num_train_batches, running_loss / output_period)) running_loss = 0.0 gc.collect() gc.collect() # save after every epoch torch.save(model.state_dict(), "models/model.%d" % epoch) # TODO: Calculate classification error and Top-5 Error # on training and validation datasets here # count = 0 # accuracy_top1 = 0 # accuracy_top5 = 0 # for batch_num, (inputs, labels) in enumerate(train_loader, 1): # prediction = model(inputs) # prediction = prediction.to('cpu') # _, cls = torch.max(prediction, dim=1) # _, top5 = torch.topk(prediction, k=5, dim=1) # for i in range(len(cls)): # accuracy_top1 += int(cls[i] == labels[i]) # count += 1 # for i in range(len(top5)): # accuracy_top5 += int(labels[i] in top5[i]) # accuracy_top1 /= count # accuracy_top5 /= count # print(accuracy_top1, accuracy_top5) # accuracy_top1 = 0 # accuracy_top5 = 0 # for batch_num, (inputs, labels) in enumerate(val_loader, 1): # prediction = model(inputs) # prediction = prediction.to('cpu') # _, cls = torch.max(prediction, dim=1) # _, top5 = torch.topk(prediction, k=5, dim=1) # for i in range(len(cls)): # accuracy_top1 += int(cls[i] == labels[i]) # for i in range(len(top5)): # accuracy_top5 += int(labels[i] in top5[i]) # count += 1 # accuracy_top1 /= count # accuracy_top5 /= count # print(accuracy_top1, accuracy_top5) gc.collect() epoch += 1
def train_ignite(self): train_loader, validation_loader = get_data_loaders(self.config) writer = create_summary_writer(self.Model, train_loader, self.logs_save_dir) self.optimizer = Adam(self.Model.parameters(), lr=self.learning_rate, betas=(0.9, 0.999)) self.learning_rate_scheduler() loss = UNetCrossEntropyLoss().cuda() trainer = create_trainer(model=self.Model, optimizer=self.optimizer, criterion=loss, device=self.device) evaluator = create_evaluator(self.Model, metrics={ 'CrossEntropy': Loss(loss), 'PrecisionRecall': PrecisionRecall() }, device=self.device) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) log_interval = 2 @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) writer.add_scalar("training/logs", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): self.scheduler.step() pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics cross_entropy_loss = metrics['CrossEntropy'] tqdm.write( "Current Learning Rate:{:.10f}: Training Results - Epoch: {} Cross Entropy Loss: {:.2f}" .format(self.optimizer.param_groups[0]['lr'], engine.state.epoch, cross_entropy_loss)) writer.add_scalar("training/cross_entropy_loss", cross_entropy_loss, engine.state.epoch) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): pbar.refresh() evaluator.run(validation_loader) metrics = evaluator.state.metrics cross_entropy_loss = metrics['CrossEntropy'] precision_recall_loss = metrics['PrecisionRecall'] tqdm.write( "Validation Results - Epoch: {} Cross Entropy Loss: {:.2f} \n Precision: {:.4f}," " Recall: {:.4f}, Mean Euclidean Distance: {:.2f}".format( engine.state.epoch, cross_entropy_loss, precision_recall_loss['precision'], precision_recall_loss['recall'], precision_recall_loss['mean_euclidean_dist'])) pbar.n = pbar.last_print_n = 0 input = evaluator.state.batch['image'] output = evaluator.state.output pred = output[0] mask = output[1] input_grid = torchvision.utils.make_grid(torch.stack( [img.cpu() for img in input], dim=0), normalize=True) pred_grid = torchvision.utils.make_grid( torch.stack([img.cpu() for img in pred])) mask_grid = torchvision.utils.make_grid( torch.stack([img.cpu() for img in mask])) # torchvision.utils.save_image(pred_grid, "pred/pred_grid_" + str(engine.state.epoch) + ".png") # torchvision.utils.save_image(mask_grid, "pred/mask_grid_" + str(engine.state.epoch) + ".png") writer.add_image("Input", input_grid, engine.state.epoch) writer.add_image("Result", pred_grid, engine.state.epoch) writer.add_image("Ground Truth", mask_grid, engine.state.epoch) writer.add_scalar("validation/precision", precision_recall_loss['precision'], engine.state.epoch) writer.add_scalar("validation/recall", precision_recall_loss['recall'], engine.state.epoch) writer.add_scalar("validation/mean_euclidean_dist", precision_recall_loss['mean_euclidean_dist'], engine.state.epoch) writer.add_scalar("validation/cross_entropy_loss", cross_entropy_loss, engine.state.epoch) checkpointer = ModelCheckpoint(self.model_save_path, 'unet_v_1_', save_interval=1, n_saved=50, require_empty=False, save_as_state_dict=True) # early_stopping = EarlyStopping(patience=5, score_function=self.score_function, trainer=trainer) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'epoch': self.Model}) # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) # evaluator.add_event_handler(Events.COMPLETED, early_stopping) trainer.run(train_loader, max_epochs=self.epochs) pbar.close() writer.close()
# Set random seed for reproducibility manualSeed = 999 #manualSeed = random.randint(1, 10000) # use if you want new results print("Random Seed: ", manualSeed) random.seed(manualSeed) torch.manual_seed(manualSeed) parser = utils.prepare_parser() config = vars(parser.parse_args()) loaders = dataset.get_data_loaders( data_root=config['data_root'], label_root=config['label_root'], batch_size=config['batch_size'], num_workers=config['num_workers'], shuffle=config['shuffle'], pin_memory=config['pin_memory'], drop_last=True, load_in_mem=config['load_in_mem'], mask_out=True, ) image_size = IMG_SIZE nc = 3 nz = config['dim_z'] # Size of feature maps in generator ngf = config['G_ch']
args = parser.parse_args() config = {} config.update(vars(args)) args = utils.Map(config) o2n, n2o = utils.get_template_id_maps(args.num_templates, args.exclude_t_ids) args.o2n = o2n args.n2o = n2o for key in ['train_labels_path', 'val_labels_path']: if args[key] == 'None': args[key] = None settings.set_settings(args) train_loader, val_loader, labelled_train_loader = dataset.get_data_loaders( args) def SIDX(template_id): return (3 + (template_id - 1) * 7) def EIDX(template_id): return (3 + (template_id - 1) * 7 + 7) # for i in val_loader.dataset.raw_data[:,SIDX(6)]: # print (i) # print (val_loader.dataset.raw_data[:,SIDX(2)]) #my_score, max_score, similarity, rank, conditional_rank, mean, std
def main(args): # make sure that model dir exists if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) use_multi_gpu = args.multi_gpu gpu_index = args.gpu # if the given index is not available then we use index 0 # also when using multi gpu we should specify index 0 if gpu_index + 1 > torch.cuda.device_count() or use_multi_gpu: gpu_index = 0 logging.info('using gpu cuda:{}, script PID {}'.format( gpu_index, os.getpid())) device = torch.device('cuda:{}'.format(gpu_index)) # get the configuration file config = Config(args.config_type).create_config() if args.state: # if we provide a saved state then load config from there logging.info('loading config from {}'.format(args.state)) best_state = torch.load(args.state) config = best_state['config'] # sanity check to make sure old configs still work with new format config = backward_compatible_config(config) # size of input depends on sequence types, either difference or orientation input_size = 3 if config['seq_type'] == 'orient': input_size = 4 model = MortonNet(input_size=input_size, conv_layers=config['conv_layers'], rnn_layers=config['rnn_layers'], hidden_size=config['hidden_size']) # we use MSE loss criterion = nn.MSELoss() model.to(device) # if use multi_gou then convert the model to DataParallel if use_multi_gpu: model = nn.DataParallel(model) optimizer = torch.optim.Adam(model.parameters(), lr=config['lr']) # we reduce learning rate when validation doesn't improve after some patience scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=config['lr_decay'], patience=config['lr_patience'], verbose=True) logging.info('Config {}'.format(config)) phases = ['train', 'valid'] dataloaders, datasets = get_data_loaders( root_dir=args.root_dir, phases=phases, shuffle=True, cluster=config['cluster'], batch_size=args.bs, chunk_size=config['chunk_size'], seq_len=config['seq_len'], random_sequence=config['random_sequence'], ratio=config['ratio'], seq_type=config['seq_type']) model_dir = generate_experiment_dir(args.model_dir, config) logging.info( 'TB logs and checkpoint will be saved in {}'.format(model_dir)) # get TensorboardX writer writer = SummaryWriter(log_dir=model_dir) train(config=config, model=model, criterion=criterion, optimizer=optimizer, dataloaders=dataloaders, device=device, model_dir=model_dir, phases=phases, scheduler=scheduler, writer=writer, print_every=args.print, plot_every=args.plot, save_every=args.save) writer.close()
total_f1, total_pr, total_rc = f1_score(P, G, S) total_loss = cum_loss / total_sample return total_loss, total_f1, total_pr, total_rc if __name__ == '__main__': EP = 100 SAVING_DIR = '../models/' tokenizer = BertTokenizer.from_pretrained( '/home/zydq/.torch/models/bert/chinese-bert_chinese_wwm_pytorch', do_lower_case=True) train_loader, val_loader = get_data_loaders( rv_path='../data/TRAIN/Train_reviews.csv', lb_path='../data/TRAIN/Train_labels.csv', tokenizer=tokenizer, batch_size=12, val_split=0.15) model = OpinioNet.from_pretrained( '/home/zydq/.torch/models/bert/chinese-bert_chinese_wwm_pytorch') model.cuda() optimizer = Adam(model.parameters(), lr=5e-6) scheduler = GradualWarmupScheduler(optimizer, total_epoch=2) best_val_f1 = 0 best_val_loss = float('inf') for e in range(EP): print('Epoch [%d/%d] train:' % (e, EP)) train_loss, train_f1, train_pr, train_rc = train_epoch( model, train_loader, optimizer, scheduler)
def run(num_epochs, out_period, batch_size, model): # setup the device for running device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = model.to(device) train_loader, val_loader = dataset.get_data_loaders(batch_size) num_train_batches = len(train_loader) criterion = nn.CrossEntropyLoss().to(device) # optimizer is currently unoptimized # there's a lot of room for improvement/different optimizers # optimizer = optim.SGD(model.parameters(), lr=1e-3, nesterov=True) optimizer = optim.Adam(model.parameters(), lr=1e-3) # optimizer = optim.Nesterov(model.parameters(), lr=1e-3) # printAccuracy(train_loader, device, model, "TRAINSET", 1) epoch = 1 while epoch <= num_epochs: running_loss = 0.0 for param_group in optimizer.param_groups: param_group['lr'] = max(param_group['lr'] * 0.97, 1e-4) tqdm.write('Current learning rate: ' + str(param_group['lr'])) model.train() for batch_num, (inputs, labels) in enumerate(tqdm(train_loader), 1): inputs = inputs.to(device) labels = labels.to(device) # print(labels) # print(labels.size()) optimizer.zero_grad() outputs = model(inputs) # outputs, aux = model(inputs) # print(outputs.size()) # print(torch.topk(outputs,5)) # top5 = torch.topk(outputs,5)[1] # top52 = torch.topk(outputs,5)[0] # for i in range(len(inputs)): # print("\n\nyooooooooo", i, "\n\n\n", labels[i].item(),"\n", top5[i], top52[i]) # print(top5[i][0]) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() if batch_num % out_period == 0: tqdm.write('[%d:%.2f] loss: %.3f' % (epoch, batch_num * 1.0 / num_train_batches, running_loss / out_period)) running_loss = 0.0 gc.collect() gc.collect() # save after every epoch torch.save(model.state_dict(), "models/model.%d" % epoch) # Calculate classification error and Top-5 Error # on training and validation datasets here model.eval() try: printAccuracy(train_loader, device, model, "TRAINSET", epoch) printAccuracy(val_loader, device, model, "VALSET", epoch) except (KeyboardInterrupt, SystemExit): raise except: pass gc.collect() epoch += 1
def train(seed, depth, maxlen, batch_size, accumulation_steps, model_name): config.seed = seed config.max_sequence_length = maxlen config.batch_size = batch_size config.accumulation_steps = accumulation_steps if depth != 24: config.bert_weight = f"../bert_weight/uncased_L-{depth}_H-768_A-12/" else: config.bert_weight = f"../bert_weight/uncased_L-{depth}_H-1024_A-16/" if model_name == 'bert': config.features = f"../bert_features_{maxlen}/" elif model_name == 'gpt2': config.features = f"../features_{maxlen}_gpt/" else: config.features = f"../features_{maxlen}_xlnet/" config.experiment = f"{depth}layers" config.checkpoint = f"{config.logdir}/{config.today}/{model_name}_{config.experiment}_" \ f"{config.batch_size}bs_{config.accumulation_steps}accum_{config.seed}seed_{config.max_sequence_length}/" print_config(config) np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) torch.backends.cudnn.deterministic = True # Data loaders train_loader, valid_loader, valid_df, loss_weight = get_data_loaders( config) loaders = {"train": train_loader, "valid": valid_loader} # Criterion criterion = CustomLoss(loss_weight) # Model and optimizer if model_name == 'bert': print("BERT MODEL") model = BertForTokenClassificationMultiOutput2.from_pretrained( config.bert_weight, cache_dir=None, num_aux_labels=config.n_aux_targets) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = np.ceil( len(train_loader.dataset) / config.batch_size / config.accumulation_steps) * config.epochs optimizer = BertAdam(optimizer_grouped_parameters, lr=config.lr, warmup=0.01, t_total=num_train_optimization_steps) elif model_name == 'gpt2': print("GPT2 MODEL") model = GPT2ClassificationMultioutput.from_pretrained( config.gpt2_weight, cache_dir=None, num_aux_labels=config.n_aux_targets) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = np.ceil( len(train_loader.dataset) / config.batch_size / config.accumulation_steps) * config.epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=config.lr, warmup=0.01, t_total=num_train_optimization_steps) elif model_name == 'xlnet': model = XLNetWithMultiOutput.from_pretrained( config.xlnet_weight, clf_dropout=0.4, n_class=6 # num_aux_labels=config.n_aux_targets ) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = np.ceil( len(train_loader.dataset) / config.batch_size / config.accumulation_steps) * config.epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=config.lr, warmup=0.01, t_total=num_train_optimization_steps) else: raise ("Model is not implemented") scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) model = model.cuda() from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # if distributed_rank > -1: # from apex.parallel import DistributedDataParallel # model = DistributedDataParallel(model) model = torch.nn.DataParallel(model) if config.resume: checkpoint = torch.load(config.checkpoint + "/checkpoints/best.pth") import pdb pdb.set_trace() new_state_dict = {} old_state_dict = checkpoint['model_state_dict'] for k, v in old_state_dict.items(): new_state_dict["module." + k] = v model.load_state_dict(new_state_dict) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) scheduler.load_state_dict(checkpoint['scheduler_state_dict']) criterion.load_state_dict(checkpoint['criterion_state_dict']) print("!!! Loaded checkpoint ", config.checkpoint + "/checkpoints/best.pth") identity_valid = valid_df[config.identity_columns].copy() target_valid = valid_df.target.values auc_callback = AucCallback(identity=identity_valid, target=target_valid) checkpoint_callback = IterationCheckpointCallback( save_n_last=2000, num_iters=10000, ) # model runner runner = ModelRunner() # model training runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, main_metric='auc', minimize_metric=False, logdir=config.checkpoint, num_epochs=config.epochs, verbose=True, fp16={"opt_level": "O1"}, callbacks=[auc_callback, checkpoint_callback])
correct = 0 examples = 0 for (inputs, labels) in data_loader: inputs = inputs.to(device) labels = labels.to(device) # print(labels) # print(labels.size()) outputs = model(inputs) # print(outputs) # print(outputs.size()) _, top = torch.max(outputs, dim=1) # print(top) # print(top.size()) # exit() for i in range(len(outputs)): if labels[i] == top[i]: correct += 1 examples += len(outputs) return float(correct) / examples device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") batch_size = 100 train_loader, val_loader, test_loader = dataset.get_data_loaders(batch_size) model = NNmodel() model = model.to(device) model.load_state_dict(torch.load('models/model.101')) model.eval() print(classification_accuracy(model, test_loader, device))
def __init__(self, args): # Training configurations self.method = args.method self.dataset = args.dataset self.dim = args.dim self.lr_init = args.lr_init self.gamma_m = args.gamma_m self.gamma_s = args.gamma_s self.batch_size = args.batch_size self.val_batch_size = self.batch_size // 2 self.iteration = args.iteration self.evaluation = args.evaluation self.show_iter = 1000 self.update_epoch = args.update_epoch self.balanced = args.balanced self.instances = args.instances self.inter_test = args.intertest self.cm = args.cm self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') self.n_class = args.batch_size // args.instances self.classes = args.classes self.pretrained = args.pretrained self.model_save_interval = args.model_save_interval self.file_name = '{}_{}_{}'.format( self.method, self.dataset, self.iteration, ) print('========================================') print(json.dumps(vars(args), indent=2)) print(self.file_name) # Paths self.root_dir = os.path.join('/', 'data') self.data_dir = os.path.join(self.root_dir, self.dataset) self.model_dir = self._get_path('./trained_model') self.plot_dir = self._get_path('./plot_model') self.code_dir = self._get_path(os.path.join('codes', self.dataset)) self.fig_dir = self._get_path(os.path.join('fig', self.dataset, self.file_name)) # Preparing data self.transforms = get_transform() self.datasets = get_datasets(dataset=self.dataset, data_dir=self.data_dir, transforms=self.transforms) self.data_loaders = get_data_loaders( datasets=self.datasets, batch_size=self.batch_size, val_batch_size=self.val_batch_size, n_instance=self.instances, balanced=self.balanced, #cm=self.cm_sampler if self.cm else None ) self.dataset_sizes = {x: len(self.datasets[x]) for x in ['train', 'test']} self.mean = (torch.zeros((self.classes,self.classes)).add(1.5)-1.0*torch.eye(self.classes)).to(self.device) self.std = (torch.zeros((self.classes,self.classes)).add(0.15)).to(self.device) self.last_delta_mean = torch.zeros((self.classes,self.classes)).to(self.device) self.last_delta_std = torch.zeros((self.classes,self.classes)).to(self.device) self.ndmodel = nd.NDfdml(n_class=self.n_class,batch_size=self.batch_size,instances=self.instances,pretrained=self.pretrained).to(self.device) optimizer_c = optim.SGD( [ {'params': self.ndmodel.googlelayer.parameters()}, {'params': self.ndmodel.embedding_layer.parameters(), 'lr': self.lr_init * 10, 'momentum': 0.9} ], lr=self.lr_init, momentum=0.9 ) self.scheduler = lr_scheduler.StepLR(optimizer_c, step_size=4000, gamma=0.9)