def train(model, criterion, optimizer, epochs, device, train_loader, valid_loader, print_every=60, step_track_every=30): total_start = time() model.epochs = epochs save_checkpoint(model, optimizer) train_losses, valid_losses = [], [] steps = 0 running_loss = 0 valid_loader_size = len(valid_loader) print("==========================") print("Starting training of NN...") for epoch in range(epochs): print("==========================") print(f"Starting epoch #{epoch}...") start_epoch = start_step = time() for inputs, labels in train_loader: steps += 1 inputs, labels = inputs.to(device), labels.to(device) optimizer.zero_grad() logps = model.forward(inputs) loss = criterion(logps, labels) loss.backward() optimizer.step() running_loss += loss.item() if steps % step_track_every == 0: print( f"Time in step #{steps}: {(time() - start_step):.3f} seconds" ) start_step = time() if steps % print_every == 0: start_test = time() valid_loss, accuracy = get_valid_loss_and_accuracy( model, valid_loader, device, criterion) cur_train_loss = running_loss / valid_loader_size running_loss = 0 train_losses.append(cur_train_loss) valid_losses.append(valid_loss) print(f"Epoch {epoch}/{epochs}..." f"Training loss: {cur_train_loss:.4f}..." f"Validation loss: {valid_loss:.4f}..." f"Test accuracy: {accuracy:.4f}\n") print( f"Time taken to test losses in epoch #{epoch}: {(time() - start_test):.3f} seconds" ) print(f"Time per epoch: {(time() - start_epoch):.3f} seconds") print(f"Total training time: {(time() - total_start):.3f} seconds") save_checkpoint(model, optimizer)
def do_run(): config = get_input_config() #setup some globals global STANZA STANZA = config.get("name") http_proxy = config.get("http_proxy") https_proxy = config.get("https_proxy") proxies = {} if not http_proxy is None: proxies["http"] = http_proxy if not https_proxy is None: proxies["https"] = https_proxy request_timeout = int(config.get("request_timeout", 30)) try: req_args = {"verify": True, "timeout": float(request_timeout)} if proxies: req_args["proxies"] = proxies req = requests.get( url= "https://publicdashacc.blob.core.windows.net/publicdata?restype=container&comp=list&prefix=data", params=req_args) xmldom = etree.fromstring(req.content) blobs = xmldom.xpath('/EnumerationResults/Blobs/Blob') for blob in blobs: blob_etag = blob.xpath('Properties/Etag')[0].text blob_name = blob.xpath('Name')[0].text logging.info("Found file=%s etag=%s" % (blob_name, blob_etag)) blob_url = "https://publicdashacc.blob.core.windows.net/publicdata/%s" % ( blob_name) if not load_checkpoint(config, blob_etag): print("Processing file={}".format(blob_url)) data_req = requests.get(url=blob_url, params=req_args) data_json = data_req.json() iterate_json_data("overview", data_json, blob_name) iterate_json_data("countries", data_json, blob_name) iterate_json_data("regions", data_json, blob_name) iterate_json_data("utlas", data_json, blob_name) logging.info("Marking file={} etag={} as processed".format( blob_name, blob_etag)) save_checkpoint(config, blob_etag) except RuntimeError, e: logging.error("Looks like an error: %s" % str(e)) sys.exit(2)
def main(): in_args = train_input_args() train_dir = in_args.data_dir + '/train' valid_dir = in_args.data_dir + '/valid' train_transforms = transforms.Compose([ transforms.RandomRotation(30), transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) valid_transforms = transforms.Compose([ transforms.Resize(255), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) train_data = datasets.ImageFolder(train_dir, transform=train_transforms) valid_data = datasets.ImageFolder(valid_dir, transform=valid_transforms) trainloader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True) validloader = torch.utils.data.DataLoader(valid_data, batch_size=64) dataloader = [trainloader, validloader] if in_args.arch == 'densenet121': model = models.densenet121(pretrained=True) classifier = nn.Sequential( OrderedDict([("fc1", nn.Linear(1024, in_args.hidden_units)), ("ReLU1", nn.ReLU()), ("dropout", nn.Dropout(0.5)), ("fc2", nn.Linear(in_args.hidden_units, 102)), ("output", nn.LogSoftmax(dim=1))])) elif in_args.arch == 'vgg16': model = models.vgg16(pretrained=True) classifier = nn.Sequential( OrderedDict([("fc1", nn.Linear(25088, in_args.hidden_units)), ("ReLU1", nn.ReLU()), ("dropout", nn.Dropout(0.5)), ("fc2", nn.Linear(in_args.hidden_units, 102)), ("output", nn.LogSoftmax(dim=1))])) # Freeze parameters so we don't backprop through them for param in model.parameters(): param.requires_grad = False model.classifier = classifier criterion = nn.NLLLoss() optimizer = optim.Adam(model.classifier.parameters(), lr=in_args.learning_rate) device = torch.device('cuda' if in_args.gpu == 'gpu' else 'cpu') train(model, in_args, dataloader, optimizer, criterion, device) model.class_to_idx = train_data.class_to_idx save_checkpoint(in_args.save_dir, optimizer, in_args, classifier, model)
def main(train_set, learning_rate, n_epochs, batch_size, num_workers, hidden_size, model_file, cuda, checkpoint_interval, seed, n_disc): # make data between -1 and 1 data_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))]) train_dataset = datasets.ImageFolder(root=os.path.join(os.getcwd(), train_set), transform=data_transform) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True) # initialize model if model_file: try: total_examples, fixed_noise, gen_losses, disc_losses, gen_loss_per_epoch, \ disc_loss_per_epoch, prev_epoch, gen, disc = load_model_wgan(model_file, hidden_size) # TODO: upsampling method? print('model loaded successfully!') except: print('could not load model! creating new model...') model_file = None if not model_file: print('creating new model...') gen = Generator(hidden_dim=hidden_size, leaky=0.2) disc = Discriminator(leaky=0.2) gen.weight_init(mean=0, std=0.02) disc.weight_init(mean=0, std=0.02) total_examples = 0 disc_losses = [] gen_losses = [] disc_loss_per_epoch = [] gen_loss_per_epoch = [] prev_epoch = 0 # Sample minibatch of m noise samples from noise prior p_g(z) and transform if cuda: fixed_noise = Variable(torch.randn(9, hidden_size).cuda()) else: fixed_noise = Variable(torch.rand(9, hidden_size)) if cuda: gen.cuda() disc.cuda() # Adam optimizer gen_optimizer = optim.RMSprop(gen.parameters(), lr=learning_rate, eps=1e-8) disc_optimizer = optim.RMSprop(disc.parameters(), lr=learning_rate, eps=1e-8) # results save folder gen_images_dir = 'results/wgan_generated_images' train_summaries_dir = 'results/wgan_training_summaries' checkpoint_dir = 'results/wgan_checkpoints' if not os.path.isdir('results'): os.mkdir('results') if not os.path.isdir(gen_images_dir): os.mkdir(gen_images_dir) if not os.path.isdir(train_summaries_dir): os.mkdir(train_summaries_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) np.random.seed(seed) # reset training seed to ensure that batches remain the same between runs! try: for epoch in range(prev_epoch, n_epochs): disc_losses_epoch = [] gen_losses_epoch = [] for idx, (true_batch, _) in enumerate(train_dataloader): disc.zero_grad() # Sample minibatch of examples from data generating distribution if cuda: true_batch = Variable(true_batch.cuda()) else: true_batch = Variable(true_batch) # discriminator on true data true_disc_output = disc.forward(true_batch) # Sample minibatch of m noise samples from noise prior p_g(z) and transform if cuda: z = Variable(torch.randn(batch_size, hidden_size).cuda()) else: z = Variable(torch.randn(batch_size, hidden_size)) # discriminator on fake data fake_batch = gen.forward(z.view(-1, hidden_size, 1, 1)) fake_disc_output = disc.forward( fake_batch.detach()) # detach so gradients not computed for generator # Optimize with new loss function disc_loss = wgan_Dloss(true_disc_output, fake_disc_output) disc_loss.backward() disc_optimizer.step() # Weight clipping as done by WGAN for p in disc.parameters(): p.data.clamp_(-0.01, 0.01) # Store losses disc_losses_epoch.append(disc_loss.data[0]) # Train generator after the discriminator has been trained n_disc times if (idx+1) % n_disc == 0: gen.zero_grad() # Sample minibatch of m noise samples from noise prior p_g(z) and transform if cuda: z = Variable(torch.randn(batch_size, hidden_size).cuda()) else: z = Variable(torch.rand(batch_size, hidden_size)) # train generator fake_batch = gen.forward(z.view(-1, hidden_size, 1, 1)) fake_disc_output = disc.forward(fake_batch) # Optimize generator gen_loss = wgan_Gloss(fake_disc_output) gen_loss.backward() gen_optimizer.step() # Store losses gen_losses_epoch.append(gen_loss.data[0]) if (total_examples != 0) and (total_examples % n_disc*4 == 0): print('epoch {}: step {}/{} disc loss: {:.4f}, gen loss: {:.4f}' .format(epoch + 1, idx + 1, len(train_dataloader), disc_loss.data[0], gen_loss.data[0])) # Checkpoint model total_examples += batch_size if (checkpoint_interval != 0) and (total_examples % checkpoint_interval == 0): disc_losses.extend(disc_losses_epoch) gen_losses.extend(gen_losses_epoch) save_checkpoint(total_examples=total_examples, fixed_noise=fixed_noise, disc=disc, gen=gen, gen_losses=gen_losses, disc_losses=disc_losses, disc_loss_per_epoch=disc_loss_per_epoch, gen_loss_per_epoch=gen_loss_per_epoch, epoch=epoch, directory=checkpoint_dir) print("Checkpoint saved!") # sample images for inspection save_image_sample(batch=gen.forward(fixed_noise.view(-1, hidden_size, 1, 1)), cuda=cuda, total_examples=total_examples, directory=gen_images_dir) print("Saved images!") # save learning curves for inspection save_learning_curve(gen_losses=gen_losses, disc_losses=disc_losses, total_examples=total_examples, directory=train_summaries_dir) print("Saved learning curves!") disc_loss_per_epoch.append(np.average(disc_losses_epoch)) gen_loss_per_epoch.append(np.average(gen_losses_epoch)) # Save epoch learning curve save_learning_curve_epoch(gen_losses=gen_loss_per_epoch, disc_losses=disc_loss_per_epoch, total_epochs=epoch + 1, directory=train_summaries_dir) print("Saved learning curves!") print('epoch {}/{} disc loss: {:.4f}, gen loss: {:.4f}' .format(epoch + 1, n_epochs, np.array(disc_losses_epoch).mean(), np.array(gen_losses_epoch).mean())) disc_losses.extend(disc_losses_epoch) gen_losses.extend(gen_losses_epoch) except KeyboardInterrupt: print("Saving before quit...") save_checkpoint(total_examples=total_examples, fixed_noise=fixed_noise, disc=disc, gen=gen, disc_loss_per_epoch=disc_loss_per_epoch, gen_loss_per_epoch=gen_loss_per_epoch, gen_losses=gen_losses, disc_losses=disc_losses, epoch=epoch, directory=checkpoint_dir) print("Checkpoint saved!") # sample images for inspection save_image_sample(batch=gen.forward(fixed_noise.view(-1, hidden_size, 1, 1)), cuda=cuda, total_examples=total_examples, directory=gen_images_dir) print("Saved images!") # save learning curves for inspection save_learning_curve(gen_losses=gen_losses, disc_losses=disc_losses, total_examples=total_examples, directory=train_summaries_dir) print("Saved learning curves!")
def train_and_eval(model, train_loader, valid_loader, learning_rate, epochs, model_outdir, wts, task, metrics_every_iter, restore_chkpt=None, run_suffix=None): """ Contains the powerhouse of the network, ie. the training and validation iterations called through run_model(). All parameters from the command line/json are parsed and then passed into run_model(). Performs checkpointing each epoch, saved as 'last.pth.tar' and the best model thus far (based on validation AUC), saved as 'best.pth.tar' :param model: (nn.Module) :param train_loader: (torch DataLoader) :param valid_loader: (torch DataLoader) :param learning_rate: (float) - the learning rate, defaults to 1e-05 :param epochs: (int) - the number of epochs :param wts: (tensor) - class weights :param model_outdir: (str) - the output directory for checkpointing, checkpoints will be saved as output_dir/task/view/*.tar :param restore_chkpt: (str) - the directory to reload the checkpoint, if specified :param run_suffix: (str) - suffix to be appended to the event file :return: """ # output/task/my_run # goes back 3 levels up, putting the name in the same level as the output. two levels up would put them into output/ recover_root_dir = os.path.dirname( os.path.dirname(os.path.dirname( model_outdir))) # removes the task and view from the directory log_dir = os.path.join(recover_root_dir, "logs") run_name = re.split(r'/|\\', model_outdir)[-1] # task = re.split(r'/|\\', model_outdir)[-2] # have log folder naming structure same as models log_fn = os.path.join(log_dir, task, run_name) dtnow = datetime.now() # dtnow.strftime("%Y%m%d_%H%M%S") log_fn = os.path.join(log_fn, dtnow.strftime("%Y_%m_%d-%H_%M_%S")) # make directory if it doesn't exist. if not os.path.exists(log_fn): os.makedirs(log_fn) print('{} does not exist, creating..!'.format(log_fn)) else: print('{} already exists!'.format(log_fn)) # each tensorboard event file should ideally be saved to a unique folder, else the resulting graph will look like # it's time traveling because of overlapping logs # if run_suffix: # writer = tf.summary.create_file_writer(log_fn, filename_suffix=run_suffix) # else: # writer = tf.summary.create_file_writer(log_fn) # use cpu or cuda depending on availability device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") current_best_val_loss = float('Inf') # this needs to be outside of the loop else it'll keep resetting, right? same with the model optimizer = torch.optim.Adam(model.parameters(), learning_rate, weight_decay=0.01) # taken directly from MRNet code scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, patience=5, # how many epochs to wait for before acting factor=0.3, # factor to reduce LR by, LR = factor * LR threshold=1e-4) # threshold to measure new optimum # weight loss by training class positive weights, if use_wts is False then no weights are applied # criterion_d = {'bladder': torch.nn.BCEWithLogitsLoss(), 'view': torch.nn.CrossEntropyLoss(), 'granular':torch.nn.CrossEntropyLoss()} if wts is None: criterion = torch.nn.CrossEntropyLoss() else: wts = wts.to(device) criterion = torch.nn.CrossEntropyLoss(weight=wts) # # TODO: reloading checkpoint # if restore_chkpt: # logging.info("Restoring Checkpoint from {}".format(restore_chkpt)) # helpers.load_checkpoint(checkpoint = restore_chkpt, # model = model, # optimizer = optimizer, # # scheduler = scheduler, # epochs = epochs) # # so epochs - loaded_epoch is where we would need to start, right? # logging.info("Starting again at Epoch {}....".format(epochs)) # logging.info("Finished Restoring Checkpoint...") for epoch in range(epochs): logging.info('[Epoch {}]'.format(epoch + 1)) # main training loop epoch_loss, epoch_preds, epoch_labels = run_model( model=model, loader=train_loader, optimizer=optimizer, criterion=criterion, metrics_every_iter=metrics_every_iter, train=True) logging.info('[Epoch {}]\t\t Training Average Loss: {:.5f}'.format( epoch + 1, epoch_loss)) # logging.info('[Epoch {}]\t\tTraining Balanced Accuracy: {:.3f}\t Training Average Loss: {:.5f}'.format(epoch + 1, epoch_auc, epoch_loss)) # main validation loop epoch_val_loss, epoch_val_preds, epoch_val_labels = run_model( model=model, loader=valid_loader, optimizer=optimizer, criterion=criterion, metrics_every_iter= False, # default, just show the epoch validation metrics.. train=False) logging.info('[Epoch {}]\t\t Validation Average Loss: {:.5f}'.format( epoch + 1, epoch_val_loss)) # logging.info('[Epoch {}]\t\tValidation Balanced Accuracy: {:.3f}\t Validation Average Loss: {:.5f}'.format(epoch + 1, epoch_val_acc, epoch_val_loss)) scheduler.step(epoch_val_loss ) # check per epoch, how does the threshold work?!?!? logging.info('[Epoch {}]\t\tOptimizer Learning Rate: {}'.format( epoch + 1, {optimizer.param_groups[0]['lr']})) # with writer.as_default(): # tf.summary.scalar('Loss/train', epoch_loss, epoch + 1) # tf.summary.scalar('Loss/val', epoch_val_loss, epoch + 1) # tf.summary.scalar('BACC/train', epoch_acc, epoch + 1) # tf.summary.scalar('BACC/val', epoch_val_acc, epoch + 1) # check whether the most recent epoch loss is better than previous best # is_best_val_auc = epoch_val_auc >= current_best_val_auc is_best_val_loss = epoch_val_loss < current_best_val_loss # save state in a dictionary state = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), # 'validation_acc': epoch_val_acc, 'best_validation_loss': epoch_val_loss, # 'metrics': metrics # read more into this 'scheduler_dict': scheduler.state_dict(), 'optim_dict': optimizer.state_dict() } # save as last epoch helpers.save_checkpoint(state, is_best=is_best_val_loss, checkpoint_dir=model_outdir) # epoch = epoch + 1) if is_best_val_loss: # set new best validation loss # current_best_val_auc = epoch_val_auc current_best_val_loss = epoch_val_loss # logging.info('[Epoch {}]\t\t******New Best Validation:\t AUC: {:.3f}******'.format(epoch + 1, epoch_val_auc)) logging.info( '[Epoch {}]\t\t******New Best Validation Loss: {:.3f}******'. format(epoch + 1, epoch_val_loss)) helpers.save_checkpoint(state, is_best=is_best_val_loss, checkpoint_dir=model_outdir)
def main(train_set, learning_rate, n_epochs, beta_0, beta_1, batch_size, num_workers, hidden_size, model_file, cuda, display_result_every, checkpoint_interval, seed, label_smoothing, grad_clip, dropout, upsampling): # make data between -1 and 1 data_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) train_dataset = datasets.ImageFolder(root=os.path.join( os.getcwd(), train_set), transform=data_transform) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True) # initialize model if model_file: try: total_examples, fixed_noise, gen_losses, disc_losses, gen_loss_per_epoch, \ disc_loss_per_epoch, prev_epoch, gen, disc = load_model(model_file, hidden_size, upsampling, cuda) print('model loaded successfully!') except: print('could not load model! creating new model...') model_file = None if not model_file: print('creating new model...') if upsampling == 'transpose': from models.model import Generator, Discriminator elif upsampling == 'nn': from models.model_nn import Generator, Discriminator elif upsampling == 'bilinear': from models.model_bilinear import Generator, Discriminator gen = Generator(hidden_dim=hidden_size, leaky=0.2, dropout=dropout) disc = Discriminator(leaky=0.2, dropout=dropout) gen.weight_init(mean=0, std=0.02) disc.weight_init(mean=0, std=0.02) total_examples = 0 disc_losses = [] gen_losses = [] disc_loss_per_epoch = [] gen_loss_per_epoch = [] prev_epoch = 0 # Sample minibatch of m noise samples from noise prior p_g(z) and transform if cuda: fixed_noise = Variable(torch.randn(9, hidden_size).cuda()) else: fixed_noise = Variable(torch.rand(9, hidden_size)) if cuda: gen.cuda() disc.cuda() # Binary Cross Entropy loss BCE_loss = nn.BCELoss() # Adam optimizer gen_optimizer = optim.Adam(gen.parameters(), lr=learning_rate, betas=(beta_0, beta_1), eps=1e-8) disc_optimizer = optim.Adam(disc.parameters(), lr=learning_rate, betas=(beta_0, beta_1), eps=1e-8) # results save folder gen_images_dir = 'results/generated_images' train_summaries_dir = 'results/training_summaries' checkpoint_dir = 'results/checkpoints' if not os.path.isdir('results'): os.mkdir('results') if not os.path.isdir(gen_images_dir): os.mkdir(gen_images_dir) if not os.path.isdir(train_summaries_dir): os.mkdir(train_summaries_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) np.random.seed( seed ) # reset training seed to ensure that batches remain the same between runs! try: for epoch in range(prev_epoch, n_epochs): disc_losses_epoch = [] gen_losses_epoch = [] for idx, (true_batch, _) in enumerate(train_dataloader): disc.zero_grad() # hack 6 of https://github.com/soumith/ganhacks if label_smoothing: true_target = torch.FloatTensor(batch_size).uniform_( 0.7, 1.2) else: true_target = torch.ones(batch_size) # Sample minibatch of examples from data generating distribution if cuda: true_batch = Variable(true_batch.cuda()) true_target = Variable(true_target.cuda()) else: true_batch = Variable(true_batch) true_target = Variable(true_target) # train discriminator on true data true_disc_result = disc.forward(true_batch) disc_train_loss_true = BCE_loss(true_disc_result.squeeze(), true_target) disc_train_loss_true.backward() torch.nn.utils.clip_grad_norm(disc.parameters(), grad_clip) # Sample minibatch of m noise samples from noise prior p_g(z) and transform if label_smoothing: fake_target = torch.FloatTensor(batch_size).uniform_( 0, 0.3) else: fake_target = torch.zeros(batch_size) if cuda: z = Variable(torch.randn(batch_size, hidden_size).cuda()) fake_target = Variable(fake_target.cuda()) else: z = Variable(torch.randn(batch_size, hidden_size)) fake_target = Variable(fake_target) # train discriminator on fake data fake_batch = gen.forward(z.view(-1, hidden_size, 1, 1)) fake_disc_result = disc.forward(fake_batch.detach( )) # detach so gradients not computed for generator disc_train_loss_false = BCE_loss(fake_disc_result.squeeze(), fake_target) disc_train_loss_false.backward() torch.nn.utils.clip_grad_norm(disc.parameters(), grad_clip) disc_optimizer.step() # compute performance statistics disc_train_loss = disc_train_loss_true + disc_train_loss_false disc_losses_epoch.append(disc_train_loss.data[0]) disc_fake_accuracy = 1 - fake_disc_result.mean().data[0] disc_true_accuracy = true_disc_result.mean().data[0] # Sample minibatch of m noise samples from noise prior p_g(z) and transform if label_smoothing: true_target = torch.FloatTensor(batch_size).uniform_( 0.7, 1.2) else: true_target = torch.ones(batch_size) if cuda: z = Variable(torch.randn(batch_size, hidden_size).cuda()) true_target = Variable(true_target.cuda()) else: z = Variable(torch.rand(batch_size, hidden_size)) true_target = Variable(true_target) # train generator gen.zero_grad() fake_batch = gen.forward(z.view(-1, hidden_size, 1, 1)) disc_result = disc.forward(fake_batch) gen_train_loss = BCE_loss(disc_result.squeeze(), true_target) gen_train_loss.backward() torch.nn.utils.clip_grad_norm(gen.parameters(), grad_clip) gen_optimizer.step() gen_losses_epoch.append(gen_train_loss.data[0]) if (total_examples != 0) and (total_examples % display_result_every == 0): print( 'epoch {}: step {}/{} disc true acc: {:.4f} disc fake acc: {:.4f} ' 'disc loss: {:.4f}, gen loss: {:.4f}'.format( epoch + 1, idx + 1, len(train_dataloader), disc_true_accuracy, disc_fake_accuracy, disc_train_loss.data[0], gen_train_loss.data[0])) # Checkpoint model total_examples += batch_size if (total_examples != 0) and (total_examples % checkpoint_interval == 0): disc_losses.extend(disc_losses_epoch) gen_losses.extend(gen_losses_epoch) save_checkpoint(total_examples=total_examples, fixed_noise=fixed_noise, disc=disc, gen=gen, gen_losses=gen_losses, disc_losses=disc_losses, disc_loss_per_epoch=disc_loss_per_epoch, gen_loss_per_epoch=gen_loss_per_epoch, epoch=epoch, directory=checkpoint_dir) print("Checkpoint saved!") # sample images for inspection save_image_sample(batch=gen.forward( fixed_noise.view(-1, hidden_size, 1, 1)), cuda=cuda, total_examples=total_examples, directory=gen_images_dir) print("Saved images!") # save learning curves for inspection save_learning_curve(gen_losses=gen_losses, disc_losses=disc_losses, total_examples=total_examples, directory=train_summaries_dir) print("Saved learning curves!") disc_loss_per_epoch.append(np.average(disc_losses_epoch)) gen_loss_per_epoch.append(np.average(gen_losses_epoch)) # Save epoch learning curve save_learning_curve_epoch(gen_losses=gen_loss_per_epoch, disc_losses=disc_loss_per_epoch, total_epochs=epoch + 1, directory=train_summaries_dir) print("Saved learning curves!") print('epoch {}/{} disc loss: {:.4f}, gen loss: {:.4f}'.format( epoch + 1, n_epochs, np.array(disc_losses_epoch).mean(), np.array(gen_losses_epoch).mean())) disc_losses.extend(disc_losses_epoch) gen_losses.extend(gen_losses_epoch) except KeyboardInterrupt: print("Saving before quit...") save_checkpoint(total_examples=total_examples, fixed_noise=fixed_noise, disc=disc, gen=gen, disc_loss_per_epoch=disc_loss_per_epoch, gen_loss_per_epoch=gen_loss_per_epoch, gen_losses=gen_losses, disc_losses=disc_losses, epoch=epoch, directory=checkpoint_dir) print("Checkpoint saved!") # sample images for inspection save_image_sample(batch=gen.forward( fixed_noise.view(-1, hidden_size, 1, 1)), cuda=cuda, total_examples=total_examples, directory=gen_images_dir) print("Saved images!") # save learning curves for inspection save_learning_curve(gen_losses=gen_losses, disc_losses=disc_losses, total_examples=total_examples, directory=train_summaries_dir) print("Saved learning curves!")
def train_and_eval(model, train_loader, valid_loader, learning_rate, epochs, model_outdir, #pos_wt, metrics_every_iter, task, tensorboard = False, restore_chkpt = None, run_suffix = None): """ Contains the powerhouse of the network, ie. the training and validation iterations called through run_model(). All parameters from the command line/json are parsed and then passed into run_model(). Performs checkpointing each epoch, saved as 'last.pth.tar' and the best model thus far (based on validation AUC), saved as 'best.pth.tar' :param model: (nn.Module) - :param train_loader: (torch DataLoader) :param valid_loader: (torch DataLoader) :param learning_rate: (float) - the learning rate, defaults to 1e-05 :param epochs: (int) - the number of epochs :param model_outdir: (str) - the output directory for checkpointing, checkpoints will be saved as output_dir/task/view/*.tar :param restore_chkpt: (str) - the directory to reload the checkpoint, if specified :param run_suffix: (str) - suffix to be appended to the event file. removed for now. :return: """ log_fn = helpers.create_tb_log_dir(model_outdir) log_fn = log_fn.strip("/") # remove leading forward slash which messes up tf log if tensorboard: import tensorflow as tf writer = tf.summary.create_file_writer(log_fn) # tf 2.0+ writer = tf.compat.v1.summary.FileWriter(log_fn) # tf v1.15 current_best_val_loss = float('Inf') optimizer = torch.optim.Adam(model.parameters(), learning_rate, weight_decay=0.01) # taken directly from MRNet code scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 5, # how many epochs to wait for before acting factor = 0.3, # factor to reduce LR by, LR = factor * LR threshold = 1e-4) # threshold to measure new optimum # losses = {'regression': torch.nn.MSELoss(), 'classification': torch.nn.BCEWithLogitsLoss(), 'multitask': torch.nn.MSELoss()} # criterion = losses[task] print(criterion) metric = {'regression':'MSE', 'classification':'AUC', 'multitask': 'MSE'} # Steve: Seems like this is mostly for logging? # # TODO: reloading checkpoint # if restore_chkpt: # logging.info("Restoring Checkpoint from {}".format(restore_chkpt)) # helpers.load_checkpoint(checkpoint = restore_chkpt, # model = model, # optimizer = optimizer, # # scheduler = scheduler, # epochs = epochs) # # print(loaded_epoch) # # so epochs - loaded_epoch is where we would need to start, right? # logging.info("Starting again at Epoch {}....".format(epochs)) # logging.info("Finished Restoring Checkpoint...") for epoch in range(epochs): logging.info('[Epoch {}]'.format(epoch + 1)) # main training loop epoch_loss, epoch_metric, epoch_preds, epoch_labels, train_df = run_model( model = model, loader = train_loader, optimizer = optimizer, criterion = criterion, metrics_every_iter = metrics_every_iter, task = task, tensorboard = tensorboard, train = True) logging.info('[Epoch {}]\t\tTraining {}: {:.3f}\t Training Average Loss: {:.5f}'\ .format(epoch + 1, metric[task], epoch_metric, epoch_loss)) # main validation loop epoch_val_loss, epoch_val_metric, epoch_val_preds, epoch_val_labels, val_df = run_model(model = model, loader = valid_loader, optimizer = optimizer, criterion = criterion, task = task, tensorboard = tensorboard, metrics_every_iter = False, # default, just show the epoch validation metrics.. train = False) logging.info('[Epoch {}]\t\tValidation {}: {:.3f}\t Validation Average Loss: {:.5f}'.format(epoch + 1, metric[task], epoch_val_metric, epoch_val_loss)) scheduler.step(epoch_val_loss) # check per epoch, how does the threshold work?!?!? logging.info('[Epoch {}]\t\tOptimizer Learning Rate: {}'.format(epoch + 1, {optimizer.param_groups[0]['lr']})) # with writer:#.as_default(): # temp = torch.tensor([epoch + 1]) # needs to be a tesor in tf v1.5? # writer.add_summary(tf.compat.v1.summary.scalar('Loss/train', epoch_loss), temp).eval() # writer.add_summary(tf.compat.v1.summary.scalar('Loss/val', epoch_val_loss), temp).eval() # writer.add_summary(tf.compat.v1.summary.scalar('{}/train'.format(metric[task]), epoch_metric), temp).eval() # writer.add_summary(tf.compat.v1.summary.scalar('{}/val'.format(metric[task]), epoch_val_metric), temp).eval() # writer_flush = writer.flush() # with writer.as_default(): # tf.summary.scalar('Loss/train', epoch_loss, epoch + 1) # tf.summary.scalar('Loss/val', epoch_val_loss, epoch + 1) # tf.summary.scalar('{}/train'.format(metric[task]), epoch_metric, epoch + 1) # tf.summary.scalar('{}/val'.format(metric[task]), epoch_val_metric, epoch + 1) print('Loss/train: {} for epoch: {}'.format(str(epoch_loss), str(epoch + 1))) print('Loss/val: {} for epoch: {}'.format(str(epoch_val_loss), str(epoch + 1))) print('{}/train: {} for epoch: {}'.format(metric[task], str(epoch_metric), str(epoch + 1))) print('{}/val: {} for epoch: {}'.format(metric[task], str(epoch_val_metric), str(epoch + 1))) # check whether the most recent epoch loss is better than previous best is_best_val_loss = epoch_val_loss < current_best_val_loss # save state in a dictionary state = {'epoch': epoch + 1, 'state_dict': model.state_dict(), 'validation_metric': epoch_val_metric, 'metric': metric[task], 'best_validation_loss': epoch_val_loss, # 'metrics': metrics # read more into this 'scheduler_dict': scheduler.state_dict(), 'optim_dict': optimizer.state_dict()} # save as last epoch helpers.save_checkpoint(state, is_best = is_best_val_loss, checkpoint_dir = model_outdir) if is_best_val_loss: current_best_val_loss = epoch_val_loss logging.info('[Epoch {}]\t\t******New Best Validation Loss: {:.3f}******'.format(epoch + 1, epoch_val_loss)) helpers.save_checkpoint(state, is_best = is_best_val_loss, checkpoint_dir = model_outdir) #if task == 'multitask': # Steven: Seems like this should work the same if doing regression or classification. I'll try doing the same for regression by commenting out this if statement. # train_df.to_csv(os.path.join(model_outdir, 'best_epoch_training_results.csv')) # val_df.to_csv(os.path.join(model_outdir, 'best_epoch_validation_results.csv')) train_df.to_csv(os.path.join(model_outdir, 'best_epoch_training_results.csv')) val_df.to_csv(os.path.join(model_outdir, 'best_epoch_validation_results.csv'))
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model # if args.pretrained: # print("=> using pre-trained model '{}'".format(args.arch)) # model = models.__dict__[args.arch](pretrained=True) # model = autofit(model, args.arch, args.num_classes) # else: # print("=> creating model '{}'".format(args.arch)) # model = models.__dict__[args.arch](num_classes=args.num_classes) model = AutoFitNet(arch=args.arch, pretrained=args.pretrained, num_classes=args.num_classes) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') testdir = os.path.join(args.data, 'test') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # train_dataset = datasets.ImageFolder( # traindir, # transforms.Compose([ # transforms.Resize(256), # transforms.RandomResizedCrop(224), # # transforms.RandomHorizontalFlip(), # transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1), # transforms.ToTensor(), # normalize, # ])) train_dataset = CityFuncDataset( traindir, transforms.Compose([ transforms.Resize(256), transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(CityFuncDataset( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.test: test_loader = torch.utils.data.DataLoader(CityFuncDataset( testdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) validate(test_loader, model, criterion, args) return if args.evaluate: validate(val_loader, model, criterion, args) return epoch_time = AverageMeter('Time', ':6.3f', 's') end = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # learning rate decay adjust_learning_rate(optimizer, epoch, args.lr) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best) # measure elapsed time epoch_time.update(time.time() - end) eta = (args.epochs - epoch - 1) * epoch_time.avg eta_str = str(datetime.timedelta(seconds=int(eta))) print( 'Epoch: [{epoch:d}]\tTime:{time:6.3f}s\tETA:{eta:6.3f}s ({eta_str:s})' .format(epoch=epoch, time=epoch_time.val, eta=eta, eta_str=eta_str)) end = time.time()
def train(model, train_data, val_data, epochs, batch_size, learning_rate, savedir, alpha=3, beta=3, vc_flag=True, mix_flag=False): best_check = {'epoch': 0, 'best': 0, 'val_acc': 0} out_file_name = savedir + 'result.txt' total_train = len(train_data) train_loader = DataLoader(dataset=train_data, batch_size=1, shuffle=True) val_loaders = [] for i in range(len(val_data)): val_loader = DataLoader(dataset=val_data[i], batch_size=1, shuffle=True) val_loaders.append(val_loader) # we observed that training the backbone does not make a very big difference but not training saves a lot of memory # if the backbone should be trained, then only with very small learning rate e.g. 1e-7 for param in model.backbone.parameters(): param.requires_grad = False if not vc_flag: model.conv1o1.weight.requires_grad = False else: model.conv1o1.weight.requires_grad = True if not mix_flag: model.mix_model.requires_grad = False else: model.mix_model.requires_grad = True classification_loss = nn.CrossEntropyLoss() cluster_loss = ClusterLoss() optimizer = torch.optim.Adagrad(params=filter( lambda param: param.requires_grad, model.parameters()), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.98) print('Training') for epoch in range(epochs): out_file = open(out_file_name, 'a') train_loss = 0.0 correct = 0 start = time.time() model.train() model.backbone.eval() for index, data in enumerate(train_loader): if index % 500 == 0 and index != 0: end = time.time() print('Epoch{}: {}/{}, Acc: {}, Loss: {} Time:{}'.format( epoch + 1, index, total_train, correct.cpu().item() / index, train_loss.cpu().item() / index, (end - start))) start = time.time() input, _, label = data input = input.cuda(device_ids[0]) label = label.cuda(device_ids[0]) output, vgg_feat, like = model(input) out = output.argmax(1) correct += torch.sum(out == label) class_loss = classification_loss(output, label) / output.shape[0] loss = class_loss if alpha != 0: clust_loss = cluster_loss( vgg_feat, model.conv1o1.weight) / output.shape[0] loss += alpha * clust_loss if beta != 0: mix_loss = like[0, label[0]] loss += -beta * mix_loss #with torch.autograd.set_detect_anomaly(True): loss.backward() # pseudo batches if np.mod(index, batch_size) == 0: # and index!=0: optimizer.step() optimizer.zero_grad() train_loss += loss.detach() * input.shape[0] updated_clutter = update_clutter_model(model, device_ids) model.clutter_model = updated_clutter scheduler.step() train_acc = correct.cpu().item() / total_train train_loss = train_loss.cpu().item() / total_train out_str = 'Epochs: [{}/{}], Train Acc:{}, Train Loss:{}'.format( epoch + 1, epochs, train_acc, train_loss) print(out_str) out_file.write(out_str) # Evaluate Validation images model.eval() with torch.no_grad(): correct = 0 val_accs = [] for i in range(len(val_loaders)): val_loader = val_loaders[i] correct_local = 0 total_local = 0 val_loss = 0 out_pred = torch.zeros(len(val_data[i].images)) for index, data in enumerate(val_loader): input, _, label = data input = input.cuda(device_ids[0]) label = label.cuda(device_ids[0]) output, _, _ = model(input) out = output.argmax(1) out_pred[index] = out correct_local += torch.sum(out == label) total_local += label.shape[0] class_loss = classification_loss(output, label) / output.shape[0] loss = class_loss val_loss += loss.detach() * input.shape[0] correct += correct_local val_acc = correct_local.cpu().item() / total_local val_loss = val_loss.cpu().item() / total_local val_accs.append(val_acc) out_str = 'Epochs: [{}/{}], Val-Set {}, Val Acc:{} Val Loss:{}\n'.format( epoch + 1, epochs, i, val_acc, val_loss) print(out_str) out_file.write(out_str) val_acc = np.mean(val_accs) out_file.write('Epochs: [{}/{}], Val Acc:{}\n'.format( epoch + 1, epochs, val_acc)) if val_acc > best_check['val_acc']: print('BEST: {}'.format(val_acc)) out_file.write('BEST: {}\n'.format(val_acc)) best_check = { 'state_dict': model.state_dict(), 'val_acc': val_acc, 'epoch': epoch } save_checkpoint(best_check, savedir + 'vc' + str(epoch + 1) + '.pth', True) print('\n') out_file.close() return best_check
losses['validate']['history'].append(validate_loss) # always save latest checkpoint after an epoch, and flag if best checkpoint if (epoch + 1) % 5 == 0 or is_best_model: print('Saving checkpoint at epoch {}...'.format(epoch + 1)) logging.info('Saving checkpoint at epoch {}...'.format(epoch + 1)) model.cpu() save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'losses': losses, 'word_embeddings': model.word_embeddings.weight.data.numpy(), 'pos_embeddings': model.cpu().pos_embeddings.weight.data.numpy(), 'optimizer': optimizer.state_dict(), }, LATEST_CHECKPOINT_RELATIVE_PATH, BEST_CHECKPOINT_RELATIVE_PATH, is_best_model) if CUDA: model.cuda() if validate_loss > losses['validate']['min'][ 'value'] and epoch - losses['validate']['min'][ 'epoch'] > 10: print( 'Ten epochs with no improvement have passed. Stopping training...'