Ejemplo n.º 1
0
def train(model,
          criterion,
          optimizer,
          epochs,
          device,
          train_loader,
          valid_loader,
          print_every=60,
          step_track_every=30):
    total_start = time()
    model.epochs = epochs
    save_checkpoint(model, optimizer)
    train_losses, valid_losses = [], []
    steps = 0
    running_loss = 0
    valid_loader_size = len(valid_loader)
    print("==========================")
    print("Starting training of NN...")
    for epoch in range(epochs):
        print("==========================")
        print(f"Starting epoch #{epoch}...")
        start_epoch = start_step = time()
        for inputs, labels in train_loader:
            steps += 1
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            logps = model.forward(inputs)
            loss = criterion(logps, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            if steps % step_track_every == 0:
                print(
                    f"Time in step #{steps}: {(time() - start_step):.3f} seconds"
                )
                start_step = time()
            if steps % print_every == 0:
                start_test = time()
                valid_loss, accuracy = get_valid_loss_and_accuracy(
                    model, valid_loader, device, criterion)
                cur_train_loss = running_loss / valid_loader_size
                running_loss = 0
                train_losses.append(cur_train_loss)
                valid_losses.append(valid_loss)
                print(f"Epoch {epoch}/{epochs}..."
                      f"Training loss: {cur_train_loss:.4f}..."
                      f"Validation loss: {valid_loss:.4f}..."
                      f"Test accuracy: {accuracy:.4f}\n")
                print(
                    f"Time taken to test losses in epoch #{epoch}: {(time() - start_test):.3f} seconds"
                )
        print(f"Time per epoch: {(time() - start_epoch):.3f} seconds")
    print(f"Total training time: {(time() - total_start):.3f} seconds")
    save_checkpoint(model, optimizer)
Ejemplo n.º 2
0
def do_run():

    config = get_input_config()

    #setup some globals
    global STANZA
    STANZA = config.get("name")

    http_proxy = config.get("http_proxy")
    https_proxy = config.get("https_proxy")

    proxies = {}

    if not http_proxy is None:
        proxies["http"] = http_proxy
    if not https_proxy is None:
        proxies["https"] = https_proxy

    request_timeout = int(config.get("request_timeout", 30))

    try:
        req_args = {"verify": True, "timeout": float(request_timeout)}
        if proxies:
            req_args["proxies"] = proxies

        req = requests.get(
            url=
            "https://publicdashacc.blob.core.windows.net/publicdata?restype=container&comp=list&prefix=data",
            params=req_args)
        xmldom = etree.fromstring(req.content)

        blobs = xmldom.xpath('/EnumerationResults/Blobs/Blob')
        for blob in blobs:
            blob_etag = blob.xpath('Properties/Etag')[0].text
            blob_name = blob.xpath('Name')[0].text
            logging.info("Found file=%s etag=%s" % (blob_name, blob_etag))
            blob_url = "https://publicdashacc.blob.core.windows.net/publicdata/%s" % (
                blob_name)
            if not load_checkpoint(config, blob_etag):
                print("Processing file={}".format(blob_url))
                data_req = requests.get(url=blob_url, params=req_args)
                data_json = data_req.json()

                iterate_json_data("overview", data_json, blob_name)
                iterate_json_data("countries", data_json, blob_name)
                iterate_json_data("regions", data_json, blob_name)
                iterate_json_data("utlas", data_json, blob_name)
            logging.info("Marking file={} etag={} as processed".format(
                blob_name, blob_etag))
            save_checkpoint(config, blob_etag)

    except RuntimeError, e:
        logging.error("Looks like an error: %s" % str(e))
        sys.exit(2)
Ejemplo n.º 3
0
def main():
    in_args = train_input_args()
    train_dir = in_args.data_dir + '/train'
    valid_dir = in_args.data_dir + '/valid'

    train_transforms = transforms.Compose([
        transforms.RandomRotation(30),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    valid_transforms = transforms.Compose([
        transforms.Resize(255),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    train_data = datasets.ImageFolder(train_dir, transform=train_transforms)
    valid_data = datasets.ImageFolder(valid_dir, transform=valid_transforms)

    trainloader = torch.utils.data.DataLoader(train_data,
                                              batch_size=64,
                                              shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_data, batch_size=64)
    dataloader = [trainloader, validloader]

    if in_args.arch == 'densenet121':
        model = models.densenet121(pretrained=True)
        classifier = nn.Sequential(
            OrderedDict([("fc1", nn.Linear(1024, in_args.hidden_units)),
                         ("ReLU1", nn.ReLU()), ("dropout", nn.Dropout(0.5)),
                         ("fc2", nn.Linear(in_args.hidden_units, 102)),
                         ("output", nn.LogSoftmax(dim=1))]))
    elif in_args.arch == 'vgg16':
        model = models.vgg16(pretrained=True)
        classifier = nn.Sequential(
            OrderedDict([("fc1", nn.Linear(25088, in_args.hidden_units)),
                         ("ReLU1", nn.ReLU()), ("dropout", nn.Dropout(0.5)),
                         ("fc2", nn.Linear(in_args.hidden_units, 102)),
                         ("output", nn.LogSoftmax(dim=1))]))
    # Freeze parameters so we don't backprop through them
    for param in model.parameters():
        param.requires_grad = False
    model.classifier = classifier
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.classifier.parameters(),
                           lr=in_args.learning_rate)
    device = torch.device('cuda' if in_args.gpu == 'gpu' else 'cpu')
    train(model, in_args, dataloader, optimizer, criterion, device)
    model.class_to_idx = train_data.class_to_idx
    save_checkpoint(in_args.save_dir, optimizer, in_args, classifier, model)
Ejemplo n.º 4
0
def main(train_set, learning_rate, n_epochs, batch_size, num_workers, hidden_size, model_file,
         cuda, checkpoint_interval, seed, n_disc):

    #  make data between -1 and 1
    data_transform = transforms.Compose([transforms.ToTensor(),
                                         transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])

    train_dataset = datasets.ImageFolder(root=os.path.join(os.getcwd(), train_set),
                                         transform=data_transform)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                                  shuffle=True, num_workers=num_workers,
                                  drop_last=True)

    # initialize model
    if model_file:
        try:
            total_examples, fixed_noise, gen_losses, disc_losses, gen_loss_per_epoch, \
            disc_loss_per_epoch, prev_epoch, gen, disc = load_model_wgan(model_file, hidden_size)  # TODO: upsampling method?
            print('model loaded successfully!')

        except:
            print('could not load model! creating new model...')
            model_file = None

    if not model_file:
        print('creating new model...')

        gen = Generator(hidden_dim=hidden_size, leaky=0.2)
        disc = Discriminator(leaky=0.2)

        gen.weight_init(mean=0, std=0.02)
        disc.weight_init(mean=0, std=0.02)

        total_examples = 0
        disc_losses = []
        gen_losses = []
        disc_loss_per_epoch = []
        gen_loss_per_epoch = []
        prev_epoch = 0

        #  Sample minibatch of m noise samples from noise prior p_g(z) and transform
        if cuda:
            fixed_noise = Variable(torch.randn(9, hidden_size).cuda())
        else:
            fixed_noise = Variable(torch.rand(9, hidden_size))

    if cuda:
        gen.cuda()
        disc.cuda()

    # Adam optimizer
    gen_optimizer = optim.RMSprop(gen.parameters(), lr=learning_rate, eps=1e-8)
    disc_optimizer = optim.RMSprop(disc.parameters(), lr=learning_rate, eps=1e-8)

    # results save folder
    gen_images_dir = 'results/wgan_generated_images'
    train_summaries_dir = 'results/wgan_training_summaries'
    checkpoint_dir = 'results/wgan_checkpoints'
    if not os.path.isdir('results'):
        os.mkdir('results')
    if not os.path.isdir(gen_images_dir):
        os.mkdir(gen_images_dir)
    if not os.path.isdir(train_summaries_dir):
        os.mkdir(train_summaries_dir)
    if not os.path.isdir(checkpoint_dir):
        os.mkdir(checkpoint_dir)

    np.random.seed(seed)  # reset training seed to ensure that batches remain the same between runs!

    try:
        for epoch in range(prev_epoch, n_epochs):
            disc_losses_epoch = []
            gen_losses_epoch = []
            for idx, (true_batch, _) in enumerate(train_dataloader):
                disc.zero_grad()

                #  Sample  minibatch  of examples from data generating distribution
                if cuda:
                    true_batch = Variable(true_batch.cuda())
                else:
                    true_batch = Variable(true_batch)

                # discriminator on true data
                true_disc_output = disc.forward(true_batch)

                #  Sample minibatch of m noise samples from noise prior p_g(z) and transform
                if cuda:
                    z = Variable(torch.randn(batch_size, hidden_size).cuda())
                else:
                    z = Variable(torch.randn(batch_size, hidden_size))

                # discriminator on fake data
                fake_batch = gen.forward(z.view(-1, hidden_size, 1, 1))
                fake_disc_output = disc.forward(
                    fake_batch.detach())  # detach so gradients not computed for generator

                # Optimize with new loss function
                disc_loss = wgan_Dloss(true_disc_output, fake_disc_output)
                disc_loss.backward()
                disc_optimizer.step()

                # Weight clipping as done by WGAN
                for p in disc.parameters():
                    p.data.clamp_(-0.01, 0.01)

                #  Store losses
                disc_losses_epoch.append(disc_loss.data[0])

                # Train generator after the discriminator has been trained n_disc times
                if (idx+1) % n_disc == 0:
                    gen.zero_grad()

                    # Sample minibatch of m noise samples from noise prior p_g(z) and transform
                    if cuda:
                        z = Variable(torch.randn(batch_size, hidden_size).cuda())
                    else:
                        z = Variable(torch.rand(batch_size, hidden_size))

                    # train generator
                    fake_batch = gen.forward(z.view(-1, hidden_size, 1, 1))
                    fake_disc_output = disc.forward(fake_batch)

                    # Optimize generator
                    gen_loss = wgan_Gloss(fake_disc_output)
                    gen_loss.backward()
                    gen_optimizer.step()

                    # Store losses
                    gen_losses_epoch.append(gen_loss.data[0])

                if (total_examples != 0) and (total_examples % n_disc*4 == 0):
                    print('epoch {}: step {}/{} disc loss: {:.4f}, gen loss: {:.4f}'
                          .format(epoch + 1, idx + 1, len(train_dataloader), disc_loss.data[0], gen_loss.data[0]))

                # Checkpoint model
                total_examples += batch_size
                if (checkpoint_interval != 0) and (total_examples % checkpoint_interval == 0):
                    disc_losses.extend(disc_losses_epoch)
                    gen_losses.extend(gen_losses_epoch)
                    save_checkpoint(total_examples=total_examples, fixed_noise=fixed_noise, disc=disc, gen=gen,
                                    gen_losses=gen_losses, disc_losses=disc_losses,
                                    disc_loss_per_epoch=disc_loss_per_epoch,
                                    gen_loss_per_epoch=gen_loss_per_epoch, epoch=epoch, directory=checkpoint_dir)
                    print("Checkpoint saved!")

                    #  sample images for inspection
                    save_image_sample(batch=gen.forward(fixed_noise.view(-1, hidden_size, 1, 1)),
                                      cuda=cuda, total_examples=total_examples, directory=gen_images_dir)
                    print("Saved images!")

                    # save learning curves for inspection
                    save_learning_curve(gen_losses=gen_losses, disc_losses=disc_losses,
                                        total_examples=total_examples, directory=train_summaries_dir)
                    print("Saved learning curves!")

            disc_loss_per_epoch.append(np.average(disc_losses_epoch))
            gen_loss_per_epoch.append(np.average(gen_losses_epoch))

            # Save epoch learning curve
            save_learning_curve_epoch(gen_losses=gen_loss_per_epoch, disc_losses=disc_loss_per_epoch,
                                      total_epochs=epoch + 1, directory=train_summaries_dir)
            print("Saved learning curves!")

            print('epoch {}/{} disc loss: {:.4f}, gen loss: {:.4f}'
                  .format(epoch + 1, n_epochs, np.array(disc_losses_epoch).mean(), np.array(gen_losses_epoch).mean()))

            disc_losses.extend(disc_losses_epoch)
            gen_losses.extend(gen_losses_epoch)

    except KeyboardInterrupt:
        print("Saving before quit...")
        save_checkpoint(total_examples=total_examples, fixed_noise=fixed_noise, disc=disc, gen=gen,
                        disc_loss_per_epoch=disc_loss_per_epoch,
                        gen_loss_per_epoch=gen_loss_per_epoch,
                        gen_losses=gen_losses, disc_losses=disc_losses, epoch=epoch, directory=checkpoint_dir)
        print("Checkpoint saved!")

        # sample images for inspection
        save_image_sample(batch=gen.forward(fixed_noise.view(-1, hidden_size, 1, 1)),
                          cuda=cuda, total_examples=total_examples, directory=gen_images_dir)
        print("Saved images!")

        # save learning curves for inspection
        save_learning_curve(gen_losses=gen_losses, disc_losses=disc_losses,
                            total_examples=total_examples, directory=train_summaries_dir)
        print("Saved learning curves!")
Ejemplo n.º 5
0
def train_and_eval(model,
                   train_loader,
                   valid_loader,
                   learning_rate,
                   epochs,
                   model_outdir,
                   wts,
                   task,
                   metrics_every_iter,
                   restore_chkpt=None,
                   run_suffix=None):
    """
    Contains the powerhouse of the network, ie. the training and validation iterations called through run_model().
    All parameters from the command line/json are parsed and then passed into run_model().
    Performs checkpointing each epoch, saved as 'last.pth.tar' and the best model thus far (based on validation AUC), saved as 'best.pth.tar'

    :param model: (nn.Module)
    :param train_loader: (torch DataLoader)
    :param valid_loader: (torch DataLoader)
    :param learning_rate: (float) - the learning rate, defaults to 1e-05
    :param epochs: (int) - the number of epochs
    :param wts: (tensor) - class weights
    :param model_outdir: (str) - the output directory for checkpointing, checkpoints will be saved as output_dir/task/view/*.tar
    :param restore_chkpt: (str) - the directory to reload the checkpoint, if specified
    :param run_suffix: (str) - suffix to be appended to the event file
    :return:
    """

    # output/task/my_run
    # goes back 3 levels up, putting the name in the same level as the output. two levels up would put them into output/
    recover_root_dir = os.path.dirname(
        os.path.dirname(os.path.dirname(
            model_outdir)))  # removes the task and view from the directory
    log_dir = os.path.join(recover_root_dir, "logs")
    run_name = re.split(r'/|\\', model_outdir)[-1]
    # task = re.split(r'/|\\', model_outdir)[-2]
    # have log folder naming structure same as models
    log_fn = os.path.join(log_dir, task, run_name)

    dtnow = datetime.now()
    # dtnow.strftime("%Y%m%d_%H%M%S")
    log_fn = os.path.join(log_fn, dtnow.strftime("%Y_%m_%d-%H_%M_%S"))

    # make directory if it doesn't exist.
    if not os.path.exists(log_fn):
        os.makedirs(log_fn)
        print('{} does not exist, creating..!'.format(log_fn))
    else:
        print('{} already exists!'.format(log_fn))

    # each tensorboard event file should ideally be saved to a unique folder, else the resulting graph will look like
    # it's time traveling because of overlapping logs
    # if run_suffix:
    #     writer = tf.summary.create_file_writer(log_fn, filename_suffix=run_suffix)
    # else:
    #     writer = tf.summary.create_file_writer(log_fn)

    # use cpu or cuda depending on availability
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    current_best_val_loss = float('Inf')
    # this needs to be outside of the loop else it'll keep resetting, right? same with the model
    optimizer = torch.optim.Adam(model.parameters(),
                                 learning_rate,
                                 weight_decay=0.01)
    # taken directly from MRNet code
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        patience=5,  # how many epochs to wait for before acting
        factor=0.3,  # factor to reduce LR by, LR = factor * LR
        threshold=1e-4)  # threshold to measure new optimum

    # weight loss by training class positive weights, if use_wts is False then no weights are applied
    # criterion_d = {'bladder': torch.nn.BCEWithLogitsLoss(), 'view': torch.nn.CrossEntropyLoss(), 'granular':torch.nn.CrossEntropyLoss()}
    if wts is None:
        criterion = torch.nn.CrossEntropyLoss()
    else:
        wts = wts.to(device)
        criterion = torch.nn.CrossEntropyLoss(weight=wts)

    # # TODO: reloading checkpoint
    # if restore_chkpt:
    #     logging.info("Restoring Checkpoint from {}".format(restore_chkpt))
    #     helpers.load_checkpoint(checkpoint = restore_chkpt,
    #                             model = model,
    #                             optimizer = optimizer,
    #                             # scheduler = scheduler,
    #                             epochs = epochs)
    #     # so epochs - loaded_epoch is where we would need to start, right?
    #     logging.info("Starting again at Epoch {}....".format(epochs))
    #     logging.info("Finished Restoring Checkpoint...")

    for epoch in range(epochs):
        logging.info('[Epoch {}]'.format(epoch + 1))
        # main training loop
        epoch_loss, epoch_preds, epoch_labels = run_model(
            model=model,
            loader=train_loader,
            optimizer=optimizer,
            criterion=criterion,
            metrics_every_iter=metrics_every_iter,
            train=True)
        logging.info('[Epoch {}]\t\t Training Average Loss: {:.5f}'.format(
            epoch + 1, epoch_loss))

        # logging.info('[Epoch {}]\t\tTraining Balanced Accuracy: {:.3f}\t Training Average Loss: {:.5f}'.format(epoch + 1, epoch_auc, epoch_loss))
        # main validation loop
        epoch_val_loss, epoch_val_preds, epoch_val_labels = run_model(
            model=model,
            loader=valid_loader,
            optimizer=optimizer,
            criterion=criterion,
            metrics_every_iter=
            False,  # default, just show the epoch validation metrics..
            train=False)

        logging.info('[Epoch {}]\t\t Validation Average Loss: {:.5f}'.format(
            epoch + 1, epoch_val_loss))

        # logging.info('[Epoch {}]\t\tValidation Balanced Accuracy: {:.3f}\t Validation Average Loss: {:.5f}'.format(epoch + 1, epoch_val_acc, epoch_val_loss))
        scheduler.step(epoch_val_loss
                       )  # check per epoch, how does the threshold work?!?!?
        logging.info('[Epoch {}]\t\tOptimizer Learning Rate: {}'.format(
            epoch + 1, {optimizer.param_groups[0]['lr']}))

        # with writer.as_default():
        #     tf.summary.scalar('Loss/train', epoch_loss, epoch + 1)
        #     tf.summary.scalar('Loss/val', epoch_val_loss, epoch + 1)
        # tf.summary.scalar('BACC/train', epoch_acc, epoch + 1)
        # tf.summary.scalar('BACC/val', epoch_val_acc, epoch + 1)

        # check whether the most recent epoch loss is better than previous best
        # is_best_val_auc = epoch_val_auc >= current_best_val_auc
        is_best_val_loss = epoch_val_loss < current_best_val_loss

        # save state in a dictionary
        state = {
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            # 'validation_acc': epoch_val_acc,
            'best_validation_loss': epoch_val_loss,
            # 'metrics': metrics # read more into this
            'scheduler_dict': scheduler.state_dict(),
            'optim_dict': optimizer.state_dict()
        }

        # save as last epoch
        helpers.save_checkpoint(state,
                                is_best=is_best_val_loss,
                                checkpoint_dir=model_outdir)
        # epoch = epoch + 1)

        if is_best_val_loss:
            # set new best validation loss
            # current_best_val_auc = epoch_val_auc
            current_best_val_loss = epoch_val_loss
            # logging.info('[Epoch {}]\t\t******New Best Validation:\t AUC: {:.3f}******'.format(epoch + 1, epoch_val_auc))
            logging.info(
                '[Epoch {}]\t\t******New Best Validation Loss: {:.3f}******'.
                format(epoch + 1, epoch_val_loss))
            helpers.save_checkpoint(state,
                                    is_best=is_best_val_loss,
                                    checkpoint_dir=model_outdir)
Ejemplo n.º 6
0
def main(train_set, learning_rate, n_epochs, beta_0, beta_1, batch_size,
         num_workers, hidden_size, model_file, cuda, display_result_every,
         checkpoint_interval, seed, label_smoothing, grad_clip, dropout,
         upsampling):

    #  make data between -1 and 1
    data_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
    ])

    train_dataset = datasets.ImageFolder(root=os.path.join(
        os.getcwd(), train_set),
                                         transform=data_transform)

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=num_workers,
                                  drop_last=True)

    # initialize model
    if model_file:
        try:
            total_examples, fixed_noise, gen_losses, disc_losses, gen_loss_per_epoch, \
            disc_loss_per_epoch, prev_epoch, gen, disc = load_model(model_file, hidden_size, upsampling, cuda)
            print('model loaded successfully!')
        except:
            print('could not load model! creating new model...')
            model_file = None

    if not model_file:
        print('creating new model...')
        if upsampling == 'transpose':
            from models.model import Generator, Discriminator
        elif upsampling == 'nn':
            from models.model_nn import Generator, Discriminator
        elif upsampling == 'bilinear':
            from models.model_bilinear import Generator, Discriminator

        gen = Generator(hidden_dim=hidden_size, leaky=0.2, dropout=dropout)
        disc = Discriminator(leaky=0.2, dropout=dropout)

        gen.weight_init(mean=0, std=0.02)
        disc.weight_init(mean=0, std=0.02)

        total_examples = 0
        disc_losses = []
        gen_losses = []
        disc_loss_per_epoch = []
        gen_loss_per_epoch = []
        prev_epoch = 0

        #  Sample minibatch of m noise samples from noise prior p_g(z) and transform
        if cuda:
            fixed_noise = Variable(torch.randn(9, hidden_size).cuda())
        else:
            fixed_noise = Variable(torch.rand(9, hidden_size))

    if cuda:
        gen.cuda()
        disc.cuda()

    # Binary Cross Entropy loss
    BCE_loss = nn.BCELoss()

    # Adam optimizer
    gen_optimizer = optim.Adam(gen.parameters(),
                               lr=learning_rate,
                               betas=(beta_0, beta_1),
                               eps=1e-8)
    disc_optimizer = optim.Adam(disc.parameters(),
                                lr=learning_rate,
                                betas=(beta_0, beta_1),
                                eps=1e-8)

    # results save folder
    gen_images_dir = 'results/generated_images'
    train_summaries_dir = 'results/training_summaries'
    checkpoint_dir = 'results/checkpoints'
    if not os.path.isdir('results'):
        os.mkdir('results')
    if not os.path.isdir(gen_images_dir):
        os.mkdir(gen_images_dir)
    if not os.path.isdir(train_summaries_dir):
        os.mkdir(train_summaries_dir)
    if not os.path.isdir(checkpoint_dir):
        os.mkdir(checkpoint_dir)

    np.random.seed(
        seed
    )  # reset training seed to ensure that batches remain the same between runs!

    try:
        for epoch in range(prev_epoch, n_epochs):
            disc_losses_epoch = []
            gen_losses_epoch = []
            for idx, (true_batch, _) in enumerate(train_dataloader):
                disc.zero_grad()

                #  hack 6 of https://github.com/soumith/ganhacks
                if label_smoothing:
                    true_target = torch.FloatTensor(batch_size).uniform_(
                        0.7, 1.2)
                else:
                    true_target = torch.ones(batch_size)

                #  Sample  minibatch  of examples from data generating distribution
                if cuda:
                    true_batch = Variable(true_batch.cuda())
                    true_target = Variable(true_target.cuda())
                else:
                    true_batch = Variable(true_batch)
                    true_target = Variable(true_target)

                #  train discriminator on true data
                true_disc_result = disc.forward(true_batch)
                disc_train_loss_true = BCE_loss(true_disc_result.squeeze(),
                                                true_target)
                disc_train_loss_true.backward()
                torch.nn.utils.clip_grad_norm(disc.parameters(), grad_clip)

                #  Sample minibatch of m noise samples from noise prior p_g(z) and transform
                if label_smoothing:
                    fake_target = torch.FloatTensor(batch_size).uniform_(
                        0, 0.3)
                else:
                    fake_target = torch.zeros(batch_size)

                if cuda:
                    z = Variable(torch.randn(batch_size, hidden_size).cuda())
                    fake_target = Variable(fake_target.cuda())
                else:
                    z = Variable(torch.randn(batch_size, hidden_size))
                    fake_target = Variable(fake_target)

                #  train discriminator on fake data
                fake_batch = gen.forward(z.view(-1, hidden_size, 1, 1))
                fake_disc_result = disc.forward(fake_batch.detach(
                ))  # detach so gradients not computed for generator
                disc_train_loss_false = BCE_loss(fake_disc_result.squeeze(),
                                                 fake_target)
                disc_train_loss_false.backward()
                torch.nn.utils.clip_grad_norm(disc.parameters(), grad_clip)
                disc_optimizer.step()

                #  compute performance statistics
                disc_train_loss = disc_train_loss_true + disc_train_loss_false
                disc_losses_epoch.append(disc_train_loss.data[0])

                disc_fake_accuracy = 1 - fake_disc_result.mean().data[0]
                disc_true_accuracy = true_disc_result.mean().data[0]

                #  Sample minibatch of m noise samples from noise prior p_g(z) and transform
                if label_smoothing:
                    true_target = torch.FloatTensor(batch_size).uniform_(
                        0.7, 1.2)
                else:
                    true_target = torch.ones(batch_size)

                if cuda:
                    z = Variable(torch.randn(batch_size, hidden_size).cuda())
                    true_target = Variable(true_target.cuda())
                else:
                    z = Variable(torch.rand(batch_size, hidden_size))
                    true_target = Variable(true_target)

                # train generator
                gen.zero_grad()
                fake_batch = gen.forward(z.view(-1, hidden_size, 1, 1))
                disc_result = disc.forward(fake_batch)
                gen_train_loss = BCE_loss(disc_result.squeeze(), true_target)

                gen_train_loss.backward()
                torch.nn.utils.clip_grad_norm(gen.parameters(), grad_clip)
                gen_optimizer.step()
                gen_losses_epoch.append(gen_train_loss.data[0])

                if (total_examples != 0) and (total_examples %
                                              display_result_every == 0):
                    print(
                        'epoch {}: step {}/{} disc true acc: {:.4f} disc fake acc: {:.4f} '
                        'disc loss: {:.4f}, gen loss: {:.4f}'.format(
                            epoch + 1, idx + 1, len(train_dataloader),
                            disc_true_accuracy, disc_fake_accuracy,
                            disc_train_loss.data[0], gen_train_loss.data[0]))

                # Checkpoint model
                total_examples += batch_size
                if (total_examples != 0) and (total_examples %
                                              checkpoint_interval == 0):

                    disc_losses.extend(disc_losses_epoch)
                    gen_losses.extend(gen_losses_epoch)
                    save_checkpoint(total_examples=total_examples,
                                    fixed_noise=fixed_noise,
                                    disc=disc,
                                    gen=gen,
                                    gen_losses=gen_losses,
                                    disc_losses=disc_losses,
                                    disc_loss_per_epoch=disc_loss_per_epoch,
                                    gen_loss_per_epoch=gen_loss_per_epoch,
                                    epoch=epoch,
                                    directory=checkpoint_dir)
                    print("Checkpoint saved!")

                    #  sample images for inspection
                    save_image_sample(batch=gen.forward(
                        fixed_noise.view(-1, hidden_size, 1, 1)),
                                      cuda=cuda,
                                      total_examples=total_examples,
                                      directory=gen_images_dir)
                    print("Saved images!")

                    # save learning curves for inspection
                    save_learning_curve(gen_losses=gen_losses,
                                        disc_losses=disc_losses,
                                        total_examples=total_examples,
                                        directory=train_summaries_dir)
                    print("Saved learning curves!")

            disc_loss_per_epoch.append(np.average(disc_losses_epoch))
            gen_loss_per_epoch.append(np.average(gen_losses_epoch))

            # Save epoch learning curve
            save_learning_curve_epoch(gen_losses=gen_loss_per_epoch,
                                      disc_losses=disc_loss_per_epoch,
                                      total_epochs=epoch + 1,
                                      directory=train_summaries_dir)
            print("Saved learning curves!")

            print('epoch {}/{} disc loss: {:.4f}, gen loss: {:.4f}'.format(
                epoch + 1, n_epochs,
                np.array(disc_losses_epoch).mean(),
                np.array(gen_losses_epoch).mean()))

            disc_losses.extend(disc_losses_epoch)
            gen_losses.extend(gen_losses_epoch)

    except KeyboardInterrupt:
        print("Saving before quit...")
        save_checkpoint(total_examples=total_examples,
                        fixed_noise=fixed_noise,
                        disc=disc,
                        gen=gen,
                        disc_loss_per_epoch=disc_loss_per_epoch,
                        gen_loss_per_epoch=gen_loss_per_epoch,
                        gen_losses=gen_losses,
                        disc_losses=disc_losses,
                        epoch=epoch,
                        directory=checkpoint_dir)
        print("Checkpoint saved!")

        # sample images for inspection
        save_image_sample(batch=gen.forward(
            fixed_noise.view(-1, hidden_size, 1, 1)),
                          cuda=cuda,
                          total_examples=total_examples,
                          directory=gen_images_dir)
        print("Saved images!")

        # save learning curves for inspection
        save_learning_curve(gen_losses=gen_losses,
                            disc_losses=disc_losses,
                            total_examples=total_examples,
                            directory=train_summaries_dir)
        print("Saved learning curves!")
Ejemplo n.º 7
0
def train_and_eval(model, train_loader, valid_loader, learning_rate, epochs,
                   model_outdir, #pos_wt,
                   metrics_every_iter, task, tensorboard = False,
                   restore_chkpt = None, run_suffix = None):
    """
    Contains the powerhouse of the network, ie. the training and validation iterations called through run_model().
    All parameters from the command line/json are parsed and then passed into run_model().
    Performs checkpointing each epoch, saved as 'last.pth.tar' and the best model thus far (based on validation AUC), saved as 'best.pth.tar'

    :param model: (nn.Module) -
    :param train_loader: (torch DataLoader)
    :param valid_loader: (torch DataLoader)
    :param learning_rate: (float) - the learning rate, defaults to 1e-05
    :param epochs: (int) - the number of epochs
    :param model_outdir: (str) - the output directory for checkpointing, checkpoints will be saved as output_dir/task/view/*.tar
    :param restore_chkpt: (str) - the directory to reload the checkpoint, if specified
    :param run_suffix: (str) - suffix to be appended to the event file. removed for now.
    :return:
    """

    log_fn = helpers.create_tb_log_dir(model_outdir)

    log_fn = log_fn.strip("/") # remove leading forward slash which messes up tf log

    if tensorboard:
        import tensorflow as tf
        writer = tf.summary.create_file_writer(log_fn) # tf 2.0+
        writer = tf.compat.v1.summary.FileWriter(log_fn) # tf v1.15

    current_best_val_loss = float('Inf')
    optimizer = torch.optim.Adam(model.parameters(), learning_rate, weight_decay=0.01)
    # taken directly from MRNet code
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           patience = 5, # how many epochs to wait for before acting
                                                           factor = 0.3, # factor to reduce LR by, LR = factor * LR
                                                           threshold = 1e-4)  # threshold to measure new optimum
    #
    losses = {'regression': torch.nn.MSELoss(),
              'classification': torch.nn.BCEWithLogitsLoss(),
              'multitask': torch.nn.MSELoss()}

    #
    criterion = losses[task]
    print(criterion)

    metric = {'regression':'MSE',
              'classification':'AUC',
              'multitask': 'MSE'} # Steve: Seems like this is mostly for logging?

    # # TODO: reloading checkpoint
    # if restore_chkpt:
    #     logging.info("Restoring Checkpoint from {}".format(restore_chkpt))
    #     helpers.load_checkpoint(checkpoint = restore_chkpt,
    #                             model = model,
    #                             optimizer = optimizer,
    #                             # scheduler = scheduler,
    #                             epochs = epochs)
    #     # print(loaded_epoch)
    #     # so epochs - loaded_epoch is where we would need to start, right?
    #     logging.info("Starting again at Epoch {}....".format(epochs))
    #     logging.info("Finished Restoring Checkpoint...")


    for epoch in range(epochs):
        logging.info('[Epoch {}]'.format(epoch + 1))

        # main training loop
        epoch_loss, epoch_metric, epoch_preds, epoch_labels, train_df = run_model(
                                               model = model,
                                               loader = train_loader,
                                               optimizer = optimizer,
                                               criterion = criterion,
                                               metrics_every_iter  = metrics_every_iter,
                                               task = task,
                                               tensorboard = tensorboard,
                                               train = True)

        logging.info('[Epoch {}]\t\tTraining {}: {:.3f}\t Training Average Loss: {:.5f}'\
                     .format(epoch + 1, metric[task], epoch_metric, epoch_loss))

        # main validation loop
        epoch_val_loss, epoch_val_metric, epoch_val_preds, epoch_val_labels, val_df = run_model(model = model,
                                                             loader = valid_loader,
                                                             optimizer = optimizer,
                                                             criterion = criterion,
                                                             task = task,
                                                             tensorboard = tensorboard,
                                                             metrics_every_iter = False, # default, just show the epoch validation metrics..
                                                             train = False)

        logging.info('[Epoch {}]\t\tValidation {}: {:.3f}\t Validation Average Loss: {:.5f}'.format(epoch + 1, metric[task], epoch_val_metric, epoch_val_loss))
        scheduler.step(epoch_val_loss) # check per epoch, how does the threshold work?!?!?
        logging.info('[Epoch {}]\t\tOptimizer Learning Rate: {}'.format(epoch + 1, {optimizer.param_groups[0]['lr']}))

        # with writer:#.as_default():
        # temp = torch.tensor([epoch + 1]) # needs to be a tesor in tf v1.5?
        # writer.add_summary(tf.compat.v1.summary.scalar('Loss/train', epoch_loss), temp).eval()
        # writer.add_summary(tf.compat.v1.summary.scalar('Loss/val', epoch_val_loss), temp).eval()
        # writer.add_summary(tf.compat.v1.summary.scalar('{}/train'.format(metric[task]), epoch_metric), temp).eval()
        # writer.add_summary(tf.compat.v1.summary.scalar('{}/val'.format(metric[task]), epoch_val_metric), temp).eval()
        # writer_flush = writer.flush()
        # with writer.as_default():
        #     tf.summary.scalar('Loss/train', epoch_loss, epoch + 1)
        #     tf.summary.scalar('Loss/val', epoch_val_loss, epoch + 1)
        #     tf.summary.scalar('{}/train'.format(metric[task]), epoch_metric, epoch + 1)
        #     tf.summary.scalar('{}/val'.format(metric[task]), epoch_val_metric, epoch + 1)


        print('Loss/train: {} for epoch: {}'.format(str(epoch_loss), str(epoch + 1)))
        print('Loss/val: {} for epoch: {}'.format(str(epoch_val_loss), str(epoch + 1)))
        print('{}/train: {} for epoch: {}'.format(metric[task], str(epoch_metric), str(epoch + 1)))
        print('{}/val: {} for epoch: {}'.format(metric[task], str(epoch_val_metric), str(epoch + 1)))


        # check whether the most recent epoch loss is better than previous best
        is_best_val_loss = epoch_val_loss < current_best_val_loss

        # save state in a dictionary
        state = {'epoch': epoch + 1,
                 'state_dict': model.state_dict(),
                 'validation_metric': epoch_val_metric,
                 'metric': metric[task],
                 'best_validation_loss': epoch_val_loss,
                 # 'metrics': metrics # read more into this
                 'scheduler_dict': scheduler.state_dict(),
                 'optim_dict': optimizer.state_dict()}

        # save as last epoch
        helpers.save_checkpoint(state,
                                is_best = is_best_val_loss,
                                checkpoint_dir = model_outdir)

        if is_best_val_loss:
            current_best_val_loss = epoch_val_loss
            logging.info('[Epoch {}]\t\t******New Best Validation Loss: {:.3f}******'.format(epoch + 1, epoch_val_loss))
            helpers.save_checkpoint(state,
                                  is_best = is_best_val_loss,
                                  checkpoint_dir = model_outdir)
            #if task == 'multitask': # Steven: Seems like this should work the same if doing regression or classification. I'll try doing the same for regression by commenting out this if statement.
            #    train_df.to_csv(os.path.join(model_outdir, 'best_epoch_training_results.csv'))
            #    val_df.to_csv(os.path.join(model_outdir, 'best_epoch_validation_results.csv'))
            train_df.to_csv(os.path.join(model_outdir, 'best_epoch_training_results.csv'))
            val_df.to_csv(os.path.join(model_outdir, 'best_epoch_validation_results.csv'))
Ejemplo n.º 8
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    # if args.pretrained:
    #     print("=> using pre-trained model '{}'".format(args.arch))
    #     model = models.__dict__[args.arch](pretrained=True)
    #     model = autofit(model, args.arch, args.num_classes)
    # else:
    #     print("=> creating model '{}'".format(args.arch))
    #     model = models.__dict__[args.arch](num_classes=args.num_classes)
    model = AutoFitNet(arch=args.arch,
                       pretrained=args.pretrained,
                       num_classes=args.num_classes)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    testdir = os.path.join(args.data, 'test')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    # train_dataset = datasets.ImageFolder(
    #     traindir,
    #     transforms.Compose([
    #         transforms.Resize(256),
    #         transforms.RandomResizedCrop(224),
    #         # transforms.RandomHorizontalFlip(),
    #         transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
    #         transforms.ToTensor(),
    #         normalize,
    #     ]))
    train_dataset = CityFuncDataset(
        traindir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(brightness=0.1,
                                   contrast=0.1,
                                   saturation=0.1,
                                   hue=0.1),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(CityFuncDataset(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.test:
        test_loader = torch.utils.data.DataLoader(CityFuncDataset(
            testdir,
            transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                normalize,
            ])),
                                                  batch_size=args.batch_size,
                                                  shuffle=False,
                                                  num_workers=args.workers,
                                                  pin_memory=True)
        validate(test_loader, model, criterion, args)
        return

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    epoch_time = AverageMeter('Time', ':6.3f', 's')
    end = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        # learning rate decay
        adjust_learning_rate(optimizer, epoch, args.lr)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)

        # measure elapsed time
        epoch_time.update(time.time() - end)
        eta = (args.epochs - epoch - 1) * epoch_time.avg
        eta_str = str(datetime.timedelta(seconds=int(eta)))
        print(
            'Epoch: [{epoch:d}]\tTime:{time:6.3f}s\tETA:{eta:6.3f}s ({eta_str:s})'
            .format(epoch=epoch, time=epoch_time.val, eta=eta,
                    eta_str=eta_str))
        end = time.time()
Ejemplo n.º 9
0
def train(model,
          train_data,
          val_data,
          epochs,
          batch_size,
          learning_rate,
          savedir,
          alpha=3,
          beta=3,
          vc_flag=True,
          mix_flag=False):
    best_check = {'epoch': 0, 'best': 0, 'val_acc': 0}
    out_file_name = savedir + 'result.txt'
    total_train = len(train_data)
    train_loader = DataLoader(dataset=train_data, batch_size=1, shuffle=True)
    val_loaders = []

    for i in range(len(val_data)):
        val_loader = DataLoader(dataset=val_data[i],
                                batch_size=1,
                                shuffle=True)
        val_loaders.append(val_loader)

    # we observed that training the backbone does not make a very big difference but not training saves a lot of memory
    # if the backbone should be trained, then only with very small learning rate e.g. 1e-7
    for param in model.backbone.parameters():
        param.requires_grad = False

    if not vc_flag:
        model.conv1o1.weight.requires_grad = False
    else:
        model.conv1o1.weight.requires_grad = True

    if not mix_flag:
        model.mix_model.requires_grad = False
    else:
        model.mix_model.requires_grad = True

    classification_loss = nn.CrossEntropyLoss()
    cluster_loss = ClusterLoss()

    optimizer = torch.optim.Adagrad(params=filter(
        lambda param: param.requires_grad, model.parameters()),
                                    lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer,
                                                       gamma=0.98)

    print('Training')

    for epoch in range(epochs):
        out_file = open(out_file_name, 'a')
        train_loss = 0.0
        correct = 0
        start = time.time()
        model.train()
        model.backbone.eval()
        for index, data in enumerate(train_loader):
            if index % 500 == 0 and index != 0:
                end = time.time()
                print('Epoch{}: {}/{}, Acc: {}, Loss: {} Time:{}'.format(
                    epoch + 1, index, total_train,
                    correct.cpu().item() / index,
                    train_loss.cpu().item() / index, (end - start)))
                start = time.time()

            input, _, label = data

            input = input.cuda(device_ids[0])
            label = label.cuda(device_ids[0])

            output, vgg_feat, like = model(input)

            out = output.argmax(1)
            correct += torch.sum(out == label)
            class_loss = classification_loss(output, label) / output.shape[0]

            loss = class_loss
            if alpha != 0:
                clust_loss = cluster_loss(
                    vgg_feat, model.conv1o1.weight) / output.shape[0]
                loss += alpha * clust_loss

            if beta != 0:
                mix_loss = like[0, label[0]]
                loss += -beta * mix_loss

            #with torch.autograd.set_detect_anomaly(True):
            loss.backward()

            # pseudo batches
            if np.mod(index, batch_size) == 0:  # and index!=0:
                optimizer.step()
                optimizer.zero_grad()

            train_loss += loss.detach() * input.shape[0]

        updated_clutter = update_clutter_model(model, device_ids)
        model.clutter_model = updated_clutter
        scheduler.step()
        train_acc = correct.cpu().item() / total_train
        train_loss = train_loss.cpu().item() / total_train
        out_str = 'Epochs: [{}/{}], Train Acc:{}, Train Loss:{}'.format(
            epoch + 1, epochs, train_acc, train_loss)
        print(out_str)
        out_file.write(out_str)

        # Evaluate Validation images
        model.eval()
        with torch.no_grad():
            correct = 0
            val_accs = []
            for i in range(len(val_loaders)):
                val_loader = val_loaders[i]
                correct_local = 0
                total_local = 0
                val_loss = 0
                out_pred = torch.zeros(len(val_data[i].images))
                for index, data in enumerate(val_loader):
                    input, _, label = data
                    input = input.cuda(device_ids[0])
                    label = label.cuda(device_ids[0])
                    output, _, _ = model(input)
                    out = output.argmax(1)
                    out_pred[index] = out
                    correct_local += torch.sum(out == label)
                    total_local += label.shape[0]

                    class_loss = classification_loss(output,
                                                     label) / output.shape[0]
                    loss = class_loss
                    val_loss += loss.detach() * input.shape[0]
                correct += correct_local
                val_acc = correct_local.cpu().item() / total_local
                val_loss = val_loss.cpu().item() / total_local
                val_accs.append(val_acc)
                out_str = 'Epochs: [{}/{}], Val-Set {}, Val Acc:{} Val Loss:{}\n'.format(
                    epoch + 1, epochs, i, val_acc, val_loss)
                print(out_str)
                out_file.write(out_str)
            val_acc = np.mean(val_accs)
            out_file.write('Epochs: [{}/{}], Val Acc:{}\n'.format(
                epoch + 1, epochs, val_acc))
            if val_acc > best_check['val_acc']:
                print('BEST: {}'.format(val_acc))
                out_file.write('BEST: {}\n'.format(val_acc))
                best_check = {
                    'state_dict': model.state_dict(),
                    'val_acc': val_acc,
                    'epoch': epoch
                }
                save_checkpoint(best_check,
                                savedir + 'vc' + str(epoch + 1) + '.pth', True)

            print('\n')
        out_file.close()
    return best_check
Ejemplo n.º 10
0
            losses['validate']['history'].append(validate_loss)

            # always save latest checkpoint after an epoch, and flag if best checkpoint
            if (epoch + 1) % 5 == 0 or is_best_model:
                print('Saving checkpoint at epoch {}...'.format(epoch + 1))
                logging.info('Saving checkpoint at epoch {}...'.format(epoch +
                                                                       1))
                model.cpu()
                save_checkpoint(
                    {
                        'epoch':
                        epoch + 1,
                        'model':
                        model.state_dict(),
                        'losses':
                        losses,
                        'word_embeddings':
                        model.word_embeddings.weight.data.numpy(),
                        'pos_embeddings':
                        model.cpu().pos_embeddings.weight.data.numpy(),
                        'optimizer':
                        optimizer.state_dict(),
                    }, LATEST_CHECKPOINT_RELATIVE_PATH,
                    BEST_CHECKPOINT_RELATIVE_PATH, is_best_model)
                if CUDA:
                    model.cuda()

            if validate_loss > losses['validate']['min'][
                    'value'] and epoch - losses['validate']['min'][
                        'epoch'] > 10:
                print(
                    'Ten epochs with no improvement have passed. Stopping training...'