Beispiel #1
0
def main(opt, device):

    if not opt.nlog and not opt.test:
        sys.stdout = Logger(Path(opt.save_dir) / 'log_.txt')
    print_argument_options(opt)

    #Configure
    cuda = device.type != 'cpu'
    init_torch_seeds()

    dataset = load_datasets(opt.data, opt.batch_size, cuda, opt.workers)
    trainloader, testloader = dataset.trainloader, dataset.testloader
    opt.num_classes = dataset.num_classes
    print("Creat dataset: {}".format(opt.data))

    model = build_models(opt.model, opt.num_classes).to(device)
    print(model)
    if cuda and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
    print("Creat model: {}".format(opt.model))

    if opt.test:
        acc, err = __testing(opt, model, testloader, 0, device)
        print("==> Train Accuracy (%): {}\t Error rate(%): {}".format(
            acc, err))
        return

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=opt.lr,
                                weight_decay=5e-04,
                                momentum=0.9)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=opt.stepsize,
                                                gamma=opt.gamma)

    if opt.amp:
        opt.scaler = torch.cuda.amp.GradScaler(enabled=True)

    start_time = time.time()
    for epoch in range(opt.max_epoch):
        print("==> Epoch {}/{}".format(epoch + 1, opt.max_epoch))
        __training(opt, model, criterion, optimizer, trainloader, epoch,
                   device)
        scheduler.step()

        if opt.eval_freq > 0 and (epoch + 1) % opt.eval_freq == 0 or (
                epoch + 1) == opt.max_epoch:
            acc, err = __testing(opt, model, trainloader, epoch, device)
            print("==> Train Accuracy (%): {}\t Error rate(%): {}".format(
                acc, err))
            acc, err = __testing(opt, model, testloader, epoch, device)
            print("==> Test Accuracy (%): {}\t Error rate(%): {}".format(
                acc, err))
            save_model(model, epoch, name=opt.model, save_dir=opt.save_dir)

    elapsed = round(time.time() - start_time)
    elapsed = str(datetime.timedelta(seconds=elapsed))
    print("Finished. Total elapsed time (h:m:s): {}".format(elapsed))
Beispiel #2
0
def train(run_id,
          set_name,
          model_name,
          loss_type,
          m, d, k, alpha,
          n_iterations=1000,
          net_learning_rate=0.0001,
          cluster_learning_rate=0.001,
          chunk_size=32,
          refresh_clusters=50,
          norm_clusters=False,
          calc_acc_every=10,
          load_latest=True,
          save_every=200,
          save_path=configs.general.paths.models,
          plot_every=100,
          plots_path=configs.general.paths.graphing,
          plots_ext='.png',
          n_plot_samples=10,
          n_plot_classes=10):


    # Setup model directory
    save_path = os.path.join(save_path, "%s" % run_id)
    os.makedirs(save_path, exist_ok=True)

    # Setup plotting directory
    plots_path = os.path.join(plots_path, "%s" % run_id)
    os.makedirs(plots_path, exist_ok=True)

    net, input_size = load_net(model_name)

    # Load set and get train and test labels from datasets
    train_dataset, test_dataset = load_datasets(set_name, input_size=input_size)#299 for inception
    train_y = get_labels(train_dataset)
    test_y = get_labels(test_dataset)

    # Use the GPU
    net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count()))
    net.cuda()
    cudnn.benchmark = True

    # make list of cluster refresh if given an interval int
    if isinstance(refresh_clusters, int):
        refresh_clusters = list(range(0, n_iterations, refresh_clusters))

    # Get initial embedding using all samples in training set
    initial_reps = compute_all_reps(net, train_dataset, chunk_size)

    # Create loss object (this stores the cluster centroids)
    if loss_type == "magnet":
        the_loss = MagnetLoss(train_y, k, m, d, alpha=alpha)

        # Initialise the embeddings/representations/clusters
        print("Initialising the clusters")
        the_loss.update_clusters(initial_reps)

        # Setup the optimizer
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=net_learning_rate)
        optimizerb = None
    elif loss_type == "repmet" or loss_type == "repmet2" or loss_type == "repmet3" or loss_type == "myloss1":
        if loss_type == "repmet":
            the_loss = RepMetLoss(train_y, k, m, d, alpha=alpha)
        elif loss_type == "repmet2":
            the_loss = RepMetLoss2(train_y, k, m, d, alpha=alpha)
        elif loss_type == "repmet3":
            the_loss = RepMetLoss3(train_y, k, m, d, alpha=alpha)
        elif loss_type == "myloss1":
            the_loss = MyLoss1(train_y, k, m, d, alpha=alpha)

        # Initialise the embeddings/representations/clusters
        print("Initialising the clusters")
        the_loss.update_clusters(initial_reps)

        # Setup the optimizer
        if cluster_learning_rate < 0:
            optimizer = torch.optim.Adam(list(filter(lambda p: p.requires_grad, net.parameters())) + [the_loss.centroids], lr=net_learning_rate)
            optimizerb = None
        else:
            optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=net_learning_rate)
            optimizerb = torch.optim.Adam([the_loss.centroids], lr=cluster_learning_rate)

    l = os.listdir(save_path)
    if load_latest and len(l) > 1:
        l.sort(reverse=True)
        state = torch.load("%s/%s" % (save_path, l[1])) # ignore log.txt

        print("Loading model: %s/%s" % (save_path, l[1]))

        net.load_state_dict(state['state_dict'])
        optimizer.load_state_dict(state['optimizer'])
        if optimizerb:
            optimizerb.load_state_dict(state['optimizerb'])

        start_iteration = state['iteration']+1
        best_acc = state['best_acc']
        the_loss = state['the_loss'] # overwrite the loss
        plot_sample_indexs = state['plot_sample_indexs']
        plot_classes = state['plot_classes']
        plot_test_sample_indexs = state['plot_test_sample_indexs']
        plot_test_classes = state['plot_test_classes']
        batch_losses = state['batch_losses']
        train_accs = state['train_accs']
        test_accs = state['test_accs']

        test_acc = test_accs[0][-1]
        train_acc = train_accs[0][-1]
        test_acc_b = test_accs[1][-1]
        train_acc_b = train_accs[1][-1]
        test_acc_c = test_accs[2][-1]
        train_acc_c = train_accs[2][-1]
        test_acc_d = test_accs[3][-1]
        train_acc_d = train_accs[3][-1]
    else:

        # Randomly sample the classes then the samples from each class to plot
        plot_sample_indexs, plot_classes = get_indexs(train_y, n_plot_classes, n_plot_samples)
        plot_test_sample_indexs, plot_test_classes = get_indexs(test_y, n_plot_classes, n_plot_samples, class_ids=plot_classes)

        batch_losses = []
        train_accs = [[], [], [], []]
        test_accs = [[], [], [], []]
        start_iteration = 0
        best_acc = 0
        test_acc = 0
        train_acc = 0
        test_acc_b = 0
        train_acc_b = 0
        test_acc_c = 0
        train_acc_c = 0
        test_acc_d = 0
        train_acc_d = 0

    # lets plot the initial embeddings
    cluster_classes = the_loss.cluster_classes

    # use this to get indexs (indx to match cluster classes) for class ids (plot_classes) that we are plotting
    for i in range(len(cluster_classes)):
        cluster_classes[i] = the_loss.unique_y[cluster_classes[i]]

    cluster_indexs = []
    for ci in range(len(the_loss.cluster_classes)):
        if the_loss.cluster_classes[ci] in plot_classes:
            cluster_indexs.append(ci)

    if not load_latest or len(l) < 2:
        # plot it
        graph(initial_reps[plot_sample_indexs], train_y[plot_sample_indexs],
              cluster_centers=ensure_numpy(the_loss.centroids)[cluster_indexs],
              cluster_classes=the_loss.cluster_classes[cluster_indexs],
              savepath="%s/emb-initial%s" % (plots_path, plots_ext))

    # Get some sample indxs to do acc test on... compare these to the acc coming out of the batch calc
    test_train_inds,_ = get_indexs(train_y, len(set(train_y)), 10)

    # Lets setup the training loop
    iteration = None
    for iteration in range(start_iteration, n_iterations):
        # Sample batch and do forward-backward
        batch_example_inds, batch_class_inds = the_loss.gen_batch()

        # Get inputs and and labels from the dataset
        batch_x = get_inputs(train_dataset, batch_example_inds).cuda()
        batch_y = torch.from_numpy(batch_class_inds).cuda()

        # Calc the outputs (embs) and then the loss + accs
        outputs = net(batch_x)
        batch_loss, batch_example_losses, batch_acc = the_loss.loss(outputs, batch_y)

        # Pass the gradient and update
        optimizer.zero_grad()
        if optimizerb:
            optimizerb.zero_grad()
        batch_loss.backward()
        optimizer.step()
        if optimizerb:
            optimizerb.step()

            if norm_clusters:
                # Let's also normalise those centroids [because repmet pushes them away from unit sphere] to:
                # Option 1: sit on the hypersphere (use norm)
                # g = the_loss.centroids.norm(p=2,dim=0,keepdim=True)
                import torch.nn.functional as F
                the_loss.centroids.data = F.normalize(the_loss.centroids)

                # Option 2: sit on OR within the hypersphere (divide by max [scales all evenly]))
                # mx, _ = the_loss.centroids.max(0)
                # mx, _ = mx.max(0)
                # the_loss.centroids.data = the_loss.centroids/mx
                # What you wrote here doesn't work as scales axes independently...

        # Just changing some types
        batch_loss = float(ensure_numpy(batch_loss))
        batch_example_losses = ensure_numpy(batch_example_losses)

        # Update loss index
        the_loss.update_losses(batch_example_inds, batch_example_losses)

        if iteration > 0 and not iteration % calc_acc_every:
            # calc all the accs
            train_reps = compute_reps(net, train_dataset, test_train_inds, chunk_size)
            test_test_inds, _ = get_indexs(test_y, len(set(test_y)), 10)
            test_reps = compute_reps(net, test_dataset, test_test_inds, chunk_size)

            test_acc = the_loss.calc_accuracy(test_reps, test_y[test_test_inds], method='simple')
            train_acc = the_loss.calc_accuracy(train_reps, train_y[test_train_inds], method='simple')

            test_acc_b = the_loss.calc_accuracy(test_reps, test_y[test_test_inds], method='magnet')
            train_acc_b = the_loss.calc_accuracy(train_reps, train_y[test_train_inds], method='magnet')

            test_acc_c = the_loss.calc_accuracy(test_reps, test_y[test_test_inds], method='repmet')
            train_acc_c = the_loss.calc_accuracy(train_reps, train_y[test_train_inds], method='repmet')

            # removed because of failed runs with out of mem errors
            # test_acc_d = the_loss.calc_accuracy(test_reps, test_y[test_test_inds], method='unsupervised')
            # train_acc_d = the_loss.calc_accuracy(train_reps, train_y[test_train_inds], method='unsupervised')

            test_acc_d = test_acc_c
            train_acc_d = train_acc_c

            with open(save_path+'/log.txt', 'a') as f:
                f.write("Iteration %06d/%06d: Tr. L: %0.3f :: Batch. A: %0.3f :::: Tr. A - simple: %0.3f -- magnet: %0.3f -- repmet: %0.3f -- unsupervised: %0.3f :::: Te. A - simple: %0.3f -- magnet: %0.3f -- repmet: %0.3f -- unsupervised: %0.3f\n" % (iteration, n_iterations, batch_loss, batch_acc, train_acc, train_acc_b, train_acc_c, train_acc_d, test_acc, test_acc_b, test_acc_c, test_acc_d))
            print("Iteration %06d/%06d: Tr. L: %0.3f :: Batch. A: %0.3f :::: Tr. A - simple: %0.3f -- magnet: %0.3f -- repmet: %0.3f -- unsupervised: %0.3f :::: Te. A - simple: %0.3f -- magnet: %0.3f -- repmet: %0.3f -- unsupervised: %0.3f" % (iteration, n_iterations, batch_loss, batch_acc, train_acc, train_acc_b, train_acc_c, train_acc_d, test_acc, test_acc_b, test_acc_c, test_acc_d))

            batch_ass_ids = np.unique(the_loss.assignments[batch_example_inds])

            os.makedirs("%s/batch-emb/" % plots_path, exist_ok=True)
            os.makedirs("%s/batch-emb-all/" % plots_path, exist_ok=True)
            os.makedirs("%s/batch-clusters/" % plots_path, exist_ok=True)

            graph(ensure_numpy(outputs),
                  train_y[batch_example_inds],
                  cluster_centers=ensure_numpy(the_loss.centroids)[batch_ass_ids],
                  cluster_classes=the_loss.cluster_classes[batch_ass_ids],
                  savepath="%s/batch-emb/i%06d%s" % (plots_path, iteration, plots_ext))

            graph(ensure_numpy(outputs),
                  train_y[batch_example_inds],
                  cluster_centers=ensure_numpy(the_loss.centroids),
                  cluster_classes=the_loss.cluster_classes,
                  savepath="%s/batch-emb-all/i%06d%s" % (plots_path, iteration, plots_ext))

            graph(np.zeros_like(ensure_numpy(outputs)),
                  np.zeros_like(train_y[batch_example_inds]),
                  cluster_centers=ensure_numpy(the_loss.centroids),
                  cluster_classes=the_loss.cluster_classes,
                  savepath="%s/batch-clusters/i%06d%s" % (plots_path, iteration, plots_ext))

        train_reps_this_iter = False
        if iteration in refresh_clusters:
            with open(save_path+'/log.txt', 'a') as f:
                f.write('Refreshing clusters')
            print('Refreshing clusters')
            train_reps = compute_all_reps(net, train_dataset, chunk_size=chunk_size)
            the_loss.update_clusters(train_reps)

            cluster_classes = the_loss.cluster_classes
            train_reps_this_iter = True

        # store the stats to graph at end
        batch_losses.append(batch_loss)
        # batch_accs.append(batch_acc)
        train_accs[0].append(train_acc)
        test_accs[0].append(test_acc)
        train_accs[1].append(train_acc_b)
        test_accs[1].append(test_acc_b)
        train_accs[2].append(train_acc_c)
        test_accs[2].append(test_acc_c)
        train_accs[3].append(train_acc_d)
        test_accs[3].append(test_acc_d)

        if iteration > 0 and not iteration % plot_every:
            #use this to get indexs (indx to match cluster classes) for class ids (plot_classes) that we are plotting
            for i in range(len(cluster_classes)):
                cluster_classes[i] = the_loss.unique_y[cluster_classes[i]]

            # so 1. we don't have to recalc, 2. the kmeans update occured on these reps, better graphing ...
            # if we were to re-get with compute_reps(), batch norm and transforms could give different embeddings
            if train_reps_this_iter:
                plot_train_emb = train_reps[test_train_inds]
            else:
                plot_train_emb = compute_reps(net, train_dataset, test_train_inds, chunk_size=chunk_size)

            plot_test_emb = compute_reps(net, test_dataset, plot_test_sample_indexs, chunk_size=chunk_size)

            os.makedirs("%s/train-emb/" % plots_path, exist_ok=True)
            os.makedirs("%s/test-emb/" % plots_path, exist_ok=True)
            os.makedirs("%s/train-emb-all/" % plots_path, exist_ok=True)
            os.makedirs("%s/test-emb-all/" % plots_path, exist_ok=True)
            os.makedirs("%s/cluster-losses/" % plots_path, exist_ok=True)
            os.makedirs("%s/cluster-counts/" % plots_path, exist_ok=True)

            graph(plot_train_emb,
                  train_y[plot_sample_indexs],
                  cluster_centers=ensure_numpy(the_loss.centroids)[cluster_indexs],
                  cluster_classes=the_loss.cluster_classes[cluster_indexs],
                  savepath="%s/train-emb/i%06d%s" % (plots_path, iteration, plots_ext))

            graph(plot_test_emb,
                  test_y[plot_test_sample_indexs],
                  cluster_centers=ensure_numpy(the_loss.centroids)[cluster_indexs],
                  cluster_classes=the_loss.cluster_classes[cluster_indexs],
                  savepath="%s/test-emb/i%06d%s" % (plots_path, iteration, plots_ext))

            graph(plot_train_emb,
                  # train_y[plot_sample_indexs],
                  train_y[test_train_inds],
                  cluster_centers=ensure_numpy(the_loss.centroids),
                  cluster_classes=the_loss.cluster_classes,
                  savepath="%s/train-emb-all/i%06d%s" % (plots_path, iteration, plots_ext))

            graph(plot_test_emb,
                  test_y[plot_test_sample_indexs],
                  cluster_centers=ensure_numpy(the_loss.centroids),
                  cluster_classes=the_loss.cluster_classes,
                  savepath="%s/test-emb-all/i%06d%s" % (plots_path, iteration, plots_ext))

            plot_smooth({'loss': batch_losses,
                         'train acc': train_accs[0],
                         'test acc': test_accs[0]},
                        savepath="%s/loss_simple%s" % (plots_path, plots_ext))
            plot_smooth({'loss': batch_losses,
                         'train acc': train_accs[1],
                         'test acc': test_accs[1]},
                        savepath="%s/loss_magnet%s" % (plots_path, plots_ext))
            plot_smooth({'loss': batch_losses,
                         'train acc': train_accs[2],
                         'test acc': test_accs[2]},
                        savepath="%s/loss_repmet%s" % (plots_path, plots_ext))
            # plot_smooth({'loss': batch_losses,
            #              'train acc': train_accs[3],
            #              'test acc': test_accs[3]},
            #             savepath="%s/loss_unsupervised%s" % (plots_path, plots_ext))

            plot_cluster_data(the_loss.cluster_losses,
                              the_loss.cluster_classes,
                              title="cluster losses",
                              savepath="%s/cluster-losses/i%06d%s" % (plots_path, iteration, plots_ext))

            cluster_counts = []
            for c in range(len(the_loss.cluster_assignments)):
                cluster_counts.append(len(the_loss.cluster_assignments[c]))

            plot_cluster_data(cluster_counts,
                              the_loss.cluster_classes,
                              title="cluster counts",
                              savepath="%s/cluster-counts/i%06d%s" % (plots_path, iteration, plots_ext))

        if iteration > 0 and not iteration % save_every:
            if save_path:
                if test_acc_d > best_acc:
                    print("Saving model (is best): %s/i%06d%s" % (save_path, iteration, '.pth'))
                    best_acc = test_acc_d
                else:
                    print("Saving model: %s/i%06d%s" % (save_path, iteration, '.pth'))

                state = {
                    'iteration': iteration,
                    'state_dict': net.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'acc': test_acc_d,
                    'best_acc': best_acc,
                    'the_loss': the_loss,
                    'plot_sample_indexs': plot_sample_indexs,
                    'plot_classes': plot_classes,
                    'plot_test_sample_indexs': plot_test_sample_indexs,
                    'plot_test_classes': plot_test_classes,
                    'batch_losses': batch_losses,
                    'train_accs': train_accs,
                    'test_accs': test_accs,
                }
                if optimizerb:
                    state['optimizerb'] = optimizerb.state_dict()
                torch.save(state, "%s/i%06d%s" % (save_path, iteration, '.pth'))

    # END TRAINING LOOP

    # Plot curves and graphs
    plot_smooth({'loss': batch_losses,
                 'train acc': train_accs[0],
                 'test acc': test_accs[0]},
                savepath="%s/loss_simple%s" % (plots_path, plots_ext))
    plot_smooth({'loss': batch_losses,
                 'train acc': train_accs[1],
                 'test acc': test_accs[1]},
                savepath="%s/loss_magnet%s" % (plots_path, plots_ext))
    plot_smooth({'loss': batch_losses,
                 'train acc': train_accs[2],
                 'test acc': test_accs[2]},
                savepath="%s/loss_repmet%s" % (plots_path, plots_ext))
    plot_smooth({'loss': batch_losses,
                 'train acc': train_accs[3],
                 'test acc': test_accs[3]},
                savepath="%s/loss_unsupervised%s" % (plots_path, plots_ext))

    # Calculate and graph the final
    final_reps = compute_reps(net, train_dataset, plot_sample_indexs, chunk_size=chunk_size)
    graph(final_reps, train_y[plot_sample_indexs], savepath="%s/emb-final%s" % (plots_path, plots_ext))

    if save_path and iteration:
        if test_acc_d > best_acc:
            print("Saving model (is best): %s/i%06d%s" % (save_path, iteration+1, '.pth'))
            best_acc = test_acc_d
        else:
            print("Saving model: %s/i%06d%s" % (save_path, iteration+1, '.pth'))

        state = {
            'iteration': iteration,
            'state_dict': net.state_dict(),
            'optimizer': optimizer.state_dict(),
            'acc': test_acc_d,
            'best_acc': best_acc,
            'the_loss': the_loss,
            'plot_sample_indexs': plot_sample_indexs,
            'plot_classes': plot_classes,
            'plot_test_sample_indexs': plot_test_sample_indexs,
            'plot_test_classes': plot_test_classes,
            'batch_losses': batch_losses,
            'train_accs': train_accs,
            'test_accs': test_accs,
        }
        if optimizerb:
            state['optimizerb'] = optimizerb.state_dict()
        torch.save(state, "%s/i%06d%s" % (save_path, iteration+1, '.pth'))
Beispiel #3
0
def evaluate(run_id,
             set_name,
             model_name,
             chunk_size=32,
             split='test',
             load_iteration=-1,
             load_path=configs.general.paths.models,
             plots_path=configs.general.paths.graphing,
             plots_ext='.png'):

    # Setup load path
    load_path = os.path.join(load_path, "%s" % run_id)

    # Setup plotting directory
    plots_path = os.path.join(plots_path, "%s" % run_id)
    os.makedirs(plots_path, exist_ok=True)

    net, input_size = load_net(model_name)

    # Load set and get train and test labels from datasets
    train_dataset, test_dataset = load_datasets(set_name,
                                                input_size=input_size)
    if split == 'train':
        dataset = train_dataset
    else:
        dataset = test_dataset
    y = get_labels(dataset)

    # Use the GPU
    net = torch.nn.DataParallel(net,
                                device_ids=range(torch.cuda.device_count()))
    net.cuda()
    cudnn.benchmark = True

    # Load the particular iteration we want
    if load_iteration < 0:
        l = os.listdir(load_path)
        l.sort(reverse=True)
        state = torch.load("%s/%s" % (load_path, l[1]))  # ignore log.txt
        print("Loading model: %s/%s" % (load_path, l[1]))
    else:
        if os.path.exists("%s/i%06d%s" % (load_path, load_iteration, '.pth')):
            state = torch.load(
                "%s/i%06d%s" %
                (load_path, load_iteration, '.pth'))  # ignore log.txt
            print("%s/i%06d%s" % (load_path, load_iteration, '.pth'))
        else:
            print("%s/i%06d%s doesn't exist... awkies. :/" %
                  (load_path, load_iteration, '.pth'))
            return

    # Load the net state
    net.load_state_dict(state['state_dict'])

    # Load the loss and cluster centres
    the_loss = state['the_loss']

    # Compute the embeddings fof the dataset
    x = compute_reps(net, dataset, list(range(len(y))), chunk_size=chunk_size)

    # Compute the accuracies
    test_acc = the_loss.calc_accuracy(x, y, method='simple')
    test_acc_b = the_loss.calc_accuracy(x, y, method='magnet')
    test_acc_c = the_loss.calc_accuracy(x, y, method='repmet')
    test_acc_d = the_loss.calc_accuracy(x, y, method='unsupervised')

    print(
        "simple: %0.3f -- magnet: %0.3f -- repmet: %0.3f -- unsupervised: %0.3f"
        % (test_acc, test_acc_b, test_acc_c, test_acc_d))

    # And hey, why not graph them all!
    graph(x,
          y,
          cluster_centers=ensure_numpy(the_loss.centroids),
          cluster_classes=the_loss.cluster_classes,
          savepath="%s/test-%s%s" % (plots_path, split, plots_ext))
Beispiel #4
0
def main(args):
    # load train/test data
    datadir = os.path.join(args.volumedir, args.datadir)
    # train = imdb_data_load(datadir)
    train, test = load_datasets(datadir)
    # train, test = load_context_target_pairs(datadir, context_len = args.conlength)
    # train = sorted(train, key=lambda a: len(a), reverse=True)
    # train = train[:min(len(train), args.datacap)]
    # for msg in train:
    #     if "roster" in msg:
    #         print(msg)
    # return

    # Dynamically load modelBuilder class
    moduleName, klassName = args.modelbuilder.split(".")
    mod = __import__('models.%s' % moduleName, fromlist=[klassName])
    klass = getattr(mod, klassName)
    modelBuilder = klass(args)

    timestamp = int(time.time())
    logdir = os.path.join(args.volumedir,
                          datetime.datetime.today().strftime('%Y%m%d'),
                          args.logdir)
    if not os.path.isdir(logdir):
        os.makedirs(logdir)
    hdlr = logging.FileHandler(
        os.path.join(logdir, "training_output_%d.log" % timestamp))
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.setLevel(logging.INFO)

    checkpointdir = os.path.join(args.volumedir,
                                 datetime.datetime.today().strftime('%Y%m%d'),
                                 args.checkpointdir)
    if not os.path.isdir(checkpointdir):
        os.makedirs(checkpointdir)
    checkpointpath = configure_checkpointing(args, timestamp)
    checkpoint_callback = ModelCheckpoint(filepath=checkpointpath,
                                          save_weights_only=False)

    # Create or load existing model
    init_epoch = 0
    if args.textlineds:
        X, Y, vocab, tokens = SlackTextLineDataset(args, train).get_dataset()
        reverse_token_map = {t: i for i, t in enumerate(vocab)}
    else:
        tokens, vocab, reverse_token_map = modelBuilder.tokenize(
            train, freq_threshold=args.freqthreshold)
    # text_ds = text_ds.shuffle(buffer_size=1024).batch(args.minibatchsize)
    # print(text_ds.cardinality().numpy())
    if args.loadmodel and os.path.exists(args.loadmodel):
        modelpath = args.loadmodel
        timestamp = int(modelpath.split(".")[1])
        init_epoch = int(modelpath.split(".")[2])
        loaddir = "/".join(modelpath.split("/")[:-1])
        model = load_model(modelpath, custom_objects={"EinsumOp": EinsumOp})
        vocab = load_vocab(loaddir, timestamp)
        # tokens = load_tokens(loaddir, timestamp)
        reverse_token_map = {t: i for i, t in enumerate(vocab)}
    else:
        model = modelBuilder.create_model(vocab)
        save_vocab(vocab, checkpointdir, timestamp)
        if args.savetokens:
            save_tokens(tokens, checkpointdir, timestamp)

    plot_model(model,
               to_file='model_plot_2.png',
               show_shapes=True,
               show_layer_names=True)
    optimizer_map = {"adam": Adam, "rmsprop": RMSprop, "sgd": SGD}
    optimizer = optimizer_map[
        args.optimizer] if args.optimizer in optimizer_map.keys() else RMSprop
    lr_decay = ExponentialDecay(initial_learning_rate=args.learningrate,
                                decay_rate=args.decayrate,
                                decay_steps=args.decaysteps)
    custom_lr = CustomSchedule(args.hiddensize)
    opt = optimizer(learning_rate=lr_decay, clipvalue=3)
    # model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=["accuracy"])
    # attn_4_output = model.get_layer("attention_values_4").output
    # dense_v_out = model.get_layer("dense_v_4").output
    # einsum_com_output = model.get_layer("einsum_com_4").output
    # inpt = model.get_layer("input")
    # attn_factor_model = keras.Model(inputs=inpt.input, outputs=attn_4_output)
    # einsum_com_model = keras.Model(inputs=inpt.input, outputs=einsum_com_output)
    # dense_v_model = keras.Model(inputs=inpt.input, outputs=dense_v_out)
    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(name="loss"),
        run_eagerly=True,
        optimizer=opt,
        metrics=[
            tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
            tf.keras.metrics.SparseTopKCategoricalAccuracy(
                k=3, name="top_3_accuracy"),
            tf.keras.metrics.SparseTopKCategoricalAccuracy(
                k=5, name="top_5_accuracy"),
            last_word_prediction_accuracy(args.minibatchsize, args.seqlength)
        ])
    # last_word_prediction_topk_accuracy(args.minibatchsize, args.seqlength, 5)])

    model.summary(print_fn=logger.info)

    checkpointnames = args.checkpointnames % timestamp
    sample_func = lambda: modelBuilder.sample(model, tokens, vocab,
                                              reverse_token_map)
    callbacks = get_callbacks(args.volumedir, checkpointdir, checkpointnames,
                              timestamp, sample_func)
    sample_callback = LambdaCallback(
        on_epoch_end=lambda epoch, logs: sample_func())
    logger_callback = LambdaCallback(on_epoch_end=lambda epoch, logs: logger.
                                     info("Epoch %d: %s" % (epoch, str(logs))))

    if not args.textlineds:
        trainseqs = modelBuilder.get_input_sequences(tokens, reverse_token_map)
        # trainseqs, valseqs = validation_split(seqs, val_split=args.valsplit)

        if args.modelbuilder == "keras_word_lm.WordLanguageModelBuilder":
            trainvectors = SequenceVectors(args, trainseqs, vocab)
            history = model.fit(trainvectors,
                                epochs=args.numepochs,
                                initial_epoch=init_epoch,
                                callbacks=[
                                    sample_callback, logger_callback,
                                    checkpoint_callback
                                ])
            logger.info(history.history)
            plot_history(history.history, args.learningrate, logdir, timestamp)
            return
        X, Y, sample_weights = modelBuilder.build_input_vectors(
            trainseqs, vocab, reverse_token_map)

    # ds = modelBuilder.build_input_vectors(trainseqs, vocab, reverse_token_map)
    # model.fit(X, Y,
    # print(ds)
    # start_prompt = "this movie is"
    # start_tokens = [reverse_token_map[t] for t in start_prompt.split()]
    # num_tokens_generated = 40
    # text_gen_callback = TextGenerator(num_tokens_generated, args.seqlength, start_tokens, vocab)
    history = model.fit(
        X,
        Y,
        epochs=args.numepochs,
        initial_epoch=init_epoch,
        batch_size=args.minibatchsize,
        validation_split=0.1,
        shuffle=True,
        callbacks=[sample_callback, logger_callback, checkpoint_callback])
    logger.info(history.history)
    plot_history(history.history, args.learningrate, logdir, timestamp)
    return
    allmetrics = {}
    for epoch in range(init_epoch, args.numepochs):
        batches = rand_mini_batches(trainseqs, args.minibatchsize)
        for i, batch in enumerate(batches):
            X, Y, sample_weights = modelBuilder.build_input_vectors(
                batch, vocab, reverse_token_map)
            metrics = model.train_on_batch(X,
                                           Y,
                                           sample_weight=sample_weights,
                                           reset_metrics=i == 0,
                                           return_dict=True)
            if i % 100 == 0:
                valmetrics = evaluate_mini_batches(model, modelBuilder, vocab,
                                                   reverse_token_map, valseqs,
                                                   args.minibatchsize)
                metrics.update(valmetrics)
                for key in metrics.keys():
                    if key in allmetrics.keys():
                        allmetrics[key] += [metrics[key]]
                    else:
                        allmetrics[key] = [metrics[key]]
                print("Batch %d of %d in epoch %d: %s" %
                      (i, len(batches), epoch, str(metrics)))
        logger.info("Epoch %d: %s" % (epoch, str(metrics)))
        # logger.info("Validation metrics %s" % str(valmetrics))
        if args.runsamples:
            sample_output = sample_func()
            logger.info("\n" + sample_output)
        model.save(
            os.path.join(checkpointdir, checkpointnames).format(epoch=epoch))
        plot_history(allmetrics, args.learningrate, logdir, timestamp)
def main(opt, device):
    best_acc1 = 0
    if not opt.nlog and not opt.test:
        sys.stdout = Logger(Path(opt.save_dir) / 'log_.txt')

    if opt.global_rank in [-1, 0]:
        print_argument_options(opt)

    #Configure
    cuda = device.type != 'cpu'
    init_torch_seeds()

    dataset = load_datasets(opt.data, opt.batch_size, cuda, opt.workers,
                            opt.global_rank)
    trainloader, testloader = dataset.trainloader, dataset.testloader
    opt.num_classes = dataset.num_classes

    if opt.global_rank in [-1, 0]:
        print("Creat dataset: {}".format(opt.data))

    model = build_models(opt.model, opt.num_classes, opt.input_size,
                         opt.model_size).to(device)

    if cuda and opt.global_rank == -1 and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    if cuda and opt.global_rank != -1:
        model = DDP(model,
                    device_ids=[opt.local_rank],
                    output_device=opt.local_rank)

    if opt.global_rank in [-1, 0]:
        print(model)
        print("Creat model: {}".format(opt.model))

    criterion = nn.CrossEntropyLoss()
    #criterion = SmoothCrossEntropyLoss(label_smoothing=0.1)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=opt.lr,
                                weight_decay=5e-04,
                                momentum=0.9)

    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            opt.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            if opt.global_rank in [-1, 0]:
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    opt.resume, checkpoint['epoch']))
        else:
            if opt.global_rank in [-1, 0]:
                print("=> no checkpoint found at '{}'".format(opt.resume))

    opt.scaler = torch.cuda.amp.GradScaler(enabled=True)

    if opt.global_rank in [-1, 0]:
        start_time = time.time()
    for epoch in range(opt.start_epoch, opt.max_epoch):
        if opt.global_rank != -1:
            trainloader.sampler.set_epoch(epoch)
        if opt.global_rank in [-1, 0]:
            print("==> Epoch {}/{}".format(epoch + 1, opt.max_epoch))
        __training(opt, model, criterion, optimizer, trainloader, epoch,
                   device, opt.global_rank)

        if opt.eval_freq > 0 and (epoch + 1) % opt.eval_freq == 0 or (
                epoch + 1) == opt.max_epoch:
            #if cuda and opt.global_rank != -1:
            #    model.module.inference_mode()
            #else:
            #    model.inference_mode()
            acc1 = __testing(opt, model, testloader, epoch, device,
                             opt.global_rank)

            #if cuda and opt.global_rank != -1:
            #    model.module.training_mode()
            #else:
            #    model.training_mode()
            acc1 = __testing(opt, model, testloader, epoch, device,
                             opt.global_rank)

            # remember best acc@1 and save checkpoint
            is_best = acc1 > best_acc1
            best_acc1 = max(acc1, best_acc1)
            if opt.global_rank in [-1, 0]:
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': opt.model,
                        'state_dict': model.state_dict(),
                        'best_acc1': best_acc1,
                        'optimizer': optimizer.state_dict(),
                    },
                    is_best,
                    save_dir=opt.save_dir)

    if opt.global_rank in [-1, 0]:
        elapsed = round(time.time() - start_time)
        elapsed = str(datetime.timedelta(seconds=elapsed))
        print("Finished. Total elapsed time (h:m:s): {}".format(elapsed))
Beispiel #6
0
def main(args):
    volumedir = args.volumedir
    datadir = os.path.join(volumedir, args.datadir)
    checkpointdir = os.path.join(volumedir, args.checkpointdir)
    checkpointnames = args.checkpointnames
    mini_batch_size = args.minibatchsize
    learning_rate = args.learningrate
    dropout_rate = args.dropoutrate
    reg_factor = args.regfactor
    n_a = args.hiddensize
    num_epochs = args.numepochs
    loadmodel = args.loadmodel
    timestamp = int(time.time())
    checkpointnames = checkpointnames % timestamp

    train, test = load_datasets(datadir)
    train = train[:min(len(train), args.datacap)]
    m = len(train)
    chars = set()
    step = args.step
    maxlen = args.seqlength
    #for msg in train:
    #    chars = chars.union(set(msg))
    #chars = sorted(list(chars))
    #chars = ['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '*']
    chars = [
        '\n', ' ', '!', '"', ',', '.', '0', '1', '2', '3', '4', '5', '6', '7',
        '8', '9', ':', '?', '@', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
        'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
        'x', 'y', 'z', '~', '*'
    ]
    char_to_ix = {c: i for i, c in enumerate(chars)}

    if loadmodel and os.path.exists(loadmodel):
        timestamp = int(loadmodel.split(".")[1])
        epoch_number = int(loadmodel.split(".")[2])
        model = load_checkpoint_model(loadmodel)
    else:
        model = create_seq2seq_model(chars,
                                     n_a,
                                     maxlen,
                                     learning_rate,
                                     dropout_rate=dropout_rate,
                                     reg_factor=reg_factor)
        epoch_number = 0

    hdlr = logging.FileHandler(
        os.path.join(volumedir, "training_output_%d.log" % timestamp))
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.setLevel(logging.INFO)
    metrics = []
    X, Y = format_x_y_no_seed(maxlen, chars, train, step, char_to_ix)
    callbacks = get_callbacks(volumedir, checkpointdir, checkpointnames, chars,
                              char_to_ix, train, model, timestamp)
    model.fit(X,
              Y,
              batch_size=mini_batch_size,
              epochs=num_epochs,
              initial_epoch=epoch_number,
              validation_split=0.2,
              shuffle=True,
              callbacks=callbacks)
Beispiel #7
0
def train_model(batch_size,
                n_epochs,
                learning_rate,
                saved_epoch,
                run_id="def",
                set_name="stanford_dogs",
                save_every=1000,
                save_path=configs.models,
                plot_every=500,
                plot_path=configs.plots):
    # Setup save directories
    if save_path:
        save_path = os.path.join(save_path, "run_{}".format(run_id))
        os.makedirs(save_path, exist_ok=True)
    if plot_path:
        plot_path = os.path.join(plot_path, "run_{}".format(run_id))
        os.makedirs(plot_path, exist_ok=True)

    # Load network and use GPU
    net = models.Net2().cuda()
    cudnn.benchmark = True

    # Load dataset
    train_data, test_data, classes = load_datasets(set_name)
    #train_y, test_y = utils.get_labels(train_data), utils.get_labels(test_data)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

    # obtain one batch of training images
    dataiter = iter(train_loader)
    images, labels = dataiter.next()
    images = np.swapaxes(np.swapaxes(images.numpy(), 1, 2), 2, 3)

    # plot the images in the batch, along with the corresponding labels
    fig = plt.figure(figsize=(batch_size / 4 + 5, batch_size / 4 + 5))
    for idx in np.arange(batch_size):
        ax = fig.add_subplot(batch_size / 8, 8, idx + 1, xticks=[], yticks=[])
        ax.imshow(images[idx])
        ax.set_title(classes[labels[idx]], {'fontsize': batch_size / 5},
                     pad=0.4)
    plt.tight_layout(pad=1, w_pad=0, h_pad=0)
    if plot_path:
        plt.savefig(os.path.join(plot_path, "Initial_Visualization"))
    else:
        plt.show()
    plt.clf()

    # cross entropy loss combines softmax and nn.NLLLoss() in one single class.
    criterion = nn.NLLLoss()

    # stochastic gradient descent with a small learning rate
    optimizer = optim.SGD(net.parameters(), lr=learning_rate)

    # ToDo: Add to utils
    # Calculate accuracy before training
    correct = 0
    total = 0

    # Iterate through test dataset
    for images, labels in test_loader:
        images, labels = images.cuda(), labels.cuda()

        # forward pass to get outputs
        # the outputs are a series of class scores
        outputs = net(images)

        # get the predicted class from the maximum value in the output-list of class scores
        _, predicted = torch.max(outputs.data, 1)

        # count up total number of correct labels
        # for which the predicted and true labels are equal
        total += labels.size(0)
        correct += (predicted == labels).sum()

    # calculate the accuracy
    # to convert `correct` from a Tensor into a scalar, use .item()
    accuracy = 100.0 * correct.item() / total

    # print('Accuracy before training: ', accuracy)

    def train(n_epochs):
        net.train()
        loss_over_time = []  # to track the loss as the network trains

        for epoch in range(n_epochs):  # loop over the dataset multiple times
            output_epoch = epoch + saved_epoch
            running_loss = 0.0

            for batch_i, data in enumerate(train_loader):
                # get the input images and their corresponding labels
                inputs, labels = data
                inputs, labels = inputs.cuda(), labels.cuda()

                # zero the parameter (weight) gradients
                optimizer.zero_grad()

                # forward pass to get outputs
                outputs = net(inputs)

                # calculate the loss
                loss = criterion(outputs, labels)

                # backward pass to calculate the parameter gradients
                loss.backward()

                # update the parameters
                optimizer.step()

                # print loss statistics
                # to convert loss into a scalar and add it to running_loss, we use .item()
                running_loss += loss.item()

                if batch_i % 45 == 44:  # print every 45 batches
                    avg_loss = running_loss / 45
                    # record and print the avg loss over the 100 batches
                    loss_over_time.append(avg_loss)
                    print('Epoch: {}, Batch: {}, Avg. Loss: {}'.format(
                        output_epoch + 1, batch_i + 1, avg_loss))
                    running_loss = 0.0
            if output_epoch % 100 == 99:  # save every 100 epochs
                torch.save(net.state_dict(),
                           'saved_models/Net2_{}.pt'.format(output_epoch + 1))

        print('Finished Training')
        return loss_over_time

    if saved_epoch:
        net.load_state_dict(
            torch.load('saved_models/Net2_{}.pt'.format(saved_epoch)))

    # call train and record the loss over time
    training_loss = train(n_epochs)

    # visualize the loss as the network trained
    fig = plt.figure()
    plt.plot(45 * np.arange(len(training_loss)), training_loss)
    plt.rc('xtick', labelsize=12)
    plt.rc('ytick', labelsize=12)
    plt.xlabel('Number of Batches', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.ylim(0, 5.5)  # consistent scale
    plt.tight_layout()
    if plot_path:
        plt.savefig(os.path.join(plot_path, "Loss_Over_Time"))
        print("saved")
    else:
        plt.show()
    plt.clf()

    # initialize tensor and lists to monitor test loss and accuracy
    test_loss = torch.zeros(1).cuda()
    class_correct = list(0. for i in range(len(classes)))
    class_total = list(0. for i in range(len(classes)))

    # set the module to evaluation mode
    # used to turn off layers that are only useful for training
    # like dropout and batch_norm
    net.eval()

    for batch_i, data in enumerate(test_loader):

        # get the input images and their corresponding labels
        inputs, labels = data
        inputs, labels = inputs.cuda(), labels.cuda()

        # forward pass to get outputs
        outputs = net(inputs)

        # calculate the loss
        loss = criterion(outputs, labels)

        # update average test loss
        test_loss = test_loss + ((torch.ones(1).cuda() / (batch_i + 1)) *
                                 (loss.data - test_loss))

        # get the predicted class from the maximum value in the output-list of class scores
        _, predicted = torch.max(outputs.data, 1)

        # compare predictions to true label
        # this creates a `correct` Tensor that holds the number of correctly classified images in a batch
        correct = np.squeeze(predicted.eq(labels.data.view_as(predicted)))

        # calculate test accuracy for *each* object class
        # we get the scalar value of correct items for a class, by calling `correct[i].item()`
        for l, c in zip(labels.data, correct):
            class_correct[l] += c.item()
            class_total[l] += 1

    print('Test Loss: {:.6f}\n'.format(test_loss.cpu().numpy()[0]))

    for i in range(len(classes)):
        if class_total[i] > 0:
            print('Test Accuracy of %30s: %2d%% (%2d/%2d)' %
                  (classes[i], 100 * class_correct[i] / class_total[i],
                   np.sum(class_correct[i]), np.sum(class_total[i])))
        else:
            print('Test Accuracy of %5s: N/A (no training examples)' %
                  (classes[i]))

    print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' %
          (100. * np.sum(class_correct) / np.sum(class_total),
           np.sum(class_correct), np.sum(class_total)))

    # Visualize Sample Results (Runs until a batch contains a )
    # plot the images in the batch, along with predicted and true labels
    fig = plt.figure(figsize=(batch_size / 4 + 5, batch_size / 4 + 5))
    misclassification_found = False
    while (not misclassification_found):
        fig.clf()
        # obtain one batch of test images
        dataiter = iter(test_loader)
        images, labels = dataiter.next()
        images, labels = images.cuda(), labels.cuda()
        # get predictions
        preds = np.squeeze(
            net(images).data.max(1, keepdim=True)[1].cpu().numpy())
        images = np.swapaxes(np.swapaxes(images.cpu().numpy(), 1, 2), 2, 3)
        for idx in np.arange(batch_size):
            ax = fig.add_subplot(batch_size / 8,
                                 8,
                                 idx + 1,
                                 xticks=[],
                                 yticks=[])
            ax.imshow(images[idx])
            if preds[idx] == labels[idx]:
                ax.set_title("{}".format(classes[preds[idx]],
                                         classes[labels[idx]]),
                             color="green")
            else:
                ax.set_title("({})\n{}".format(classes[labels[idx]],
                                               classes[preds[idx]]),
                             color="red",
                             pad=.4)
                misclassification_found = True
    if plot_path:
        plt.savefig(os.path.join(plot_path, "Results Visualization"))
    else:
        plt.show()
    plt.clf()