Ejemplo n.º 1
0
    args = parser.parse_args()

    hp = HParam(args.config)
    graphs = [
        read_graph(hp.model.graph0),
        read_graph(hp.model.graph1),
        read_graph(hp.model.graph2),
    ]
    print('Loading model from checkpoint...')
    model = RandWire(hp, graphs).cuda()
    checkpoint = torch.load(args.checkpoint_path)
    model.load_state_dict(checkpoint['model'])
    step = checkpoint['step']

    dataset = hp.data.type
    switcher = {
        'MNIST': MNIST_dataloader,
        'CIFAR10': CIFAR10_dataloader,
        'ImageNet': create_dataloader,
    }
    assert dataset in switcher.keys(), 'Dataset type currently not supported'
    dl_func = switcher[dataset]
    valset = dl_func(hp, args, False)

    print('Validating...')
    test_avg_loss, accuracy = validate(model, valset)

    print('Result on step %d:' % step)
    print('Average test loss: %.4f' % test_avg_loss)
    print('Accuracy: %.3f' % accuracy)
Ejemplo n.º 2
0
def train(out_dir, chkpt_path, trainset, valset, writer, logger, hp, hp_str, graphs, in_channels=3):
    model = RandWire(hp, graphs, in_channels=in_channels).cuda()

    
    
    #print(net)
    #print(count_parameters(model))
    #with torch.no_grad():
    #    x = torch.randn(1,1,28,28).cuda()
    #    y = model(x)

    # Save onnx model for visualization
    #torch.onnx.export(model, x, "onnx/CElegans.onnx")
    #print('Model saved to disk')
    
    
    
    

    if hp.train.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=hp.train.adam*10)
    elif hp.train.optimizer == 'adabound':
        optimizer = adabound.AdaBound(model.parameters(),
                             lr=hp.train.adabound.initial,
                             final_lr=hp.train.adabound.final)
    else:
        raise Exception("Optimizer not supported: %s" % hp.train.optimizer)

    step = 0

    if chkpt_path is not None:
        logger.info("Resuming from checkpoint: %s" % chkpt_path)
        checkpoint = torch.load(chkpt_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        step = checkpoint['step']

        if hp_str != checkpoint['hp_str']:
            logger.warning("New hparams are different from checkpoint.")
            logger.warning("Will use new hparams.")
        # hp = load_hparam_str(hp_str)
    else:
        logger.info("Starting new training run")
        logger.info("Writing graph to tensorboardX...")
        #writer.write_graph(model, torch.randn(1, 1, 28, 28).cuda())
        logger.info("Finished.")

    try:
        model.train()
        epoch = 0
        best_acc = 0
        while epoch < 20: # TODO Change this to one epoch but make sure logging still works
            
            print(type(model.dagly3.weighted_adj))
            print(model.dagly3.weighted_adj)
            np.savetxt('adj/unweighted.txt'.format(epoch), model.dagly3.unweighted_adj)
            np.savetxt('adj/after_{}_epochs.txt'.format(epoch), model.dagly3.weighted_adj)
            
            for data, target in trainset:
                data, target = data.cuda(), target.cuda()
                optimizer.zero_grad()
                output = model(data)
                loss = F.nll_loss(output, target)
                loss.backward()
                optimizer.step()

                step += 1
                loss = loss.item()
                if loss > 1e8 or math.isnan(loss):
                    logger.error("Loss exploded to %.02f at step %d!" % (loss, step))
                    raise Exception("Loss exploded")

                if step % hp.train.summary_interval == 0:
                    writer.log_training(loss, step)
                    logger.info("Wrote summary at step %d, epoch %d" % (step, epoch))

                if step % len(trainset) == 0: #step % hp.train.checkpoint_interval == 0:
                    save_path = os.path.join(out_dir, 'chkpt_%07d.pt' % step)
                    torch.save({
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'step': step,
                        'hp_str': hp_str,
                    }, save_path)
                    logger.info("Saved checkpoint to: %s" % save_path)

                if step % len(trainset) == 0: #step % hp.train.evaluation_interval == 0:
                    test_loss, accuracy = validate(model, valset, writer, step)
                    
                    if accuracy > best_acc:
                        best_acc = accuracy
                        
                    logger.info("Evaluation saved at step %d, epoch %d | test_loss: %.5f | accuracy: %.4f"
                                    % (step, epoch, test_loss, accuracy))

                if step % hp.train.decay.step == 0:
                    temp = optimizer.state_dict()
                    temp['param_groups'][0]['lr'] *= hp.train.decay.gamma
                    optimizer.load_state_dict(temp)

            epoch += 1
            
        writer.log_best_acc(best_acc)
        
        print(type(model.dagly3.weighted_adj))
        print(model.dagly3.weighted_adj)
        np.savetxt('adj/after_{}_epochs.txt'.format(epoch), model.dagly3.weighted_adj)

    except Exception as e:
        logger.info("Exiting due to exception: %s" % e)
        traceback.print_exc()
Ejemplo n.º 3
0
def train(out_dir, chkpt_path, trainset, valset, writer, logger, hp, hp_str):
    model = RandWire(hp).cuda()
    if hp.train.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=hp.train.adam)
    elif hp.train.optimizer == 'adabound':
        optimizer = AdaBound(model.parameters(),
                             lr=hp.train.adabound.initial,
                             final_lr=hp.train.adabound.final)
    else:
        raise Exception("Optimizer not supported: %s" % hp.train.optimizer)

    step = 0

    if chkpt_path is not None:
        logger.info("Resuming from checkpoint: %s" % chkpt_path)
        checkpoint = torch.load(chkpt_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        step = checkpoint['step']

        if hp_str != checkpoint['hp_str']:
            logger.warning("New hparams are different from checkpoint.")
            logger.warning("Will use new hparams.")
        # hp = load_hparam_str(hp_str)
    else:
        logger.info("Starting new training run")

    try:
        model.train()
        while True:
            for batch in trainset:
                batch = batch.cuda()
                data, target = batch
                output = model(data)
                loss = F.nll_loss(output, target)
                loss.backward()
                optimizer.step()

                step += 1
                loss = loss.item()
                if loss > 1e8 or math.isnan(loss):
                    logger.error("Loss exploded to %.02f at step %d!" %
                                 (loss, step))
                    raise Exception("Loss exploded")

                if step % hp.train.summary_interval == 0:
                    writer.log_training(loss, step)
                    logger.info("Wrote summary at step %d" % step)

                if step % hp.train.checkpoint_interval == 0:
                    save_path = os.path.join(out_dir, 'chkpt_%07d.pt' % step)
                    torch.save(
                        {
                            'model': model.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'step': step,
                            'hp_str': hp_str,
                        }, save_path)
                    logger.info("Saved checkpoint to: %s" % save_path)

                if step % hp.train.evaluation_interval == 0:
                    test_loss, accuracy = validate(model, valset, writer, step)
                    logger.info(
                        "Evaluation saved at step %d | test_loss: %.5f | accuracy: %.2f(%)"
                        % (test_loss, accuracy))
    except Exception as e:
        logger.info("Exiting due to exception: %s" % e)
        traceback.print_exc()
Ejemplo n.º 4
0
def train(out_dir, chkpt_path, trainset, valset, writer, logger, hp, hp_str,
          graphs):
    model = RandWire(hp, graphs).cuda()

    if hp.train.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=hp.train.adam)
    elif hp.train.optimizer == 'adabound':
        optimizer = adabound.AdaBound(model.parameters(),
                                      lr=hp.train.adabound.initial,
                                      final_lr=hp.train.adabound.final)
    elif hp.train.optimizer == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=hp.train.sgd.lr,
                                    momentum=hp.train.sgd.momentum,
                                    weight_decay=hp.train.sgd.weight_decay)
    else:
        raise Exception("Optimizer not supported: %s" % hp.train.optimizer)

    step = 0

    if chkpt_path is not None:
        logger.info("Resuming from checkpoint: %s" % chkpt_path)
        checkpoint = torch.load(chkpt_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        step = checkpoint['step']

        if hp_str != checkpoint['hp_str']:
            logger.warning("New hparams are different from checkpoint.")
            logger.warning("Will use new hparams.")
        # hp = load_hparam_str(hp_str)
    else:
        logger.info("Starting new training run")
        logger.info("Writing graph to tensorboardX...")
        writer.write_graph(
            model,
            torch.randn(7, hp.model.input_maps, 224, 224).cuda())
        logger.info("Finished.")

    try:
        model.train()
        while True:
            for data, target in trainset:
                data, target = data.cuda(), target.cuda()
                optimizer.zero_grad()
                output = model(data)
                loss = F.nll_loss(output, target)
                loss.backward()
                optimizer.step()

                step += 1
                loss = loss.item()
                if loss > 1e8 or math.isnan(loss):
                    logger.error("Loss exploded to %.02f at step %d!" %
                                 (loss, step))
                    raise Exception("Loss exploded")

                if step % hp.train.summary_interval == 0:
                    writer.log_training(loss, step)
                    logger.info("Wrote summary at step %d" % step)

                if step % hp.train.checkpoint_interval == 0:
                    save_path = os.path.join(out_dir, 'chkpt_%07d.pt' % step)
                    torch.save(
                        {
                            'model': model.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'step': step,
                            'hp_str': hp_str,
                        }, save_path)
                    logger.info("Saved checkpoint to: %s" % save_path)

                if step % hp.train.evaluation_interval == 0:
                    test_loss, accuracy = validate(model, valset, writer, step)
                    logger.info(
                        "Evaluation saved at step %d | test_loss: %.5f | accuracy: %.4f"
                        % (step, test_loss, accuracy))

                if step % hp.train.decay.step == 0:
                    temp = optimizer.state_dict()
                    temp['param_groups'][0]['lr'] *= hp.train.decay.gamma
                    optimizer.load_state_dict(temp)

    except Exception as e:
        logger.info("Exiting due to exception: %s" % e)
        traceback.print_exc()
Ejemplo n.º 5
0
def train(out_dir, chkpt_path, trainset, valset, writer, logger, hp, hp_str,
          graphs):
    model = RandWire(hp, graphs).cuda()
    if hp.train.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=hp.train.adam)
    elif hp.train.optimizer == 'adabound':
        optimizer = adabound.AdaBound(model.parameters(),
                                      lr=hp.train.adabound.initial,
                                      final_lr=hp.train.adabound.final)
    elif hp.train.optimizer == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=hp.train.sgd.lr,
                                    momentum=hp.train.sgd.momentum,
                                    weight_decay=hp.train.sgd.weight_decay)
    else:
        raise Exception("Optimizer not supported: %s" % hp.train.optimizer)

    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, hp.train.epoch)

    init_epoch = -1
    step = 0

    if chkpt_path is not None:
        logger.info("Resuming from checkpoint: %s" % chkpt_path)
        checkpoint = torch.load(chkpt_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        step = checkpoint['step']
        init_epoch = checkpoint['epoch']

        if hp_str != checkpoint['hp_str']:
            logger.warning("New hparams are different from checkpoint.")
            logger.warning("Will use new hparams.")
        # hp = load_hparam_str(hp_str)
    else:
        logger.info("Starting new training run")
        logger.info("Writing graph to tensorboardX...")
        #print(model)
        # parameters = 0
        # for p in list(model.parameters()):
        #     nn =1
        #     for s in list(p.size()):
        #         nn = nn * s
        #     parameters += nn
        #print("Parameters",parameters)

        #print("model",hp.model)
        #       summary(model,(1,224,224))
        writer.write_graph(
            model,
            torch.randn(7, hp.model.input_maps, 224, 224).cuda())
        #Batch??, input_channels, width,depths
        logger.info("Finished.")

    try:
        model.train()
        for epoch in itertools.count(init_epoch + 1):
            loader = tqdm.tqdm(trainset, desc='Train data loader')
            for data, target in loader:
                data, target = data.cuda(), target.cuda()
                optimizer.zero_grad()
                output = model(data)
                loss = F.nll_loss(output, target)
                loss.backward()
                optimizer.step()

                loss = loss.item()
                if loss > 1e8 or math.isnan(loss):
                    logger.error("Loss exploded to %.02f at step %d!" %
                                 (loss, step))
                    raise Exception("Loss exploded")

                writer.log_training(loss, step)
                loader.set_description('Loss %.02f at step %d' % (loss, step))
                step += 1

            save_path = os.path.join(out_dir, 'chkpt_%03d.pt' % epoch)
            torch.save(
                {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'step': step,
                    'epoch': epoch,
                    'hp_str': hp_str,
                }, save_path)
            logger.info("Saved checkpoint to: %s" % save_path)

            validate(model, valset, writer, epoch)
            lr_scheduler.step()

    except Exception as e:
        logger.info("Exiting due to exception: %s" % e)
        traceback.print_exc()