Ejemplo n.º 1
0
def train_loop(hp, logger, writer):
    # make dataloader
    logger.info("Making train dataloader...")
    train_loader = create_dataloader(hp, DataloaderMode.train)
    logger.info("Making test dataloader...")
    test_loader = create_dataloader(hp, DataloaderMode.test)

    # init Model
    net_arch = Net_arch(hp)
    loss_f = torch.nn.MSELoss()
    model = Model(hp, net_arch, loss_f)

    if hp.load.resume_state_path is not None:
        model.load_training_state(logger)
    else:
        logger.info("Starting new training run.")

    try:
        for model.epoch in itertools.count(model.epoch + 1):
            if model.epoch > hp.train.num_iter:
                break
            train_model(hp, model, train_loader, writer, logger)
            if model.epoch % hp.log.chkpt_interval == 0:
                model.save_network(logger)
                model.save_training_state(logger)
            test_model(hp, model, test_loader, writer)
        logger.info("End of Train")
    except Exception as e:
        logger.info("Exiting due to exception: %s" % e)
        traceback.print_exc()
def _do_work(filenames):
    # This is the only unique thing to the handler. You have to
    # implement the method that operates on a file.
    new_filenames = []
    new_filenames = train_model(filenames)

    data = []
    if len(new_filenames) > 0:
        multi_file_manifest = {}
        context, socket = zmq_connect(port=5557, pattern="REQ")
        for f in new_filenames:
            single_file_manifest = generateFileManifest(f,
                                                        purpose="train_model")
            for k in single_file_manifest:
                multi_file_manifest[k] = single_file_manifest[k]

        socket.send_string(json.dumps(multi_file_manifest))
        repl = socket.recv()
        print(f"\nGot {repl}")
    else:
        n = inspect.stack()[0][3]
        print("\nnew_filenames is empty")
        print(f"{n} failed on {filenames}")

    return new_filenames
Ejemplo n.º 3
0
def train_loop(rank, hp, world_size=1):
    # reload hp
    hp = DotDict(hp)
    if hp.model.device.lower() == "cuda" and world_size != 0:
        setup(hp, rank, world_size)
    if rank != 0:
        logger = None
        writer = None
    else:
        # set logger
        logger = make_logger(hp)
        # set writer (tensorboard / wandb)
        writer = Writer(hp, hp.log.log_dir)
        hp_str = yaml.dump(hp.to_dict())
        logger.info("Config:")
        logger.info(hp_str)
        if hp.data.train_dir == "" or hp.data.test_dir == "":
            logger.error("train or test data directory cannot be empty.")
            raise Exception("Please specify directories of data")
        logger.info("Set up train process")

    if hp.model.device.lower() == "cuda" and world_size != 0:
        hp.model.device = rank
        torch.cuda.set_device(rank)
    else:
        hp.model.device = hp.model.device.lower()

    # make dataloader
    if logger is not None:
        logger.info("Making train dataloader...")
    train_loader = create_dataloader(hp, DataloaderMode.train, rank,
                                     world_size)
    if logger is not None:
        logger.info("Making test dataloader...")
    test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size)

    # init Model
    net_arch = Net_arch(hp)
    loss_f = torch.nn.MSELoss()
    model = Model(hp, net_arch, loss_f, rank, world_size)

    # load training state
    if hp.load.resume_state_path is not None:
        model.load_training_state(logger)
    else:
        if logger is not None:
            logger.info("Starting new training run.")

    try:
        epoch_step = 1 if hp.data.divide_dataset_per_gpu else world_size
        for model.epoch in itertools.count(model.epoch + 1, epoch_step):
            if model.epoch > hp.train.num_iter:
                break
            train_model(hp, model, train_loader, writer, logger)
            if model.epoch % hp.log.chkpt_interval == 0:
                model.save_network(logger)
                model.save_training_state(logger)
            test_model(hp, model, test_loader, writer)
        cleanup()
        if logger is not None:
            logger.info("End of Train")
    except Exception as e:
        if logger is not None:
            logger.info("Exiting due to exception: %s" % e)
        traceback.print_exc()
        cleanup()
Ejemplo n.º 4
0
def main_func(params):
    assert params.data_mean != '', "-data_mean is required"
    assert params.data_sd != '', "-data_sd is required"
    params.data_mean = [float(m) for m in params.data_mean.split(',')]
    params.data_sd = [float(s) for s in params.data_sd.split(',')]

    if params.seed > -1:
        set_seed(params.seed)
    rnd_generator = torch.Generator(device='cpu') if params.seed > -1 else None

    # Setup image training data
    training_data, num_classes, class_weights = load_dataset(data_path=params.data_path, val_percent=params.val_percent, batch_size=params.batch_size, \
                                                             input_mean=params.data_mean, input_sd=params.data_sd, use_caffe=not params.not_caffe, \
                                                             train_workers=params.train_workers, val_workers=params.val_workers, balance_weights=params.balance_classes, \
                                                             rnd_generator=rnd_generator)


    # Setup model definition
    cnn, is_start_model, base_model = setup_model(params.model_file, num_classes=num_classes, base_model=params.base_model, pretrained=not params.reset_weights)

    if params.optimizer == 'sgd':
        optimizer = optim.SGD(cnn.parameters(), lr=params.lr, momentum=0.9)
    elif params.optimizer == 'adam':
        optimizer = optim.Adam(cnn.parameters(), lr=params.lr)

    lrscheduler = optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.96)

    if params.balance_classes:
        criterion = torch.nn.CrossEntropyLoss(weight=class_weights.to(params.use_device))
    else:
        criterion = torch.nn.CrossEntropyLoss()

    # Maybe delete braches
    if params.delete_branches and not is_start_model:
        try:
            cnn.remove_branches()
            has_branches = False
        except:
            has_branches = True
            pass
    else:
       has_branches = True


    # Load pretrained model weights
    start_epoch = 1
    if not params.reset_weights:
        cnn, optimizer, lrscheduler, start_epoch = load_checkpoint(cnn, params.model_file, optimizer, lrscheduler, num_classes, is_start_model=is_start_model)

    if params.delete_branches and is_start_model:
        try:
            cnn.remove_branches()
            has_branches = False
        except:
            has_branches = True
            pass
    else:
       has_branches = True


    # Maybe freeze some model layers
    main_layer_list = ['conv1', 'conv2', 'conv3', 'mixed3a', 'mixed3b', 'mixed4a', 'mixed4b', 'mixed4c', 'mixed4d', 'mixed4e', 'mixed5a', 'mixed5b']
    if params.freeze_to != 'none':
        for layer in main_layer_list:
            if params.freeze_to == layer:
                break
            for param in getattr(cnn, layer).parameters():
                param.requires_grad = False
    branch_layer_list = ['loss_conv', 'loss_fc', 'loss_classifier']
    if params.freeze_aux1_to != 'none' and has_branches:
        for layer in branch_layer_list:
            if params.freeze_aux1_to == layer:
                break
            for param in getattr(getattr(cnn, 'aux1'), layer).parameters():
                param.requires_grad = False
    if params.freeze_aux2_to != 'none' and has_branches:
        for layer in branch_layer_list:
            if params.freeze_aux2_to == layer:
                break
            for param in getattr(getattr(cnn, 'aux2'), layer).parameters():
                param.requires_grad = False


       # Optionally freeze/unfreeze specific layers and sub layers
    if params.toggle_layers != 'none':
        toggle_layers = [l.replace('\\', '/').replace('.', '/').split('/') for l in params.toggle_layers.split(',')]
        for layer in toggle_layers:
            if len(layer) == 2:
                for param in getattr(getattr(cnn, layer[0]), layer[1]).parameters():
                    param.requires_grad = False if param.requires_grad == True else False
            else:
                for param in getattr(cnn, layer[0]).parameters():
                    param.requires_grad = False if param.requires_grad == True else False


    n_learnable_params = sum(param.numel() for param in cnn.parameters() if param.requires_grad)
    print('Model has ' + "{:,}".format(n_learnable_params) + ' learnable parameters\n')


    cnn = cnn.to(params.use_device)
    if 'cuda' in params.use_device:
        if params.seed > -1:
            torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.enabled = True


    save_info = [[params.data_mean, params.data_sd, 'BGR'], num_classes, has_branches, base_model]

    # Train model
    train_model(model=cnn, dataloaders=training_data, criterion=criterion, optimizer=optimizer, lrscheduler=lrscheduler, \
                num_epochs=params.num_epochs, start_epoch=start_epoch, save_epoch=params.save_epoch, output_name=params.output_name, \
                device=params.use_device, has_branches=has_branches, fc_only=False, num_classes=num_classes, individual_acc=params.individual_acc, \
                should_save_csv=params.save_csv, csv_path=params.csv_dir, save_info=save_info)
    else:
        raise ("Invalid Model Type")

    print(model)

    model = model.to(device)
    model = trans_to_cuda(model)
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.1)
    model, log = train_model(model,
                             criterion,
                             optimizer,
                             exp_lr_scheduler,
                             dataloaders,
                             dataset_sizes,
                             device,
                             num_epochs=args.epochs)
    df = pd.DataFrame({
        'epoch': [],
        'training_loss': [],
        'training_acc': [],
        'val_loss': [],
        'val_acc': []
    })
    df['epoch'] = log['epoch']
    df['training_loss'] = log['training_loss']
    df['training_acc'] = log['training_acc']
    df['val_loss'] = log['val_loss']
    df['val_acc'] = log['val_acc']
Ejemplo n.º 6
0
epochs = 100
batch_size = 32
iterations = 10  # number of repeat the training
iter_start = 0
model_type = 'OctFiResNet'  # name proposed model
folder_name = '{}_models_{}'.format(model_type, train_id)
save_dir = os.path.join(models_root, folder_name)
make_summary = False

if make_summary:
    # model.summary() and plot_model()
    summarize_model(save_dir, model_type)

# train against other dataset
for val_id in val_datasets:
    if train_id == val_id:
        dt_name = 'itself'
        val_id = None
    else:
        dt_name = datasets[val_id]['name']
    print('Training', datasets[train_id]['name'], 'dataset against', dt_name)
    train_model(iterations=iterations,
                train_dataset=train_id,
                iter_start=iter_start,
                test_dataset=val_id,
                save_dir=save_dir,
                batch_size=batch_size,
                epochs=epochs,
                input_shape=input_shape,
                debug=DEBUG)
Ejemplo n.º 7
0
def train_loop(rank, cfg):
    logger = get_logger(cfg, os.path.basename(__file__))
    if cfg.device == "cuda" and cfg.dist.gpus != 0:
        cfg.device = rank
        # turn off background generator when distributed run is on
        cfg.data.use_background_generator = False
        setup(cfg, rank)
        torch.cuda.set_device(cfg.device)

    # setup writer
    if is_logging_process():
        # set log/checkpoint dir
        os.makedirs(cfg.log.chkpt_dir, exist_ok=True)
        # set writer (tensorboard / wandb)
        writer = Writer(cfg, "tensorboard")
        cfg_str = OmegaConf.to_yaml(cfg)
        logger.info("Config:\n" + cfg_str)
        if cfg.data.train_dir == "" or cfg.data.test_dir == "":
            logger.error("train or test data directory cannot be empty.")
            raise Exception("Please specify directories of data")
        logger.info("Set up train process")
        logger.info("BackgroundGenerator is turned off when Distributed running is on")

        # download MNIST dataset before making dataloader
        # TODO: This is example code. You should change this part as you need
        _ = torchvision.datasets.MNIST(
            root=hydra.utils.to_absolute_path("dataset/meta"),
            train=True,
            transform=torchvision.transforms.ToTensor(),
            download=True,
        )
        _ = torchvision.datasets.MNIST(
            root=hydra.utils.to_absolute_path("dataset/meta"),
            train=False,
            transform=torchvision.transforms.ToTensor(),
            download=True,
        )
    # Sync dist processes (because of download MNIST Dataset)
    if cfg.dist.gpus != 0:
        dist.barrier()

    # make dataloader
    if is_logging_process():
        logger.info("Making train dataloader...")
    train_loader = create_dataloader(cfg, DataloaderMode.train, rank)
    if is_logging_process():
        logger.info("Making test dataloader...")
    test_loader = create_dataloader(cfg, DataloaderMode.test, rank)

    # init Model
    net_arch = Net_arch(cfg)
    loss_f = torch.nn.CrossEntropyLoss()
    model = Model(cfg, net_arch, loss_f, rank)

    # load training state / network checkpoint
    if cfg.load.resume_state_path is not None:
        model.load_training_state()
    elif cfg.load.network_chkpt_path is not None:
        model.load_network()
    else:
        if is_logging_process():
            logger.info("Starting new training run.")

    try:
        if cfg.dist.gpus == 0 or cfg.data.divide_dataset_per_gpu:
            epoch_step = 1
        else:
            epoch_step = cfg.dist.gpus
        for model.epoch in itertools.count(model.epoch + 1, epoch_step):
            if model.epoch > cfg.num_epoch:
                break
            train_model(cfg, model, train_loader, writer)
            if model.epoch % cfg.log.chkpt_interval == 0:
                model.save_network()
                model.save_training_state()
            test_model(cfg, model, test_loader, writer)
        if is_logging_process():
            logger.info("End of Train")
    except Exception as e:
        if is_logging_process():
            logger.error(traceback.format_exc())
        else:
            traceback.print_exc()
    finally:
        if cfg.dist.gpus != 0:
            cleanup()
Ejemplo n.º 8
0
def train_loop(rank, hp, world_size=0):
    if hp.model.device == "cuda" and world_size != 0:
        hp.model.device = rank
        # turn off background generator when distributed run is on
        hp.data.use_background_generator = False
        setup(hp, rank, world_size)
        torch.cuda.set_device(hp.model.device)

    # setup logger / writer
    if rank != 0:
        logger = None
        writer = None
    else:
        # set logger
        logger = make_logger(hp)
        # set writer (tensorboard / wandb)
        writer = Writer(hp, os.path.join(hp.log.log_dir, "tensorboard"))
        hp_str = yaml.dump(hp.to_dict())
        logger.info("Config:")
        logger.info(hp_str)
        if hp.data.train_dir == "" or hp.data.test_dir == "":
            logger.error("train or test data directory cannot be empty.")
            raise Exception("Please specify directories of data")
        logger.info("Set up train process")
        logger.info(
            "BackgroundGenerator is turned off when Distributed running is on")

        # download MNIST dataset before making dataloader
        # TODO: This is example code. You should change this part as you need
        _ = torchvision.datasets.MNIST(
            root="dataset/meta",
            train=True,
            transform=torchvision.transforms.ToTensor(),
            download=True,
        )
        _ = torchvision.datasets.MNIST(
            root="dataset/meta",
            train=False,
            transform=torchvision.transforms.ToTensor(),
            download=True,
        )
    # Sync dist processes (because of download MNIST Dataset)
    if world_size != 0:
        dist.barrier()

    # make dataloader
    if logger is not None:
        logger.info("Making train dataloader...")
    train_loader = create_dataloader(hp, DataloaderMode.train, rank,
                                     world_size)
    if logger is not None:
        logger.info("Making test dataloader...")
    test_loader = create_dataloader(hp, DataloaderMode.test, rank, world_size)

    # init Model
    net_arch = Net_arch(hp)
    loss_f = torch.nn.CrossEntropyLoss()
    model = Model(hp, net_arch, loss_f, rank, world_size)

    # load training state / network checkpoint
    if hp.load.resume_state_path is not None:
        model.load_training_state(logger)
    elif hp.load.network_chkpt_path is not None:
        model.load_network(logger=logger)
    else:
        if logger is not None:
            logger.info("Starting new training run.")

    try:
        if world_size == 0 or hp.data.divide_dataset_per_gpu:
            epoch_step = 1
        else:
            epoch_step = world_size
        for model.epoch in itertools.count(model.epoch + 1, epoch_step):
            if model.epoch > hp.train.num_epoch:
                break
            train_model(hp, model, train_loader, writer, logger)
            if model.epoch % hp.log.chkpt_interval == 0:
                model.save_network(logger)
                model.save_training_state(logger)
            test_model(hp, model, test_loader, writer, logger)
        if logger is not None:
            logger.info("End of Train")
    except Exception as e:
        if logger is not None:
            logger.error(traceback.format_exc())
        else:
            traceback.print_exc()
    finally:
        if world_size != 0:
            cleanup()
Ejemplo n.º 9
0
def main(argv=sys.argv):
    """ The main script """

    args = parse_args(argv)

    action = args.app_action
    train_folder_path = args.trp
    test_folder_path = args.tep
    folder_or_image = "" if args.path is None else args.path
    #Any arg supplied to this will be seen as True, no arg means False
    generate_model_name = args.gen_name

    # If the action is train, the model is the name of the new model
    # that is going to be trained; if it's predict, the model is the
    # name of the model to use for prediction
    model = args.model

    if action == 'train':

        new_model = model
        if not new_model:
            if generate_model_name in truth_values:
                #The user want us to generate model name for them
                #trp and tep args are required args implicitly for users from app
                if train_folder_path and test_folder_path:
                    #Means user fulfilled the requirement. we can proceed now
                    #generate name
                    new_model = generate_name(train_folder_path)
                    train_model(new_model, train_folder_path, test_folder_path)
                    return
                #Here, the user might have supplied one folder argument or None at all
                print(
                    "\n Both training folder and test folder arguments are required"
                )
                return
            #The user did not supply model name and did not ask us to generate one. So definitely,
            # we are the one running this from console app
            #We don't want to retrain our default model. Better to delete. So we have to check if we
            #have trained our default model before. If default model exist, return
            if default_model in all_models():
                print(
                    "Retraining the default model is forbidden. Supply model name or Delete the default model manually and proceed"
                )
                return

            #Training our default model now
            new_model = default_model
            print("Training the default model now...")
            #We use train function directly here for obvious reasons
            return train(new_model)

        #Model name supplied
        new_model = model + model_extension
        if new_model in all_models():
            print(
                "There's already a model with that name. Please choose another name"
                " or find a model with name {}. Delete it and try again".
                format(new_model))
            return
        #From here on, we expect user to supply training dataset and test dataset.
        #trp and tep args are required args implicitly for users from app
        if train_folder_path and test_folder_path:
            #Means user fulfilled the requirement. we can proceed now
            return train_model(new_model, train_folder_path, test_folder_path)
        #Here, the user might have supplied one folder argument or None at all
        print("\n Both training folder and test folder arguments are required")
        return

    elif action == 'predict':

        # If no model was given, use the default one
        if not model:
            model = default_model

        else:
            model = model + model_extension

            # If one was supplied, check that it actually exists
            if model not in all_models():
                print("No such model has been trained")
                return

        # if it's not a folder that was supplied, check if it's a file
        if not os.path.isdir(folder_or_image):
            if os.path.isfile(folder_or_image):
                if not folder_or_image.endswith(image_extensions):
                    print("\nError: An image file is required. Try again\n")
                    return
                input_type = 'file'
                # add logic before here to pass in the model we want to use in the predictor
                predictor(input_type, folder_or_image, model)
                return
            print(
                '\nError: Invalid path. Kindly supply a valid folder or image path\n'
            )
            return

        input_type = 'folder'

        # add logic before here to pass in the model we want to use in the predictor
        predictor(input_type, folder_or_image, model)
        if input_type == 'folder':
            print(f"\nDone! The results are in {folder_or_image}")

    elif action == 'delete':
        # Check that model name is provided.

        if not model:
            print("\n You must supply a model to delete")
            return

        model = model + model_extension

        if model not in all_models():
            print("That model does not exist")
            return

        model_delete(model)

        return

    elif action == 'retrieve_models':

        # List all models
        print(all_models())

        return

    else:
        print(
            '\nAction command is not supported\n for help: run python3 app.py -h'
        )