Esempio n. 1
0
def train_model(model: SLModel,
                trainset: NpDataset,
                valset: NpDataset,
                epochs=5,
                batch_size=32):

    # Create the generators
    logging.info("Training model for {} epochs and {} batch size".format(
        epochs, batch_size))
    logging.info("Flowing the train and validation sets")
    traingen = trainset.flow(
        batch_size=batch_size, shuffle=True, seed=utils.get_random_seed())
    valgen = valset.flow(batch_size=batch_size, shuffle=False)

    # Create the callbacks
    logging.info("Creating the callbacks")
    callbacks = [
        ModelCheckpoint(
            utils.get_model_path(RUN_ID),
            "val_loss",
            verbose=1,
            save_best_only=True),
        Plotter(
            "loss",
            scale='log',
            plot_during_train=True,
            save_to_file=utils.get_plot_path(RUN_ID),
            block_on_end=False),
        Plotter(
            "accuracy",
            scale='linear',
            plot_during_train=True,
            save_to_file=utils.get_plot_path(RUN_ID + "_acc"),
            block_on_end=False)
    ]

    # Create the optiizer
    logging.info("Creating the optimizer")
    params = [param for param in model.parameters() if param.requires_grad]
    # optimizer = optim.SGD(
    #     params
    #     lr=0.01,
    #     momentum=0.9,
    #     nesterov=True)
    optimizer = optim.Adam(params)
    logging.info("Optimizer: %r" % optimizer)

    # Train the model
    logs = model.fit_generator(
        traingen,
        traingen.steps_per_epoch,
        epochs=epochs,
        optimizer=optimizer,
        validation_generator=valgen,
        validation_steps=valgen.steps_per_epoch,
        metrics=["accuracy"],
        callbacks=callbacks,
        verbose=1)

    return logs
Esempio n. 2
0
 def fit(self, X, y, *args, parallel=None, **kwargs):
     """
     :param X: n_samples X n_models
     :param y: n_samples
     :param args: placeholder
     :param kwargs: placeholder
     """
     data = NpDataset(X, y=y)
     self.__val_preds = np.zeros(y.shape)
     if self.__val_preds.ndim != 1:
         logging.error(
             "Shape of validation predictions is incorrect: {}".format(
                 self.__val_preds.shape))
     model_and_preds = parallel(
         delayed(fit_fold_model)(self.models[i], train_data.x, train_data.y,
                                 val_data.x, val_data.y, *args, **kwargs)
         for i, (train_data,
                 val_data) in enumerate(data.kfold(self.k, shuffle=False)))
     cur_sample_ind = 0
     for i, (model, val_preds) in enumerate(model_and_preds):
         self.models[i] = model
         self.__val_preds[cur_sample_ind:cur_sample_ind +
                          val_preds.shape[0]] = val_preds
         cur_sample_ind += val_preds.shape[0]
     assert cur_sample_ind == X.shape[0]
Esempio n. 3
0
def test_model(model: SLModel, test_data: NpDataset, batch_size=BATCH_SIZE):
    logging.info("Testing model with batch size of {batch_size}".format(**locals()))
    logging.info("Flowing the test set")
    test_data.output_labels = False
    testgen = test_data.flow(batch_size=batch_size, shuffle=False)
    test_preds = model.predict_generator(
        testgen, testgen.steps_per_epoch, verbose=1)
    return test_preds.squeeze(-1)
Esempio n. 4
0
 def roc_auc(self, X, y, *args, **kwargs):
     data = NpDataset(X, y=y)
     score = 0.
     for i, (train_data,
             val_data) in enumerate(data.kfold(self.k, shuffle=False)):
         score = score + self.models[i].roc_auc(val_data.x, val_data.y, *
                                                args, **kwargs)
     return score / self.k
Esempio n. 5
0
def validate_model(model: SLModel, val_data: NpDataset, batch_size=32):
    logging.info("Validating model with batch size of {}".format(batch_size))
    val_data.output_labels = False
    logging.info("Flowing the validation set")
    valgen = val_data.flow(batch_size=batch_size, shuffle=False)
    logging.info("Getting validation predictions")
    val_preds = model.predict_generator(valgen, valgen.steps_per_epoch)
    score = roc_auc_score(val_data.y[:, 0], val_preds[:, 0])
    logging.info("Validation ROC AUC score: {}".format(score))
    return score
Esempio n. 6
0
def train_model(model: SLModel,
                trainset: NpDataset,
                valset: NpDataset,
                epochs=EPOCHS,
                batch_size=BATCH_SIZE,
                plot=True):

    # Create the generators
    logging.info("Training model for {epochs} epochs and {batch_size} batch "
                 "size".format(**locals()))
    logging.info("Flowing the train and validation sets")
    traingen = trainset.flow(
        batch_size=batch_size, shuffle=True, seed=utils.get_random_seed())
    valgen = valset.flow(batch_size=batch_size, shuffle=False)

    # Create the callbacks
    logging.info("Creating the callbacks")
    callbacks = [
        ModelCheckpoint(
            utils.get_model_path(RUN_ID),
            "val_loss",
            verbose=1,
            save_best_only=True),
        Plotter(
            "bce",
            scale='log',
            plot_during_train=plot,
            save_to_file=utils.get_plot_path(RUN_ID+"_bce"),
            block_on_end=False),
        Plotter(
            "dice",
            plot_during_train=plot,
            save_to_file=utils.get_plot_path(RUN_ID+"_dice"),
            block_on_end=False),
        Plotter(
            "iou",
            plot_during_train=plot,
            save_to_file=utils.get_plot_path(RUN_ID + "_iou"),
            block_on_end=False),
    ]

    # Train the model
    logs = model.fit_generator(
        traingen,
        traingen.steps_per_epoch,
        epochs=epochs,
        validation_data=valgen,
        validation_steps=valgen.steps_per_epoch,
        callbacks=callbacks,
        metrics=["iou", mean_iou],
        verbose=1)

    return logs
    def load_train(self):
        # Just load the data into a numpy dataset, it ain't that big
        logging.info("Loading train images from {self.path_to_train_images} "
                     "and masks from {self.path_to_train_masks}"
                     "".format(**locals()))
        img_paths = sorted(glob(self.glob_train_images))
        mask_paths = set(glob(self.glob_train_masks))  # Use set to look up
        # Initialize the numpy data containers
        x = np.zeros((len(img_paths), ) + self.img_size + (4, ))
        y = np.zeros((len(mask_paths), ) + self.img_size + (1, ))
        ids = []
        for i, img_path in enumerate(tqdm(img_paths)):
            img_basename = os.path.basename(img_path)
            ids.append(os.path.splitext(img_basename)[0])

            x[i, ..., :3] = ImageDataset.load_img(img_path,
                                                  img_size=None,
                                                  mode=self.mode)[0]
            x[i, ..., 3] = self.depths.loc[ids[-1]]
            # Load the mask
            mask_path = os.path.join(self.path_to_train_masks, img_basename)
            # Use the 0 mask if its not there
            if mask_path not in mask_paths:
                logging.info("Could not find {img_basename} in masks"
                             "".format(**locals()))
                continue
            y[i] = ImageDataset.load_img(mask_path, img_size=None,
                                         mode="gray")[0]
        print("X shape:", x.shape)
        print("Y Shape:", y.shape)
        return NpDataset(x.astype('float32'),
                         y.astype('float32'),
                         ids=np.array(ids))
Esempio n. 8
0
def test_model(model: SLModel, test_data: NpDataset, batch_size=32):
    logging.info("Testing model with batch size of {}".format(batch_size))
    logging.info("Flowing the test set")
    testgen = test_data.flow(batch_size=batch_size, shuffle=False)
    test_preds = model.predict_generator(
        testgen, testgen.steps_per_epoch, verbose=1)
    return test_preds[:, 0]
Esempio n. 9
0
    def kfold(self, k=True, shuffle=False, seed=None):
        for train_split, val_split in self.original_dataset.get_kfold_indices(
                k, shuffle, seed):
            train_data = MultiNpDatasetAugmenter(*(NpDataset(
                dataset.x[train_split],
                y=None if not dataset.has_labels else dataset.y[train_split],
                ids=None if not dataset.has_ids else dataset.ids[train_split])
                                                   for dataset in self.datasets
                                                   ))

            val_data = NpDataset(self.original_dataset.x[val_split],
                                 y=None if not self.original_dataset.has_labels
                                 else self.original_dataset.y[val_split],
                                 ids=None if not self.original_dataset.has_ids
                                 else self.original_dataset.ids[val_split])
            yield train_data, val_data
Esempio n. 10
0
 def load_supervised(data):
     ids = data["ids"]
     text = data["texts"]
     if "labels" in data:
         labels = data["labels"]
     else:
         labels = None
     return ids, NpDataset(text, labels, ids=ids)
Esempio n. 11
0
    def fit(self, *args, **kwargs):
        scores = np.zeros(len(LABEL_NAMES))
        rocs = np.zeros(len(LABEL_NAMES))
        accs = np.zeros(len(LABEL_NAMES))
        for label_num in range(len(LABEL_NAMES)):
            logging.info("Training for label {label}".format(
                label=LABEL_NAMES[label_num]))
            subdataset = NpDataset(self.base_dataset.x[..., label_num],
                                   y=self.base_dataset.y[..., label_num])
            # Best stuff
            best_score = float('inf')
            best_param_num = 0
            for param_num in trange(len(self.model[label_num])):
                # This will also save the val_preds
                self.model[label_num][param_num].fit(subdataset.x,
                                                     subdataset.y, *args,
                                                     **kwargs)

                # Save the model if its our best so far
                score = self.model[label_num][param_num].score(
                    subdataset.x, subdataset.y)
                print(self.grid[param_num])
                if score < best_score:
                    logging.info(
                        "Score improved from {best_score} to {score}".format(
                            best_score=best_score, score=score))
                    self.best_model.models[label_num] = copy.deepcopy(
                        self.model[label_num][param_num])
                    best_score = score
                    best_param_num = param_num
                    # Calculate the stats
                    scores[label_num] = best_score
                    rocs[label_num] = self.best_model.models[
                        label_num].roc_auc(subdataset.x, subdataset.y)
                    accs[label_num] = self.best_model.models[
                        label_num].accuracy(subdataset.x, subdataset.y)

                # Remove the current model from memory
                self.model[label_num][param_num] = None
            logging.info(
                "Best score achieved is {best_score} with params {best_params}"
                .format(best_score=best_score,
                        best_params=self.grid[best_param_num]))
            logging.info(
                "Metrics are: Accuracy - {acc} --- ROC AUC - {roc}".format(
                    acc=accs[label_num], roc=rocs[label_num]))

            logging.info("C")
        logging.info(
            "Average Scores: LogLoss - {loss} --- Accuracy - {acc} --- ROC AUC - {roc}"
            .format(loss=np.average(scores),
                    acc=np.average(accs),
                    roc=np.average(rocs)))
Esempio n. 12
0
def test_augmenter_basic():
    # Try different combinations of with labels and without
    data = NpDataset(x=np.ones((32, )), y=np.ones((32, )))
    augmenter = ZeroAugmenter(labels=False, augment_labels=False)
    assert not augmenter.labels
    assert not augmenter.augment_labels
    data.output_labels = False
    x = next(augmenter(data.flow(batch_size=32)))
    assert np.all(x == 0.)

    augmenter = ZeroAugmenter(labels=False, augment_labels=True)
    assert not augmenter.labels
    assert augmenter.augment_labels
    data.output_labels = False
    x = next(augmenter(data.flow(batch_size=32)))
    assert np.all(x == 0.)

    augmenter = ZeroAugmenter(labels=True, augment_labels=False)
    assert augmenter.labels
    assert not augmenter.augment_labels
    data.output_labels = True
    x, y = next(augmenter(data.flow(batch_size=32)))
    assert np.all(x == 0.)
    assert np.all(y == 1.)

    augmenter = ZeroAugmenter(labels=True, augment_labels=True)
    assert augmenter.labels
    assert augmenter.augment_labels
    data.output_labels = True
    x, y = next(augmenter(data.flow(batch_size=32)))
    assert np.all(x == 0.)
    assert np.all(y == 0.)

    # Try a generic python generator
    def datagen():
        yield np.ones((32, ))

    augmenter = ZeroAugmenter(labels=False, augment_labels=False)
    x = next(augmenter(datagen()))
    assert np.all(x == 0.)
Esempio n. 13
0
 def validation_split(self,
                      split=0.2,
                      shuffle=False,
                      seed=None,
                      stratified=False):
     """
     NOTE: Only use stratified if the labels are the same between the augmented sets
     This will assume the first dataset provided is the original and others are augmented
     versions. Thus the validation set will only be pulled from the original dataset.
     """
     # Get the split indicies
     train_split, val_split = self.original_dataset.get_split_indicies(
         split, shuffle, seed, stratified)
     # Create each subdataset
     train_data = MultiNpDatasetAugmenter(*(NpDataset(
         dataset.x[train_split],
         None if not self.output_labels else dataset.y[train_split])
                                            for dataset in self.datasets))
     # We use the original dataset for the validation set
     val_data = NpDataset(
         self.original_dataset.x[val_split], None
         if not self.output_labels else self.original_dataset.y[val_split])
     return train_data, val_data
Esempio n. 14
0
 def load_test(self):
     # Just load the data into a numpy dataset, it ain't that big
     logging.info("Loading test images from {self.path_to_test_images}"
                  " and glob {self.glob_test_images}".format(**locals()))
     img_paths = sorted(glob(self.glob_test_images))
     # Initialize the numpy data containers
     x = np.zeros((len(img_paths), ) + self.img_size + (4, ))
     ids = []
     for i, img_path in enumerate(tqdm(img_paths)):
         x[i, ..., :3] = ImageDataset.load_img(img_path,
                                               img_size=None,
                                               mode=self.mode)[0]
         x[i, ..., :4] = self.depths.loc[ids[-1]] / MAX_DEPTH
         # Load the mask
         img_basename = os.path.basename(img_path)
         ids.append(os.path.splitext(img_basename)[0])
     print("Xte Shape:", x.shape)
     return NpDataset(x.astype('float32'), ids=np.array(ids))
Esempio n. 15
0
def create_predictions(model_names,
                       k,
                       seed=7,
                       savedir="../superlearner_preds/",
                       data_paths=tuple(),
                       batch_size=32):
    num_base_learners = len(model_names)
    logging.info("Using %s base learners" % num_base_learners)
    # Build the new train data to train the meta learner on
    predictions, pred_labels = None, None
    for j, model_name in enumerate(model_names):
        # Try to load it, otherwise create the predictions
        try:
            single_predictions, pred_labels = load_predictions(
                model_name, savedir, pred_labels=pred_labels)
        except:
            # If the file is not there, create it
            logging.info("Couldn't load predictions for " + model_name +
                         ", creating instead")
            train_data = load_dataset(data_paths[j])
            single_predictions, pred_labels = predict_val(
                model_name,
                train_data,
                k,
                seed=seed,
                Y=pred_labels,
                batch_size=batch_size)
            save_predictions(single_predictions, predictions, model_names[j],
                             savedir)

        assert single_predictions.ndim == 2
        # Construct the X array if this is our first iteration
        if j == 0:
            predictions = np.zeros(
                (single_predictions.shape[0], num_base_learners,
                 single_predictions.shape[1]),
                dtype=np.float32)

        assert predictions.shape[0] == single_predictions.shape[0]
        assert predictions.shape[2] == single_predictions.shape[1]
        predictions[:, j] = single_predictions

    return NpDataset(predictions, y=pred_labels)
Esempio n. 16
0
def train_superlearner(pred_X, pred_Y):
    # Now train 6 dense layers
    num_base_learners = pred_X.shape[1]
    weights = np.zeros((num_base_learners, len(LABEL_NAMES)))
    for i, label in enumerate(LABEL_NAMES):
        logging.info("Training logistic regression for label %s" % label)
        pred_dataset = NpDataset(x=pred_X[:, :, i], y=pred_Y[:, i:i + 1])
        datagen = DatasetGenerator(pred_dataset, batch_size=len(pred_dataset), shuffle=False)
        logistic_reg = build_model(num_base_learners, 1)
        optimizer = torch.optim.SGD(logistic_reg.parameters(), lr=0.01)
        train_logs, val_logs = logistic_reg.fit_generator(datagen, steps_per_epoch=datagen.steps_per_epoch, epochs=1000,
                                                          optimizer=optimizer,
                                                          loss_fn=F.binary_cross_entropy_with_logits,
                                                          metrics=[accuracy_with_logits], verbose=0)
        logging.info("Final Loss: %s" % train_logs["loss"][-1])
        logging.info("Final Accuracy: %s" % train_logs["accuracy_with_logits"][-1])
        weight = logistic_reg.torch_module.linear.weight.data
        weights[:, i] = weight.cpu().numpy().flatten()
        logging.info("Trained weights: {}".format(weights[:, i]))
    return weights
Esempio n. 17
0
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)


model = Net()
if args.cuda:
    model.cuda()

optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

# Turn the numpy dataset into a BatchGenerator
train_datagen = DatasetGenerator(NpDataset(xtr, y=ytr),
                                 batch_size=32,
                                 shuffle=True,
                                 seed=1234)
# Turn the val data into a BatchGenerator
val_datagen = DatasetGenerator(NpDataset(xval, y=yval),
                               batch_size=1000,
                               shuffle=True,
                               seed=1234)


def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_datagen):
        data, target = torch.Tensor(data), torch.LongTensor(target)
        if args.cuda:
Esempio n. 18
0
def train_model(model,
                trainset: NpDataset,
                valset: NpDataset,
                epochs=70,
                batch_size=32,
                val_batch_size=32,
                plot=True,
                run_id='default_model_name',
                augmenter=None,
                verbose=1,
                debug=False):

    # Create the generators
    logger.info(
        f'Training model for {epochs} epochs and {batch_size} batch size')
    logger.info('Flowing the train and validation sets')
    traingen = trainset.flow(batch_size=batch_size,
                             shuffle=True,
                             seed=utils.get_random_seed())
    valgen = valset.flow(batch_size=val_batch_size, shuffle=False)

    if augmenter is not None:
        logger.info(f'Training with augmenter {augmenter.image_augmenter}')
        augmenter.labels = True
        traingen = augmenter(traingen)

    # Create the callbacks
    logger.info('Creating the callbacks')
    callbacks = [
        ModelCheckpoint(utils.get_model_path(run_id),
                        'val_loss',
                        verbose=verbose,
                        save_best_only=True,
                        save_weights_only=True),
        ModelCheckpoint(utils.get_model_path(run_id + '_dice_coef'),
                        'val_dice_coef',
                        verbose=verbose,
                        save_best_only=True,
                        save_weights_only=True,
                        mode='max'),
        Plotter('loss',
                scale='log',
                plot_during_train=plot,
                save_to_file=utils.get_plot_path(run_id + '_loss'),
                block_on_end=False),
        Plotter('dice_coef',
                plot_during_train=plot,
                save_to_file=utils.get_plot_path(run_id + '_dice_coef'),
                block_on_end=False),
    ]

    train_steps = 3 if debug else traingen.steps_per_epoch
    val_steps = 3 if debug else valgen.steps_per_epoch
    epochs = 2 if debug else epochs

    # Train the model
    logs = model.fit_generator(traingen,
                               train_steps,
                               epochs=epochs,
                               validation_data=valgen,
                               validation_steps=val_steps,
                               callbacks=callbacks,
                               verbose=verbose,
                               max_queue_size=3)

    return logs
Esempio n. 19
0
def test_model(model,
               test_data: NpDataset,
               batch_size=32,
               num_augmentations=0,
               view_preds=False,
               debug=False):
    logger.info(f'Testing model with batch size of {batch_size}')
    logger.info('Flowing the test set')
    test_data.output_labels = False
    testgen = test_data.flow(batch_size=batch_size, shuffle=False)
    if num_augmentations:
        print(f'Testing with a flip augmenter')
        augmenter = FlipAugmenter(flipud=True, fliplr=True)
        aug_params = [
            dict(flipud=True, fliplr=True),
            dict(flipud=True, fliplr=False),
            dict(flipud=False, fliplr=True),
            dict(flipud=False, fliplr=False)
        ]
        augmenter.labels = False
        testgen = augmenter(testgen)
    else:
        num_augmentations = 1
        augmenter = None

    test_steps = 3 if debug else testgen.steps_per_epoch

    test_preds = 0.
    for i in range(num_augmentations):
        if augmenter is not None:
            print(
                f'Testing for augmentation {i+1}/{num_augmentations} with flipud={aug_params[i]["flipud"]} and fliplr={aug_params[i]["fliplr"]}'
            )
            augmenter.flipud = aug_params[i]['flipud']
            augmenter.fliplr = aug_params[i]['fliplr']

        aug_test_preds = model.predict_generator(
            testgen, test_steps, verbose=1, max_queue_size=0, workers=0
        )  # Must set to workers=0 to maintain test prediction order
        # Reverse the augmentations
        # TODO: only works with flips, implement general solution for non-flips
        if augmenter is not None:
            print('Running reverse augmentation on predictions...')
            aug_test_preds = augmenter.reverse_augment(aug_test_preds)

        if view_preds:
            if augmenter:
                testgen.generator.restart()
                display_predictions(testgen.generator, aug_test_preds)
            else:
                display_predictions(testgen, aug_test_preds)

        test_preds = test_preds + aug_test_preds
    test_preds /= num_augmentations

    if debug:
        filler = np.zeros(
            (len(test_data) - len(test_preds), *test_preds.shape[1:]))
        test_preds = np.concatenate([test_preds, filler])

    if view_preds:
        display_predictions(testgen, test_preds)

    return test_preds.squeeze(-1)
Esempio n. 20
0
                        submission_file)


if __name__ == "__main__":
    args = parser.parse_args()
    # Load the train_config
    train_config = load_train_setup(args.train_id)
    trained_model = None
    PLOT = args.plot
    if args.train:
        # Load the train data
        train_ids, x_train, y_train = dsb.load_train_data(
            path_to_train="../input/train/",
            img_size=train_config["img_size"],
            num_channels=3)
        train_dataset = NpDataset(x=x_train, y=y_train, ids=train_ids)
        # train the models
        if not train_config["kfold"]:
            raise NotImplementedError("Non-kfold training is not implemented")
        trained_model = kfold(train_dataset,
                              train_config,
                              args.train_id,
                              num_completed=args.num_completed)

    if args.test:
        # Load the test data
        test_ids, x_test, sizes_test = dsb.load_test_data(
            path_to_test="../input/test/",
            img_size=train_config["img_size"],
            num_channels=3)
        test_dataset = NpDataset(x=x_test, ids=test_ids)
Esempio n. 21
0
model = MNISTModel()
model.add_loss(nn.CrossEntropyLoss())

# This will save the best scoring model weights to the current directory
best_model = ModelCheckpoint(
    "mnist_pyjet" + ".state",
    monitor="val_accuracy",
    mode="max",
    verbose=1,
    save_best_only=True,
)
# This will plot the model's accuracy during training
plotter = Plotter(scale="linear", monitor="accuracy")

# Turn the numpy dataset into a BatchGenerator
train_datagen = NpDataset(xtr, y=ytr).flow(batch_size=64, shuffle=True, seed=1234)
# Turn the val data into a BatchGenerator
val_datagen = NpDataset(xval, y=yval).flow(batch_size=1000, shuffle=True, seed=1234)

# Set up the optimizer
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
model.add_optimizer(optimizer)

# Add the LR scheduler
one_cycle = OneCycleScheduler(
    optimizer, (1e-4, 1e-2), (0.95, 0.85), train_datagen.steps_per_epoch * 5
)


class LR(Metric):
    def __init__(self, onecycle):
Esempio n. 22
0
 def load_test(self):
     ids, data = self.load_application_data(type='test')
     return NpDataset(data.values, ids=ids.values)
Esempio n. 23
0
 def load_train(self):
     # TODO: For now just loads the trianing data
     ids, data, targets = self.load_application_data(type='train')
     y = targets.values.astype(np.float32)[:, None]
     return NpDataset(data.values, y=y, ids=ids.values)