def evaluate(model):
    features, classes, sample_names, feature_names, class_names = read_data()

    # Now, here we take the other portion of our input data and use
    # that to test the model and ensure it performs well on data it
    # hasn't seen before.
    num_test_samples = int(0.15 * len(features))
    test_features, test_classes = \
        features[-num_test_samples:], classes[-num_test_samples:]

    print("Evaluating test accuracy...")
    evaluate_model(model, test_features, test_classes,
                   sample_names[-num_test_samples:], class_names)
def evaluate_dataset(model, data_folder_path, intensity_correction=0.0):
    val = model_utils.load_dataset(data_folder_path)
    val_X, val_y = model_utils.convert_training_images_to_numpy_arrays(val)
    val_X += intensity_correction / (2**8 - 1)  # Adjust for differing light levels in training and this dataset
    val_X = model_utils.fake_colors(val_X)
    val_y = model_utils.replace_class(val_y, class_id=5)

    return model_utils.evaluate_model(model, val_X, val_y, num_classes=5)
def train_model(output_file='model.bin'):
    # First, let's read all of the features that we got from feature_extract.
    # Fun fact: you could do ./feature_extract.py | ./classifier.py to execute
    # both the feature extraction and classification steps at once, without
    # writing the results to JSON first. Very handy for iterating on features.
    features, classes, sample_names, feature_names, class_names = read_data()

    # We'll use this percentage of the data to train, and the rest for testing.
    # Why not just train on all the data? That would result in a model that is
    # overfitted, or overly good at the data that it's seen and does poorly
    # with data that it hasn't seen.
    training_percentage = 0.85
    num_training_samples = int(len(features) * training_percentage)

    # Here we separate all of our features and classes into just the ones
    # we want to train on...
    training_features, training_classes = \
        features[:num_training_samples], classes[:num_training_samples]

    # ...and we do the training, which creates our model!
    # vvv MACHINE LEARNING HAPPENS ON THIS LINE BELOW vvv
    model = DecisionTreeClassifier(random_state=2).fit(training_features,
                                                       training_classes)
    # ^^^ MACHINE LEARNING HAPPENS ON THIS LINE ABOVE ^^^

    with open(output_file, 'wb') as out:
        pickle.dump(model, out)

    # These two lines write out a .pdf file of the model's decision tree.
    # It's useful if you want to explain the model, but requires
    # you to have Graphviz installed, so I've left it commented out.
    # from model_utils import explain_model
    # explain_model(model, feature_names, class_names)

    print("Evaluating training accuracy...")
    evaluate_model(model,
                   training_features,
                   training_classes,
                   sample_names[:num_training_samples],
                   class_names,
                   output=False)

    return model
Exemple #4
0
# Indicators of interest
indicators = [
    'Wealth Index', 'Education completed (years)', 'Access to electricity',
    'Access to water (minutes)'
]

# ## OSM Features + Nighttime Lights

# In[ ]:

predictions = model_utils.evaluate_model(data=dhs,
                                         feature_cols=osm_ntl_cols,
                                         indicator_cols=indicators,
                                         wandb=wandb,
                                         scoring=scoring,
                                         model_type='random_forest',
                                         refit='r2',
                                         search_type='random',
                                         n_splits=5,
                                         n_iter=10,
                                         plot_importance=False,
                                         verbose=2)

# ## OSM Features Only

# In[7]:

predictions = model_utils.evaluate_model(data=dhs,
                                         feature_cols=osm_cols,
                                         indicator_cols=indicators,
                                         scoring=scoring,
                                         wandb=None,
Exemple #5
0
f.colorbar(points)


# ## Machine Learning Pipeline

# ### Using CNN feature embeddings + Regional indicators

# In[31]:


predictions = model_utils.evaluate_model(
    data=data,
    feature_cols=embedding_cols+region_cols,
    indicator_cols=indicators,
    wandb=wandb,
    scoring=scoring,
    model_type='ridge',
    refit='r2',
    search_type='grid',
    n_splits=5,
    n_workers=1
)


# ### Using CNN feature embeddings

# In[33]:


predictions = model_utils.evaluate_model(
    data=data,
    wandb=None,
Exemple #6
0
# model = Scattering2dNet()
#
# x = torch.rand(1,3,64,64)
# x_scatter = scattering(x)
# outp = model(x_scatter)
# print(outp)
# =============================================================================

#two_fc_classifier = TwoFullNet(100)
#two_fc_classifier = TwoConvTwoFullNet()
two_fc_classifier = ScatteringEqualNet_Batch_Good_Cuda_hiden()

#two_fc_classifier = Scattering2dNet()

if MU.use_cuda: two_fc_classifier.cuda()
evaluate_model(two_fc_classifier)

# =============================================================================
# #from scatwave.scattering import Scattering
# from kymatio import Scattering2D
# import kymatio.datasets as scattering_datasets
#
# #scat = Scattering2D(M=MU.imgsize[0]+8, N=MU.imgsize[1]+8, J=4, jit=True)
# scat = Scattering2D(J=4, shape= (MU.imgsize[0]+8, MU.imgsize[1]+8), L=8)
# if MU.use_cuda : scat = scat.cuda()
#
# print(scat['psi'])
#
# class scatteringfullnet(nn.module) :
#     """
#     implements a trainable model which is the concatenation
def run_from_dir(
        train_data_folder_path,
        val_data_folder_path,
        model_name="vgg16",
        freeze="all",
        run_path="/home/kitkat/PycharmProjects/river-segmentation/runs",
        batch_size=1,
        dropout=0):
    """
        Trains a CNN Unet model and saves the best model to file. Uses training images from disk instead of loading
        everything into RAM.

        :param train_data_folder_path: Path to the folder containing training images (.png format)
        :param val_data_folder_path: Path to the folder containing validation images (.png format)
        :param model_name: The name of the model. Supported models are: vgg16
        :param freeze: Determine how many blocks in the encoder that are frozen during training.
        Should be all, first, 1, 2, 3, 4, 5 or none
        :param run_path: Folder where the run information and model will be saved.
        :param dropout: Drop rate, [0.0, 1)
        :return: Writes model to the run folder, nothing is returned.
        """

    tf.keras.backend.clear_session()
    start_time = time.time()

    # Make run name based on parameters and timestamp
    run_name = f"{model_name}_freeze_{freeze}"
    date = str(datetime.datetime.now())
    run_path = os.path.join(run_path, f"{date}_{run_name}".replace(" ", "_"))
    os.makedirs(run_path, exist_ok=True)

    # Setup data generators
    image_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=lambda x: x / (2**8 - 1))
    mask_datagen = tf.keras.preprocessing.image.ImageDataGenerator()

    image_generator = image_datagen.flow_from_directory(os.path.join(
        train_data_folder_path, "images"),
                                                        class_mode=None,
                                                        target_size=(512, 512),
                                                        seed=1,
                                                        batch_size=batch_size)
    mask_generator = mask_datagen.flow_from_directory(os.path.join(
        train_data_folder_path, "labels"),
                                                      class_mode=None,
                                                      target_size=(512, 512),
                                                      seed=1,
                                                      batch_size=batch_size,
                                                      color_mode="grayscale")
    train_generator = (pair for pair in zip(image_generator, mask_generator))

    # Validation data
    val = model_utils.load_dataset(val_data_folder_path)
    val_X, val_y = model_utils.convert_training_images_to_numpy_arrays(val)
    val_X = model_utils.fake_colors(val_X)
    val_y = model_utils.replace_class(val_y, class_id=5)

    # Load and compile model
    if model_name.lower() == "vgg16":
        model = vgg16_unet(freeze=freeze,
                           context_mode=False,
                           num_classes=5,
                           dropout=dropout)
    else:
        model = None
    opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(opt,
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])

    # Define callbacks
    callbacks = []
    callbacks.append(
        tf.keras.callbacks.EarlyStopping(patience=10, monitor="val_loss"))

    checkpoint = tf.keras.callbacks.ModelCheckpoint(os.path.join(
        run_path, "model.hdf5"),
                                                    monitor="val_loss",
                                                    save_best_only=True)
    callbacks.append(checkpoint)

    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=run_path,
                                                          histogram_freq=1)
    callbacks.append(tensorboard_callback)

    csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(
        run_path, "log.csv"))
    callbacks.append(csv_logger)

    # Train the model
    model.fit_generator(train_generator,
                        epochs=100,
                        validation_data=(val_X, val_y),
                        steps_per_epoch=int(np.ceil(57648 / batch_size)),
                        callbacks=callbacks,
                        verbose=2)

    # Print and save confusion matrix
    print("Confusion matrix on the validation data")
    conf_mat = model_utils.evaluate_model(model, val_X, val_y)
    with open(os.path.join(run_path, "conf_mat.txt"), "w+") as f:
        f.write(str(conf_mat))

    try:
        print(
            "The current process uses the following amount of RAM (in GB) at its peak"
        )
        print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20)
        print(resource.getpagesize())
    except Exception:
        print(
            "Failed to print memory usage. This function was intended to run on a linux system."
        )
def run(train_data_folder_path,
        val_data_folder_path,
        model_name="vgg16",
        freeze="all",
        image_augmentation=True,
        context_mode=False,
        run_path="/home/kitkat/PycharmProjects/river-segmentation/runs",
        replace_unknown=True,
        dropout=0):
    """
    Trains a CNN Unet model and saves the best model to file. If using large datasets consider using the run_from_dir
    function instead to decrease RAM usage.

    :param train_data_folder_path: Path to the folder containing training images (.tif format)
    :param val_data_folder_path: Path to the folder containing validation images (.tif format)
    :param model_name: The name of the model. Supported models are: vgg16
    :param freeze: Determine how many blocks in the encoder that are frozen during training.
    Should be all, first, 1, 2, 3, 4, 5 or none
    :param image_augmentation: Determines if image augmentation are used on the training data.
    :param context_mode: Determines if image context are included on the training data. Recommended set to False
    :param run_path: Folder where the run information and model will be saved.
    :param replace_unknown: When True the unknown class in the training date will be replaced using closest neighbor.
    :param dropout: Drop rate, [0.0, 1)
    :return: Writes model to the run folder, nothing is returned.
    """
    tf.keras.backend.clear_session()
    start_time = time.time()

    # Make run name based on parameters and timestamp
    augment = "with" if image_augmentation else "no"
    run_name = f"{model_name}_freeze_{freeze}_{augment}_augment"
    date = str(datetime.datetime.now())
    run_path = os.path.join(run_path, f"{date}_{run_name}".replace(" ", "_"))
    os.makedirs(run_path, exist_ok=True)

    # Load data
    # Training data
    train = model_utils.load_dataset(train_data_folder_path)
    print(f"Loading the training data took {time.time() - start_time} seconds")
    train_X, train_y = model_utils.convert_training_images_to_numpy_arrays(
        train)
    print(
        f"Converting to a numpy array took {time.time() - start_time} seconds")
    del train
    if replace_unknown:
        train_y = model_utils.replace_class(train_y, class_id=5)
    train_X = model_utils.fake_colors(train_X)
    if image_augmentation:
        train_X = model_utils.image_augmentation(train_X)
        train_y = model_utils.image_augmentation(train_y)
    print(
        f"Converting image augmentation and color faking took {time.time() - start_time} seconds"
    )

    # Validation data
    val = model_utils.load_dataset(val_data_folder_path)
    val_X, val_y = model_utils.convert_training_images_to_numpy_arrays(val)
    del val
    if replace_unknown:
        model_utils.replace_class(val_y, class_id=5)
    val_X = model_utils.fake_colors(val_X)

    # Load and compile model
    if model_name.lower() == "vgg16":
        model = vgg16_unet(freeze=freeze,
                           context_mode=context_mode,
                           num_classes=5 if replace_unknown else 6,
                           dropout=dropout)
    else:
        model = None
    opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(opt,
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])

    # Define callbacks
    callbacks = []
    callbacks.append(
        tf.keras.callbacks.EarlyStopping(patience=10, monitor="val_loss"))

    checkpoint = tf.keras.callbacks.ModelCheckpoint(os.path.join(
        run_path, "model.hdf5"),
                                                    monitor="val_loss",
                                                    save_best_only=True)
    callbacks.append(checkpoint)

    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=run_path,
                                                          histogram_freq=1)
    callbacks.append(tensorboard_callback)

    csv_logger = tf.keras.callbacks.CSVLogger(os.path.join(
        run_path, "log.csv"))
    callbacks.append(csv_logger)

    # Train the model
    model.fit(train_X,
              train_y,
              batch_size=4,
              epochs=100,
              validation_data=(val_X, val_y),
              callbacks=callbacks)

    # Print and save confusion matrix
    print("Confusion matrix on the validation data")
    conf_mat = model_utils.evaluate_model(model, val_X, val_y)
    with open(os.path.join(run_path, "conf_mat.txt"), "w+") as f:
        f.write(str(conf_mat))

    try:
        print(
            "The current process uses the following amount of RAM (in GB) at its peak"
        )
        print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20)
        print(resource.getpagesize())
    except Exception:
        print(
            "Failed to print memory usage. This function was intended to run on a linux system."
        )
Exemple #9
0
    'Access to water (minutes)'
]

# In[14]:

wandb.init(project="tm-poverty-prediction")

# ### Random Forest

# In[15]:

predictions = model_utils.evaluate_model(data=dhs,
                                         feature_cols=feature_cols,
                                         indicator_cols=indicators,
                                         scoring=scoring,
                                         model_type='random_forest',
                                         refit='r2',
                                         search_type='random',
                                         n_splits=5,
                                         n_iter=10,
                                         wandb=wandb)

# ### XGBoost

# In[ ]:

predictions = model_utils.evaluate_model(data=dhs,
                                         feature_cols=feature_cols,
                                         indicator_cols=indicators,
                                         scoring=scoring,
                                         model_type='xgboost',
                                         refit='r2',