def evaluate(model): features, classes, sample_names, feature_names, class_names = read_data() # Now, here we take the other portion of our input data and use # that to test the model and ensure it performs well on data it # hasn't seen before. num_test_samples = int(0.15 * len(features)) test_features, test_classes = \ features[-num_test_samples:], classes[-num_test_samples:] print("Evaluating test accuracy...") evaluate_model(model, test_features, test_classes, sample_names[-num_test_samples:], class_names)
def evaluate_dataset(model, data_folder_path, intensity_correction=0.0): val = model_utils.load_dataset(data_folder_path) val_X, val_y = model_utils.convert_training_images_to_numpy_arrays(val) val_X += intensity_correction / (2**8 - 1) # Adjust for differing light levels in training and this dataset val_X = model_utils.fake_colors(val_X) val_y = model_utils.replace_class(val_y, class_id=5) return model_utils.evaluate_model(model, val_X, val_y, num_classes=5)
def train_model(output_file='model.bin'): # First, let's read all of the features that we got from feature_extract. # Fun fact: you could do ./feature_extract.py | ./classifier.py to execute # both the feature extraction and classification steps at once, without # writing the results to JSON first. Very handy for iterating on features. features, classes, sample_names, feature_names, class_names = read_data() # We'll use this percentage of the data to train, and the rest for testing. # Why not just train on all the data? That would result in a model that is # overfitted, or overly good at the data that it's seen and does poorly # with data that it hasn't seen. training_percentage = 0.85 num_training_samples = int(len(features) * training_percentage) # Here we separate all of our features and classes into just the ones # we want to train on... training_features, training_classes = \ features[:num_training_samples], classes[:num_training_samples] # ...and we do the training, which creates our model! # vvv MACHINE LEARNING HAPPENS ON THIS LINE BELOW vvv model = DecisionTreeClassifier(random_state=2).fit(training_features, training_classes) # ^^^ MACHINE LEARNING HAPPENS ON THIS LINE ABOVE ^^^ with open(output_file, 'wb') as out: pickle.dump(model, out) # These two lines write out a .pdf file of the model's decision tree. # It's useful if you want to explain the model, but requires # you to have Graphviz installed, so I've left it commented out. # from model_utils import explain_model # explain_model(model, feature_names, class_names) print("Evaluating training accuracy...") evaluate_model(model, training_features, training_classes, sample_names[:num_training_samples], class_names, output=False) return model
# Indicators of interest indicators = [ 'Wealth Index', 'Education completed (years)', 'Access to electricity', 'Access to water (minutes)' ] # ## OSM Features + Nighttime Lights # In[ ]: predictions = model_utils.evaluate_model(data=dhs, feature_cols=osm_ntl_cols, indicator_cols=indicators, wandb=wandb, scoring=scoring, model_type='random_forest', refit='r2', search_type='random', n_splits=5, n_iter=10, plot_importance=False, verbose=2) # ## OSM Features Only # In[7]: predictions = model_utils.evaluate_model(data=dhs, feature_cols=osm_cols, indicator_cols=indicators, scoring=scoring, wandb=None,
f.colorbar(points) # ## Machine Learning Pipeline # ### Using CNN feature embeddings + Regional indicators # In[31]: predictions = model_utils.evaluate_model( data=data, feature_cols=embedding_cols+region_cols, indicator_cols=indicators, wandb=wandb, scoring=scoring, model_type='ridge', refit='r2', search_type='grid', n_splits=5, n_workers=1 ) # ### Using CNN feature embeddings # In[33]: predictions = model_utils.evaluate_model( data=data, wandb=None,
# model = Scattering2dNet() # # x = torch.rand(1,3,64,64) # x_scatter = scattering(x) # outp = model(x_scatter) # print(outp) # ============================================================================= #two_fc_classifier = TwoFullNet(100) #two_fc_classifier = TwoConvTwoFullNet() two_fc_classifier = ScatteringEqualNet_Batch_Good_Cuda_hiden() #two_fc_classifier = Scattering2dNet() if MU.use_cuda: two_fc_classifier.cuda() evaluate_model(two_fc_classifier) # ============================================================================= # #from scatwave.scattering import Scattering # from kymatio import Scattering2D # import kymatio.datasets as scattering_datasets # # #scat = Scattering2D(M=MU.imgsize[0]+8, N=MU.imgsize[1]+8, J=4, jit=True) # scat = Scattering2D(J=4, shape= (MU.imgsize[0]+8, MU.imgsize[1]+8), L=8) # if MU.use_cuda : scat = scat.cuda() # # print(scat['psi']) # # class scatteringfullnet(nn.module) : # """ # implements a trainable model which is the concatenation
def run_from_dir( train_data_folder_path, val_data_folder_path, model_name="vgg16", freeze="all", run_path="/home/kitkat/PycharmProjects/river-segmentation/runs", batch_size=1, dropout=0): """ Trains a CNN Unet model and saves the best model to file. Uses training images from disk instead of loading everything into RAM. :param train_data_folder_path: Path to the folder containing training images (.png format) :param val_data_folder_path: Path to the folder containing validation images (.png format) :param model_name: The name of the model. Supported models are: vgg16 :param freeze: Determine how many blocks in the encoder that are frozen during training. Should be all, first, 1, 2, 3, 4, 5 or none :param run_path: Folder where the run information and model will be saved. :param dropout: Drop rate, [0.0, 1) :return: Writes model to the run folder, nothing is returned. """ tf.keras.backend.clear_session() start_time = time.time() # Make run name based on parameters and timestamp run_name = f"{model_name}_freeze_{freeze}" date = str(datetime.datetime.now()) run_path = os.path.join(run_path, f"{date}_{run_name}".replace(" ", "_")) os.makedirs(run_path, exist_ok=True) # Setup data generators image_datagen = tf.keras.preprocessing.image.ImageDataGenerator( preprocessing_function=lambda x: x / (2**8 - 1)) mask_datagen = tf.keras.preprocessing.image.ImageDataGenerator() image_generator = image_datagen.flow_from_directory(os.path.join( train_data_folder_path, "images"), class_mode=None, target_size=(512, 512), seed=1, batch_size=batch_size) mask_generator = mask_datagen.flow_from_directory(os.path.join( train_data_folder_path, "labels"), class_mode=None, target_size=(512, 512), seed=1, batch_size=batch_size, color_mode="grayscale") train_generator = (pair for pair in zip(image_generator, mask_generator)) # Validation data val = model_utils.load_dataset(val_data_folder_path) val_X, val_y = model_utils.convert_training_images_to_numpy_arrays(val) val_X = model_utils.fake_colors(val_X) val_y = model_utils.replace_class(val_y, class_id=5) # Load and compile model if model_name.lower() == "vgg16": model = vgg16_unet(freeze=freeze, context_mode=False, num_classes=5, dropout=dropout) else: model = None opt = tf.keras.optimizers.Adam(learning_rate=0.0001) model.compile(opt, loss="sparse_categorical_crossentropy", metrics=["accuracy"]) # Define callbacks callbacks = [] callbacks.append( tf.keras.callbacks.EarlyStopping(patience=10, monitor="val_loss")) checkpoint = tf.keras.callbacks.ModelCheckpoint(os.path.join( run_path, "model.hdf5"), monitor="val_loss", save_best_only=True) callbacks.append(checkpoint) tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=run_path, histogram_freq=1) callbacks.append(tensorboard_callback) csv_logger = tf.keras.callbacks.CSVLogger(os.path.join( run_path, "log.csv")) callbacks.append(csv_logger) # Train the model model.fit_generator(train_generator, epochs=100, validation_data=(val_X, val_y), steps_per_epoch=int(np.ceil(57648 / batch_size)), callbacks=callbacks, verbose=2) # Print and save confusion matrix print("Confusion matrix on the validation data") conf_mat = model_utils.evaluate_model(model, val_X, val_y) with open(os.path.join(run_path, "conf_mat.txt"), "w+") as f: f.write(str(conf_mat)) try: print( "The current process uses the following amount of RAM (in GB) at its peak" ) print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20) print(resource.getpagesize()) except Exception: print( "Failed to print memory usage. This function was intended to run on a linux system." )
def run(train_data_folder_path, val_data_folder_path, model_name="vgg16", freeze="all", image_augmentation=True, context_mode=False, run_path="/home/kitkat/PycharmProjects/river-segmentation/runs", replace_unknown=True, dropout=0): """ Trains a CNN Unet model and saves the best model to file. If using large datasets consider using the run_from_dir function instead to decrease RAM usage. :param train_data_folder_path: Path to the folder containing training images (.tif format) :param val_data_folder_path: Path to the folder containing validation images (.tif format) :param model_name: The name of the model. Supported models are: vgg16 :param freeze: Determine how many blocks in the encoder that are frozen during training. Should be all, first, 1, 2, 3, 4, 5 or none :param image_augmentation: Determines if image augmentation are used on the training data. :param context_mode: Determines if image context are included on the training data. Recommended set to False :param run_path: Folder where the run information and model will be saved. :param replace_unknown: When True the unknown class in the training date will be replaced using closest neighbor. :param dropout: Drop rate, [0.0, 1) :return: Writes model to the run folder, nothing is returned. """ tf.keras.backend.clear_session() start_time = time.time() # Make run name based on parameters and timestamp augment = "with" if image_augmentation else "no" run_name = f"{model_name}_freeze_{freeze}_{augment}_augment" date = str(datetime.datetime.now()) run_path = os.path.join(run_path, f"{date}_{run_name}".replace(" ", "_")) os.makedirs(run_path, exist_ok=True) # Load data # Training data train = model_utils.load_dataset(train_data_folder_path) print(f"Loading the training data took {time.time() - start_time} seconds") train_X, train_y = model_utils.convert_training_images_to_numpy_arrays( train) print( f"Converting to a numpy array took {time.time() - start_time} seconds") del train if replace_unknown: train_y = model_utils.replace_class(train_y, class_id=5) train_X = model_utils.fake_colors(train_X) if image_augmentation: train_X = model_utils.image_augmentation(train_X) train_y = model_utils.image_augmentation(train_y) print( f"Converting image augmentation and color faking took {time.time() - start_time} seconds" ) # Validation data val = model_utils.load_dataset(val_data_folder_path) val_X, val_y = model_utils.convert_training_images_to_numpy_arrays(val) del val if replace_unknown: model_utils.replace_class(val_y, class_id=5) val_X = model_utils.fake_colors(val_X) # Load and compile model if model_name.lower() == "vgg16": model = vgg16_unet(freeze=freeze, context_mode=context_mode, num_classes=5 if replace_unknown else 6, dropout=dropout) else: model = None opt = tf.keras.optimizers.Adam(learning_rate=0.0001) model.compile(opt, loss="sparse_categorical_crossentropy", metrics=["accuracy"]) # Define callbacks callbacks = [] callbacks.append( tf.keras.callbacks.EarlyStopping(patience=10, monitor="val_loss")) checkpoint = tf.keras.callbacks.ModelCheckpoint(os.path.join( run_path, "model.hdf5"), monitor="val_loss", save_best_only=True) callbacks.append(checkpoint) tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=run_path, histogram_freq=1) callbacks.append(tensorboard_callback) csv_logger = tf.keras.callbacks.CSVLogger(os.path.join( run_path, "log.csv")) callbacks.append(csv_logger) # Train the model model.fit(train_X, train_y, batch_size=4, epochs=100, validation_data=(val_X, val_y), callbacks=callbacks) # Print and save confusion matrix print("Confusion matrix on the validation data") conf_mat = model_utils.evaluate_model(model, val_X, val_y) with open(os.path.join(run_path, "conf_mat.txt"), "w+") as f: f.write(str(conf_mat)) try: print( "The current process uses the following amount of RAM (in GB) at its peak" ) print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20) print(resource.getpagesize()) except Exception: print( "Failed to print memory usage. This function was intended to run on a linux system." )
'Access to water (minutes)' ] # In[14]: wandb.init(project="tm-poverty-prediction") # ### Random Forest # In[15]: predictions = model_utils.evaluate_model(data=dhs, feature_cols=feature_cols, indicator_cols=indicators, scoring=scoring, model_type='random_forest', refit='r2', search_type='random', n_splits=5, n_iter=10, wandb=wandb) # ### XGBoost # In[ ]: predictions = model_utils.evaluate_model(data=dhs, feature_cols=feature_cols, indicator_cols=indicators, scoring=scoring, model_type='xgboost', refit='r2',