def test_generate_tfrecords(train, tmpdir): created_records = boxes.generate_tfrecords( shapefile=test_predictions, site = 1, elevation=100, savedir=tmpdir, train=train, HSI_sensor_path=test_sensor_tile, RGB_sensor_path=test_sensor_tile, species_label_dict=None, RGB_size=100, HSI_size=20, classes=6) assert all([os.path.exists(x) for x in created_records]) if train: dataset = boxes.tf_dataset(created_records, batch_size=2, mode="ensemble") else: dataset = boxes.tf_dataset(created_records, batch_size=2, mode="predict") if train: for (HSI, RGB), label_batch in dataset.take(3): assert HSI.shape == (2,20,20,3) assert RGB.shape == (2,100,100,3) assert label_batch.shape == (2,6) else: for (HSI, RGB ), box_index_batch in dataset.take(3): assert HSI.shape == (2,20,20,3) assert RGB.shape == (2,100,100,3) assert box_index_batch.shape == (2,)
def test_generate_tfrecords(train, tmpdir): created_records = boxes.generate_tfrecords(shapefile=test_predictions, savedir=tmpdir, train=train, sensor_path=test_sensor_tile, height=20, width=20, classes=6) assert all([os.path.exists(x) for x in created_records]) if train: dataset = boxes.tf_dataset(created_records, batch_size=2, mode="train") else: dataset = boxes.tf_dataset(created_records, batch_size=2, mode="predict") if train: for image_batch, label_batch in dataset.take(3): assert image_batch.shape == (2, 20, 20, 3) assert label_batch.shape == (2, 6) else: for image_batch, box_index_batch, in dataset.take(3): assert image_batch.shape == (2, 20, 20, 3) assert box_index_batch.shape == (2, )
def read_data(self, mode="train", validation_split=False): """Read tfrecord into datasets from config Args: validation_split: True -> split tfrecords into train test. This overrides the evaluation config! """ self.train_records = glob.glob( os.path.join(self.config["train"]["tfrecords"], "*.tfrecord")) if len(self.train_records) == 0: raise IOError("Cannot find .tfrecords at {}".format( self.config["train"]["tfrecords"])) if validation_split: print("Splitting training set into train-test") train_df = pd.Series(self.train_records) #Sample with set seed to make it the same between runs self.train_split_records = train_df.head( int(self.config["train"]["training_fraction"] * train_df.shape[0])).values self.test_split_records = train_df[~( train_df.isin(self.train_split_records))].values #Create training tf.data self.train_split = boxes.tf_dataset( tfrecords=self.train_split_records, batch_size=self.config["train"]["batch_size"], shuffle=self.config["train"]["shuffle"], mode=mode, cores=self.config["cpu_workers"]) #Create testing tf.data self.val_split = boxes.tf_dataset( tfrecords=self.test_split_records, batch_size=self.config["train"]["batch_size"], shuffle=self.config["train"]["shuffle"], mode=mode, cores=self.config["cpu_workers"]) else: #Create training tf.data self.train_split = boxes.tf_dataset( tfrecords=self.train_records, batch_size=self.config["train"]["batch_size"], shuffle=self.config["train"]["shuffle"], mode=mode, cores=self.config["cpu_workers"]) #honor config if validation not set self.val_split = None if self.config["evaluation"]["tfrecords"] is not None: self.test_records = glob.glob( os.path.join(self.config["evaluation"]["tfrecords"], "*.tfrecord")) self.val_split = boxes.tf_dataset( tfrecords=self.test_records, batch_size=self.config["train"]["batch_size"], shuffle=self.config["train"]["shuffle"], mode=mode, cores=self.config["cpu_workers"])
def test_generate_records(tmpdir, ensemble_model): created_records = boxes.generate_tfrecords( shapefile=test_predictions, domain=1, site=1, elevation=100.0, savedir=tmpdir, HSI_sensor_path=test_hsi_tile, RGB_sensor_path=test_sensor_tile, species_label_dict=None, RGB_size=100, HSI_size=10, classes=6, number_of_sites=10, number_of_domains=10, ensemble_model=None) shp = gpd.read_file(test_predictions) dataset = boxes.tf_dataset(created_records, batch_size=1) counter = 0 for batch in dataset: counter += 1 assert counter == shp.shape[0]
def test_main(): created_records = prepare_field_data.main( field_data=data_path, hyperspectral_dir=hyperspectral_dir, RGB_size=height, HSI_size=width, rgb_dir=rgb_dir, hyperspectral_savedir=hyperspectral_savedir, extend_box=0.5) dataset = boxes.tf_dataset(created_records, batch_size=1, mode="RGB_train") iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() with tensorflow.Session() as sess: labels = [] counter = 0 while True: try: data, label = sess.run(next_element) assert data.shape == (1, height, width, 3) assert label.shape == (1, 3) plt.imshow(data[0].astype("uint8")) labels.append(label) counter += 1 except tensorflow.errors.OutOfRangeError: break input_data = gpd.read_file(data_path) assert counter == input_data.shape[0]
def test_metadata(created_records): dataset = boxes.tf_dataset(created_records, batch_size=2, mode="metadata") for data, label_batch in dataset.take(1): elevation, site, domain = data assert elevation.numpy().shape == (2, ) assert site.numpy().shape == (2, 10) assert domain.numpy().shape == (2, 16)
def test_main(): created_records = prepare_field_data.main( field_data=data_path, hyperspectral_pool=hyperspec_pool, height=height, width=width, rgb_pool=rgb_pool, sensor="rgb", hyperspectral_savedir=hyperspectral_savedir, use_dask=False, extend_box=3) dataset = boxes.tf_dataset(created_records, batch_size=1) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() with tensorflow.Session() as sess: labels = [] counter = 0 while True: try: image, label = sess.run(next_element) assert image.shape == (1, height, width, 3) assert label.shape == (1, 2) plt.imshow(image[0].astype("uint8")) labels.append(label) counter += 1 except Exception as e: print(e) break assert counter == 3 assert max([np.argmax(x) for x in labels])
def predict_raster(self, tfrecords, batch_size=1): """Predicted a set of tfrecords and create a raster image""" prediction_set = boxes.tf_dataset(tfrecords=tfrecords, batch_size=batch_size, shuffle=False, mode="predict", cores=self.config["cpu_workers"]) predictions = [] row_list = [] col_list = [] for image, x, y in prediction_set: try: softmax_batch = self.model.predict_on_batch(image) row_list.append(x.numpy()) col_list.append(y.numpy()) predictions.append(softmax_batch) except tf.errors.OutOfRangeError: print("Completed {} predictions".format(len(predictions))) #stack predictions = np.vstack(predictions) row_list = np.concatenate(row_list) col_list = np.concatenate(col_list) predictions = np.argmax(predictions, 1) results = pd.DataFrame({ "label": predictions, "row": row_list, "col": col_list }) results = results.sort_values(by=["row", "col"]) return results
def predict_boxes(self, tfrecords, batch_size=1): """Predicted a set of tfrecords and create a raster image""" prediction_set = boxes.tf_dataset(tfrecords=tfrecords, batch_size=batch_size, shuffle=False, mode="predict", cores=self.config["cpu_workers"]) predictions = [] indices = [] for image, box_index in prediction_set: try: softmax_batch = self.model.predict_on_batch(image) predictions.append(softmax_batch) indices.append(box_index) except tf.errors.OutOfRangeError: print("Completed {} predictions".format(len(predictions))) #stack predictions = np.vstack(predictions) predictions = np.argmax(predictions, 1) indices = np.concatenate(indices) #Read class labels labels = [ self.classes_file.loc[self.classes_file.index == x, "taxonID"].values[0] for x in predictions ] results = pd.DataFrame({"label": labels, "box_index": indices}) #decode results results["box_index"] = results["box_index"].apply(lambda x: x.decode()).astype( str) return results
def test_ensemble(created_records): dataset = boxes.tf_dataset(created_records, batch_size=2, mode="ensemble") for data, label_batch in dataset.take(1): HSI, elevation, site, domain = data assert HSI.shape == (2, 40, 40, 369) assert elevation.numpy().shape == (2, ) assert site.numpy().shape == (2, 10) assert domain.numpy().shape == (2, 16)
def test_RGB_submodel(created_records): dataset = boxes.tf_dataset(created_records, batch_size=2, mode="RGB_submodel") for batch in dataset.take(1): data, label = batch assert data.shape == (2, 100, 100, 3) assert len(label) == 3 assert label[0].shape == (2, 6)
def test_neighbor(created_records): dataset = boxes.tf_dataset(created_records, batch_size=2, mode="neighbors") for data, label_batch in dataset.take(1): HSI, neighbor_array, elevation, site, domain = data assert HSI.shape == (2, 20, 20, 369) assert neighbor_array.shape == (2, 5, 4) assert elevation.numpy().shape == (2, ) assert site.numpy().shape == (2, 10) assert domain.numpy().shape == (2, 10)
def test_id_train(created_records): shp = gpd.read_file(test_predictions) dataset = boxes.tf_dataset(created_records, batch_size=2, ids=True, mode="RGB") for ids, batch in dataset.take(1): data, label = batch assert ids.numpy().shape == (2, ) assert all([x in shp.index.values for x in ids.numpy()])
def test_id_train(created_records): shp = gpd.read_file(test_predictions) dataset = boxes.tf_dataset(created_records, batch_size=2, ids=True, mode="RGB") for ids, batch in dataset.take(1): data, label = batch assert ids.numpy().shape == (2, ) basename = os.path.splitext(os.path.basename(test_predictions))[0] shp["box_index"] = ["{}_{}".format(basename, x) for x in shp.index.values] assert all([x in shp.box_index.values for x in ids.numpy()])
def test_generate_tfrecords(train, created_records): assert all([os.path.exists(x) for x in created_records]) if train: dataset = boxes.tf_dataset(created_records, mode="RGB", batch_size=2) else: dataset = boxes.tf_dataset(created_records, mode="RGB", batch_size=2, ids=True) if train: #Yield a batch of data and confirm its shape for batch in dataset.take(1): data, label = batch assert data.shape == (2, 100, 100, 3) assert label.shape == (2, 6) else: for ids, batch in dataset.take(3): data, label = batch assert data.shape == (2, 100, 100, 3) assert ids.shape == (2)
def test_generate_tfrecords(train, tmpdir): shp = gpd.read_file(test_predictions) created_records = boxes.generate_tfrecords( shapefile=test_predictions, site = 1, elevation=100, heights=np.random.random(shp.shape[0])*10, savedir=tmpdir, train=train, HSI_sensor_path=test_sensor_tile, RGB_sensor_path=test_sensor_tile, species_label_dict=None, RGB_size=100, HSI_size=20, classes=6, number_of_sites=10) assert all([os.path.exists(x) for x in created_records]) if train: dataset = boxes.tf_dataset(created_records, batch_size=2, mode="ensemble") else: dataset = boxes.tf_dataset(created_records, batch_size=2, mode="predict") if train: #Yield a batch of data and confirm its shape for (HSI, RGB, elevation, height, site), label_batch in dataset.take(3): assert HSI.shape == (2,20,20,3) assert RGB.shape == (2,100,100,3) assert elevation.shape == (2) assert site.shape == (2,10) assert height.shape == (2) assert label_batch.shape == (2,6) else: for (HSI, RGB ), box_index_batch in dataset.take(3): assert HSI.shape == (2,20,20,3) assert RGB.shape == (2,100,100,3) assert box_index_batch.shape == (2,)
def test_generate(mod): shp = gpd.read_file(test_predictions) created_records = mod.generate(shapefile=test_predictions, domain=1, site=0, elevation=100, HSI_sensor_path=test_sensor_hyperspec, RGB_sensor_path=test_sensor_tile, train=True, chunk_size=2, savedir=mod.config["train"]["tfrecords"]) assert all([os.path.exists(x) for x in created_records]) dataset = boxes.tf_dataset(created_records, batch_size=1) counter = 0 for batch in dataset: batch counter += 1 assert counter == shp.shape[0]
def test_metadata(tmpdir): shp = gpd.read_file(test_predictions) created_records = boxes.generate_tfrecords( shapefile=test_predictions, site = 1, heights=np.random.random(shp.shape[0])*10, elevation=100, savedir=tmpdir, HSI_sensor_path=test_sensor_tile, RGB_sensor_path=test_sensor_tile, species_label_dict=None, RGB_size=100, HSI_size=20, classes=6, number_of_sites=10) dataset = boxes.tf_dataset(created_records, batch_size=2, mode="metadata") for data, label_batch in dataset.take(3): elevation, height, site = data assert elevation.numpy().shape == (2,) assert site.numpy().shape == (2,10)
def find_outliers(self): self.autoencoder_model = cleaning.autoencoder_model( height=self.HSI_size, width=self.HSI_size, channels=self.HSI_channels) self.autoencoder_model.fit( self.train_split, batch_size=self.config["train"]["batch_size"], epochs=self.config["autoencoder"]["epochs"], validation_data=self.val_split) ## training data ## self.train_split_with_ids = boxes.tf_dataset( tfrecords=self.train_records, batch_size=self.config["train"]["batch_size"], shuffle=False, mode="HSI_autoencoder", ids=True, cache=False, augmentation=False, cores=self.config["cpu_workers"]) #Get the true labels since they are not shuffled y_pred = [] box_index = [] mse = tf.keras.losses.MeanSquaredError() for index, batch in self.train_split_with_ids: data, label = batch prediction = self.autoencoder_model.predict(data) for x in np.arange(prediction.shape[0]): print(x) error = mse(prediction[x, :, :, :], data[x, :, :, :]) y_pred.append(error.numpy()) box_index.append(index.numpy()[x]) results = pd.DataFrame({"error": y_pred, "point_id": box_index}) #Read original data #Merge joined_gdf = self.train_shp.merge(results, on="point_id") #outlier threshold threshold = joined_gdf.error.quantile( self.config["autoencoder"]["quantile"]) train_error_df = joined_gdf[joined_gdf.error > threshold] ## repeat for test data ## #Get the true labels since they are not shuffled y_pred = [] box_index = [] mse = tf.keras.losses.MeanSquaredError() for index, batch in self.val_split_with_ids: data, label = batch prediction = self.autoencoder_model.predict(data) for x in np.arange(prediction.shape[0]): error = mse(prediction[x, :, :, :], data[x, :, :, :]) y_pred.append(error.numpy()) box_index.append(index.numpy()[x]) results = pd.DataFrame({"error": y_pred, "point_id": box_index}) #Read original data #Merge joined_gdf = self.test_shp.merge(results, on="point_id") #outlier threshold test_error_df = joined_gdf[joined_gdf.error > threshold] return train_error_df, test_error_df
import glob from DeepTreeAttention.generators import boxes created_records = glob.glob( "/orange/idtrees-collab/DeepTreeAttention/tfrecords/train/*.tfrecord") dataset = boxes.tf_dataset(created_records, batch_size=100) counter = 0 for image, label in dataset: counter += image.shape[0]
import glob import pandas as pd import numpy as np import matplotlib.pyplot as plt from DeepTreeAttention.generators import boxes #metadata created_records = glob.glob("/orange/idtrees-collab/DeepTreeAttention/tfrecords/evaluation/*.tfrecord") dataset = boxes.tf_dataset(created_records, mode = "metadata", batch_size=10) counter=0 labels=[] data =[] for data, label in dataset: counter+=data[0].shape[0] labels.append(label) print(counter) labels = np.concatenate(labels) labels = np.argmax(labels,1) pd.DataFrame({"label":labels}).groupby("label").size()
import glob import pandas as pd import numpy as np import matplotlib.pyplot as plt from DeepTreeAttention.generators import boxes #metadata created_records = glob.glob("/orange/idtrees-collab/DeepTreeAttention/tfrecords/evaluation/*.tfrecord") dataset = boxes.tf_dataset(created_records, mode="metadata",batch_size=256) counter=0 labels=[] data =[] for metadata, label in dataset: counter+=metadata.shape[0] print(counter) labels.append(label) data.append(metadata) created_records = glob.glob("/orange/idtrees-collab/DeepTreeAttention/tfrecords/train/*.tfrecord") dataset = boxes.tf_dataset(created_records, mode="ensemble",batch_size=256) counter=0 labels=[] data =[] for (HSI, RGB, elevation, height, sites), label in dataset: counter+=RGB.shape[0] labels.append(label) data.append(RGB) labels = np.vstack(labels)