def read_data(self, mode="train", validation_split=False): """Read tfrecord into datasets from config Args: validation_split: True -> split tfrecords into train test. This overrides the evaluation config! """ self.train_records = glob.glob( os.path.join(self.config["train"]["tfrecords"], "*.tfrecord")) if validation_split: print("Splitting training set into train-test") train_df = pd.Series(self.train_records) #Sample with set seed to make it the same between runs self.train_split_records = train_df.head( int(0.9 * train_df.shape[0])).values self.test_split_records = train_df[~( train_df.isin(self.train_split_records))].values #Create training tf.data self.train_split = make_dataset.tf_dataset( tfrecords=self.train_split_records, batch_size=self.config["train"]["batch_size"], shuffle=self.config["train"]["shuffle"], mode=mode, cores=self.config["cpu_workers"]) #Create testing tf.data self.val_split = make_dataset.tf_dataset( tfrecords=self.test_split_records, batch_size=self.config["train"]["batch_size"], shuffle=self.config["train"]["shuffle"], mode=mode, cores=self.config["cpu_workers"]) else: #Create training tf.data self.train_split = make_dataset.tf_dataset( tfrecords=self.train_records, batch_size=self.config["train"]["batch_size"], shuffle=self.config["train"]["shuffle"], mode=mode, cores=self.config["cpu_workers"]) #honor config if validation not set self.val_split = None if self.config["evaluation"]["tfrecords"] is not None: self.test_records = glob.glob( os.path.join(self.config["evaluation"]["tfrecords"], "*.tfrecord")) self.val_split = make_dataset.tf_dataset( tfrecords=self.test_records, batch_size=self.config["train"]["batch_size"], shuffle=self.config["train"]["shuffle"], mode=mode, cores=self.config["cpu_workers"])
def predict_raster(self, tfrecords, batch_size=1): """Predicted a set of tfrecords and create a raster image""" prediction_set = make_dataset.tf_dataset( tfrecords=tfrecords, batch_size=batch_size, shuffle=False, mode="predict", cores=self.config["cpu_workers"]) predictions = [] row_list = [] col_list = [] for image, x, y in prediction_set: try: softmax_batch = self.model.predict_on_batch(image) row_list.append(x.numpy()) col_list.append(y.numpy()) predictions.append(softmax_batch) except tf.errors.OutOfRangeError: print("Completed {} predictions".format(len(predictions))) #stack predictions = np.vstack(predictions) row_list = np.concatenate(row_list) col_list = np.concatenate(col_list) predictions = np.argmax(predictions, 1) results = pd.DataFrame({ "label": predictions, "row": row_list, "col": col_list }) results = results.sort_values(by=["row", "col"]) return results
def predict_boxes(self, tfrecords, batch_size=1, majority_vote=True): """Predicted a set of tfrecords and create a raster image""" prediction_set = tf_dataset(tfrecords=tfrecords, batch_size=batch_size, shuffle=False, mode="box", cores=self.config["cpu_workers"]) predictions = [] indices = [] for image, box_index in prediction_set: try: softmax_batch = self.model.predict_on_batch(image) predictions.append(softmax_batch) indices.append(box_index) except tf.errors.OutOfRangeError: print("Completed {} predictions".format(len(predictions))) #stack predictions = np.vstack(predictions) predictions = np.argmax(predictions, 1) indices = np.concatenate(indices) labels = [self.config["class_labels"][x] for x in predictions] results = pd.DataFrame({"label": labels, "box_index": indices}) if majority_vote: majority_results = results.groupby("box_index").agg(pd.Series.mode).to_frame() return majority_results else: return results
def test_tf_dataset_train(train_tfrecords, ground_truth_raster): print(train_tfrecords) #Tensorflow encodes string as b bytes dataset = make_dataset.tf_dataset(train_tfrecords, batch_size=6, shuffle=False) counter = 0 for data, label in dataset.take(2): # turn off repeat numpy_data = data.numpy() assert numpy_data.shape[0] == 6 assert not numpy_data.sum() == 0 #full dataset dataset = make_dataset.tf_dataset(train_tfrecords, batch_size=6, shuffle=False) counter = 0 labels = [] center_pixels = [] for data, label in dataset: # turn off repeat counter += data.shape[0] center_pixels.append(data[:, 4, 4, 0]) labels.append(label) labels = np.vstack(labels) labels = np.argmax(labels, 1) center_pixels = np.concatenate(center_pixels) #one epoch should be every pixel in the raster minus the 0 label pixels src = rasterio.open(ground_truth_raster) raster = src.read() non_zero = raster[raster != 0] assert counter == len(non_zero) #The unique values should have the same indexing np.testing.assert_equal(np.unique(non_zero), np.unique(labels)) #same frequency, take one values as an example assert len(raster[raster == 7]) == len(labels[labels == 7])
def test_tf_dataset_predict(predict_tfrecords): #Tensorflow encodes string as b bytes dataset = make_dataset.tf_dataset(predict_tfrecords, batch_size=20, shuffle=False, mode="predict") counter = 0 for data, x, y in dataset: # only take first element of dataset numpy_data = data.numpy() assert not numpy_data.sum() == 0 x = x.numpy() y = y.numpy() counter += data.shape[0] assert counter == 25 * 50