def create_training_dataset(datapath, experiment, neighborhood_sizes, labels): """Create a training dataset that will feed the classifier in the training step Parameters ---------- datapath : str Root of the data folder experiment : str Name of the experiment, used for identifying the accurate subfolder neighbors : list List of number of neighbors labels : dict Dataset glossary Returns ------- pd.DataFrame Shuffled training dataset, without point coordinates """ dfs = [] for label in labels.keys(): df = io.load_features(datapath, experiment, neighborhood_sizes, label) if df is not None: df["label"] = labels[label]["id"] dfs.append(df) df = pd.concat(dfs, axis=0) return df.sample(frac=1.).drop(columns=["x", "y", "z"])
def main(opts): config_path = Path("config", opts.config_file) feature_config = io.read_config(config_path) experiment = opts.input_file.split(".")[0] data = io.load_features(opts.datapath, experiment, opts.neighbors) points = data[["x", "y", "z"]].copy() for c in data.drop(columns=["x", "y"]): data[c] = max_normalize(data[c]) data = add_accumulation_features(data, feature_config) update_features(data, feature_config) data.drop(columns=["x", "y"], inplace=True) logger.info("Compute %s clusters...", opts.nb_clusters) labels = compute_clusters(data, n_clusters=opts.nb_clusters, batch_size=KMEAN_BATCH, seed=SEED) # Postprocessing if opts.postprocessing_neighbors > 0: logger.info(f"Post-process point labels by batches of {KMEAN_BATCH}") tree = compute_tree(points, POSTPROCESSING_KDTREE_LEAFS) gen = postprocess.batch_points(points, POSTPROCESSING_BATCH) labels = postprocess.postprocess_batch_labels( gen, POSTPROCESSING_BATCH, labels, tree, opts.postprocessing_neighbors) colored_results = colorize_labels(points, labels) save_labels(colored_results, opts.datapath, experiment, opts.neighbors, "kmeans", opts.nb_clusters, config_path.stem, opts.postprocessing_neighbors, opts.xyz)
def test_load_features(): """Test the feature loading process: it must work with full scene, as well as with sampled point clouds. It must then works with several neighborhood sizes, as well as with only one neighborhood size. """ # datapath, experiment, neighbors, sample=None NEIGHBORS = [10, 50, 200] N_FEATURES = 19 # Test for sample == None features = io.load_features(DATADIR, "b9", NEIGHBORS) assert features.shape == (22300, 3 + len(NEIGHBORS) * N_FEATURES) # Test for sample not None features = io.load_features(DATADIR, "b9", NEIGHBORS, "foo") assert features is None # 'foo' is not a valid entry # Test for different neighbors features = io.load_features(DATADIR, "b9", [NEIGHBORS[0]]) assert features.shape == (22300, 3 + N_FEATURES)
def test_normalize_features(): """Test the feature normalization process; it normalize each feature of the input dataframe. """ features = load_features(DATADIR, EXPERIMENT, NEIGHBORHOOD_SIZES) norm_features = normalize_features(features) assert norm_features.shape == features.shape for feature in norm_features: assert norm_features[feature].mean() < 1e-3 assert norm_features[feature].std() - 1 < 1e-3
def test_compute_clusters(): """Test the k-mean clustering procedure: it must give as many labels as one has individuals in the dataset; plus, the labels must be between 0 and N_CLUSTERS. """ features = load_features(DATADIR, EXPERIMENT, NEIGHBORHOOD_SIZES) labels = compute_clusters(features, n_clusters=N_CLUSTERS, batch_size=0) assert labels.shape == (features.shape[0], ) assert set(np.unique(labels)) == set(range(N_CLUSTERS)) b_labels = compute_clusters(features, n_clusters=N_CLUSTERS, batch_size=50) assert b_labels.shape == (features.shape[0], ) assert set(np.unique(b_labels)) == set(range(N_CLUSTERS))
def main(opts): experiment = opts.input_file.split(".")[0] logger.info("Load data from %s dataset...", experiment) df = io.load_features(opts.datapath, experiment, opts.neighbors) logger.info("Load the trained classifier...") model_dir = Path(opts.datapath, "trained_models") if opts.generalized_model: model_filename = "logreg-" + io.instance(opts.neighbors) + ".pkl" else: model_filename = experiment + "-" + io.instance( opts.neighbors) + ".pkl" with open(model_dir / model_filename, "rb") as fobj: clf = pickle.load(fobj) logger.info("Predict labels...") points = df[["x", "y", "z"]].copy() df.drop(columns=["x", "y", "z"], inplace=True) labels = clf.predict(df) # Postprocessing if opts.postprocessing_neighbors > 0: logger.info("Post-process point labels by batches of %s", POSTPROCESSING_BATCH) tree = compute_tree(points, POSTPROCESSING_KDTREE_LEAFS) gen = postprocess.batch_points(points, POSTPROCESSING_BATCH) labels = postprocess.postprocess_batch_labels( gen, POSTPROCESSING_BATCH, labels, tree, opts.postprocessing_neighbors) logger.info("Save predictions on disk...") outdf = classification.colorize_labels(points, labels, GLOSSARY) classification.save_labels(outdf, opts.datapath, experiment, opts.neighbors, algorithm="logreg", config_name="full", pp_neighbors=opts.postprocessing_neighbors, xyz=opts.xyz)
def test_colorize_labels(): """Test the label colorization procedure: it must return a pandas dataframe with XYZ and RGB features, and the number of computed RGB triplets must correspond to the cluster quantity. The user may choose its own color palette. In such a case, one must retrieve corresponding RGB triplets at the end of the process. """ features = load_features(DATADIR, EXPERIMENT, NEIGHBORHOOD_SIZES) labels = np.random.randint(0, N_CLUSTERS, features.shape[0]) df_color = colorize_labels(features[["x", "y", "z"]], labels) assert set(df_color.columns) == set("xyzrgb") assert len(df_color[["r", "g", "b"]].drop_duplicates()) == N_CLUSTERS colors = [(0, 0, 255), (51, 102, 153), (0, 255, 51), (255, 102, 204)] glossary = { "foo": { "id": 0, "color": (0.0, 0.0, 1.0) }, "bar": { "id": 1, "color": (0.2, 0.4, 0.6) }, "dummy": { "id": 2, "color": (0.0, 1.0, 0.2) }, "doe": { "id": 3, "color": (1.0, 0.4, 0.8) } } df_color = colorize_labels(features[["x", "y", "z"]], labels, glossary) assert set(df_color.columns) == set("xyzrgb") assert len(df_color[["r", "g", "b"]].drop_duplicates()) == N_CLUSTERS unique_output_colors = df_color[["r", "g", "b"]].drop_duplicates().values assert np.all([c in unique_output_colors for c in colors])