Example #1
0
def create_training_dataset(datapath, experiment, neighborhood_sizes, labels):
    """Create a training dataset that will feed the classifier in the training
    step

    Parameters
    ----------
    datapath : str
        Root of the data folder
    experiment : str
        Name of the experiment, used for identifying the accurate subfolder
    neighbors : list
        List of number of neighbors
    labels : dict
        Dataset glossary

    Returns
    -------
    pd.DataFrame
        Shuffled training dataset, without point coordinates
    """
    dfs = []
    for label in labels.keys():
        df = io.load_features(datapath, experiment, neighborhood_sizes, label)
        if df is not None:
            df["label"] = labels[label]["id"]
            dfs.append(df)
    df = pd.concat(dfs, axis=0)
    return df.sample(frac=1.).drop(columns=["x", "y", "z"])
Example #2
0
def main(opts):
    config_path = Path("config", opts.config_file)
    feature_config = io.read_config(config_path)

    experiment = opts.input_file.split(".")[0]
    data = io.load_features(opts.datapath, experiment, opts.neighbors)
    points = data[["x", "y", "z"]].copy()

    for c in data.drop(columns=["x", "y"]):
        data[c] = max_normalize(data[c])

    data = add_accumulation_features(data, feature_config)
    update_features(data, feature_config)
    data.drop(columns=["x", "y"], inplace=True)

    logger.info("Compute %s clusters...", opts.nb_clusters)
    labels = compute_clusters(data,
                              n_clusters=opts.nb_clusters,
                              batch_size=KMEAN_BATCH,
                              seed=SEED)

    # Postprocessing
    if opts.postprocessing_neighbors > 0:
        logger.info(f"Post-process point labels by batches of {KMEAN_BATCH}")
        tree = compute_tree(points, POSTPROCESSING_KDTREE_LEAFS)
        gen = postprocess.batch_points(points, POSTPROCESSING_BATCH)
        labels = postprocess.postprocess_batch_labels(
            gen, POSTPROCESSING_BATCH, labels, tree,
            opts.postprocessing_neighbors)

    colored_results = colorize_labels(points, labels)
    save_labels(colored_results, opts.datapath, experiment, opts.neighbors,
                "kmeans", opts.nb_clusters, config_path.stem,
                opts.postprocessing_neighbors, opts.xyz)
Example #3
0
def test_load_features():
    """Test the feature loading process: it must work with full scene, as well
    as with sampled point clouds. It must then works with several neighborhood
    sizes, as well as with only one neighborhood size.
    """
    # datapath, experiment, neighbors, sample=None
    NEIGHBORS = [10, 50, 200]
    N_FEATURES = 19
    # Test for sample == None
    features = io.load_features(DATADIR, "b9", NEIGHBORS)
    assert features.shape == (22300, 3 + len(NEIGHBORS) * N_FEATURES)
    # Test for sample not None
    features = io.load_features(DATADIR, "b9", NEIGHBORS, "foo")
    assert features is None  # 'foo' is not a valid entry
    # Test for different neighbors
    features = io.load_features(DATADIR, "b9", [NEIGHBORS[0]])
    assert features.shape == (22300, 3 + N_FEATURES)
def test_normalize_features():
    """Test the feature normalization process; it normalize each feature of the
    input dataframe.
    """
    features = load_features(DATADIR, EXPERIMENT, NEIGHBORHOOD_SIZES)
    norm_features = normalize_features(features)
    assert norm_features.shape == features.shape
    for feature in norm_features:
        assert norm_features[feature].mean() < 1e-3
        assert norm_features[feature].std() - 1 < 1e-3
def test_compute_clusters():
    """Test the k-mean clustering procedure: it must give as many labels as one
    has individuals in the dataset; plus, the labels must be between 0 and
    N_CLUSTERS.
    """
    features = load_features(DATADIR, EXPERIMENT, NEIGHBORHOOD_SIZES)
    labels = compute_clusters(features, n_clusters=N_CLUSTERS, batch_size=0)
    assert labels.shape == (features.shape[0], )
    assert set(np.unique(labels)) == set(range(N_CLUSTERS))
    b_labels = compute_clusters(features, n_clusters=N_CLUSTERS, batch_size=50)
    assert b_labels.shape == (features.shape[0], )
    assert set(np.unique(b_labels)) == set(range(N_CLUSTERS))
Example #6
0
def main(opts):
    experiment = opts.input_file.split(".")[0]
    logger.info("Load data from %s dataset...", experiment)
    df = io.load_features(opts.datapath, experiment, opts.neighbors)

    logger.info("Load the trained classifier...")
    model_dir = Path(opts.datapath, "trained_models")
    if opts.generalized_model:
        model_filename = "logreg-" + io.instance(opts.neighbors) + ".pkl"
    else:
        model_filename = experiment + "-" + io.instance(
            opts.neighbors) + ".pkl"
    with open(model_dir / model_filename, "rb") as fobj:
        clf = pickle.load(fobj)

    logger.info("Predict labels...")
    points = df[["x", "y", "z"]].copy()
    df.drop(columns=["x", "y", "z"], inplace=True)
    labels = clf.predict(df)

    # Postprocessing
    if opts.postprocessing_neighbors > 0:
        logger.info("Post-process point labels by batches of %s",
                    POSTPROCESSING_BATCH)
        tree = compute_tree(points, POSTPROCESSING_KDTREE_LEAFS)
        gen = postprocess.batch_points(points, POSTPROCESSING_BATCH)
        labels = postprocess.postprocess_batch_labels(
            gen, POSTPROCESSING_BATCH, labels, tree,
            opts.postprocessing_neighbors)

    logger.info("Save predictions on disk...")
    outdf = classification.colorize_labels(points, labels, GLOSSARY)
    classification.save_labels(outdf,
                               opts.datapath,
                               experiment,
                               opts.neighbors,
                               algorithm="logreg",
                               config_name="full",
                               pp_neighbors=opts.postprocessing_neighbors,
                               xyz=opts.xyz)
def test_colorize_labels():
    """Test the label colorization procedure: it must return a pandas dataframe
    with XYZ and RGB features, and the number of computed RGB triplets must
    correspond to the cluster quantity.

    The user may choose its own color palette. In such a case, one must
    retrieve corresponding RGB triplets at the end of the process.
    """
    features = load_features(DATADIR, EXPERIMENT, NEIGHBORHOOD_SIZES)
    labels = np.random.randint(0, N_CLUSTERS, features.shape[0])
    df_color = colorize_labels(features[["x", "y", "z"]], labels)
    assert set(df_color.columns) == set("xyzrgb")
    assert len(df_color[["r", "g", "b"]].drop_duplicates()) == N_CLUSTERS
    colors = [(0, 0, 255), (51, 102, 153), (0, 255, 51), (255, 102, 204)]
    glossary = {
        "foo": {
            "id": 0,
            "color": (0.0, 0.0, 1.0)
        },
        "bar": {
            "id": 1,
            "color": (0.2, 0.4, 0.6)
        },
        "dummy": {
            "id": 2,
            "color": (0.0, 1.0, 0.2)
        },
        "doe": {
            "id": 3,
            "color": (1.0, 0.4, 0.8)
        }
    }
    df_color = colorize_labels(features[["x", "y", "z"]], labels, glossary)
    assert set(df_color.columns) == set("xyzrgb")
    assert len(df_color[["r", "g", "b"]].drop_duplicates()) == N_CLUSTERS
    unique_output_colors = df_color[["r", "g", "b"]].drop_duplicates().values
    assert np.all([c in unique_output_colors for c in colors])