Example #1
0
File: cw2.py Project: arlyon/dmml
def train_test(ctx, test_dir, train_data_offset: int):
    """Train the model using two training and testing data sets."""
    print("loading data...")
    train_images, train_labels = load_data(ctx.obj["data_folder"],
                                           shuffle_seed=ctx.obj["seed"])
    test_images, test_labels = load_data(test_dir,
                                         shuffle_seed=ctx.obj["seed"])

    train_images, train_labels, test_images, test_labels = move_data(
        train_data_offset, train_images, train_labels, test_images,
        test_labels)

    print("")
    print(
        f"training {ctx.obj['classifier'].value} model with {train_data_offset} moved..."
    )
    model, hist = build_model(ctx,
                              train_images,
                              train_labels,
                              train_test.name,
                              batch_size=ctx.obj["batch_size"])

    test_images = (test_images / 255)
    test_labels = k.utils.to_categorical(test_labels)

    print("")
    print("evaluating model...")
    for key, value in zip(
            model.metrics_names,
            model.evaluate(test_images,
                           test_labels,
                           batch_size=ctx.obj["batch_size"],
                           verbose=ctx.obj["verbosity"] > 1)):
        print(f" - {key}: {value}")
Example #2
0
File: cw1.py Project: arlyon/dmml
def agglo_clustering(ctx):
    """
    Agglomerative clustering function to be run on dataset.
    """
    clusters = 10
    linkage = 'ward'

    print("loading data...")
    x_train, _, y_train = load_data(ctx.obj["data_folder"],
                                    shuffle_seed=ctx.obj["seed"])

    print(
        "Running agglomerative clustering where linkage = {} and n_clusters = {}"
        .format(linkage, clusters))

    model = AgglomerativeClustering(linkage=linkage, n_clusters=clusters)
    labels_predicted = model.fit_predict(x_train)

    y_train = column_or_1d(y_train)

    score = metrics.adjusted_rand_score(y_train, labels_predicted)
    print(f"Accuracy: {score}.")

    score = metrics.homogeneity_score(y_train, labels_predicted)
    print(f"Homogeneity Score: {score}.")

    score = metrics.completeness_score(y_train, labels_predicted)
    print(f"Completeness Score: {score}.")

    score = metrics.v_measure_score(y_train, labels_predicted)
    print(f"V Measure Score: {score}.")

    score = metrics.fowlkes_mallows_score(y_train, labels_predicted)
    print(f"Fowlkes Mallows Score: {score}.")
Example #3
0
File: cw1.py Project: arlyon/dmml
def em_clustering(ctx):
    """
    Gaussian Mixture function to be run on dataset.
    """
    covariance_type = 'spherical'
    n_components = 10

    print("loading data...")
    x_train, _, y_train = load_data(ctx.obj["data_folder"],
                                    shuffle_seed=ctx.obj["seed"])

    print("Running Gaussian Mixture...")

    model = GaussianMixture(n_components=n_components,
                            covariance_type=covariance_type,
                            verbose=2)

    labels_predicted = model.fit_predict(x_train)

    y_train = column_or_1d(y_train)

    score = metrics.adjusted_rand_score(y_train, labels_predicted)
    print(f"Accuracy: {score}.")

    score = metrics.homogeneity_score(y_train, labels_predicted)
    print(f"Homogeneity Score: {score}.")

    score = metrics.completeness_score(y_train, labels_predicted)
    print(f"Completeness Score: {score}.")

    score = metrics.v_measure_score(y_train, labels_predicted)
    print(f"V Measure Score: {score}.")

    score = metrics.fowlkes_mallows_score(y_train, labels_predicted)
    print(f"Fowlkes Mallows Score: {score}.")
Example #4
0
File: cw1.py Project: arlyon/dmml
def bayes_complex(ctx, n):
    """
    Improve bayesian classification and make conclusions.

    - https://github.com/arlyon/dmml/issues/5
    """

    print("loading data...")
    x_train, y_train, _ = load_data(ctx.obj["data_folder"],
                                    shuffle_seed=ctx.obj["seed"])

    print("")
    print(
        f"building accuracy graph over {n} features sorted by correlation...")

    save_plot = ctx.obj["save_plot"]
    show_plot = ctx.obj["show_plot"]

    label_classifiers = fit_labels(x_train, y_train)

    # dictionary mapping subsets of n features to the analyses generated from them
    feature_analyses = {}
    with click.progressbar(range(1, n + 1)) as bar:
        for n in bar:
            top_n_pixels = set(
                itertools.chain.from_iterable(
                    x.top_features[:n] for x in label_classifiers.values()))
            feature_analyses[n] = fit_labels(
                x_train[(str(x) for x in top_n_pixels)], y_train)

    # for each of the analyses get the a pair of the (average accuracy, index)
    average_data = ((sum(y.correct_count / y.total_count
                         for y in x.values()) / len(x), y)
                    for y, x in feature_analyses.items())

    average_accuracy = pandas.DataFrame(
        data=average_data,
        columns=["prediction accuracy", "number of features"])

    print("")
    print("accuracy for 2, 5, and 10 top features per label:")
    for label in label_mapping:
        features = " / ".join(
            f"{x} features {100 * feature_analyses[x][label].correct_count / feature_analyses[x][label].total_count:.2f}%"
            for x in (2, 5, 10))
        print(f" - {label} / {features}")

    average_accuracy.plot(kind='scatter',
                          x='number of features',
                          y='prediction accuracy')
    plt.title("Accuracy using n top correlating features for each label")

    if save_plot is not None:
        path = os.path.join(save_plot, "feature_accuracy.png")
        plt.savefig(path)
        print("")
        print("saved figure to " + path)

    if show_plot:
        plt.show()
Example #5
0
File: cw1.py Project: arlyon/dmml
def bayes_tan(ctx):
    """
    Bayesian Network and make conclusions.

    - https://github.com/arlyon/dmml/issues/8

    """
    print("loading data...")
    x_train, y_train, labels = load_data(ctx.obj["data_folder"],
                                         shuffle_seed=ctx.obj["seed"])
    assert x_train is not None

    print("")
    print("running bayesian network classification on all features...")

    save_plot = ctx.obj["save_plot"]
    show_plot = ctx.obj["show_plot"]

    label_classifiers = fit_labels(x_train,
                                   y_train,
                                   classifier=DiscreteBayesNetClassifier)

    for label, analysis in label_classifiers.items():
        accuracy = f"{analysis.correct_count} out of {analysis.total_count} ({analysis.correct_count / analysis.total_count * 100:.2f}%)"
        print(
            f" - {click.style(label, fg='green')}: {click.style(accuracy, fg='bright_black')}"
        )
        print(
            f"   {click.style(str(len(analysis.top_features[:10])), fg='yellow')} most correlated pixels: {click.style(', '.join(analysis.top_features[:10].pixel_coords()), fg='bright_black')}"
        )

        plt.imshow(analysis.heat_map, cmap='hot', interpolation='lanczos')
        plt.title("Heatmap for " + label)

        if save_plot is not None:
            plt.savefig(os.path.join(save_plot, label + ".png"))

        if show_plot:
            plt.show()

    print(
        f"average accuracy: {sum(analysis.correct_count / analysis.total_count for analysis in label_classifiers.values()) / len(label_classifiers) * 100:.2f}%"
    )
Example #6
0
File: cw2.py Project: arlyon/dmml
def kfold(ctx, splits):
    """Train the model using k-fold method on one data set."""
    print("loading data...")
    images, labels = load_data(ctx.obj["data_folder"],
                               shuffle_seed=ctx.obj["seed"])

    print("")
    print(
        f"running k-fold with 10 folds on a {ctx.obj['classifier'].value} model..."
    )
    scores = pandas.DataFrame()
    for fold, (train_indices, test_indices) in enumerate(
            StratifiedKFold(n_splits=splits,
                            random_state=ctx.obj["seed"]).split(
                                images, labels)):
        print(f" - training fold {fold+1}")
        train_images = images.iloc[train_indices]
        train_labels = labels.iloc[train_indices]
        test_images = images.iloc[test_indices]
        test_labels = k.utils.to_categorical(labels.iloc[test_indices])
        model, hist = build_model(ctx,
                                  train_images,
                                  train_labels,
                                  kfold.name,
                                  batch_size=ctx.obj["batch_size"])

        print(f"   evaluating fold {fold+1}")
        data = dict(
            zip(
                model.metrics_names,
                model.evaluate(test_images,
                               test_labels,
                               batch_size=ctx.obj["batch_size"],
                               verbose=ctx.obj["verbosity"] > 1)))

        scores = scores.append(data, ignore_index=True)

    print("")
    with pandas.option_context('display.max_rows', None, 'display.max_columns',
                               None):
        print(scores)
Example #7
0
File: cw1.py Project: arlyon/dmml
def k_clustering(ctx, sweep_features, sweep_variance, sweep_clusters):
    """
    K-means clustering function to be run on dataset.
    Includes simple analysis of results.
    """
    save_plot = ctx.obj["save_plot"]
    show_plot = ctx.obj["show_plot"]

    print("loading data...")
    features, boolean_labels, labels = load_data(ctx.obj["data_folder"],
                                                 shuffle_seed=ctx.obj["seed"])
    n_samples, n_features = features.shape
    features_with_labels = features.copy()
    features_with_labels[n_features] = labels

    # Save seed for consistent runs for data analysis
    seed = numpy.random.get_state()

    # Run k-clustering excluding the class attribute
    model = KMeans(n_clusters=10)
    print("running k-means clustering on all features except class...")
    numpy.random.set_state(seed)
    base_predictions = model.fit_predict(features)
    score_clustering(labels, base_predictions, print_score=True)

    # Run k-clustering including the class attribute
    print("running k-means clustering on all features including class...")
    numpy.random.set_state(seed)
    score_clustering(labels,
                     model.fit_predict(features_with_labels),
                     print_score=True)

    # Perform Analytical sweeps of features, variance and clusters
    best_feature_n = None
    if sweep_features:
        best_feature_n = feature_sweep(features,
                                       boolean_labels,
                                       labels,
                                       seed,
                                       save_plot,
                                       show_plot,
                                       n_features=20)

    if sweep_variance:
        variance_sweep(features, labels, seed, save_plot, show_plot, step=500)

    best_cluster_n = None
    if sweep_clusters:
        best_cluster_n = cluster_sweep(features,
                                       labels,
                                       seed,
                                       save_plot,
                                       show_plot,
                                       n_clusters=50,
                                       step=1)

    # Plotting the contingency matrix for the base prediction
    matrix = metrics.cluster.contingency_matrix(column_or_1d(labels),
                                                base_predictions)
    plt.imshow(matrix, cmap="hot")
    plt.title("Base Prediction mapping centroids against class labels")
    plt.xlabel("Cluster Centroid Label")
    plt.ylabel("Actual Label")
    if save_plot is not None:
        path = os.path.join(save_plot, "base_prediction_matrix.png")
        plt.savefig(path)
        print("")
        print("saved figure to " + path)
    if show_plot:
        plt.show()
    plt.clf()

    # Running k-clustering with results from sweep analysis
    print("Running k-means clustering with optimal settings")
    model.set_params(n_clusters=28)
    selector = SelectKBest(k=122)
    numpy.random.set_state(seed)
    optimal_predictions = model.fit_predict(
        selector.fit_transform(features, column_or_1d(labels)))
    score_clustering(labels, optimal_predictions, print_score=True)

    # Plotting contingency matrix from optimal predictions
    matrix = metrics.cluster.contingency_matrix(column_or_1d(labels),
                                                optimal_predictions)
    plt.imshow(matrix, cmap="hot")
    plt.title("Optimal Prediction mapping centroids against class labels")
    plt.xlabel("Cluster Centroid Label")
    plt.ylabel("Actual Label")
    if save_plot is not None:
        path = os.path.join(save_plot, "optimal_prediction_matrix.png")
        plt.savefig(path)
        print("")
        print("saved figure to " + path)
    if show_plot:
        plt.show()
    plt.clf()

    # Print out optimal results from sweep analysis
    if best_feature_n:
        print(f"Ideal number of k-best features is {best_feature_n}.")
    if best_feature_n:
        print(f"Ideal number of clusters is {best_cluster_n}.")

    print("Analysis Completed.")
Example #8
0
File: cw1.py Project: arlyon/dmml
def bayes_simple(ctx):
    """
    Naive Bayesian Classification and Deeper Analysis.

    - https://github.com/arlyon/dmml/issues/3
    - https://github.com/arlyon/dmml/issues/4
    """

    print("loading data...")
    x_train, y_train, labels = load_data(ctx.obj["data_folder"],
                                         shuffle_seed=ctx.obj["seed"])

    print("")
    print("running bayesian classification on all features...")

    save_plot = ctx.obj["save_plot"]
    show_plot = ctx.obj["show_plot"]

    label_classifiers = fit_labels(x_train, y_train)

    for label, analysis in label_classifiers.items():
        accuracy = f"{analysis.correct_count} out of {analysis.total_count} ({analysis.correct_count / analysis.total_count * 100:.2f}%)"
        print(
            f" - {click.style(label, fg='green')}: {click.style(accuracy, fg='bright_black')}"
        )
        print(
            f"   {click.style(str(len(analysis.top_features[:10])), fg='yellow')} most correlated pixels: {click.style(', '.join(analysis.top_features[:10].pixel_coords()), fg='bright_black')}"
        )

        plt.imshow(analysis.heat_map, cmap='hot', interpolation='lanczos')
        plt.title("Heatmap for " + label)

        if save_plot is not None:
            plt.savefig(os.path.join(save_plot, label + ".png"))

        if show_plot:
            plt.show()

    print(
        f"average accuracy: {sum(analysis.correct_count / analysis.total_count for analysis in label_classifiers.values()) / len(label_classifiers) * 100:.2f}%"
    )

    print("")
    print("mistaken classifications:")
    most_mistaken = calculate_most_mistaken_heatmap(label_classifiers, labels)
    plt.imshow(most_mistaken, cmap='hot')
    plt.title("Which signs are most frequently mislabeled as another?")
    plt.xlabel("mistaken label")
    plt.ylabel("actual label")

    if save_plot is not None:
        plt.savefig(os.path.join(save_plot, "mislabeled.png"))

    if show_plot:
        plt.show()

    for label, analysis in label_classifiers.items():
        most_mistaken_label = Counter(
            label_mapping[x]
            for x in labels.loc[analysis.mistake_indices].label)
        n_most_mistaken = sorted(most_mistaken_label.items(),
                                 key=lambda x: x[1],
                                 reverse=True)[:3]
        most_mistaken = ", ".join(f"{key} ({count})"
                                  for key, count in n_most_mistaken)
        print(
            f" - mistaken with {click.style(label, fg='green')}: {click.style(most_mistaken, fg='bright_black')}"
        )

    print("")
    print("10 most frequently influential features:")
    counter = Counter(
        itertools.chain.from_iterable(x.top_features[:10]
                                      for x in label_classifiers.values()))
    for key, count in itertools.islice(
            sorted(counter.items(), key=lambda x: x[1], reverse=True), 10):
        print(f" - {key % 48}x{key // 48} (top feature {count} times)")