def train_test(ctx, test_dir, train_data_offset: int): """Train the model using two training and testing data sets.""" print("loading data...") train_images, train_labels = load_data(ctx.obj["data_folder"], shuffle_seed=ctx.obj["seed"]) test_images, test_labels = load_data(test_dir, shuffle_seed=ctx.obj["seed"]) train_images, train_labels, test_images, test_labels = move_data( train_data_offset, train_images, train_labels, test_images, test_labels) print("") print( f"training {ctx.obj['classifier'].value} model with {train_data_offset} moved..." ) model, hist = build_model(ctx, train_images, train_labels, train_test.name, batch_size=ctx.obj["batch_size"]) test_images = (test_images / 255) test_labels = k.utils.to_categorical(test_labels) print("") print("evaluating model...") for key, value in zip( model.metrics_names, model.evaluate(test_images, test_labels, batch_size=ctx.obj["batch_size"], verbose=ctx.obj["verbosity"] > 1)): print(f" - {key}: {value}")
def agglo_clustering(ctx): """ Agglomerative clustering function to be run on dataset. """ clusters = 10 linkage = 'ward' print("loading data...") x_train, _, y_train = load_data(ctx.obj["data_folder"], shuffle_seed=ctx.obj["seed"]) print( "Running agglomerative clustering where linkage = {} and n_clusters = {}" .format(linkage, clusters)) model = AgglomerativeClustering(linkage=linkage, n_clusters=clusters) labels_predicted = model.fit_predict(x_train) y_train = column_or_1d(y_train) score = metrics.adjusted_rand_score(y_train, labels_predicted) print(f"Accuracy: {score}.") score = metrics.homogeneity_score(y_train, labels_predicted) print(f"Homogeneity Score: {score}.") score = metrics.completeness_score(y_train, labels_predicted) print(f"Completeness Score: {score}.") score = metrics.v_measure_score(y_train, labels_predicted) print(f"V Measure Score: {score}.") score = metrics.fowlkes_mallows_score(y_train, labels_predicted) print(f"Fowlkes Mallows Score: {score}.")
def em_clustering(ctx): """ Gaussian Mixture function to be run on dataset. """ covariance_type = 'spherical' n_components = 10 print("loading data...") x_train, _, y_train = load_data(ctx.obj["data_folder"], shuffle_seed=ctx.obj["seed"]) print("Running Gaussian Mixture...") model = GaussianMixture(n_components=n_components, covariance_type=covariance_type, verbose=2) labels_predicted = model.fit_predict(x_train) y_train = column_or_1d(y_train) score = metrics.adjusted_rand_score(y_train, labels_predicted) print(f"Accuracy: {score}.") score = metrics.homogeneity_score(y_train, labels_predicted) print(f"Homogeneity Score: {score}.") score = metrics.completeness_score(y_train, labels_predicted) print(f"Completeness Score: {score}.") score = metrics.v_measure_score(y_train, labels_predicted) print(f"V Measure Score: {score}.") score = metrics.fowlkes_mallows_score(y_train, labels_predicted) print(f"Fowlkes Mallows Score: {score}.")
def bayes_complex(ctx, n): """ Improve bayesian classification and make conclusions. - https://github.com/arlyon/dmml/issues/5 """ print("loading data...") x_train, y_train, _ = load_data(ctx.obj["data_folder"], shuffle_seed=ctx.obj["seed"]) print("") print( f"building accuracy graph over {n} features sorted by correlation...") save_plot = ctx.obj["save_plot"] show_plot = ctx.obj["show_plot"] label_classifiers = fit_labels(x_train, y_train) # dictionary mapping subsets of n features to the analyses generated from them feature_analyses = {} with click.progressbar(range(1, n + 1)) as bar: for n in bar: top_n_pixels = set( itertools.chain.from_iterable( x.top_features[:n] for x in label_classifiers.values())) feature_analyses[n] = fit_labels( x_train[(str(x) for x in top_n_pixels)], y_train) # for each of the analyses get the a pair of the (average accuracy, index) average_data = ((sum(y.correct_count / y.total_count for y in x.values()) / len(x), y) for y, x in feature_analyses.items()) average_accuracy = pandas.DataFrame( data=average_data, columns=["prediction accuracy", "number of features"]) print("") print("accuracy for 2, 5, and 10 top features per label:") for label in label_mapping: features = " / ".join( f"{x} features {100 * feature_analyses[x][label].correct_count / feature_analyses[x][label].total_count:.2f}%" for x in (2, 5, 10)) print(f" - {label} / {features}") average_accuracy.plot(kind='scatter', x='number of features', y='prediction accuracy') plt.title("Accuracy using n top correlating features for each label") if save_plot is not None: path = os.path.join(save_plot, "feature_accuracy.png") plt.savefig(path) print("") print("saved figure to " + path) if show_plot: plt.show()
def bayes_tan(ctx): """ Bayesian Network and make conclusions. - https://github.com/arlyon/dmml/issues/8 """ print("loading data...") x_train, y_train, labels = load_data(ctx.obj["data_folder"], shuffle_seed=ctx.obj["seed"]) assert x_train is not None print("") print("running bayesian network classification on all features...") save_plot = ctx.obj["save_plot"] show_plot = ctx.obj["show_plot"] label_classifiers = fit_labels(x_train, y_train, classifier=DiscreteBayesNetClassifier) for label, analysis in label_classifiers.items(): accuracy = f"{analysis.correct_count} out of {analysis.total_count} ({analysis.correct_count / analysis.total_count * 100:.2f}%)" print( f" - {click.style(label, fg='green')}: {click.style(accuracy, fg='bright_black')}" ) print( f" {click.style(str(len(analysis.top_features[:10])), fg='yellow')} most correlated pixels: {click.style(', '.join(analysis.top_features[:10].pixel_coords()), fg='bright_black')}" ) plt.imshow(analysis.heat_map, cmap='hot', interpolation='lanczos') plt.title("Heatmap for " + label) if save_plot is not None: plt.savefig(os.path.join(save_plot, label + ".png")) if show_plot: plt.show() print( f"average accuracy: {sum(analysis.correct_count / analysis.total_count for analysis in label_classifiers.values()) / len(label_classifiers) * 100:.2f}%" )
def kfold(ctx, splits): """Train the model using k-fold method on one data set.""" print("loading data...") images, labels = load_data(ctx.obj["data_folder"], shuffle_seed=ctx.obj["seed"]) print("") print( f"running k-fold with 10 folds on a {ctx.obj['classifier'].value} model..." ) scores = pandas.DataFrame() for fold, (train_indices, test_indices) in enumerate( StratifiedKFold(n_splits=splits, random_state=ctx.obj["seed"]).split( images, labels)): print(f" - training fold {fold+1}") train_images = images.iloc[train_indices] train_labels = labels.iloc[train_indices] test_images = images.iloc[test_indices] test_labels = k.utils.to_categorical(labels.iloc[test_indices]) model, hist = build_model(ctx, train_images, train_labels, kfold.name, batch_size=ctx.obj["batch_size"]) print(f" evaluating fold {fold+1}") data = dict( zip( model.metrics_names, model.evaluate(test_images, test_labels, batch_size=ctx.obj["batch_size"], verbose=ctx.obj["verbosity"] > 1))) scores = scores.append(data, ignore_index=True) print("") with pandas.option_context('display.max_rows', None, 'display.max_columns', None): print(scores)
def k_clustering(ctx, sweep_features, sweep_variance, sweep_clusters): """ K-means clustering function to be run on dataset. Includes simple analysis of results. """ save_plot = ctx.obj["save_plot"] show_plot = ctx.obj["show_plot"] print("loading data...") features, boolean_labels, labels = load_data(ctx.obj["data_folder"], shuffle_seed=ctx.obj["seed"]) n_samples, n_features = features.shape features_with_labels = features.copy() features_with_labels[n_features] = labels # Save seed for consistent runs for data analysis seed = numpy.random.get_state() # Run k-clustering excluding the class attribute model = KMeans(n_clusters=10) print("running k-means clustering on all features except class...") numpy.random.set_state(seed) base_predictions = model.fit_predict(features) score_clustering(labels, base_predictions, print_score=True) # Run k-clustering including the class attribute print("running k-means clustering on all features including class...") numpy.random.set_state(seed) score_clustering(labels, model.fit_predict(features_with_labels), print_score=True) # Perform Analytical sweeps of features, variance and clusters best_feature_n = None if sweep_features: best_feature_n = feature_sweep(features, boolean_labels, labels, seed, save_plot, show_plot, n_features=20) if sweep_variance: variance_sweep(features, labels, seed, save_plot, show_plot, step=500) best_cluster_n = None if sweep_clusters: best_cluster_n = cluster_sweep(features, labels, seed, save_plot, show_plot, n_clusters=50, step=1) # Plotting the contingency matrix for the base prediction matrix = metrics.cluster.contingency_matrix(column_or_1d(labels), base_predictions) plt.imshow(matrix, cmap="hot") plt.title("Base Prediction mapping centroids against class labels") plt.xlabel("Cluster Centroid Label") plt.ylabel("Actual Label") if save_plot is not None: path = os.path.join(save_plot, "base_prediction_matrix.png") plt.savefig(path) print("") print("saved figure to " + path) if show_plot: plt.show() plt.clf() # Running k-clustering with results from sweep analysis print("Running k-means clustering with optimal settings") model.set_params(n_clusters=28) selector = SelectKBest(k=122) numpy.random.set_state(seed) optimal_predictions = model.fit_predict( selector.fit_transform(features, column_or_1d(labels))) score_clustering(labels, optimal_predictions, print_score=True) # Plotting contingency matrix from optimal predictions matrix = metrics.cluster.contingency_matrix(column_or_1d(labels), optimal_predictions) plt.imshow(matrix, cmap="hot") plt.title("Optimal Prediction mapping centroids against class labels") plt.xlabel("Cluster Centroid Label") plt.ylabel("Actual Label") if save_plot is not None: path = os.path.join(save_plot, "optimal_prediction_matrix.png") plt.savefig(path) print("") print("saved figure to " + path) if show_plot: plt.show() plt.clf() # Print out optimal results from sweep analysis if best_feature_n: print(f"Ideal number of k-best features is {best_feature_n}.") if best_feature_n: print(f"Ideal number of clusters is {best_cluster_n}.") print("Analysis Completed.")
def bayes_simple(ctx): """ Naive Bayesian Classification and Deeper Analysis. - https://github.com/arlyon/dmml/issues/3 - https://github.com/arlyon/dmml/issues/4 """ print("loading data...") x_train, y_train, labels = load_data(ctx.obj["data_folder"], shuffle_seed=ctx.obj["seed"]) print("") print("running bayesian classification on all features...") save_plot = ctx.obj["save_plot"] show_plot = ctx.obj["show_plot"] label_classifiers = fit_labels(x_train, y_train) for label, analysis in label_classifiers.items(): accuracy = f"{analysis.correct_count} out of {analysis.total_count} ({analysis.correct_count / analysis.total_count * 100:.2f}%)" print( f" - {click.style(label, fg='green')}: {click.style(accuracy, fg='bright_black')}" ) print( f" {click.style(str(len(analysis.top_features[:10])), fg='yellow')} most correlated pixels: {click.style(', '.join(analysis.top_features[:10].pixel_coords()), fg='bright_black')}" ) plt.imshow(analysis.heat_map, cmap='hot', interpolation='lanczos') plt.title("Heatmap for " + label) if save_plot is not None: plt.savefig(os.path.join(save_plot, label + ".png")) if show_plot: plt.show() print( f"average accuracy: {sum(analysis.correct_count / analysis.total_count for analysis in label_classifiers.values()) / len(label_classifiers) * 100:.2f}%" ) print("") print("mistaken classifications:") most_mistaken = calculate_most_mistaken_heatmap(label_classifiers, labels) plt.imshow(most_mistaken, cmap='hot') plt.title("Which signs are most frequently mislabeled as another?") plt.xlabel("mistaken label") plt.ylabel("actual label") if save_plot is not None: plt.savefig(os.path.join(save_plot, "mislabeled.png")) if show_plot: plt.show() for label, analysis in label_classifiers.items(): most_mistaken_label = Counter( label_mapping[x] for x in labels.loc[analysis.mistake_indices].label) n_most_mistaken = sorted(most_mistaken_label.items(), key=lambda x: x[1], reverse=True)[:3] most_mistaken = ", ".join(f"{key} ({count})" for key, count in n_most_mistaken) print( f" - mistaken with {click.style(label, fg='green')}: {click.style(most_mistaken, fg='bright_black')}" ) print("") print("10 most frequently influential features:") counter = Counter( itertools.chain.from_iterable(x.top_features[:10] for x in label_classifiers.values())) for key, count in itertools.islice( sorted(counter.items(), key=lambda x: x[1], reverse=True), 10): print(f" - {key % 48}x{key // 48} (top feature {count} times)")