def _axis_label_for_symbol(symbol, coordinate=None, decomposition_method=None, distribution=None, prefix="", suffix=""): if decomposition_method: decomposition_method = proper_string( normalise_string(decomposition_method), DECOMPOSITION_METHOD_NAMES) decomposition_label = DECOMPOSITION_METHOD_LABEL[decomposition_method] else: decomposition_label = "" if decomposition_label: decomposition_label = "\\mathrm{{{}}}".format(decomposition_label) if coordinate: coordinate_text = "{{{} {}}}".format(decomposition_label, coordinate) else: coordinate_text = "" if distribution == "prior": distribution_symbol = "\\theta" elif distribution == "posterior": distribution_symbol = "\\phi" else: distribution_symbol = "" if distribution_symbol and coordinate_text: distribution_position = "_" coordinate_position = "^" elif distribution_symbol and not coordinate_text: distribution_position = "_" coordinate_position = "" elif not distribution_symbol and coordinate_text: distribution_position = "" coordinate_position = "_" else: distribution_position = "" coordinate_position = "" if coordinate_position == "^": coordinate_text = "{{(" + coordinate_text + ")}}" elif coordinate_position == "_": coordinate_text = "{{" + coordinate_text + "}}" axis_label = "$" + "".join([ prefix, symbol, distribution_position, distribution_symbol, coordinate_position, coordinate_text, suffix ]) + "$" return axis_label
def __init__(self, method, number_of_clusters=None, training_set_kind=None): prediction_method_names = { name: specifications["aliases"] for name, specifications in PREDICTION_METHODS.items() } method = proper_string(method, prediction_method_names) if method not in PREDICTION_METHODS: raise ValueError( "Prediction method `{}` not found.".format(method)) if number_of_clusters is None: raise TypeError("Number of clusters not set.") self.method = method self.number_of_clusters = number_of_clusters if training_set_kind: training_set_kind = normalise_string(training_set_kind) self.training_set_kind = training_set_kind
def decompose(values, other_value_sets={}, centroids={}, method=None, number_of_components=None, random=False): if method is None: method = defaults["decomposition_method"] method = proper_string(normalise_string(method), DECOMPOSITION_METHOD_NAMES) if number_of_components is None: number_of_components = defaults["decomposition_dimensionality"] other_values_provided_as_dictionary = True if other_value_sets is not None and not isinstance(other_value_sets, dict): other_value_sets["unknown"] = other_value_sets other_values_provided_as_dictionary = False if random: random_state = None else: random_state = 42 if method == "PCA": if (values.shape[1] <= MAXIMUM_FEATURE_SIZE_FOR_NORMAL_PCA and not scipy.sparse.issparse(values)): model = PCA(n_components=number_of_components) else: model = IncrementalPCA(n_components=number_of_components, batch_size=100) elif method == "SVD": model = TruncatedSVD(n_components=number_of_components) elif method == "ICA": model = FastICA(n_components=number_of_components) elif method == "t-SNE": if number_of_components < 4: tsne_method = "barnes_hut" else: tsne_method = "exact" model = TSNE(n_components=number_of_components, method=tsne_method, random_state=random_state) else: raise ValueError("Method `{}` not found.".format(method)) values_decomposed = model.fit_transform(values) if other_value_sets and method != "t-SNE": other_value_sets_decomposed = {} for other_set_name, other_values in other_value_sets.items(): if other_values is not None: other_value_decomposed = model.transform(other_values) else: other_value_decomposed = None other_value_sets_decomposed[other_set_name] = ( other_value_decomposed) else: other_value_sets_decomposed = None if other_value_sets_decomposed and not other_values_provided_as_dictionary: other_value_sets_decomposed = other_value_sets_decomposed["unknown"] # Only supports centroids without data sets as top levels if centroids is not None and method == "PCA": if "means" in centroids: centroids = {"unknown": centroids} components = model.components_ centroids_decomposed = {} for distribution, distribution_centroids in centroids.items(): if distribution_centroids: centroids_distribution_decomposed = {} for parameter, parameter_values in ( distribution_centroids.items()): if parameter == "means": shape = numpy.array(parameter_values.shape) original_dimension = shape[-1] reshaped_parameter_values = parameter_values.reshape( -1, original_dimension) decomposed_parameter_values = model.transform( reshaped_parameter_values) shape[-1] = number_of_components new_parameter_values = ( decomposed_parameter_values.reshape(shape)) elif parameter == "covariance_matrices": shape = numpy.array(parameter_values.shape) original_dimension = shape[-1] reshaped_parameter_values = parameter_values.reshape( -1, original_dimension, original_dimension) n_centroids = reshaped_parameter_values.shape[0] decomposed_parameter_values = numpy.empty( shape=(n_centroids, 2, 2)) for i in range(n_centroids): decomposed_parameter_values[i] = ( components @ reshaped_parameter_values[i] @ components.T) shape[-2:] = number_of_components new_parameter_values = ( decomposed_parameter_values.reshape(shape)) else: new_parameter_values = parameter_values centroids_distribution_decomposed[parameter] = ( new_parameter_values) centroids_decomposed[distribution] = ( centroids_distribution_decomposed) else: centroids_decomposed[distribution] = None if "unknown" in centroids_decomposed: centroids_decomposed = centroids_decomposed["unknown"] else: centroids_decomposed = None output = [values_decomposed] if other_value_sets != {}: output.append(other_value_sets_decomposed) if centroids != {}: output.append(centroids_decomposed) return output
def analyse_decompositions(data_sets, other_data_sets=None, centroids=None, colouring_data_set=None, sampled_data_set=None, decomposition_methods=None, highlight_feature_indices=None, symbol=None, title="data set", specifier=None, analysis_level=None, export_options=None, analyses_directory=None): if analysis_level is None: analysis_level = defaults["analyses"]["analysis_level"] centroids_original = centroids if isinstance(data_sets, dict): data_sets = list(data_sets.values()) if not isinstance(data_sets, (list, tuple)): data_sets = [data_sets] if other_data_sets is None: other_data_sets = [None] * len(data_sets) elif not isinstance(other_data_sets, (list, tuple)): other_data_sets = [other_data_sets] if len(data_sets) != len(other_data_sets): raise ValueError( "Lists of data sets and alternative data sets do not have the " "same length.") specification = None base_symbol = symbol original_title = title if decomposition_methods is None: decomposition_methods = [defaults["decomposition_method"]] elif not isinstance(decomposition_methods, (list, tuple)): decomposition_methods = [decomposition_methods] else: decomposition_methods = decomposition_methods.copy() decomposition_methods.insert(0, None) if highlight_feature_indices is None: highlight_feature_indices = defaults["analyses"][ "highlight_feature_indices"] elif not isinstance(highlight_feature_indices, (list, tuple)): highlight_feature_indices = [highlight_feature_indices] else: highlight_feature_indices = highlight_feature_indices.copy() if analyses_directory is None: analyses_directory = defaults["analyses"]["directory"] for data_set, other_data_set in zip(data_sets, other_data_sets): if data_set.values.shape[1] <= 1: continue title = original_title name = normalise_string(title) if specifier: specification = specifier(data_set) if specification: name += "-" + str(specification) title += " for " + specification title += " set" if not colouring_data_set: colouring_data_set = data_set if data_set.version in ["z", "z1"]: centroids = copy.deepcopy(centroids_original) else: centroids = None if other_data_set: title = "{} set values in {}".format(other_data_set.version, title) name = other_data_set.version + "-" + name decompositions_directory = os.path.join(analyses_directory, name) for decomposition_method in decomposition_methods: other_values = None sampled_values = None if other_data_set: other_values = other_data_set.values if sampled_data_set: sampled_values = sampled_data_set.values if not decomposition_method: if data_set.number_of_features == 2: values_decomposed = data_set.values other_values_decomposed = other_values sampled_values_decomposed = sampled_values centroids_decomposed = centroids else: continue else: decomposition_method = proper_string( decomposition_method, DECOMPOSITION_METHOD_NAMES) values_decomposed = data_set.values other_values_decomposed = other_values sampled_values_decomposed = sampled_values centroids_decomposed = centroids other_value_sets_decomposed = {} if other_values is not None: other_value_sets_decomposed["other"] = other_values if sampled_values is not None: other_value_sets_decomposed["sampled"] = sampled_values if not other_value_sets_decomposed: other_value_sets_decomposed = None if decomposition_method == "t-SNE": if (data_set.number_of_examples > MAXIMUM_NUMBER_OF_EXAMPLES_FOR_TSNE): print( "The number of examples for {}".format(title), "is too large to decompose it", "using {}. Skipping.".format(decomposition_method)) print() continue elif (data_set.number_of_features > MAXIMUM_NUMBER_OF_FEATURES_FOR_TSNE): number_of_pca_components_before_tsne = min( MAXIMUM_NUMBER_OF_PCA_COMPONENTS_BEFORE_TSNE, data_set.number_of_examples - 1) print( "The number of features for {}".format(title), "is too large to decompose it", "using {} in due time.".format( decomposition_method)) print("Decomposing {} to {} components using PCA " "beforehand.".format( title, number_of_pca_components_before_tsne)) decompose_time_start = time() (values_decomposed, other_value_sets_decomposed, centroids_decomposed) = decompose( values_decomposed, other_value_sets=other_value_sets_decomposed, centroids=centroids_decomposed, method="pca", number_of_components=( number_of_pca_components_before_tsne)) decompose_duration = time() - decompose_time_start print("{} pre-decomposed ({}).".format( capitalise_string(title), format_duration(decompose_duration))) else: if scipy.sparse.issparse(values_decomposed): values_decomposed = values_decomposed.A if scipy.sparse.issparse(other_values_decomposed): other_values_decomposed = other_values_decomposed.A if scipy.sparse.issparse(sampled_values_decomposed): sampled_values_decomposed = ( sampled_values_decomposed.A) print("Decomposing {} using {}.".format( title, decomposition_method)) decompose_time_start = time() (values_decomposed, other_value_sets_decomposed, centroids_decomposed) = decompose( values_decomposed, other_value_sets=other_value_sets_decomposed, centroids=centroids_decomposed, method=decomposition_method, number_of_components=2) decompose_duration = time() - decompose_time_start print("{} decomposed ({}).".format( capitalise_string(title), format_duration(decompose_duration))) print() if other_value_sets_decomposed: other_values_decomposed = other_value_sets_decomposed.get( "other") sampled_values_decomposed = ( other_value_sets_decomposed.get("sampled")) if base_symbol: symbol = base_symbol else: symbol = specification x_label = _axis_label_for_symbol( symbol=symbol, coordinate=1, decomposition_method=decomposition_method, ) y_label = _axis_label_for_symbol( symbol=symbol, coordinate=2, decomposition_method=decomposition_method, ) figure_labels = { "title": decomposition_method, "x label": x_label, "y label": y_label } if other_data_set: plot_values_decomposed = other_values_decomposed else: plot_values_decomposed = values_decomposed if plot_values_decomposed is None: print("No values to plot.\n") return print("Plotting {}{}.".format( "decomposed " if decomposition_method else "", title)) # No colour-coding plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Samples if sampled_data_set: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, centroids=centroids_decomposed, sampled_values=sampled_values_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with samples) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Labels if colouring_data_set.labels is not None: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with labels) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Superset labels if colouring_data_set.superset_labels is not None: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="superset labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" " "{} (with superset labels) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) # For each class if analysis_level == "extensive": if colouring_data_set.number_of_classes <= 10: plot_time_start = time() for class_name in colouring_data_set.class_names: figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="class", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, class_name=class_name, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure( figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print( " {} (for each class) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) if (colouring_data_set.superset_labels is not None and data_set.number_of_superset_classes <= 10): plot_time_start = time() for superset_class_name in ( colouring_data_set.superset_class_names): figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="superset class", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, class_name=superset_class_name, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure( figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (for each superset class) plotted and " "saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Batches if colouring_data_set.has_batches: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="batches", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" " "{} (with batches) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Cluster IDs if colouring_data_set.has_predicted_cluster_ids: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="predicted cluster IDs", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print( " " "{} (with predicted cluster IDs) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) # Predicted labels if colouring_data_set.has_predicted_labels: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="predicted labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" " "{} (with predicted labels) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) if colouring_data_set.has_predicted_superset_labels: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="predicted superset labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print( " {} (with predicted superset labels) plotted and saved" " ({}).".format(capitalise_string(title), format_duration(plot_duration))) # Count sum plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="count sum", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with count sum) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Features for feature_index in highlight_feature_indices: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="feature", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, feature_index=feature_index, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with {}) plotted and saved ({}).".format( capitalise_string(title), data_set.feature_names[feature_index], format_duration(plot_duration))) print()