def parse_distribution(distribution, model_type=None): distribution = normalise_string(distribution) if model_type is None: kind = "reconstruction" distributions = DISTRIBUTIONS elif isinstance(model_type, str): kind = "latent" if model_type == "VAE": distributions = LATENT_DISTRIBUTIONS elif model_type == "GMVAE": distributions = GAUSSIAN_MIXTURE_DISTRIBUTIONS else: raise ValueError("Model type not found.") else: raise TypeError("`model_type` should be a string.") distribution_names = list(distributions.keys()) parsed_distribution_name = None for distribution_name in distribution_names: if normalise_string(distribution_name) == distribution: parsed_distribution_name = distribution_name if parsed_distribution_name is None: raise ValueError("{} distribution `{}` not supported{}.".format( kind.capitalize(), distribution, " for {}".format(model_type) if model_type else "")) return parsed_distribution_name
def _build_preprocessed_path(self, map_features=None, preprocessing_methods=None, feature_selection=None, feature_selection_parameters=None, example_filter=None, example_filter_parameters=None, splitting_method=None, splitting_fraction=None, split_indices=None): base_path = os.path.join(self.preprocess_directory, self.name) filename_parts = [base_path] if map_features: filename_parts.append("features_mapped") if feature_selection: feature_selection_part = normalise_string(feature_selection) if feature_selection_parameters: for parameter in feature_selection_parameters: feature_selection_part += "_" + normalise_string( str(parameter)) filename_parts.append(feature_selection_part) if example_filter: example_filter_part = normalise_string(example_filter) if example_filter_parameters: for parameter in example_filter_parameters: example_filter_part += "_" + normalise_string( str(parameter)) filename_parts.append(example_filter_part) if preprocessing_methods: filename_parts.extend(map(normalise_string, preprocessing_methods)) if splitting_method: filename_parts.append("split") if (splitting_method == "indices" and len(split_indices) == 3 or not splitting_fraction): filename_parts.append(splitting_method) else: filename_parts.append("{}_{}".format(splitting_method, splitting_fraction)) path = "-".join(filename_parts) + PREPROCESSED_EXTENSION return path
def parse_model_versions(proposed_versions): version_alias_sets = { "end_of_training": ["eot", "end", "finish", "finished"], "best_model": ["bm", "best", "optimal_parameters", "op", "optimal"], "early_stopping": ["es", "early", "stop", "stopped"] } parsed_versions = [] if not isinstance(proposed_versions, list): proposed_versions = [proposed_versions] if proposed_versions == ["all"]: parsed_versions = list(version_alias_sets.keys()) else: for proposed_version in proposed_versions: normalised_proposed_version = normalise_string(proposed_version) parsed_version = None for version, version_aliases in version_alias_sets.items(): if (normalised_proposed_version == version or normalised_proposed_version in version_aliases): parsed_version = version break if parsed_version: parsed_versions.append(parsed_version) else: raise ValueError( "`{}` is not a model version.".format(proposed_version)) return parsed_versions
def save(data_dictionary, tables_file, group_title=None): if group_title: group = tables_file.create_group("/", normalise_string(group_title), group_title) else: group = tables_file.root for title, value in data_dictionary.items(): if isinstance(value, scipy.sparse.csr_matrix): _save_sparse_matrix(value, title, group, tables_file) elif isinstance(value, (numpy.ndarray, list)): _save_array(value, title, group, tables_file) elif title == "split indices": _save_split_indices(value, title, group, tables_file) elif title == "feature mapping": _save_feature_mapping(value, title, group, tables_file) elif value is None: _save_string(str(value), title, group, tables_file) elif title.endswith("set"): save(value, tables_file, group_title=title) else: raise NotImplementedError( "Saving type {} for title \"{}\" has not been implemented." .format(type(value), title))
def decorator(function): aliases = set() alias = normalise_string(name) aliases.add(alias) alias = alias.replace("_", "") aliases.add(alias) PREDICTION_METHODS[name] = {"aliases": aliases, "function": function} return function
def name(self): name_parts = [self.method, self.number_of_clusters] if self.training_set_kind and self.training_set_kind != "training": name_parts.append(self.training_set_kind) name = "_".join( map(lambda s: normalise_string(str(s)).replace("_", ""), name_parts)) return name
def _save_sparse_matrix(sparse_matrix, title, group, tables_file): name = normalise_string(title) group = tables_file.create_group(group, name, title) for attribute in ("data", "indices", "indptr", "shape"): array = numpy.array(getattr(sparse_matrix, attribute)) _save_array(array, attribute, group, tables_file)
def _save_split_indices(split_indices, title, group, tables_file): name = normalise_string(title) group = tables_file.create_group(group, name, title) for subset_name, subset_slice in split_indices.items(): subset_slice_array = numpy.array( [subset_slice.start, subset_slice.stop]) _save_array(subset_slice_array, subset_name, group, tables_file)
def plot_centroid_covariance_matrices_evolution(covariance_matrices, distribution, name=None): distribution = normalise_string(distribution) figure_name = "centroids_evolution-{}-covariance_matrices".format( distribution) figure_name = saving.build_figure_name(figure_name, name) y_label = _axis_label_for_symbol(symbol="\\Sigma", distribution=distribution, prefix="|", suffix="(y = k)|") n_epochs, n_centroids, __, __ = covariance_matrices.shape determinants = numpy.empty([n_epochs, n_centroids]) for e in range(n_epochs): for k in range(n_centroids): determinants[e, k] = numpy.prod(numpy.diag(covariance_matrices[e, k])) if determinants.all() > 0: line_range_ratio = numpy.empty(n_centroids) for k in range(n_centroids): determinants_min = determinants[:, k].min() determinants_max = determinants[:, k].max() line_range_ratio[k] = determinants_max / determinants_min range_ratio = line_range_ratio.max() / line_range_ratio.min() if range_ratio > 1e2: y_scale = "log" else: y_scale = "linear" centroids_palette = style.darker_palette(n_centroids) epochs = numpy.arange(n_epochs) + 1 figure = pyplot.figure() axis = figure.add_subplot(1, 1, 1) seaborn.despine() for k in range(n_centroids): axis.plot(epochs, determinants[:, k], color=centroids_palette[k], label="$k = {}$".format(k)) axis.set_xlabel("Epochs") axis.set_ylabel(y_label) axis.set_yscale(y_scale) axis.legend(loc="best") return figure, figure_name
def _axis_label_for_symbol(symbol, coordinate=None, decomposition_method=None, distribution=None, prefix="", suffix=""): if decomposition_method: decomposition_method = proper_string( normalise_string(decomposition_method), DECOMPOSITION_METHOD_NAMES) decomposition_label = DECOMPOSITION_METHOD_LABEL[decomposition_method] else: decomposition_label = "" if decomposition_label: decomposition_label = "\\mathrm{{{}}}".format(decomposition_label) if coordinate: coordinate_text = "{{{} {}}}".format(decomposition_label, coordinate) else: coordinate_text = "" if distribution == "prior": distribution_symbol = "\\theta" elif distribution == "posterior": distribution_symbol = "\\phi" else: distribution_symbol = "" if distribution_symbol and coordinate_text: distribution_position = "_" coordinate_position = "^" elif distribution_symbol and not coordinate_text: distribution_position = "_" coordinate_position = "" elif not distribution_symbol and coordinate_text: distribution_position = "" coordinate_position = "_" else: distribution_position = "" coordinate_position = "" if coordinate_position == "^": coordinate_text = "{{(" + coordinate_text + ")}}" elif coordinate_position == "_": coordinate_text = "{{" + coordinate_text + "}}" axis_label = "$" + "".join([ prefix, symbol, distribution_position, distribution_symbol, coordinate_position, coordinate_text, suffix ]) + "$" return axis_label
def _save_array(array, title, group, tables_file): name = normalise_string(title) if isinstance(array, list): array = numpy.array(array) name += "_was_list" if array.dtype.char == "U": encode = numpy.vectorize(lambda s: s.encode("UTF-8")) array = encode(array).astype("S") atom = tables.Atom.from_dtype(array.dtype) data_store = tables_file.create_carray(group, name, atom, array.shape, title) data_store[:] = array
def _find_list_of_names(list_name_guesses, kind): if list_name_guesses is None: list_name_guesses = LIST_NAME_GUESSES[kind] elif not isinstance(list_name_guesses, list): list_name_guesses = [list_name_guesses] list_of_names = None for list_name_guess in list_name_guesses: for table_key in table: if list_name_guess == normalise_string(table_key): list_of_names = table[table_key] if list_of_names is not None: break return list_of_names
def find_data_set(name, directory): data_sets = _load_data_set_metadata() title = None data_set = None json_path = os.path.join(directory, name, name + ".json") if os.path.exists(json_path): title, data_set = _data_set_from_json_file(json_path) if not title: for data_set_title, data_set_specifications in data_sets.items(): if normalise_string(data_set_title) == normalise_string(name): title = data_set_title data_set = data_set_specifications break if not title: raise KeyError("Data set not found.") return title, data_set
def _find_list_of_names(list_name_guesses, kind): if list_name_guesses is None: list_name_guesses = LIST_NAME_GUESSES[kind] elif not isinstance(list_name_guesses, list): list_name_guesses = [list_name_guesses] list_of_names = None for list_name_guess in list_name_guesses: for table_key in table: if list_name_guess == normalise_string(table_key): list_of_names = table[table_key] if list_of_names is not None: break list_of_names = numpy.array( ["{} {}".format(kind, i + 1) for i in range(n[kind])])
def default_feature_parameters(self): feature_selection_parameters = None if self.feature_selection: feature_selection = normalise_string(self.feature_selection) if feature_selection == "keep_variances_above": feature_selection_parameters = [0.5] elif feature_selection == "keep_highest_variances": if self.number_of_features is not None: feature_selection_parameters = [ int(self.number_of_features / 2) ] return feature_selection_parameters
def save_values(values, name, row_names=None, column_names=None, directory=None): safe_name = "-".join([normalise_string(part) for part in name.split("-")]) filename = "{}.tsv.gz".format(safe_name) path = os.path.join(directory, filename) table = pandas.DataFrame(data=values, index=row_names, columns=column_names) if not os.path.exists(directory): os.makedirs(directory) table.to_csv(path, sep="\t")
def build_figure_name(base_name, other_names=None): if isinstance(base_name, list): if not other_names: other_names = [] other_names.extend(base_name[1:]) base_name = normalise_string(base_name[0]) figure_name = base_name if other_names: if not isinstance(other_names, list): other_names = str(other_names) other_names = [other_names] else: other_names = [ str(name) for name in other_names if name is not None ] figure_name += "-" + "-".join(map(normalise_string, other_names)) return figure_name
def parse_input(input_file_or_name): if input_file_or_name.endswith(".json"): json_path = input_file_or_name with open(json_path, "r") as json_file: data_set_dictionary = json.load(json_file) name = _base_name(json_path) if "URLs" not in data_set_dictionary: if "values" in data_set_dictionary: json_directory = os.path.dirname(json_path) data_set_dictionary["values"] = os.path.join( json_directory, data_set_dictionary["values"]) else: raise KeyError("Missing path or URL to values.") if "labels" in data_set_dictionary: json_directory = os.path.dirname(json_path) data_set_dictionary["labels"] = os.path.join( json_directory, data_set_dictionary["labels"]) elif os.path.isfile(input_file_or_name): file_path = input_file_or_name filename = os.path.basename(file_path) file_extension = extension(filename) data_format = file_extension[1:] if file_extension else None name = _base_name(file_path) data_set_dictionary = {"values": file_path, "format": data_format} else: name = input_file_or_name name = normalise_string(name) data_set_dictionary = None return name, data_set_dictionary
def _save_feature_mapping(feature_mapping, title, group, tables_file): name = normalise_string(title) group = tables_file.create_group(group, name, title) feature_names = [] feature_counts = [] feature_ids = [] for feature_name, feature_id_set in feature_mapping.items(): feature_names.append(feature_name) feature_counts.append(len(feature_id_set)) feature_ids.extend(feature_id_set) feature_lists = { "feature_names": feature_names, "feature_counts": feature_counts, "feature_ids": feature_ids } for feature_list_name, feature_list in feature_lists.items(): feature_list_array = numpy.array(feature_list) _save_array(feature_list_array, feature_list_name, group, tables_file)
def __init__(self, method, number_of_clusters=None, training_set_kind=None): prediction_method_names = { name: specifications["aliases"] for name, specifications in PREDICTION_METHODS.items() } method = proper_string(method, prediction_method_names) if method not in PREDICTION_METHODS: raise ValueError( "Prediction method `{}` not found.".format(method)) if number_of_clusters is None: raise TypeError("Number of clusters not set.") self.method = method self.number_of_clusters = number_of_clusters if training_set_kind: training_set_kind = normalise_string(training_set_kind) self.training_set_kind = training_set_kind
def plot_centroid_probabilities_evolution(probabilities, distribution, linestyle="solid", name=None): distribution = normalise_string(distribution) y_label = _axis_label_for_symbol(symbol="\\pi", distribution=distribution, suffix="^k") figure_name = "centroids_evolution-{}-probabilities".format(distribution) figure_name = saving.build_figure_name(figure_name, name) n_epochs, n_centroids = probabilities.shape centroids_palette = style.darker_palette(n_centroids) epochs = numpy.arange(n_epochs) + 1 figure = pyplot.figure() axis = figure.add_subplot(1, 1, 1) seaborn.despine() for k in range(n_centroids): axis.plot(epochs, probabilities[:, k], color=centroids_palette[k], linestyle=linestyle, label="$k = {}$".format(k)) axis.set_xlabel("Epochs") axis.set_ylabel(y_label) axis.legend(loc="best") return figure, figure_name
def plot_centroid_means_evolution(means, distribution, decomposed=False, name=None): symbol = "\\mu" if decomposed: decomposition_method = "PCA" else: decomposition_method = "" distribution = normalise_string(distribution) suffix = "(y = k)" x_label = _axis_label_for_symbol(symbol=symbol, coordinate=1, decomposition_method=decomposition_method, distribution=distribution, suffix=suffix) y_label = _axis_label_for_symbol(symbol=symbol, coordinate=2, decomposition_method=decomposition_method, distribution=distribution, suffix=suffix) figure_name = "centroids_evolution-{}-means".format(distribution) figure_name = saving.build_figure_name(figure_name, name) n_epochs, n_centroids, latent_size = means.shape if latent_size > 2: raise ValueError("Dimensions of means should be 2.") centroids_palette = style.darker_palette(n_centroids) epochs = numpy.arange(n_epochs) + 1 figure = pyplot.figure() axis = figure.add_subplot(1, 1, 1) seaborn.despine() colour_bar_scatter_plot = axis.scatter(means[:, 0, 0], means[:, 0, 1], c=epochs, cmap=seaborn.dark_palette( style.NEUTRAL_COLOUR, as_cmap=True), zorder=0) for k in range(n_centroids): colour = centroids_palette[k] colour_map = seaborn.dark_palette(colour, as_cmap=True) axis.plot(means[:, k, 0], means[:, k, 1], color=colour, label="$k = {}$".format(k), zorder=k + 1) axis.scatter(means[:, k, 0], means[:, k, 1], c=epochs, cmap=colour_map, zorder=n_centroids + k + 1) axis.legend(loc="best") colour_bar = figure.colorbar(colour_bar_scatter_plot) colour_bar.outline.set_linewidth(0) colour_bar.set_label("Epochs") axis.set_xlabel(x_label) axis.set_ylabel(y_label) return figure, figure_name
def select_features(values_dictionary, feature_names, method=None, parameters=None): method = normalise_string(method) print("Selecting features.") start_time = time() if type(values_dictionary) == dict: values = values_dictionary["original"] n_examples, n_features = values.shape if method == "remove_zeros": total_feature_sum = values.sum(axis=0) if isinstance(total_feature_sum, numpy.matrix): total_feature_sum = total_feature_sum.A.squeeze() indices = total_feature_sum != 0 elif method == "keep_variances_above": variances = values.var(axis=0) if isinstance(variances, numpy.matrix): variances = variances.A.squeeze() if parameters: threshold = float(parameters[0]) else: threshold = 0.5 indices = variances > threshold elif method == "keep_highest_variances": variances = values.var(axis=0) if isinstance(variances, numpy.matrix): variances = variances.A.squeeze() variance_sorted_indices = numpy.argsort(variances) if parameters: number_to_keep = int(parameters[0]) else: number_to_keep = int(n_examples/2) indices = numpy.sort(variance_sorted_indices[-number_to_keep:]) else: raise ValueError( "Feature selection `{}` not found.".format(method)) if method: error = Exception( "No features excluded using feature selection {}.".format(method)) if indices.dtype == "bool" and all(indices): raise error elif indices.dtype != "bool" and len(indices) == n_features: raise error feature_selected_values = {} for version, values in values_dictionary.items(): if values is not None: feature_selected_values[version] = values[:, indices] else: feature_selected_values[version] = None feature_selected_feature_names = feature_names[indices] n_features_changed = len(feature_selected_feature_names) duration = time() - start_time print("{} features selected, {} excluded ({}).".format( n_features_changed, n_features - n_features_changed, format_duration(duration) )) return feature_selected_values, feature_selected_feature_names
def plot_values(values, colour_coding=None, colouring_data_set=None, centroids=None, sampled_values=None, class_name=None, feature_index=None, figure_labels=None, example_tag=None, name="scatter"): figure_name = name if figure_labels: title = figure_labels.get("title") x_label = figure_labels.get("x label") y_label = figure_labels.get("y label") else: title = "none" x_label = "$x$" y_label = "$y$" if not title: title = "none" figure_name += "-" + normalise_string(title) if colour_coding: colour_coding = normalise_string(colour_coding) figure_name += "-" + colour_coding if "predicted" in colour_coding: if colouring_data_set.prediction_specifications: figure_name += "-" + ( colouring_data_set.prediction_specifications.name) else: figure_name += "unknown_prediction_method" if colouring_data_set is None: raise ValueError("Colouring data set not given.") if sampled_values is not None: figure_name += "-samples" values = values.copy()[:, :2] if scipy.sparse.issparse(values): values = values.A # Randomise examples in values to remove any prior order n_examples, __ = values.shape random_state = numpy.random.RandomState(117) shuffled_indices = random_state.permutation(n_examples) values = values[shuffled_indices] # Adjust marker size based on number of examples style._adjust_marker_size_for_scatter_plots(n_examples) figure = pyplot.figure() axis = figure.add_subplot(1, 1, 1) seaborn.despine() axis.set_xlabel(x_label) axis.set_ylabel(y_label) colour_map = seaborn.dark_palette(style.STANDARD_PALETTE[0], as_cmap=True) alpha = 1 if sampled_values is not None: alpha = 0.5 if colour_coding and ("labels" in colour_coding or "ids" in colour_coding or "class" in colour_coding or colour_coding == "batches"): if colour_coding == "predicted_cluster_ids": labels = colouring_data_set.predicted_cluster_ids class_names = numpy.unique(labels).tolist() number_of_classes = len(class_names) class_palette = None label_sorter = None elif colour_coding == "predicted_labels": labels = colouring_data_set.predicted_labels class_names = colouring_data_set.predicted_class_names number_of_classes = colouring_data_set.number_of_predicted_classes class_palette = colouring_data_set.predicted_class_palette label_sorter = colouring_data_set.predicted_label_sorter elif colour_coding == "predicted_superset_labels": labels = colouring_data_set.predicted_superset_labels class_names = colouring_data_set.predicted_superset_class_names number_of_classes = ( colouring_data_set.number_of_predicted_superset_classes) class_palette = colouring_data_set.predicted_superset_class_palette label_sorter = colouring_data_set.predicted_superset_label_sorter elif "superset" in colour_coding: labels = colouring_data_set.superset_labels class_names = colouring_data_set.superset_class_names number_of_classes = colouring_data_set.number_of_superset_classes class_palette = colouring_data_set.superset_class_palette label_sorter = colouring_data_set.superset_label_sorter elif colour_coding == "batches": labels = colouring_data_set.batch_indices.flatten() class_names = colouring_data_set.batch_names number_of_classes = colouring_data_set.number_of_batches class_palette = None label_sorter = None else: labels = colouring_data_set.labels class_names = colouring_data_set.class_names number_of_classes = colouring_data_set.number_of_classes class_palette = colouring_data_set.class_palette label_sorter = colouring_data_set.label_sorter if not class_palette: index_palette = style.lighter_palette(number_of_classes) class_palette = { class_name: index_palette[i] for i, class_name in enumerate( sorted(class_names, key=label_sorter)) } # Examples are shuffled, so should their labels be labels = labels[shuffled_indices] if ("labels" in colour_coding or "ids" in colour_coding or colour_coding == "batches"): colours = [] classes = set() for i, label in enumerate(labels): colour = class_palette[label] colours.append(colour) # Plot one example for each class to add labels if label not in classes: classes.add(label) axis.scatter(values[i, 0], values[i, 1], color=colour, label=label, alpha=alpha) axis.scatter(values[:, 0], values[:, 1], c=colours, alpha=alpha) class_handles, class_labels = axis.get_legend_handles_labels() if class_labels: class_labels, class_handles = zip( *sorted(zip(class_labels, class_handles), key=(lambda t: label_sorter(t[0]) ) if label_sorter else None)) class_label_maximum_width = max(map(len, class_labels)) if class_label_maximum_width <= 5 and number_of_classes <= 20: axis.legend(class_handles, class_labels, loc="best") else: if number_of_classes <= 20: class_label_columns = 2 else: class_label_columns = 3 axis.legend( class_handles, class_labels, bbox_to_anchor=(-0.1, 1.05, 1.1, 0.95), loc="lower left", ncol=class_label_columns, mode="expand", borderaxespad=0., ) elif "class" in colour_coding: colours = [] figure_name += "-" + normalise_string(str(class_name)) ordered_indices_set = {str(class_name): [], "Remaining": []} for i, label in enumerate(labels): if label == class_name: colour = class_palette[label] ordered_indices_set[str(class_name)].append(i) else: colour = style.NEUTRAL_COLOUR ordered_indices_set["Remaining"].append(i) colours.append(colour) colours = numpy.array(colours) z_order_index = 1 for label, ordered_indices in sorted(ordered_indices_set.items()): if label == "Remaining": z_order = 0 else: z_order = z_order_index z_order_index += 1 ordered_values = values[ordered_indices] ordered_colours = colours[ordered_indices] axis.scatter(ordered_values[:, 0], ordered_values[:, 1], c=ordered_colours, label=label, alpha=alpha, zorder=z_order) handles, labels = axis.get_legend_handles_labels() labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: label_sorter(t[0]) if label_sorter else None)) axis.legend(handles, labels, bbox_to_anchor=(-0.1, 1.05, 1.1, 0.95), loc="lower left", ncol=2, mode="expand", borderaxespad=0.) elif colour_coding == "count_sum": n = colouring_data_set.count_sum[shuffled_indices].flatten() scatter_plot = axis.scatter(values[:, 0], values[:, 1], c=n, cmap=colour_map, alpha=alpha) colour_bar = figure.colorbar(scatter_plot) colour_bar.outline.set_linewidth(0) colour_bar.set_label("Total number of {}s per {}".format( colouring_data_set.terms["item"], colouring_data_set.terms["example"])) elif colour_coding == "feature": if feature_index is None: raise ValueError("Feature number not given.") if feature_index > colouring_data_set.number_of_features: raise ValueError("Feature number higher than number of features.") feature_name = colouring_data_set.feature_names[feature_index] figure_name += "-{}".format(normalise_string(feature_name)) f = colouring_data_set.values[shuffled_indices, feature_index] if scipy.sparse.issparse(f): f = f.A f = f.squeeze() scatter_plot = axis.scatter(values[:, 0], values[:, 1], c=f, cmap=colour_map, alpha=alpha) colour_bar = figure.colorbar(scatter_plot) colour_bar.outline.set_linewidth(0) colour_bar.set_label(feature_name) elif colour_coding is None: axis.scatter(values[:, 0], values[:, 1], c="k", alpha=alpha, edgecolors="none") else: raise ValueError("Colour coding `{}` not found.".format(colour_coding)) if centroids: prior_centroids = centroids["prior"] if prior_centroids: n_centroids = prior_centroids["probabilities"].shape[0] else: n_centroids = 0 if n_centroids > 1: centroids_palette = style.darker_palette(n_centroids) classes = numpy.arange(n_centroids) means = prior_centroids["means"] covariance_matrices = prior_centroids["covariance_matrices"] for k in range(n_centroids): axis.scatter(means[k, 0], means[k, 1], s=60, marker="x", color="black", linewidth=3) axis.scatter(means[k, 0], means[k, 1], marker="x", facecolor=centroids_palette[k], edgecolors="black") ellipse_fill, ellipse_edge = _covariance_matrix_as_ellipse( covariance_matrices[k], means[k], colour=centroids_palette[k]) axis.add_patch(ellipse_edge) axis.add_patch(ellipse_fill) if sampled_values is not None: sampled_values = sampled_values.copy()[:, :2] if scipy.sparse.issparse(sampled_values): sampled_values = sampled_values.A sample_colour_map = seaborn.blend_palette(("white", "purple"), as_cmap=True) x_limits = axis.get_xlim() y_limits = axis.get_ylim() axis.hexbin(sampled_values[:, 0], sampled_values[:, 1], gridsize=75, cmap=sample_colour_map, linewidths=0., edgecolors="none", zorder=-100) axis.set_xlim(x_limits) axis.set_ylim(y_limits) # Reset marker size style.reset_plot_look() return figure, figure_name
def _setup_model(data_set, model_type=None, latent_size=None, hidden_sizes=None, number_of_importance_samples=None, number_of_monte_carlo_samples=None, inference_architecture=None, latent_distribution=None, number_of_classes=None, parameterise_latent_posterior=False, prior_probabilities_method=None, generative_architecture=None, reconstruction_distribution=None, number_of_reconstruction_classes=None, count_sum=None, proportion_of_free_nats_for_y_kl_divergence=None, minibatch_normalisation=None, batch_correction=None, dropout_keep_probabilities=None, number_of_warm_up_epochs=None, kl_weight=None, models_directory=None): if model_type is None: model_type = defaults["model"]["type"] if batch_correction is None: batch_correction = defaults["model"]["batch_correction"] feature_size = data_set.number_of_features number_of_batches = data_set.number_of_batches if not data_set.has_batches: batch_correction = False if normalise_string(model_type) == "vae": model = VariationalAutoencoder( feature_size=feature_size, latent_size=latent_size, hidden_sizes=hidden_sizes, number_of_monte_carlo_samples=number_of_monte_carlo_samples, number_of_importance_samples=number_of_importance_samples, inference_architecture=inference_architecture, latent_distribution=latent_distribution, number_of_latent_clusters=number_of_classes, parameterise_latent_posterior=parameterise_latent_posterior, generative_architecture=generative_architecture, reconstruction_distribution=reconstruction_distribution, number_of_reconstruction_classes=number_of_reconstruction_classes, minibatch_normalisation=minibatch_normalisation, batch_correction=batch_correction, number_of_batches=number_of_batches, dropout_keep_probabilities=dropout_keep_probabilities, count_sum=count_sum, number_of_warm_up_epochs=number_of_warm_up_epochs, kl_weight=kl_weight, log_directory=models_directory) elif normalise_string(model_type) == "gmvae": prior_probabilities_method_for_model = prior_probabilities_method if prior_probabilities_method == "uniform": prior_probabilities = None elif prior_probabilities_method == "infer": prior_probabilities_method_for_model = "custom" prior_probabilities = data_set.class_probabilities else: prior_probabilities = None model = GaussianMixtureVariationalAutoencoder( feature_size=feature_size, latent_size=latent_size, hidden_sizes=hidden_sizes, number_of_monte_carlo_samples=number_of_monte_carlo_samples, number_of_importance_samples=number_of_importance_samples, prior_probabilities_method=prior_probabilities_method_for_model, prior_probabilities=prior_probabilities, latent_distribution=latent_distribution, number_of_latent_clusters=number_of_classes, proportion_of_free_nats_for_y_kl_divergence=( proportion_of_free_nats_for_y_kl_divergence), reconstruction_distribution=reconstruction_distribution, number_of_reconstruction_classes=number_of_reconstruction_classes, minibatch_normalisation=minibatch_normalisation, batch_correction=batch_correction, number_of_batches=number_of_batches, dropout_keep_probabilities=dropout_keep_probabilities, count_sum=count_sum, number_of_warm_up_epochs=number_of_warm_up_epochs, kl_weight=kl_weight, log_directory=models_directory) else: raise ValueError("Model type not found: `{}`.".format(model_type)) return model
def evaluate(data_set_file_or_name, data_format=None, data_directory=None, map_features=None, feature_selection=None, example_filter=None, noisy_preprocessing_methods=None, preprocessing_methods=None, split_data_set=None, splitting_method=None, splitting_fraction=None, model_type=None, latent_size=None, hidden_sizes=None, number_of_importance_samples=None, number_of_monte_carlo_samples=None, inference_architecture=None, latent_distribution=None, number_of_classes=None, parameterise_latent_posterior=False, prior_probabilities_method=None, generative_architecture=None, reconstruction_distribution=None, number_of_reconstruction_classes=None, count_sum=None, proportion_of_free_nats_for_y_kl_divergence=None, minibatch_normalisation=None, batch_correction=None, dropout_keep_probabilities=None, number_of_warm_up_epochs=None, kl_weight=None, minibatch_size=None, run_id=None, models_directory=None, included_analyses=None, analysis_level=None, decomposition_methods=None, highlight_feature_indices=None, export_options=None, analyses_directory=None, evaluation_set_kind=None, sample_size=None, prediction_method=None, prediction_training_set_kind=None, model_versions=None, **keyword_arguments): """Evaluate model on data set.""" if split_data_set is None: split_data_set = defaults["data"]["split_data_set"] if splitting_method is None: splitting_method = defaults["data"]["splitting_method"] if splitting_fraction is None: splitting_fraction = defaults["data"]["splitting_fraction"] if models_directory is None: models_directory = defaults["models"]["directory"] if evaluation_set_kind is None: evaluation_set_kind = defaults["evaluation"]["data_set_name"] if sample_size is None: sample_size = defaults["models"]["sample_size"] if prediction_method is None: prediction_method = defaults["evaluation"]["prediction_method"] if prediction_training_set_kind is None: prediction_training_set_kind = defaults["evaluation"][ "prediction_training_set_kind"] if model_versions is None: model_versions = defaults["evaluation"]["model_versions"] if analyses_directory is None: analyses_directory = defaults["analyses"]["directory"] evaluation_set_kind = normalise_string(evaluation_set_kind) prediction_training_set_kind = normalise_string( prediction_training_set_kind) model_versions = parse_model_versions(model_versions) print(title("Data")) binarise_values = False if reconstruction_distribution == "bernoulli": if noisy_preprocessing_methods: if noisy_preprocessing_methods[-1] != "binarise": noisy_preprocessing_methods.append("binarise") else: binarise_values = True data_set = DataSet(data_set_file_or_name, data_format=data_format, directory=data_directory, map_features=map_features, feature_selection=feature_selection, example_filter=example_filter, preprocessing_methods=preprocessing_methods, binarise_values=binarise_values, noisy_preprocessing_methods=noisy_preprocessing_methods) if not split_data_set or evaluation_set_kind == "full": data_set.load() if split_data_set: training_set, validation_set, test_set = data_set.split( method=splitting_method, fraction=splitting_fraction) data_subsets = [data_set, training_set, validation_set, test_set] for data_subset in data_subsets: clear_data_subset = True if data_subset.kind == evaluation_set_kind: evaluation_set = data_subset clear_data_subset = False if data_subset.kind == prediction_training_set_kind: prediction_training_set = data_subset clear_data_subset = False if clear_data_subset: data_subset.clear() else: splitting_method = None splitting_fraction = None evaluation_set = data_set prediction_training_set = data_set evaluation_subset_indices = indices_for_evaluation_subset(evaluation_set) models_directory = build_directory_path( models_directory, data_set=evaluation_set, splitting_method=splitting_method, splitting_fraction=splitting_fraction) analyses_directory = build_directory_path( analyses_directory, data_set=evaluation_set, splitting_method=splitting_method, splitting_fraction=splitting_fraction) print(title("Model")) if number_of_classes is None: if evaluation_set.has_labels: number_of_classes = (evaluation_set.number_of_classes - evaluation_set.number_of_excluded_classes) model = _setup_model( data_set=evaluation_set, model_type=model_type, latent_size=latent_size, hidden_sizes=hidden_sizes, number_of_importance_samples=number_of_importance_samples, number_of_monte_carlo_samples=number_of_monte_carlo_samples, inference_architecture=inference_architecture, latent_distribution=latent_distribution, number_of_classes=number_of_classes, parameterise_latent_posterior=parameterise_latent_posterior, prior_probabilities_method=prior_probabilities_method, generative_architecture=generative_architecture, reconstruction_distribution=reconstruction_distribution, number_of_reconstruction_classes=number_of_reconstruction_classes, count_sum=count_sum, proportion_of_free_nats_for_y_kl_divergence=( proportion_of_free_nats_for_y_kl_divergence), minibatch_normalisation=minibatch_normalisation, batch_correction=batch_correction, dropout_keep_probabilities=dropout_keep_probabilities, number_of_warm_up_epochs=number_of_warm_up_epochs, kl_weight=kl_weight, models_directory=models_directory) if ("best_model" in model_versions and not better_model_exists(model, run_id=run_id)): model_versions.remove("best_model") if ("early_stopping" in model_versions and not model_stopped_early(model, run_id=run_id)): model_versions.remove("early_stopping") print(subtitle("Analysis")) analyses.analyse_model(model=model, run_id=run_id, included_analyses=included_analyses, analysis_level=analysis_level, export_options=export_options, analyses_directory=analyses_directory) print(title("Results")) print("Evaluation set: {} set.".format(evaluation_set.kind)) print("Model version{}: {}.".format( "" if len(model_versions) == 1 else "s", enumerate_strings([v.replace("_", " ") for v in model_versions], conjunction="and"))) if prediction_method: prediction_specifications = PredictionSpecifications( method=prediction_method, number_of_clusters=number_of_classes, training_set_kind=prediction_training_set.kind) print("Prediction method: {}.".format( prediction_specifications.method)) print("Number of clusters: {}.".format( prediction_specifications.number_of_clusters)) print("Prediction training set: {} set.".format( prediction_specifications.training_set_kind)) print() for model_version in model_versions: use_best_model = False use_early_stopping_model = False if model_version == "best_model": use_best_model = True elif model_version == "early_stopping": use_early_stopping_model = True print(subtitle(model_version.replace("_", " ").capitalize())) print( heading("{} evaluation".format( model_version.replace("_", "-").capitalize()))) (transformed_evaluation_set, reconstructed_evaluation_set, latent_evaluation_sets) = model.evaluate( evaluation_set=evaluation_set, evaluation_subset_indices=evaluation_subset_indices, minibatch_size=minibatch_size, run_id=run_id, use_best_model=use_best_model, use_early_stopping_model=use_early_stopping_model, output_versions="all") print() if sample_size: print( heading("{} sampling".format( model_version.replace("_", "-").capitalize()))) sample_reconstruction_set, __ = model.sample( sample_size=sample_size, minibatch_size=minibatch_size, run_id=run_id, use_best_model=use_best_model, use_early_stopping_model=use_early_stopping_model) print() else: sample_reconstruction_set = None if prediction_method: print( heading("{} prediction".format( model_version.replace("_", "-").capitalize()))) latent_prediction_training_sets = model.evaluate( evaluation_set=prediction_training_set, minibatch_size=minibatch_size, run_id=run_id, use_best_model=use_best_model, use_early_stopping_model=use_early_stopping_model, output_versions="latent", log_results=False) print() cluster_ids, predicted_labels, predicted_superset_labels = ( predict_labels( training_set=latent_prediction_training_sets["z"], evaluation_set=latent_evaluation_sets["z"], specifications=prediction_specifications)) evaluation_set_versions = [ transformed_evaluation_set, reconstructed_evaluation_set ] + list(latent_evaluation_sets.values()) for evaluation_set_version in evaluation_set_versions: evaluation_set_version.update_predictions( prediction_specifications=prediction_specifications, predicted_cluster_ids=cluster_ids, predicted_labels=predicted_labels, predicted_superset_labels=predicted_superset_labels) print() print( heading("{} analysis".format( model_version.replace("_", "-").capitalize()))) analyses.analyse_results( evaluation_set=transformed_evaluation_set, reconstructed_evaluation_set=reconstructed_evaluation_set, latent_evaluation_sets=latent_evaluation_sets, model=model, run_id=run_id, sample_reconstruction_set=sample_reconstruction_set, decomposition_methods=decomposition_methods, evaluation_subset_indices=evaluation_subset_indices, highlight_feature_indices=highlight_feature_indices, best_model=use_best_model, early_stopping=use_early_stopping_model, included_analyses=included_analyses, analysis_level=analysis_level, export_options=export_options, analyses_directory=analyses_directory) return 0
def decompose(values, other_value_sets={}, centroids={}, method=None, number_of_components=None, random=False): if method is None: method = defaults["decomposition_method"] method = proper_string(normalise_string(method), DECOMPOSITION_METHOD_NAMES) if number_of_components is None: number_of_components = defaults["decomposition_dimensionality"] other_values_provided_as_dictionary = True if other_value_sets is not None and not isinstance(other_value_sets, dict): other_value_sets["unknown"] = other_value_sets other_values_provided_as_dictionary = False if random: random_state = None else: random_state = 42 if method == "PCA": if (values.shape[1] <= MAXIMUM_FEATURE_SIZE_FOR_NORMAL_PCA and not scipy.sparse.issparse(values)): model = PCA(n_components=number_of_components) else: model = IncrementalPCA(n_components=number_of_components, batch_size=100) elif method == "SVD": model = TruncatedSVD(n_components=number_of_components) elif method == "ICA": model = FastICA(n_components=number_of_components) elif method == "t-SNE": if number_of_components < 4: tsne_method = "barnes_hut" else: tsne_method = "exact" model = TSNE(n_components=number_of_components, method=tsne_method, random_state=random_state) else: raise ValueError("Method `{}` not found.".format(method)) values_decomposed = model.fit_transform(values) if other_value_sets and method != "t-SNE": other_value_sets_decomposed = {} for other_set_name, other_values in other_value_sets.items(): if other_values is not None: other_value_decomposed = model.transform(other_values) else: other_value_decomposed = None other_value_sets_decomposed[other_set_name] = ( other_value_decomposed) else: other_value_sets_decomposed = None if other_value_sets_decomposed and not other_values_provided_as_dictionary: other_value_sets_decomposed = other_value_sets_decomposed["unknown"] # Only supports centroids without data sets as top levels if centroids is not None and method == "PCA": if "means" in centroids: centroids = {"unknown": centroids} components = model.components_ centroids_decomposed = {} for distribution, distribution_centroids in centroids.items(): if distribution_centroids: centroids_distribution_decomposed = {} for parameter, parameter_values in ( distribution_centroids.items()): if parameter == "means": shape = numpy.array(parameter_values.shape) original_dimension = shape[-1] reshaped_parameter_values = parameter_values.reshape( -1, original_dimension) decomposed_parameter_values = model.transform( reshaped_parameter_values) shape[-1] = number_of_components new_parameter_values = ( decomposed_parameter_values.reshape(shape)) elif parameter == "covariance_matrices": shape = numpy.array(parameter_values.shape) original_dimension = shape[-1] reshaped_parameter_values = parameter_values.reshape( -1, original_dimension, original_dimension) n_centroids = reshaped_parameter_values.shape[0] decomposed_parameter_values = numpy.empty( shape=(n_centroids, 2, 2)) for i in range(n_centroids): decomposed_parameter_values[i] = ( components @ reshaped_parameter_values[i] @ components.T) shape[-2:] = number_of_components new_parameter_values = ( decomposed_parameter_values.reshape(shape)) else: new_parameter_values = parameter_values centroids_distribution_decomposed[parameter] = ( new_parameter_values) centroids_decomposed[distribution] = ( centroids_distribution_decomposed) else: centroids_decomposed[distribution] = None if "unknown" in centroids_decomposed: centroids_decomposed = centroids_decomposed["unknown"] else: centroids_decomposed = None output = [values_decomposed] if other_value_sets != {}: output.append(other_value_sets_decomposed) if centroids != {}: output.append(centroids_decomposed) return output
def analyse_centroid_probabilities(centroids, name=None, analysis_level=None, export_options=None, analyses_directory=None): if name: name = normalise_string(name) if analysis_level is None: analysis_level = defaults["analyses"]["analysis_level"] if analyses_directory is None: analyses_directory = defaults["analyses"]["directory"] print("Plotting centroid probabilities.") plot_time_start = time() posterior_probabilities = None prior_probabilities = None if "posterior" in centroids and centroids["posterior"]: posterior_probabilities = centroids["posterior"]["probabilities"] n_centroids = len(posterior_probabilities) if "prior" in centroids and centroids["prior"]: prior_probabilities = centroids["prior"]["probabilities"] n_centroids = len(prior_probabilities) centroids_palette = style.darker_palette(n_centroids) x_label = "$k$" if prior_probabilities is not None: if posterior_probabilities is not None: y_label = _axis_label_for_symbol( symbol="\\pi", distribution=normalise_string("posterior"), suffix="^k") if name: plot_name = [name, "posterior", "prior"] else: plot_name = ["posterior", "prior"] else: y_label = _axis_label_for_symbol( symbol="\\pi", distribution=normalise_string("prior"), suffix="^k") if name: plot_name = [name, "prior"] else: plot_name = "prior" elif posterior_probabilities is not None: y_label = _axis_label_for_symbol( symbol="\\pi", distribution=normalise_string("posterior"), suffix="^k") if name: plot_name = [name, "posterior"] else: plot_name = "posterior" figure, figure_name = figures.plot_probabilities(posterior_probabilities, prior_probabilities, x_label=x_label, y_label=y_label, palette=centroids_palette, uniform=False, name=plot_name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=analyses_directory) plot_duration = time() - plot_time_start print("Centroid probabilities plotted and saved ({}).".format( format_duration(plot_duration)))
def analyse_decompositions(data_sets, other_data_sets=None, centroids=None, colouring_data_set=None, sampled_data_set=None, decomposition_methods=None, highlight_feature_indices=None, symbol=None, title="data set", specifier=None, analysis_level=None, export_options=None, analyses_directory=None): if analysis_level is None: analysis_level = defaults["analyses"]["analysis_level"] centroids_original = centroids if isinstance(data_sets, dict): data_sets = list(data_sets.values()) if not isinstance(data_sets, (list, tuple)): data_sets = [data_sets] if other_data_sets is None: other_data_sets = [None] * len(data_sets) elif not isinstance(other_data_sets, (list, tuple)): other_data_sets = [other_data_sets] if len(data_sets) != len(other_data_sets): raise ValueError( "Lists of data sets and alternative data sets do not have the " "same length.") specification = None base_symbol = symbol original_title = title if decomposition_methods is None: decomposition_methods = [defaults["decomposition_method"]] elif not isinstance(decomposition_methods, (list, tuple)): decomposition_methods = [decomposition_methods] else: decomposition_methods = decomposition_methods.copy() decomposition_methods.insert(0, None) if highlight_feature_indices is None: highlight_feature_indices = defaults["analyses"][ "highlight_feature_indices"] elif not isinstance(highlight_feature_indices, (list, tuple)): highlight_feature_indices = [highlight_feature_indices] else: highlight_feature_indices = highlight_feature_indices.copy() if analyses_directory is None: analyses_directory = defaults["analyses"]["directory"] for data_set, other_data_set in zip(data_sets, other_data_sets): if data_set.values.shape[1] <= 1: continue title = original_title name = normalise_string(title) if specifier: specification = specifier(data_set) if specification: name += "-" + str(specification) title += " for " + specification title += " set" if not colouring_data_set: colouring_data_set = data_set if data_set.version in ["z", "z1"]: centroids = copy.deepcopy(centroids_original) else: centroids = None if other_data_set: title = "{} set values in {}".format(other_data_set.version, title) name = other_data_set.version + "-" + name decompositions_directory = os.path.join(analyses_directory, name) for decomposition_method in decomposition_methods: other_values = None sampled_values = None if other_data_set: other_values = other_data_set.values if sampled_data_set: sampled_values = sampled_data_set.values if not decomposition_method: if data_set.number_of_features == 2: values_decomposed = data_set.values other_values_decomposed = other_values sampled_values_decomposed = sampled_values centroids_decomposed = centroids else: continue else: decomposition_method = proper_string( decomposition_method, DECOMPOSITION_METHOD_NAMES) values_decomposed = data_set.values other_values_decomposed = other_values sampled_values_decomposed = sampled_values centroids_decomposed = centroids other_value_sets_decomposed = {} if other_values is not None: other_value_sets_decomposed["other"] = other_values if sampled_values is not None: other_value_sets_decomposed["sampled"] = sampled_values if not other_value_sets_decomposed: other_value_sets_decomposed = None if decomposition_method == "t-SNE": if (data_set.number_of_examples > MAXIMUM_NUMBER_OF_EXAMPLES_FOR_TSNE): print( "The number of examples for {}".format(title), "is too large to decompose it", "using {}. Skipping.".format(decomposition_method)) print() continue elif (data_set.number_of_features > MAXIMUM_NUMBER_OF_FEATURES_FOR_TSNE): number_of_pca_components_before_tsne = min( MAXIMUM_NUMBER_OF_PCA_COMPONENTS_BEFORE_TSNE, data_set.number_of_examples - 1) print( "The number of features for {}".format(title), "is too large to decompose it", "using {} in due time.".format( decomposition_method)) print("Decomposing {} to {} components using PCA " "beforehand.".format( title, number_of_pca_components_before_tsne)) decompose_time_start = time() (values_decomposed, other_value_sets_decomposed, centroids_decomposed) = decompose( values_decomposed, other_value_sets=other_value_sets_decomposed, centroids=centroids_decomposed, method="pca", number_of_components=( number_of_pca_components_before_tsne)) decompose_duration = time() - decompose_time_start print("{} pre-decomposed ({}).".format( capitalise_string(title), format_duration(decompose_duration))) else: if scipy.sparse.issparse(values_decomposed): values_decomposed = values_decomposed.A if scipy.sparse.issparse(other_values_decomposed): other_values_decomposed = other_values_decomposed.A if scipy.sparse.issparse(sampled_values_decomposed): sampled_values_decomposed = ( sampled_values_decomposed.A) print("Decomposing {} using {}.".format( title, decomposition_method)) decompose_time_start = time() (values_decomposed, other_value_sets_decomposed, centroids_decomposed) = decompose( values_decomposed, other_value_sets=other_value_sets_decomposed, centroids=centroids_decomposed, method=decomposition_method, number_of_components=2) decompose_duration = time() - decompose_time_start print("{} decomposed ({}).".format( capitalise_string(title), format_duration(decompose_duration))) print() if other_value_sets_decomposed: other_values_decomposed = other_value_sets_decomposed.get( "other") sampled_values_decomposed = ( other_value_sets_decomposed.get("sampled")) if base_symbol: symbol = base_symbol else: symbol = specification x_label = _axis_label_for_symbol( symbol=symbol, coordinate=1, decomposition_method=decomposition_method, ) y_label = _axis_label_for_symbol( symbol=symbol, coordinate=2, decomposition_method=decomposition_method, ) figure_labels = { "title": decomposition_method, "x label": x_label, "y label": y_label } if other_data_set: plot_values_decomposed = other_values_decomposed else: plot_values_decomposed = values_decomposed if plot_values_decomposed is None: print("No values to plot.\n") return print("Plotting {}{}.".format( "decomposed " if decomposition_method else "", title)) # No colour-coding plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Samples if sampled_data_set: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, centroids=centroids_decomposed, sampled_values=sampled_values_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with samples) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Labels if colouring_data_set.labels is not None: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with labels) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Superset labels if colouring_data_set.superset_labels is not None: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="superset labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" " "{} (with superset labels) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) # For each class if analysis_level == "extensive": if colouring_data_set.number_of_classes <= 10: plot_time_start = time() for class_name in colouring_data_set.class_names: figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="class", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, class_name=class_name, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure( figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print( " {} (for each class) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) if (colouring_data_set.superset_labels is not None and data_set.number_of_superset_classes <= 10): plot_time_start = time() for superset_class_name in ( colouring_data_set.superset_class_names): figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="superset class", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, class_name=superset_class_name, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure( figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (for each superset class) plotted and " "saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Batches if colouring_data_set.has_batches: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="batches", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" " "{} (with batches) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Cluster IDs if colouring_data_set.has_predicted_cluster_ids: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="predicted cluster IDs", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print( " " "{} (with predicted cluster IDs) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) # Predicted labels if colouring_data_set.has_predicted_labels: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="predicted labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" " "{} (with predicted labels) plotted and saved ({}).". format(capitalise_string(title), format_duration(plot_duration))) if colouring_data_set.has_predicted_superset_labels: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="predicted superset labels", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name, ) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print( " {} (with predicted superset labels) plotted and saved" " ({}).".format(capitalise_string(title), format_duration(plot_duration))) # Count sum plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="count sum", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with count sum) plotted and saved ({}).".format( capitalise_string(title), format_duration(plot_duration))) # Features for feature_index in highlight_feature_indices: plot_time_start = time() figure, figure_name = figures.plot_values( plot_values_decomposed, colour_coding="feature", colouring_data_set=colouring_data_set, centroids=centroids_decomposed, feature_index=feature_index, figure_labels=figure_labels, example_tag=data_set.tags["example"], name=name) figures.save_figure(figure=figure, name=figure_name, options=export_options, directory=decompositions_directory) plot_duration = time() - plot_time_start print(" {} (with {}) plotted and saved ({}).".format( capitalise_string(title), data_set.feature_names[feature_index], format_duration(plot_duration))) print()
def split_data_set(data_dictionary, method=None, fraction=None): if method is None: method = defaults["data"]["splitting_method"] if fraction is None: fraction = defaults["data"]["splitting_fraction"] print("Splitting data set.") start_time = time() if method == "default": if "split indices" in data_dictionary: method = "indices" else: method = "random" method = normalise_string(method) n = data_dictionary["values"].shape[0] random_state = numpy.random.RandomState(42) if method in ["random", "sequential"]: n_training_validation = int(fraction * n) n_training = int(fraction * n_training_validation) if method == "random": indices = random_state.permutation(n) else: indices = numpy.arange(n) training_indices = indices[:n_training] validation_indices = indices[n_training:n_training_validation] test_indices = indices[n_training_validation:] elif method == "indices": split_indices = data_dictionary["split indices"] training_indices = split_indices["training"] test_indices = split_indices["test"] if "validation" in split_indices: validation_indices = split_indices["validation"] else: n_training_validation = training_indices.stop n_all = test_indices.stop n_training = n_training_validation - ( n_all - n_training_validation) training_indices = slice(n_training) validation_indices = slice(n_training, n_training_validation) elif method == "macosko": values = data_dictionary["values"] minimum_number_of_non_zero_elements = 900 number_of_non_zero_elements = (values != 0).sum(axis=1) training_indices = numpy.nonzero( number_of_non_zero_elements > minimum_number_of_non_zero_elements )[0] test_validation_indices = numpy.nonzero( number_of_non_zero_elements <= minimum_number_of_non_zero_elements )[0] random_state.shuffle(test_validation_indices) n_validation_test = len(test_validation_indices) n_validation = int((1 - fraction) * n_validation_test) validation_indices = test_validation_indices[:n_validation] test_indices = test_validation_indices[n_validation:] else: raise ValueError("Splitting method `{}` not found.".format(method)) split_data_dictionary = { "training set": { "values": data_dictionary["values"][training_indices], "preprocessed values": None, "binarised values": None, "labels": None, "example names": data_dictionary["example names"][training_indices], "batch indices": None }, "validation set": { "values": data_dictionary["values"][validation_indices], "preprocessed values": None, "binarised values": None, "labels": None, "example names": data_dictionary["example names"][validation_indices], "batch indices": None }, "test set": { "values": data_dictionary["values"][test_indices], "preprocessed values": None, "binarised values": None, "labels": None, "example names": data_dictionary["example names"][test_indices], "batch indices": None }, "feature names": data_dictionary["feature names"], "class names": data_dictionary["class names"] } if "labels" in data_dictionary and data_dictionary["labels"] is not None: split_data_dictionary["training set"]["labels"] = ( data_dictionary["labels"][training_indices]) split_data_dictionary["validation set"]["labels"] = ( data_dictionary["labels"][validation_indices]) split_data_dictionary["test set"]["labels"] = ( data_dictionary["labels"][test_indices]) if ("preprocessed values" in data_dictionary and data_dictionary["preprocessed values"] is not None): split_data_dictionary["training set"]["preprocessed values"] = ( data_dictionary["preprocessed values"][training_indices]) split_data_dictionary["validation set"]["preprocessed values"] = ( data_dictionary["preprocessed values"][validation_indices]) split_data_dictionary["test set"]["preprocessed values"] = ( data_dictionary["preprocessed values"][test_indices]) if ("binarised values" in data_dictionary and data_dictionary["binarised values"] is not None): split_data_dictionary["training set"]["binarised values"] = ( data_dictionary["binarised values"][training_indices]) split_data_dictionary["validation set"]["binarised values"] = ( data_dictionary["binarised values"][validation_indices]) split_data_dictionary["test set"]["binarised values"] = ( data_dictionary["binarised values"][test_indices]) if ("batch indices" in data_dictionary and data_dictionary["batch indices"] is not None): split_data_dictionary["training set"]["batch indices"] = ( data_dictionary["batch indices"][training_indices]) split_data_dictionary["validation set"]["batch indices"] = ( data_dictionary["batch indices"][validation_indices]) split_data_dictionary["test set"]["batch indices"] = ( data_dictionary["batch indices"][test_indices]) duration = time() - start_time print("Data set split ({}).".format(format_duration(duration))) return split_data_dictionary