Example #1
0
def parse_distribution(distribution, model_type=None):

    distribution = normalise_string(distribution)

    if model_type is None:
        kind = "reconstruction"
        distributions = DISTRIBUTIONS

    elif isinstance(model_type, str):
        kind = "latent"
        if model_type == "VAE":
            distributions = LATENT_DISTRIBUTIONS
        elif model_type == "GMVAE":
            distributions = GAUSSIAN_MIXTURE_DISTRIBUTIONS
        else:
            raise ValueError("Model type not found.")

    else:
        raise TypeError("`model_type` should be a string.")

    distribution_names = list(distributions.keys())
    parsed_distribution_name = None

    for distribution_name in distribution_names:
        if normalise_string(distribution_name) == distribution:
            parsed_distribution_name = distribution_name

    if parsed_distribution_name is None:
        raise ValueError("{} distribution `{}` not supported{}.".format(
            kind.capitalize(), distribution,
            " for {}".format(model_type) if model_type else ""))

    return parsed_distribution_name
Example #2
0
    def _build_preprocessed_path(self,
                                 map_features=None,
                                 preprocessing_methods=None,
                                 feature_selection=None,
                                 feature_selection_parameters=None,
                                 example_filter=None,
                                 example_filter_parameters=None,
                                 splitting_method=None,
                                 splitting_fraction=None,
                                 split_indices=None):

        base_path = os.path.join(self.preprocess_directory, self.name)

        filename_parts = [base_path]

        if map_features:
            filename_parts.append("features_mapped")

        if feature_selection:
            feature_selection_part = normalise_string(feature_selection)
            if feature_selection_parameters:
                for parameter in feature_selection_parameters:
                    feature_selection_part += "_" + normalise_string(
                        str(parameter))
            filename_parts.append(feature_selection_part)

        if example_filter:
            example_filter_part = normalise_string(example_filter)
            if example_filter_parameters:
                for parameter in example_filter_parameters:
                    example_filter_part += "_" + normalise_string(
                        str(parameter))
            filename_parts.append(example_filter_part)

        if preprocessing_methods:
            filename_parts.extend(map(normalise_string, preprocessing_methods))

        if splitting_method:
            filename_parts.append("split")

            if (splitting_method == "indices" and len(split_indices) == 3
                    or not splitting_fraction):
                filename_parts.append(splitting_method)
            else:
                filename_parts.append("{}_{}".format(splitting_method,
                                                     splitting_fraction))

        path = "-".join(filename_parts) + PREPROCESSED_EXTENSION

        return path
Example #3
0
def parse_model_versions(proposed_versions):

    version_alias_sets = {
        "end_of_training": ["eot", "end", "finish", "finished"],
        "best_model": ["bm", "best", "optimal_parameters", "op", "optimal"],
        "early_stopping": ["es", "early", "stop", "stopped"]
    }

    parsed_versions = []

    if not isinstance(proposed_versions, list):
        proposed_versions = [proposed_versions]

    if proposed_versions == ["all"]:
        parsed_versions = list(version_alias_sets.keys())

    else:
        for proposed_version in proposed_versions:

            normalised_proposed_version = normalise_string(proposed_version)
            parsed_version = None

            for version, version_aliases in version_alias_sets.items():
                if (normalised_proposed_version == version
                        or normalised_proposed_version in version_aliases):
                    parsed_version = version
                    break

            if parsed_version:
                parsed_versions.append(parsed_version)
            else:
                raise ValueError(
                    "`{}` is not a model version.".format(proposed_version))

    return parsed_versions
Example #4
0
    def save(data_dictionary, tables_file, group_title=None):

        if group_title:
            group = tables_file.create_group("/",
                                             normalise_string(group_title),
                                             group_title)
        else:
            group = tables_file.root

        for title, value in data_dictionary.items():

            if isinstance(value, scipy.sparse.csr_matrix):
                _save_sparse_matrix(value, title, group, tables_file)
            elif isinstance(value, (numpy.ndarray, list)):
                _save_array(value, title, group, tables_file)
            elif title == "split indices":
                _save_split_indices(value, title, group, tables_file)
            elif title == "feature mapping":
                _save_feature_mapping(value, title, group, tables_file)
            elif value is None:
                _save_string(str(value), title, group, tables_file)
            elif title.endswith("set"):
                save(value, tables_file, group_title=title)
            else:
                raise NotImplementedError(
                    "Saving type {} for title \"{}\" has not been implemented."
                    .format(type(value), title))
Example #5
0
 def decorator(function):
     aliases = set()
     alias = normalise_string(name)
     aliases.add(alias)
     alias = alias.replace("_", "")
     aliases.add(alias)
     PREDICTION_METHODS[name] = {"aliases": aliases, "function": function}
     return function
Example #6
0
 def name(self):
     name_parts = [self.method, self.number_of_clusters]
     if self.training_set_kind and self.training_set_kind != "training":
         name_parts.append(self.training_set_kind)
     name = "_".join(
         map(lambda s: normalise_string(str(s)).replace("_", ""),
             name_parts))
     return name
Example #7
0
def _save_sparse_matrix(sparse_matrix, title, group, tables_file):

    name = normalise_string(title)
    group = tables_file.create_group(group, name, title)

    for attribute in ("data", "indices", "indptr", "shape"):
        array = numpy.array(getattr(sparse_matrix, attribute))
        _save_array(array, attribute, group, tables_file)
Example #8
0
def _save_split_indices(split_indices, title, group, tables_file):

    name = normalise_string(title)
    group = tables_file.create_group(group, name, title)

    for subset_name, subset_slice in split_indices.items():
        subset_slice_array = numpy.array(
            [subset_slice.start, subset_slice.stop])
        _save_array(subset_slice_array, subset_name, group, tables_file)
Example #9
0
def plot_centroid_covariance_matrices_evolution(covariance_matrices,
                                                distribution,
                                                name=None):

    distribution = normalise_string(distribution)
    figure_name = "centroids_evolution-{}-covariance_matrices".format(
        distribution)
    figure_name = saving.build_figure_name(figure_name, name)

    y_label = _axis_label_for_symbol(symbol="\\Sigma",
                                     distribution=distribution,
                                     prefix="|",
                                     suffix="(y = k)|")

    n_epochs, n_centroids, __, __ = covariance_matrices.shape
    determinants = numpy.empty([n_epochs, n_centroids])

    for e in range(n_epochs):
        for k in range(n_centroids):
            determinants[e,
                         k] = numpy.prod(numpy.diag(covariance_matrices[e, k]))

    if determinants.all() > 0:
        line_range_ratio = numpy.empty(n_centroids)
        for k in range(n_centroids):
            determinants_min = determinants[:, k].min()
            determinants_max = determinants[:, k].max()
            line_range_ratio[k] = determinants_max / determinants_min
        range_ratio = line_range_ratio.max() / line_range_ratio.min()
        if range_ratio > 1e2:
            y_scale = "log"
        else:
            y_scale = "linear"

    centroids_palette = style.darker_palette(n_centroids)
    epochs = numpy.arange(n_epochs) + 1

    figure = pyplot.figure()
    axis = figure.add_subplot(1, 1, 1)
    seaborn.despine()

    for k in range(n_centroids):
        axis.plot(epochs,
                  determinants[:, k],
                  color=centroids_palette[k],
                  label="$k = {}$".format(k))

    axis.set_xlabel("Epochs")
    axis.set_ylabel(y_label)

    axis.set_yscale(y_scale)

    axis.legend(loc="best")

    return figure, figure_name
Example #10
0
def _axis_label_for_symbol(symbol,
                           coordinate=None,
                           decomposition_method=None,
                           distribution=None,
                           prefix="",
                           suffix=""):

    if decomposition_method:
        decomposition_method = proper_string(
            normalise_string(decomposition_method), DECOMPOSITION_METHOD_NAMES)
        decomposition_label = DECOMPOSITION_METHOD_LABEL[decomposition_method]
    else:
        decomposition_label = ""

    if decomposition_label:
        decomposition_label = "\\mathrm{{{}}}".format(decomposition_label)

    if coordinate:
        coordinate_text = "{{{} {}}}".format(decomposition_label, coordinate)
    else:
        coordinate_text = ""

    if distribution == "prior":
        distribution_symbol = "\\theta"
    elif distribution == "posterior":
        distribution_symbol = "\\phi"
    else:
        distribution_symbol = ""

    if distribution_symbol and coordinate_text:
        distribution_position = "_"
        coordinate_position = "^"
    elif distribution_symbol and not coordinate_text:
        distribution_position = "_"
        coordinate_position = ""
    elif not distribution_symbol and coordinate_text:
        distribution_position = ""
        coordinate_position = "_"
    else:
        distribution_position = ""
        coordinate_position = ""

    if coordinate_position == "^":
        coordinate_text = "{{(" + coordinate_text + ")}}"
    elif coordinate_position == "_":
        coordinate_text = "{{" + coordinate_text + "}}"

    axis_label = "$" + "".join([
        prefix, symbol, distribution_position, distribution_symbol,
        coordinate_position, coordinate_text, suffix
    ]) + "$"

    return axis_label
Example #11
0
def _save_array(array, title, group, tables_file):
    name = normalise_string(title)
    if isinstance(array, list):
        array = numpy.array(array)
        name += "_was_list"
    if array.dtype.char == "U":
        encode = numpy.vectorize(lambda s: s.encode("UTF-8"))
        array = encode(array).astype("S")
    atom = tables.Atom.from_dtype(array.dtype)
    data_store = tables_file.create_carray(group, name, atom, array.shape,
                                           title)
    data_store[:] = array
Example #12
0
 def _find_list_of_names(list_name_guesses, kind):
     if list_name_guesses is None:
         list_name_guesses = LIST_NAME_GUESSES[kind]
     elif not isinstance(list_name_guesses, list):
         list_name_guesses = [list_name_guesses]
     list_of_names = None
     for list_name_guess in list_name_guesses:
         for table_key in table:
             if list_name_guess == normalise_string(table_key):
                 list_of_names = table[table_key]
         if list_of_names is not None:
             break
     return list_of_names
Example #13
0
def find_data_set(name, directory):

    data_sets = _load_data_set_metadata()

    title = None
    data_set = None

    json_path = os.path.join(directory, name, name + ".json")
    if os.path.exists(json_path):
        title, data_set = _data_set_from_json_file(json_path)

    if not title:
        for data_set_title, data_set_specifications in data_sets.items():
            if normalise_string(data_set_title) == normalise_string(name):
                title = data_set_title
                data_set = data_set_specifications
                break

    if not title:
        raise KeyError("Data set not found.")

    return title, data_set
Example #14
0
 def _find_list_of_names(list_name_guesses, kind):
     if list_name_guesses is None:
         list_name_guesses = LIST_NAME_GUESSES[kind]
     elif not isinstance(list_name_guesses, list):
         list_name_guesses = [list_name_guesses]
     list_of_names = None
     for list_name_guess in list_name_guesses:
         for table_key in table:
             if list_name_guess == normalise_string(table_key):
                 list_of_names = table[table_key]
         if list_of_names is not None:
             break
     list_of_names = numpy.array(
         ["{} {}".format(kind, i + 1) for i in range(n[kind])])
Example #15
0
    def default_feature_parameters(self):

        feature_selection_parameters = None

        if self.feature_selection:
            feature_selection = normalise_string(self.feature_selection)

            if feature_selection == "keep_variances_above":
                feature_selection_parameters = [0.5]

            elif feature_selection == "keep_highest_variances":
                if self.number_of_features is not None:
                    feature_selection_parameters = [
                        int(self.number_of_features / 2)
                    ]

        return feature_selection_parameters
Example #16
0
def save_values(values,
                name,
                row_names=None,
                column_names=None,
                directory=None):

    safe_name = "-".join([normalise_string(part) for part in name.split("-")])
    filename = "{}.tsv.gz".format(safe_name)
    path = os.path.join(directory, filename)

    table = pandas.DataFrame(data=values,
                             index=row_names,
                             columns=column_names)

    if not os.path.exists(directory):
        os.makedirs(directory)

    table.to_csv(path, sep="\t")
Example #17
0
def build_figure_name(base_name, other_names=None):

    if isinstance(base_name, list):
        if not other_names:
            other_names = []
        other_names.extend(base_name[1:])
        base_name = normalise_string(base_name[0])

    figure_name = base_name

    if other_names:
        if not isinstance(other_names, list):
            other_names = str(other_names)
            other_names = [other_names]
        else:
            other_names = [
                str(name) for name in other_names if name is not None
            ]
        figure_name += "-" + "-".join(map(normalise_string, other_names))

    return figure_name
Example #18
0
def parse_input(input_file_or_name):

    if input_file_or_name.endswith(".json"):

        json_path = input_file_or_name

        with open(json_path, "r") as json_file:
            data_set_dictionary = json.load(json_file)

        name = _base_name(json_path)

        if "URLs" not in data_set_dictionary:

            if "values" in data_set_dictionary:
                json_directory = os.path.dirname(json_path)
                data_set_dictionary["values"] = os.path.join(
                    json_directory, data_set_dictionary["values"])
            else:
                raise KeyError("Missing path or URL to values.")

            if "labels" in data_set_dictionary:
                json_directory = os.path.dirname(json_path)
                data_set_dictionary["labels"] = os.path.join(
                    json_directory, data_set_dictionary["labels"])

    elif os.path.isfile(input_file_or_name):
        file_path = input_file_or_name
        filename = os.path.basename(file_path)
        file_extension = extension(filename)
        data_format = file_extension[1:] if file_extension else None
        name = _base_name(file_path)
        data_set_dictionary = {"values": file_path, "format": data_format}
    else:
        name = input_file_or_name
        name = normalise_string(name)
        data_set_dictionary = None

    return name, data_set_dictionary
Example #19
0
def _save_feature_mapping(feature_mapping, title, group, tables_file):

    name = normalise_string(title)
    group = tables_file.create_group(group, name, title)

    feature_names = []
    feature_counts = []
    feature_ids = []

    for feature_name, feature_id_set in feature_mapping.items():
        feature_names.append(feature_name)
        feature_counts.append(len(feature_id_set))
        feature_ids.extend(feature_id_set)

    feature_lists = {
        "feature_names": feature_names,
        "feature_counts": feature_counts,
        "feature_ids": feature_ids
    }

    for feature_list_name, feature_list in feature_lists.items():
        feature_list_array = numpy.array(feature_list)
        _save_array(feature_list_array, feature_list_name, group, tables_file)
Example #20
0
    def __init__(self,
                 method,
                 number_of_clusters=None,
                 training_set_kind=None):

        prediction_method_names = {
            name: specifications["aliases"]
            for name, specifications in PREDICTION_METHODS.items()
        }
        method = proper_string(method, prediction_method_names)

        if method not in PREDICTION_METHODS:
            raise ValueError(
                "Prediction method `{}` not found.".format(method))

        if number_of_clusters is None:
            raise TypeError("Number of clusters not set.")

        self.method = method
        self.number_of_clusters = number_of_clusters

        if training_set_kind:
            training_set_kind = normalise_string(training_set_kind)
        self.training_set_kind = training_set_kind
Example #21
0
def plot_centroid_probabilities_evolution(probabilities,
                                          distribution,
                                          linestyle="solid",
                                          name=None):

    distribution = normalise_string(distribution)

    y_label = _axis_label_for_symbol(symbol="\\pi",
                                     distribution=distribution,
                                     suffix="^k")

    figure_name = "centroids_evolution-{}-probabilities".format(distribution)
    figure_name = saving.build_figure_name(figure_name, name)

    n_epochs, n_centroids = probabilities.shape

    centroids_palette = style.darker_palette(n_centroids)
    epochs = numpy.arange(n_epochs) + 1

    figure = pyplot.figure()
    axis = figure.add_subplot(1, 1, 1)
    seaborn.despine()

    for k in range(n_centroids):
        axis.plot(epochs,
                  probabilities[:, k],
                  color=centroids_palette[k],
                  linestyle=linestyle,
                  label="$k = {}$".format(k))

    axis.set_xlabel("Epochs")
    axis.set_ylabel(y_label)

    axis.legend(loc="best")

    return figure, figure_name
Example #22
0
def plot_centroid_means_evolution(means,
                                  distribution,
                                  decomposed=False,
                                  name=None):

    symbol = "\\mu"
    if decomposed:
        decomposition_method = "PCA"
    else:
        decomposition_method = ""
    distribution = normalise_string(distribution)
    suffix = "(y = k)"

    x_label = _axis_label_for_symbol(symbol=symbol,
                                     coordinate=1,
                                     decomposition_method=decomposition_method,
                                     distribution=distribution,
                                     suffix=suffix)
    y_label = _axis_label_for_symbol(symbol=symbol,
                                     coordinate=2,
                                     decomposition_method=decomposition_method,
                                     distribution=distribution,
                                     suffix=suffix)

    figure_name = "centroids_evolution-{}-means".format(distribution)
    figure_name = saving.build_figure_name(figure_name, name)

    n_epochs, n_centroids, latent_size = means.shape

    if latent_size > 2:
        raise ValueError("Dimensions of means should be 2.")

    centroids_palette = style.darker_palette(n_centroids)
    epochs = numpy.arange(n_epochs) + 1

    figure = pyplot.figure()
    axis = figure.add_subplot(1, 1, 1)
    seaborn.despine()

    colour_bar_scatter_plot = axis.scatter(means[:, 0, 0],
                                           means[:, 0, 1],
                                           c=epochs,
                                           cmap=seaborn.dark_palette(
                                               style.NEUTRAL_COLOUR,
                                               as_cmap=True),
                                           zorder=0)

    for k in range(n_centroids):
        colour = centroids_palette[k]
        colour_map = seaborn.dark_palette(colour, as_cmap=True)
        axis.plot(means[:, k, 0],
                  means[:, k, 1],
                  color=colour,
                  label="$k = {}$".format(k),
                  zorder=k + 1)
        axis.scatter(means[:, k, 0],
                     means[:, k, 1],
                     c=epochs,
                     cmap=colour_map,
                     zorder=n_centroids + k + 1)

    axis.legend(loc="best")

    colour_bar = figure.colorbar(colour_bar_scatter_plot)
    colour_bar.outline.set_linewidth(0)
    colour_bar.set_label("Epochs")

    axis.set_xlabel(x_label)
    axis.set_ylabel(y_label)

    return figure, figure_name
Example #23
0
def select_features(values_dictionary, feature_names, method=None,
                    parameters=None):

    method = normalise_string(method)

    print("Selecting features.")
    start_time = time()

    if type(values_dictionary) == dict:
        values = values_dictionary["original"]

    n_examples, n_features = values.shape

    if method == "remove_zeros":
        total_feature_sum = values.sum(axis=0)
        if isinstance(total_feature_sum, numpy.matrix):
            total_feature_sum = total_feature_sum.A.squeeze()
        indices = total_feature_sum != 0

    elif method == "keep_variances_above":
        variances = values.var(axis=0)
        if isinstance(variances, numpy.matrix):
            variances = variances.A.squeeze()
        if parameters:
            threshold = float(parameters[0])
        else:
            threshold = 0.5
        indices = variances > threshold

    elif method == "keep_highest_variances":
        variances = values.var(axis=0)
        if isinstance(variances, numpy.matrix):
            variances = variances.A.squeeze()
        variance_sorted_indices = numpy.argsort(variances)
        if parameters:
            number_to_keep = int(parameters[0])
        else:
            number_to_keep = int(n_examples/2)
        indices = numpy.sort(variance_sorted_indices[-number_to_keep:])

    else:
        raise ValueError(
            "Feature selection `{}` not found.".format(method))

    if method:
        error = Exception(
            "No features excluded using feature selection {}.".format(method))
        if indices.dtype == "bool" and all(indices):
            raise error
        elif indices.dtype != "bool" and len(indices) == n_features:
            raise error

    feature_selected_values = {}

    for version, values in values_dictionary.items():
        if values is not None:
            feature_selected_values[version] = values[:, indices]
        else:
            feature_selected_values[version] = None

    feature_selected_feature_names = feature_names[indices]

    n_features_changed = len(feature_selected_feature_names)

    duration = time() - start_time
    print("{} features selected, {} excluded ({}).".format(
        n_features_changed,
        n_features - n_features_changed,
        format_duration(duration)
    ))

    return feature_selected_values, feature_selected_feature_names
Example #24
0
def plot_values(values,
                colour_coding=None,
                colouring_data_set=None,
                centroids=None,
                sampled_values=None,
                class_name=None,
                feature_index=None,
                figure_labels=None,
                example_tag=None,
                name="scatter"):

    figure_name = name

    if figure_labels:
        title = figure_labels.get("title")
        x_label = figure_labels.get("x label")
        y_label = figure_labels.get("y label")
    else:
        title = "none"
        x_label = "$x$"
        y_label = "$y$"

    if not title:
        title = "none"

    figure_name += "-" + normalise_string(title)

    if colour_coding:
        colour_coding = normalise_string(colour_coding)
        figure_name += "-" + colour_coding
        if "predicted" in colour_coding:
            if colouring_data_set.prediction_specifications:
                figure_name += "-" + (
                    colouring_data_set.prediction_specifications.name)
            else:
                figure_name += "unknown_prediction_method"
        if colouring_data_set is None:
            raise ValueError("Colouring data set not given.")

    if sampled_values is not None:
        figure_name += "-samples"

    values = values.copy()[:, :2]
    if scipy.sparse.issparse(values):
        values = values.A

    # Randomise examples in values to remove any prior order
    n_examples, __ = values.shape
    random_state = numpy.random.RandomState(117)
    shuffled_indices = random_state.permutation(n_examples)
    values = values[shuffled_indices]

    # Adjust marker size based on number of examples
    style._adjust_marker_size_for_scatter_plots(n_examples)

    figure = pyplot.figure()
    axis = figure.add_subplot(1, 1, 1)
    seaborn.despine()

    axis.set_xlabel(x_label)
    axis.set_ylabel(y_label)

    colour_map = seaborn.dark_palette(style.STANDARD_PALETTE[0], as_cmap=True)

    alpha = 1
    if sampled_values is not None:
        alpha = 0.5

    if colour_coding and ("labels" in colour_coding or "ids" in colour_coding
                          or "class" in colour_coding
                          or colour_coding == "batches"):

        if colour_coding == "predicted_cluster_ids":
            labels = colouring_data_set.predicted_cluster_ids
            class_names = numpy.unique(labels).tolist()
            number_of_classes = len(class_names)
            class_palette = None
            label_sorter = None
        elif colour_coding == "predicted_labels":
            labels = colouring_data_set.predicted_labels
            class_names = colouring_data_set.predicted_class_names
            number_of_classes = colouring_data_set.number_of_predicted_classes
            class_palette = colouring_data_set.predicted_class_palette
            label_sorter = colouring_data_set.predicted_label_sorter
        elif colour_coding == "predicted_superset_labels":
            labels = colouring_data_set.predicted_superset_labels
            class_names = colouring_data_set.predicted_superset_class_names
            number_of_classes = (
                colouring_data_set.number_of_predicted_superset_classes)
            class_palette = colouring_data_set.predicted_superset_class_palette
            label_sorter = colouring_data_set.predicted_superset_label_sorter
        elif "superset" in colour_coding:
            labels = colouring_data_set.superset_labels
            class_names = colouring_data_set.superset_class_names
            number_of_classes = colouring_data_set.number_of_superset_classes
            class_palette = colouring_data_set.superset_class_palette
            label_sorter = colouring_data_set.superset_label_sorter
        elif colour_coding == "batches":
            labels = colouring_data_set.batch_indices.flatten()
            class_names = colouring_data_set.batch_names
            number_of_classes = colouring_data_set.number_of_batches
            class_palette = None
            label_sorter = None
        else:
            labels = colouring_data_set.labels
            class_names = colouring_data_set.class_names
            number_of_classes = colouring_data_set.number_of_classes
            class_palette = colouring_data_set.class_palette
            label_sorter = colouring_data_set.label_sorter

        if not class_palette:
            index_palette = style.lighter_palette(number_of_classes)
            class_palette = {
                class_name: index_palette[i]
                for i, class_name in enumerate(
                    sorted(class_names, key=label_sorter))
            }

        # Examples are shuffled, so should their labels be
        labels = labels[shuffled_indices]

        if ("labels" in colour_coding or "ids" in colour_coding
                or colour_coding == "batches"):
            colours = []
            classes = set()

            for i, label in enumerate(labels):
                colour = class_palette[label]
                colours.append(colour)

                # Plot one example for each class to add labels
                if label not in classes:
                    classes.add(label)
                    axis.scatter(values[i, 0],
                                 values[i, 1],
                                 color=colour,
                                 label=label,
                                 alpha=alpha)

            axis.scatter(values[:, 0], values[:, 1], c=colours, alpha=alpha)

            class_handles, class_labels = axis.get_legend_handles_labels()

            if class_labels:
                class_labels, class_handles = zip(
                    *sorted(zip(class_labels, class_handles),
                            key=(lambda t: label_sorter(t[0])
                                 ) if label_sorter else None))
                class_label_maximum_width = max(map(len, class_labels))
                if class_label_maximum_width <= 5 and number_of_classes <= 20:
                    axis.legend(class_handles, class_labels, loc="best")
                else:
                    if number_of_classes <= 20:
                        class_label_columns = 2
                    else:
                        class_label_columns = 3
                    axis.legend(
                        class_handles,
                        class_labels,
                        bbox_to_anchor=(-0.1, 1.05, 1.1, 0.95),
                        loc="lower left",
                        ncol=class_label_columns,
                        mode="expand",
                        borderaxespad=0.,
                    )

        elif "class" in colour_coding:
            colours = []
            figure_name += "-" + normalise_string(str(class_name))
            ordered_indices_set = {str(class_name): [], "Remaining": []}

            for i, label in enumerate(labels):
                if label == class_name:
                    colour = class_palette[label]
                    ordered_indices_set[str(class_name)].append(i)
                else:
                    colour = style.NEUTRAL_COLOUR
                    ordered_indices_set["Remaining"].append(i)
                colours.append(colour)

            colours = numpy.array(colours)

            z_order_index = 1
            for label, ordered_indices in sorted(ordered_indices_set.items()):
                if label == "Remaining":
                    z_order = 0
                else:
                    z_order = z_order_index
                    z_order_index += 1
                ordered_values = values[ordered_indices]
                ordered_colours = colours[ordered_indices]
                axis.scatter(ordered_values[:, 0],
                             ordered_values[:, 1],
                             c=ordered_colours,
                             label=label,
                             alpha=alpha,
                             zorder=z_order)

                handles, labels = axis.get_legend_handles_labels()
                labels, handles = zip(*sorted(zip(labels, handles),
                                              key=lambda t: label_sorter(t[0])
                                              if label_sorter else None))
                axis.legend(handles,
                            labels,
                            bbox_to_anchor=(-0.1, 1.05, 1.1, 0.95),
                            loc="lower left",
                            ncol=2,
                            mode="expand",
                            borderaxespad=0.)

    elif colour_coding == "count_sum":

        n = colouring_data_set.count_sum[shuffled_indices].flatten()
        scatter_plot = axis.scatter(values[:, 0],
                                    values[:, 1],
                                    c=n,
                                    cmap=colour_map,
                                    alpha=alpha)
        colour_bar = figure.colorbar(scatter_plot)
        colour_bar.outline.set_linewidth(0)
        colour_bar.set_label("Total number of {}s per {}".format(
            colouring_data_set.terms["item"],
            colouring_data_set.terms["example"]))

    elif colour_coding == "feature":
        if feature_index is None:
            raise ValueError("Feature number not given.")
        if feature_index > colouring_data_set.number_of_features:
            raise ValueError("Feature number higher than number of features.")

        feature_name = colouring_data_set.feature_names[feature_index]
        figure_name += "-{}".format(normalise_string(feature_name))

        f = colouring_data_set.values[shuffled_indices, feature_index]
        if scipy.sparse.issparse(f):
            f = f.A
        f = f.squeeze()

        scatter_plot = axis.scatter(values[:, 0],
                                    values[:, 1],
                                    c=f,
                                    cmap=colour_map,
                                    alpha=alpha)
        colour_bar = figure.colorbar(scatter_plot)
        colour_bar.outline.set_linewidth(0)
        colour_bar.set_label(feature_name)

    elif colour_coding is None:
        axis.scatter(values[:, 0],
                     values[:, 1],
                     c="k",
                     alpha=alpha,
                     edgecolors="none")

    else:
        raise ValueError("Colour coding `{}` not found.".format(colour_coding))

    if centroids:
        prior_centroids = centroids["prior"]

        if prior_centroids:
            n_centroids = prior_centroids["probabilities"].shape[0]
        else:
            n_centroids = 0

        if n_centroids > 1:
            centroids_palette = style.darker_palette(n_centroids)
            classes = numpy.arange(n_centroids)

            means = prior_centroids["means"]
            covariance_matrices = prior_centroids["covariance_matrices"]

            for k in range(n_centroids):
                axis.scatter(means[k, 0],
                             means[k, 1],
                             s=60,
                             marker="x",
                             color="black",
                             linewidth=3)
                axis.scatter(means[k, 0],
                             means[k, 1],
                             marker="x",
                             facecolor=centroids_palette[k],
                             edgecolors="black")
                ellipse_fill, ellipse_edge = _covariance_matrix_as_ellipse(
                    covariance_matrices[k],
                    means[k],
                    colour=centroids_palette[k])
                axis.add_patch(ellipse_edge)
                axis.add_patch(ellipse_fill)

    if sampled_values is not None:

        sampled_values = sampled_values.copy()[:, :2]
        if scipy.sparse.issparse(sampled_values):
            sampled_values = sampled_values.A

        sample_colour_map = seaborn.blend_palette(("white", "purple"),
                                                  as_cmap=True)

        x_limits = axis.get_xlim()
        y_limits = axis.get_ylim()

        axis.hexbin(sampled_values[:, 0],
                    sampled_values[:, 1],
                    gridsize=75,
                    cmap=sample_colour_map,
                    linewidths=0.,
                    edgecolors="none",
                    zorder=-100)

        axis.set_xlim(x_limits)
        axis.set_ylim(y_limits)

    # Reset marker size
    style.reset_plot_look()

    return figure, figure_name
Example #25
0
def _setup_model(data_set,
                 model_type=None,
                 latent_size=None,
                 hidden_sizes=None,
                 number_of_importance_samples=None,
                 number_of_monte_carlo_samples=None,
                 inference_architecture=None,
                 latent_distribution=None,
                 number_of_classes=None,
                 parameterise_latent_posterior=False,
                 prior_probabilities_method=None,
                 generative_architecture=None,
                 reconstruction_distribution=None,
                 number_of_reconstruction_classes=None,
                 count_sum=None,
                 proportion_of_free_nats_for_y_kl_divergence=None,
                 minibatch_normalisation=None,
                 batch_correction=None,
                 dropout_keep_probabilities=None,
                 number_of_warm_up_epochs=None,
                 kl_weight=None,
                 models_directory=None):

    if model_type is None:
        model_type = defaults["model"]["type"]
    if batch_correction is None:
        batch_correction = defaults["model"]["batch_correction"]

    feature_size = data_set.number_of_features
    number_of_batches = data_set.number_of_batches

    if not data_set.has_batches:
        batch_correction = False

    if normalise_string(model_type) == "vae":
        model = VariationalAutoencoder(
            feature_size=feature_size,
            latent_size=latent_size,
            hidden_sizes=hidden_sizes,
            number_of_monte_carlo_samples=number_of_monte_carlo_samples,
            number_of_importance_samples=number_of_importance_samples,
            inference_architecture=inference_architecture,
            latent_distribution=latent_distribution,
            number_of_latent_clusters=number_of_classes,
            parameterise_latent_posterior=parameterise_latent_posterior,
            generative_architecture=generative_architecture,
            reconstruction_distribution=reconstruction_distribution,
            number_of_reconstruction_classes=number_of_reconstruction_classes,
            minibatch_normalisation=minibatch_normalisation,
            batch_correction=batch_correction,
            number_of_batches=number_of_batches,
            dropout_keep_probabilities=dropout_keep_probabilities,
            count_sum=count_sum,
            number_of_warm_up_epochs=number_of_warm_up_epochs,
            kl_weight=kl_weight,
            log_directory=models_directory)

    elif normalise_string(model_type) == "gmvae":
        prior_probabilities_method_for_model = prior_probabilities_method
        if prior_probabilities_method == "uniform":
            prior_probabilities = None
        elif prior_probabilities_method == "infer":
            prior_probabilities_method_for_model = "custom"
            prior_probabilities = data_set.class_probabilities
        else:
            prior_probabilities = None

        model = GaussianMixtureVariationalAutoencoder(
            feature_size=feature_size,
            latent_size=latent_size,
            hidden_sizes=hidden_sizes,
            number_of_monte_carlo_samples=number_of_monte_carlo_samples,
            number_of_importance_samples=number_of_importance_samples,
            prior_probabilities_method=prior_probabilities_method_for_model,
            prior_probabilities=prior_probabilities,
            latent_distribution=latent_distribution,
            number_of_latent_clusters=number_of_classes,
            proportion_of_free_nats_for_y_kl_divergence=(
                proportion_of_free_nats_for_y_kl_divergence),
            reconstruction_distribution=reconstruction_distribution,
            number_of_reconstruction_classes=number_of_reconstruction_classes,
            minibatch_normalisation=minibatch_normalisation,
            batch_correction=batch_correction,
            number_of_batches=number_of_batches,
            dropout_keep_probabilities=dropout_keep_probabilities,
            count_sum=count_sum,
            number_of_warm_up_epochs=number_of_warm_up_epochs,
            kl_weight=kl_weight,
            log_directory=models_directory)

    else:
        raise ValueError("Model type not found: `{}`.".format(model_type))

    return model
Example #26
0
def evaluate(data_set_file_or_name,
             data_format=None,
             data_directory=None,
             map_features=None,
             feature_selection=None,
             example_filter=None,
             noisy_preprocessing_methods=None,
             preprocessing_methods=None,
             split_data_set=None,
             splitting_method=None,
             splitting_fraction=None,
             model_type=None,
             latent_size=None,
             hidden_sizes=None,
             number_of_importance_samples=None,
             number_of_monte_carlo_samples=None,
             inference_architecture=None,
             latent_distribution=None,
             number_of_classes=None,
             parameterise_latent_posterior=False,
             prior_probabilities_method=None,
             generative_architecture=None,
             reconstruction_distribution=None,
             number_of_reconstruction_classes=None,
             count_sum=None,
             proportion_of_free_nats_for_y_kl_divergence=None,
             minibatch_normalisation=None,
             batch_correction=None,
             dropout_keep_probabilities=None,
             number_of_warm_up_epochs=None,
             kl_weight=None,
             minibatch_size=None,
             run_id=None,
             models_directory=None,
             included_analyses=None,
             analysis_level=None,
             decomposition_methods=None,
             highlight_feature_indices=None,
             export_options=None,
             analyses_directory=None,
             evaluation_set_kind=None,
             sample_size=None,
             prediction_method=None,
             prediction_training_set_kind=None,
             model_versions=None,
             **keyword_arguments):
    """Evaluate model on data set."""

    if split_data_set is None:
        split_data_set = defaults["data"]["split_data_set"]
    if splitting_method is None:
        splitting_method = defaults["data"]["splitting_method"]
    if splitting_fraction is None:
        splitting_fraction = defaults["data"]["splitting_fraction"]
    if models_directory is None:
        models_directory = defaults["models"]["directory"]
    if evaluation_set_kind is None:
        evaluation_set_kind = defaults["evaluation"]["data_set_name"]
    if sample_size is None:
        sample_size = defaults["models"]["sample_size"]
    if prediction_method is None:
        prediction_method = defaults["evaluation"]["prediction_method"]
    if prediction_training_set_kind is None:
        prediction_training_set_kind = defaults["evaluation"][
            "prediction_training_set_kind"]
    if model_versions is None:
        model_versions = defaults["evaluation"]["model_versions"]
    if analyses_directory is None:
        analyses_directory = defaults["analyses"]["directory"]

    evaluation_set_kind = normalise_string(evaluation_set_kind)
    prediction_training_set_kind = normalise_string(
        prediction_training_set_kind)
    model_versions = parse_model_versions(model_versions)

    print(title("Data"))

    binarise_values = False
    if reconstruction_distribution == "bernoulli":
        if noisy_preprocessing_methods:
            if noisy_preprocessing_methods[-1] != "binarise":
                noisy_preprocessing_methods.append("binarise")
        else:
            binarise_values = True

    data_set = DataSet(data_set_file_or_name,
                       data_format=data_format,
                       directory=data_directory,
                       map_features=map_features,
                       feature_selection=feature_selection,
                       example_filter=example_filter,
                       preprocessing_methods=preprocessing_methods,
                       binarise_values=binarise_values,
                       noisy_preprocessing_methods=noisy_preprocessing_methods)

    if not split_data_set or evaluation_set_kind == "full":
        data_set.load()

    if split_data_set:
        training_set, validation_set, test_set = data_set.split(
            method=splitting_method, fraction=splitting_fraction)
        data_subsets = [data_set, training_set, validation_set, test_set]
        for data_subset in data_subsets:
            clear_data_subset = True
            if data_subset.kind == evaluation_set_kind:
                evaluation_set = data_subset
                clear_data_subset = False
            if data_subset.kind == prediction_training_set_kind:
                prediction_training_set = data_subset
                clear_data_subset = False
            if clear_data_subset:
                data_subset.clear()
    else:
        splitting_method = None
        splitting_fraction = None
        evaluation_set = data_set
        prediction_training_set = data_set

    evaluation_subset_indices = indices_for_evaluation_subset(evaluation_set)

    models_directory = build_directory_path(
        models_directory,
        data_set=evaluation_set,
        splitting_method=splitting_method,
        splitting_fraction=splitting_fraction)
    analyses_directory = build_directory_path(
        analyses_directory,
        data_set=evaluation_set,
        splitting_method=splitting_method,
        splitting_fraction=splitting_fraction)

    print(title("Model"))

    if number_of_classes is None:
        if evaluation_set.has_labels:
            number_of_classes = (evaluation_set.number_of_classes -
                                 evaluation_set.number_of_excluded_classes)

    model = _setup_model(
        data_set=evaluation_set,
        model_type=model_type,
        latent_size=latent_size,
        hidden_sizes=hidden_sizes,
        number_of_importance_samples=number_of_importance_samples,
        number_of_monte_carlo_samples=number_of_monte_carlo_samples,
        inference_architecture=inference_architecture,
        latent_distribution=latent_distribution,
        number_of_classes=number_of_classes,
        parameterise_latent_posterior=parameterise_latent_posterior,
        prior_probabilities_method=prior_probabilities_method,
        generative_architecture=generative_architecture,
        reconstruction_distribution=reconstruction_distribution,
        number_of_reconstruction_classes=number_of_reconstruction_classes,
        count_sum=count_sum,
        proportion_of_free_nats_for_y_kl_divergence=(
            proportion_of_free_nats_for_y_kl_divergence),
        minibatch_normalisation=minibatch_normalisation,
        batch_correction=batch_correction,
        dropout_keep_probabilities=dropout_keep_probabilities,
        number_of_warm_up_epochs=number_of_warm_up_epochs,
        kl_weight=kl_weight,
        models_directory=models_directory)

    if ("best_model" in model_versions
            and not better_model_exists(model, run_id=run_id)):
        model_versions.remove("best_model")

    if ("early_stopping" in model_versions
            and not model_stopped_early(model, run_id=run_id)):
        model_versions.remove("early_stopping")

    print(subtitle("Analysis"))

    analyses.analyse_model(model=model,
                           run_id=run_id,
                           included_analyses=included_analyses,
                           analysis_level=analysis_level,
                           export_options=export_options,
                           analyses_directory=analyses_directory)

    print(title("Results"))

    print("Evaluation set: {} set.".format(evaluation_set.kind))
    print("Model version{}: {}.".format(
        "" if len(model_versions) == 1 else "s",
        enumerate_strings([v.replace("_", " ") for v in model_versions],
                          conjunction="and")))

    if prediction_method:
        prediction_specifications = PredictionSpecifications(
            method=prediction_method,
            number_of_clusters=number_of_classes,
            training_set_kind=prediction_training_set.kind)
        print("Prediction method: {}.".format(
            prediction_specifications.method))
        print("Number of clusters: {}.".format(
            prediction_specifications.number_of_clusters))
        print("Prediction training set: {} set.".format(
            prediction_specifications.training_set_kind))

    print()

    for model_version in model_versions:

        use_best_model = False
        use_early_stopping_model = False
        if model_version == "best_model":
            use_best_model = True
        elif model_version == "early_stopping":
            use_early_stopping_model = True

        print(subtitle(model_version.replace("_", " ").capitalize()))

        print(
            heading("{} evaluation".format(
                model_version.replace("_", "-").capitalize())))

        (transformed_evaluation_set, reconstructed_evaluation_set,
         latent_evaluation_sets) = model.evaluate(
             evaluation_set=evaluation_set,
             evaluation_subset_indices=evaluation_subset_indices,
             minibatch_size=minibatch_size,
             run_id=run_id,
             use_best_model=use_best_model,
             use_early_stopping_model=use_early_stopping_model,
             output_versions="all")
        print()

        if sample_size:
            print(
                heading("{} sampling".format(
                    model_version.replace("_", "-").capitalize())))

            sample_reconstruction_set, __ = model.sample(
                sample_size=sample_size,
                minibatch_size=minibatch_size,
                run_id=run_id,
                use_best_model=use_best_model,
                use_early_stopping_model=use_early_stopping_model)
            print()
        else:
            sample_reconstruction_set = None

        if prediction_method:
            print(
                heading("{} prediction".format(
                    model_version.replace("_", "-").capitalize())))

            latent_prediction_training_sets = model.evaluate(
                evaluation_set=prediction_training_set,
                minibatch_size=minibatch_size,
                run_id=run_id,
                use_best_model=use_best_model,
                use_early_stopping_model=use_early_stopping_model,
                output_versions="latent",
                log_results=False)
            print()

            cluster_ids, predicted_labels, predicted_superset_labels = (
                predict_labels(
                    training_set=latent_prediction_training_sets["z"],
                    evaluation_set=latent_evaluation_sets["z"],
                    specifications=prediction_specifications))

            evaluation_set_versions = [
                transformed_evaluation_set, reconstructed_evaluation_set
            ] + list(latent_evaluation_sets.values())

            for evaluation_set_version in evaluation_set_versions:
                evaluation_set_version.update_predictions(
                    prediction_specifications=prediction_specifications,
                    predicted_cluster_ids=cluster_ids,
                    predicted_labels=predicted_labels,
                    predicted_superset_labels=predicted_superset_labels)
            print()

        print(
            heading("{} analysis".format(
                model_version.replace("_", "-").capitalize())))

        analyses.analyse_results(
            evaluation_set=transformed_evaluation_set,
            reconstructed_evaluation_set=reconstructed_evaluation_set,
            latent_evaluation_sets=latent_evaluation_sets,
            model=model,
            run_id=run_id,
            sample_reconstruction_set=sample_reconstruction_set,
            decomposition_methods=decomposition_methods,
            evaluation_subset_indices=evaluation_subset_indices,
            highlight_feature_indices=highlight_feature_indices,
            best_model=use_best_model,
            early_stopping=use_early_stopping_model,
            included_analyses=included_analyses,
            analysis_level=analysis_level,
            export_options=export_options,
            analyses_directory=analyses_directory)

    return 0
Example #27
0
def decompose(values,
              other_value_sets={},
              centroids={},
              method=None,
              number_of_components=None,
              random=False):

    if method is None:
        method = defaults["decomposition_method"]
    method = proper_string(normalise_string(method),
                           DECOMPOSITION_METHOD_NAMES)

    if number_of_components is None:
        number_of_components = defaults["decomposition_dimensionality"]

    other_values_provided_as_dictionary = True
    if other_value_sets is not None and not isinstance(other_value_sets, dict):
        other_value_sets["unknown"] = other_value_sets
        other_values_provided_as_dictionary = False

    if random:
        random_state = None
    else:
        random_state = 42

    if method == "PCA":
        if (values.shape[1] <= MAXIMUM_FEATURE_SIZE_FOR_NORMAL_PCA
                and not scipy.sparse.issparse(values)):
            model = PCA(n_components=number_of_components)
        else:
            model = IncrementalPCA(n_components=number_of_components,
                                   batch_size=100)
    elif method == "SVD":
        model = TruncatedSVD(n_components=number_of_components)
    elif method == "ICA":
        model = FastICA(n_components=number_of_components)
    elif method == "t-SNE":
        if number_of_components < 4:
            tsne_method = "barnes_hut"
        else:
            tsne_method = "exact"
        model = TSNE(n_components=number_of_components,
                     method=tsne_method,
                     random_state=random_state)
    else:
        raise ValueError("Method `{}` not found.".format(method))

    values_decomposed = model.fit_transform(values)

    if other_value_sets and method != "t-SNE":
        other_value_sets_decomposed = {}
        for other_set_name, other_values in other_value_sets.items():
            if other_values is not None:
                other_value_decomposed = model.transform(other_values)
            else:
                other_value_decomposed = None
            other_value_sets_decomposed[other_set_name] = (
                other_value_decomposed)
    else:
        other_value_sets_decomposed = None

    if other_value_sets_decomposed and not other_values_provided_as_dictionary:
        other_value_sets_decomposed = other_value_sets_decomposed["unknown"]

    # Only supports centroids without data sets as top levels
    if centroids is not None and method == "PCA":
        if "means" in centroids:
            centroids = {"unknown": centroids}
        components = model.components_
        centroids_decomposed = {}
        for distribution, distribution_centroids in centroids.items():
            if distribution_centroids:
                centroids_distribution_decomposed = {}
                for parameter, parameter_values in (
                        distribution_centroids.items()):
                    if parameter == "means":
                        shape = numpy.array(parameter_values.shape)
                        original_dimension = shape[-1]
                        reshaped_parameter_values = parameter_values.reshape(
                            -1, original_dimension)
                        decomposed_parameter_values = model.transform(
                            reshaped_parameter_values)
                        shape[-1] = number_of_components
                        new_parameter_values = (
                            decomposed_parameter_values.reshape(shape))
                    elif parameter == "covariance_matrices":
                        shape = numpy.array(parameter_values.shape)
                        original_dimension = shape[-1]
                        reshaped_parameter_values = parameter_values.reshape(
                            -1, original_dimension, original_dimension)
                        n_centroids = reshaped_parameter_values.shape[0]
                        decomposed_parameter_values = numpy.empty(
                            shape=(n_centroids, 2, 2))
                        for i in range(n_centroids):
                            decomposed_parameter_values[i] = (
                                components @ reshaped_parameter_values[i]
                                @ components.T)
                        shape[-2:] = number_of_components
                        new_parameter_values = (
                            decomposed_parameter_values.reshape(shape))
                    else:
                        new_parameter_values = parameter_values
                    centroids_distribution_decomposed[parameter] = (
                        new_parameter_values)
                centroids_decomposed[distribution] = (
                    centroids_distribution_decomposed)
            else:
                centroids_decomposed[distribution] = None
        if "unknown" in centroids_decomposed:
            centroids_decomposed = centroids_decomposed["unknown"]
    else:
        centroids_decomposed = None

    output = [values_decomposed]

    if other_value_sets != {}:
        output.append(other_value_sets_decomposed)

    if centroids != {}:
        output.append(centroids_decomposed)

    return output
Example #28
0
def analyse_centroid_probabilities(centroids,
                                   name=None,
                                   analysis_level=None,
                                   export_options=None,
                                   analyses_directory=None):

    if name:
        name = normalise_string(name)
    if analysis_level is None:
        analysis_level = defaults["analyses"]["analysis_level"]
    if analyses_directory is None:
        analyses_directory = defaults["analyses"]["directory"]

    print("Plotting centroid probabilities.")
    plot_time_start = time()

    posterior_probabilities = None
    prior_probabilities = None

    if "posterior" in centroids and centroids["posterior"]:
        posterior_probabilities = centroids["posterior"]["probabilities"]
        n_centroids = len(posterior_probabilities)
    if "prior" in centroids and centroids["prior"]:
        prior_probabilities = centroids["prior"]["probabilities"]
        n_centroids = len(prior_probabilities)

    centroids_palette = style.darker_palette(n_centroids)
    x_label = "$k$"
    if prior_probabilities is not None:
        if posterior_probabilities is not None:
            y_label = _axis_label_for_symbol(
                symbol="\\pi",
                distribution=normalise_string("posterior"),
                suffix="^k")
            if name:
                plot_name = [name, "posterior", "prior"]
            else:
                plot_name = ["posterior", "prior"]
        else:
            y_label = _axis_label_for_symbol(
                symbol="\\pi",
                distribution=normalise_string("prior"),
                suffix="^k")
            if name:
                plot_name = [name, "prior"]
            else:
                plot_name = "prior"
    elif posterior_probabilities is not None:
        y_label = _axis_label_for_symbol(
            symbol="\\pi",
            distribution=normalise_string("posterior"),
            suffix="^k")
        if name:
            plot_name = [name, "posterior"]
        else:
            plot_name = "posterior"

    figure, figure_name = figures.plot_probabilities(posterior_probabilities,
                                                     prior_probabilities,
                                                     x_label=x_label,
                                                     y_label=y_label,
                                                     palette=centroids_palette,
                                                     uniform=False,
                                                     name=plot_name)
    figures.save_figure(figure=figure,
                        name=figure_name,
                        options=export_options,
                        directory=analyses_directory)

    plot_duration = time() - plot_time_start
    print("Centroid probabilities plotted and saved ({}).".format(
        format_duration(plot_duration)))
Example #29
0
def analyse_decompositions(data_sets,
                           other_data_sets=None,
                           centroids=None,
                           colouring_data_set=None,
                           sampled_data_set=None,
                           decomposition_methods=None,
                           highlight_feature_indices=None,
                           symbol=None,
                           title="data set",
                           specifier=None,
                           analysis_level=None,
                           export_options=None,
                           analyses_directory=None):

    if analysis_level is None:
        analysis_level = defaults["analyses"]["analysis_level"]

    centroids_original = centroids

    if isinstance(data_sets, dict):
        data_sets = list(data_sets.values())

    if not isinstance(data_sets, (list, tuple)):
        data_sets = [data_sets]

    if other_data_sets is None:
        other_data_sets = [None] * len(data_sets)
    elif not isinstance(other_data_sets, (list, tuple)):
        other_data_sets = [other_data_sets]

    if len(data_sets) != len(other_data_sets):
        raise ValueError(
            "Lists of data sets and alternative data sets do not have the "
            "same length.")

    specification = None

    base_symbol = symbol

    original_title = title

    if decomposition_methods is None:
        decomposition_methods = [defaults["decomposition_method"]]
    elif not isinstance(decomposition_methods, (list, tuple)):
        decomposition_methods = [decomposition_methods]
    else:
        decomposition_methods = decomposition_methods.copy()
    decomposition_methods.insert(0, None)

    if highlight_feature_indices is None:
        highlight_feature_indices = defaults["analyses"][
            "highlight_feature_indices"]
    elif not isinstance(highlight_feature_indices, (list, tuple)):
        highlight_feature_indices = [highlight_feature_indices]
    else:
        highlight_feature_indices = highlight_feature_indices.copy()

    if analyses_directory is None:
        analyses_directory = defaults["analyses"]["directory"]

    for data_set, other_data_set in zip(data_sets, other_data_sets):

        if data_set.values.shape[1] <= 1:
            continue

        title = original_title
        name = normalise_string(title)

        if specifier:
            specification = specifier(data_set)

        if specification:
            name += "-" + str(specification)
            title += " for " + specification

        title += " set"

        if not colouring_data_set:
            colouring_data_set = data_set

        if data_set.version in ["z", "z1"]:
            centroids = copy.deepcopy(centroids_original)
        else:
            centroids = None

        if other_data_set:
            title = "{} set values in {}".format(other_data_set.version, title)
            name = other_data_set.version + "-" + name

        decompositions_directory = os.path.join(analyses_directory, name)

        for decomposition_method in decomposition_methods:

            other_values = None
            sampled_values = None

            if other_data_set:
                other_values = other_data_set.values

            if sampled_data_set:
                sampled_values = sampled_data_set.values

            if not decomposition_method:
                if data_set.number_of_features == 2:
                    values_decomposed = data_set.values
                    other_values_decomposed = other_values
                    sampled_values_decomposed = sampled_values
                    centroids_decomposed = centroids
                else:
                    continue
            else:
                decomposition_method = proper_string(
                    decomposition_method, DECOMPOSITION_METHOD_NAMES)

                values_decomposed = data_set.values
                other_values_decomposed = other_values
                sampled_values_decomposed = sampled_values
                centroids_decomposed = centroids

                other_value_sets_decomposed = {}
                if other_values is not None:
                    other_value_sets_decomposed["other"] = other_values
                if sampled_values is not None:
                    other_value_sets_decomposed["sampled"] = sampled_values
                if not other_value_sets_decomposed:
                    other_value_sets_decomposed = None

                if decomposition_method == "t-SNE":
                    if (data_set.number_of_examples >
                            MAXIMUM_NUMBER_OF_EXAMPLES_FOR_TSNE):
                        print(
                            "The number of examples for {}".format(title),
                            "is too large to decompose it",
                            "using {}. Skipping.".format(decomposition_method))
                        print()
                        continue

                    elif (data_set.number_of_features >
                          MAXIMUM_NUMBER_OF_FEATURES_FOR_TSNE):
                        number_of_pca_components_before_tsne = min(
                            MAXIMUM_NUMBER_OF_PCA_COMPONENTS_BEFORE_TSNE,
                            data_set.number_of_examples - 1)
                        print(
                            "The number of features for {}".format(title),
                            "is too large to decompose it",
                            "using {} in due time.".format(
                                decomposition_method))
                        print("Decomposing {} to {} components using PCA "
                              "beforehand.".format(
                                  title, number_of_pca_components_before_tsne))
                        decompose_time_start = time()
                        (values_decomposed, other_value_sets_decomposed,
                         centroids_decomposed) = decompose(
                             values_decomposed,
                             other_value_sets=other_value_sets_decomposed,
                             centroids=centroids_decomposed,
                             method="pca",
                             number_of_components=(
                                 number_of_pca_components_before_tsne))
                        decompose_duration = time() - decompose_time_start
                        print("{} pre-decomposed ({}).".format(
                            capitalise_string(title),
                            format_duration(decompose_duration)))

                    else:
                        if scipy.sparse.issparse(values_decomposed):
                            values_decomposed = values_decomposed.A
                        if scipy.sparse.issparse(other_values_decomposed):
                            other_values_decomposed = other_values_decomposed.A
                        if scipy.sparse.issparse(sampled_values_decomposed):
                            sampled_values_decomposed = (
                                sampled_values_decomposed.A)

                print("Decomposing {} using {}.".format(
                    title, decomposition_method))
                decompose_time_start = time()
                (values_decomposed, other_value_sets_decomposed,
                 centroids_decomposed) = decompose(
                     values_decomposed,
                     other_value_sets=other_value_sets_decomposed,
                     centroids=centroids_decomposed,
                     method=decomposition_method,
                     number_of_components=2)
                decompose_duration = time() - decompose_time_start
                print("{} decomposed ({}).".format(
                    capitalise_string(title),
                    format_duration(decompose_duration)))
                print()

                if other_value_sets_decomposed:
                    other_values_decomposed = other_value_sets_decomposed.get(
                        "other")
                    sampled_values_decomposed = (
                        other_value_sets_decomposed.get("sampled"))

            if base_symbol:
                symbol = base_symbol
            else:
                symbol = specification

            x_label = _axis_label_for_symbol(
                symbol=symbol,
                coordinate=1,
                decomposition_method=decomposition_method,
            )
            y_label = _axis_label_for_symbol(
                symbol=symbol,
                coordinate=2,
                decomposition_method=decomposition_method,
            )

            figure_labels = {
                "title": decomposition_method,
                "x label": x_label,
                "y label": y_label
            }

            if other_data_set:
                plot_values_decomposed = other_values_decomposed
            else:
                plot_values_decomposed = values_decomposed

            if plot_values_decomposed is None:
                print("No values to plot.\n")
                return

            print("Plotting {}{}.".format(
                "decomposed " if decomposition_method else "", title))

            # No colour-coding
            plot_time_start = time()
            figure, figure_name = figures.plot_values(
                plot_values_decomposed,
                centroids=centroids_decomposed,
                figure_labels=figure_labels,
                example_tag=data_set.tags["example"],
                name=name)
            figures.save_figure(figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
            plot_duration = time() - plot_time_start
            print("    {} plotted and saved ({}).".format(
                capitalise_string(title), format_duration(plot_duration)))

            # Samples
            if sampled_data_set:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    centroids=centroids_decomposed,
                    sampled_values=sampled_values_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name)
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    {} (with samples) plotted and saved ({}).".format(
                    capitalise_string(title), format_duration(plot_duration)))

            # Labels
            if colouring_data_set.labels is not None:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="labels",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name)
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    {} (with labels) plotted and saved ({}).".format(
                    capitalise_string(title), format_duration(plot_duration)))

                # Superset labels
                if colouring_data_set.superset_labels is not None:
                    plot_time_start = time()
                    figure, figure_name = figures.plot_values(
                        plot_values_decomposed,
                        colour_coding="superset labels",
                        colouring_data_set=colouring_data_set,
                        centroids=centroids_decomposed,
                        figure_labels=figure_labels,
                        example_tag=data_set.tags["example"],
                        name=name)
                    figures.save_figure(figure=figure,
                                        name=figure_name,
                                        options=export_options,
                                        directory=decompositions_directory)
                    plot_duration = time() - plot_time_start
                    print("    "
                          "{} (with superset labels) plotted and saved ({}).".
                          format(capitalise_string(title),
                                 format_duration(plot_duration)))

                # For each class
                if analysis_level == "extensive":
                    if colouring_data_set.number_of_classes <= 10:
                        plot_time_start = time()
                        for class_name in colouring_data_set.class_names:
                            figure, figure_name = figures.plot_values(
                                plot_values_decomposed,
                                colour_coding="class",
                                colouring_data_set=colouring_data_set,
                                centroids=centroids_decomposed,
                                class_name=class_name,
                                figure_labels=figure_labels,
                                example_tag=data_set.tags["example"],
                                name=name)
                            figures.save_figure(
                                figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
                        plot_duration = time() - plot_time_start
                        print(
                            "    {} (for each class) plotted and saved ({}).".
                            format(capitalise_string(title),
                                   format_duration(plot_duration)))

                    if (colouring_data_set.superset_labels is not None
                            and data_set.number_of_superset_classes <= 10):
                        plot_time_start = time()
                        for superset_class_name in (
                                colouring_data_set.superset_class_names):
                            figure, figure_name = figures.plot_values(
                                plot_values_decomposed,
                                colour_coding="superset class",
                                colouring_data_set=colouring_data_set,
                                centroids=centroids_decomposed,
                                class_name=superset_class_name,
                                figure_labels=figure_labels,
                                example_tag=data_set.tags["example"],
                                name=name)
                            figures.save_figure(
                                figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
                        plot_duration = time() - plot_time_start
                        print("    {} (for each superset class) plotted and "
                              "saved ({}).".format(
                                  capitalise_string(title),
                                  format_duration(plot_duration)))

            # Batches
            if colouring_data_set.has_batches:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="batches",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    "
                      "{} (with batches) plotted and saved ({}).".format(
                          capitalise_string(title),
                          format_duration(plot_duration)))

            # Cluster IDs
            if colouring_data_set.has_predicted_cluster_ids:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="predicted cluster IDs",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print(
                    "    "
                    "{} (with predicted cluster IDs) plotted and saved ({}).".
                    format(capitalise_string(title),
                           format_duration(plot_duration)))

            # Predicted labels
            if colouring_data_set.has_predicted_labels:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="predicted labels",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    "
                      "{} (with predicted labels) plotted and saved ({}).".
                      format(capitalise_string(title),
                             format_duration(plot_duration)))

            if colouring_data_set.has_predicted_superset_labels:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="predicted superset labels",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print(
                    "    {} (with predicted superset labels) plotted and saved"
                    " ({}).".format(capitalise_string(title),
                                    format_duration(plot_duration)))

            # Count sum
            plot_time_start = time()
            figure, figure_name = figures.plot_values(
                plot_values_decomposed,
                colour_coding="count sum",
                colouring_data_set=colouring_data_set,
                centroids=centroids_decomposed,
                figure_labels=figure_labels,
                example_tag=data_set.tags["example"],
                name=name)
            figures.save_figure(figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
            plot_duration = time() - plot_time_start
            print("    {} (with count sum) plotted and saved ({}).".format(
                capitalise_string(title), format_duration(plot_duration)))

            # Features
            for feature_index in highlight_feature_indices:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="feature",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    feature_index=feature_index,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name)
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    {} (with {}) plotted and saved ({}).".format(
                    capitalise_string(title),
                    data_set.feature_names[feature_index],
                    format_duration(plot_duration)))

            print()
Example #30
0
def split_data_set(data_dictionary, method=None, fraction=None):

    if method is None:
        method = defaults["data"]["splitting_method"]
    if fraction is None:
        fraction = defaults["data"]["splitting_fraction"]

    print("Splitting data set.")
    start_time = time()

    if method == "default":
        if "split indices" in data_dictionary:
            method = "indices"
        else:
            method = "random"

    method = normalise_string(method)

    n = data_dictionary["values"].shape[0]

    random_state = numpy.random.RandomState(42)

    if method in ["random", "sequential"]:

        n_training_validation = int(fraction * n)
        n_training = int(fraction * n_training_validation)

        if method == "random":
            indices = random_state.permutation(n)
        else:
            indices = numpy.arange(n)

        training_indices = indices[:n_training]
        validation_indices = indices[n_training:n_training_validation]
        test_indices = indices[n_training_validation:]

    elif method == "indices":

        split_indices = data_dictionary["split indices"]

        training_indices = split_indices["training"]
        test_indices = split_indices["test"]

        if "validation" in split_indices:
            validation_indices = split_indices["validation"]
        else:
            n_training_validation = training_indices.stop
            n_all = test_indices.stop

            n_training = n_training_validation - (
                n_all - n_training_validation)

            training_indices = slice(n_training)
            validation_indices = slice(n_training, n_training_validation)

    elif method == "macosko":

        values = data_dictionary["values"]

        minimum_number_of_non_zero_elements = 900
        number_of_non_zero_elements = (values != 0).sum(axis=1)

        training_indices = numpy.nonzero(
            number_of_non_zero_elements > minimum_number_of_non_zero_elements
        )[0]

        test_validation_indices = numpy.nonzero(
            number_of_non_zero_elements <= minimum_number_of_non_zero_elements
        )[0]

        random_state.shuffle(test_validation_indices)

        n_validation_test = len(test_validation_indices)
        n_validation = int((1 - fraction) * n_validation_test)

        validation_indices = test_validation_indices[:n_validation]
        test_indices = test_validation_indices[n_validation:]

    else:
        raise ValueError("Splitting method `{}` not found.".format(method))

    split_data_dictionary = {
        "training set": {
            "values": data_dictionary["values"][training_indices],
            "preprocessed values": None,
            "binarised values": None,
            "labels": None,
            "example names":
                data_dictionary["example names"][training_indices],
            "batch indices": None
        },
        "validation set": {
            "values": data_dictionary["values"][validation_indices],
            "preprocessed values": None,
            "binarised values": None,
            "labels": None,
            "example names":
                data_dictionary["example names"][validation_indices],
            "batch indices": None
        },
        "test set": {
            "values": data_dictionary["values"][test_indices],
            "preprocessed values": None,
            "binarised values": None,
            "labels": None,
            "example names": data_dictionary["example names"][test_indices],
            "batch indices": None
        },
        "feature names": data_dictionary["feature names"],
        "class names": data_dictionary["class names"]
    }

    if "labels" in data_dictionary and data_dictionary["labels"] is not None:
        split_data_dictionary["training set"]["labels"] = (
            data_dictionary["labels"][training_indices])
        split_data_dictionary["validation set"]["labels"] = (
            data_dictionary["labels"][validation_indices])
        split_data_dictionary["test set"]["labels"] = (
            data_dictionary["labels"][test_indices])

    if ("preprocessed values" in data_dictionary
            and data_dictionary["preprocessed values"] is not None):
        split_data_dictionary["training set"]["preprocessed values"] = (
            data_dictionary["preprocessed values"][training_indices])
        split_data_dictionary["validation set"]["preprocessed values"] = (
            data_dictionary["preprocessed values"][validation_indices])
        split_data_dictionary["test set"]["preprocessed values"] = (
            data_dictionary["preprocessed values"][test_indices])

    if ("binarised values" in data_dictionary
            and data_dictionary["binarised values"] is not None):
        split_data_dictionary["training set"]["binarised values"] = (
            data_dictionary["binarised values"][training_indices])
        split_data_dictionary["validation set"]["binarised values"] = (
            data_dictionary["binarised values"][validation_indices])
        split_data_dictionary["test set"]["binarised values"] = (
            data_dictionary["binarised values"][test_indices])

    if ("batch indices" in data_dictionary
            and data_dictionary["batch indices"] is not None):
        split_data_dictionary["training set"]["batch indices"] = (
            data_dictionary["batch indices"][training_indices])
        split_data_dictionary["validation set"]["batch indices"] = (
            data_dictionary["batch indices"][validation_indices])
        split_data_dictionary["test set"]["batch indices"] = (
            data_dictionary["batch indices"][test_indices])

    duration = time() - start_time
    print("Data set split ({}).".format(format_duration(duration)))

    return split_data_dictionary