Ejemplo n.º 1
0
def _axis_label_for_symbol(symbol,
                           coordinate=None,
                           decomposition_method=None,
                           distribution=None,
                           prefix="",
                           suffix=""):

    if decomposition_method:
        decomposition_method = proper_string(
            normalise_string(decomposition_method), DECOMPOSITION_METHOD_NAMES)
        decomposition_label = DECOMPOSITION_METHOD_LABEL[decomposition_method]
    else:
        decomposition_label = ""

    if decomposition_label:
        decomposition_label = "\\mathrm{{{}}}".format(decomposition_label)

    if coordinate:
        coordinate_text = "{{{} {}}}".format(decomposition_label, coordinate)
    else:
        coordinate_text = ""

    if distribution == "prior":
        distribution_symbol = "\\theta"
    elif distribution == "posterior":
        distribution_symbol = "\\phi"
    else:
        distribution_symbol = ""

    if distribution_symbol and coordinate_text:
        distribution_position = "_"
        coordinate_position = "^"
    elif distribution_symbol and not coordinate_text:
        distribution_position = "_"
        coordinate_position = ""
    elif not distribution_symbol and coordinate_text:
        distribution_position = ""
        coordinate_position = "_"
    else:
        distribution_position = ""
        coordinate_position = ""

    if coordinate_position == "^":
        coordinate_text = "{{(" + coordinate_text + ")}}"
    elif coordinate_position == "_":
        coordinate_text = "{{" + coordinate_text + "}}"

    axis_label = "$" + "".join([
        prefix, symbol, distribution_position, distribution_symbol,
        coordinate_position, coordinate_text, suffix
    ]) + "$"

    return axis_label
Ejemplo n.º 2
0
    def __init__(self,
                 method,
                 number_of_clusters=None,
                 training_set_kind=None):

        prediction_method_names = {
            name: specifications["aliases"]
            for name, specifications in PREDICTION_METHODS.items()
        }
        method = proper_string(method, prediction_method_names)

        if method not in PREDICTION_METHODS:
            raise ValueError(
                "Prediction method `{}` not found.".format(method))

        if number_of_clusters is None:
            raise TypeError("Number of clusters not set.")

        self.method = method
        self.number_of_clusters = number_of_clusters

        if training_set_kind:
            training_set_kind = normalise_string(training_set_kind)
        self.training_set_kind = training_set_kind
Ejemplo n.º 3
0
def decompose(values,
              other_value_sets={},
              centroids={},
              method=None,
              number_of_components=None,
              random=False):

    if method is None:
        method = defaults["decomposition_method"]
    method = proper_string(normalise_string(method),
                           DECOMPOSITION_METHOD_NAMES)

    if number_of_components is None:
        number_of_components = defaults["decomposition_dimensionality"]

    other_values_provided_as_dictionary = True
    if other_value_sets is not None and not isinstance(other_value_sets, dict):
        other_value_sets["unknown"] = other_value_sets
        other_values_provided_as_dictionary = False

    if random:
        random_state = None
    else:
        random_state = 42

    if method == "PCA":
        if (values.shape[1] <= MAXIMUM_FEATURE_SIZE_FOR_NORMAL_PCA
                and not scipy.sparse.issparse(values)):
            model = PCA(n_components=number_of_components)
        else:
            model = IncrementalPCA(n_components=number_of_components,
                                   batch_size=100)
    elif method == "SVD":
        model = TruncatedSVD(n_components=number_of_components)
    elif method == "ICA":
        model = FastICA(n_components=number_of_components)
    elif method == "t-SNE":
        if number_of_components < 4:
            tsne_method = "barnes_hut"
        else:
            tsne_method = "exact"
        model = TSNE(n_components=number_of_components,
                     method=tsne_method,
                     random_state=random_state)
    else:
        raise ValueError("Method `{}` not found.".format(method))

    values_decomposed = model.fit_transform(values)

    if other_value_sets and method != "t-SNE":
        other_value_sets_decomposed = {}
        for other_set_name, other_values in other_value_sets.items():
            if other_values is not None:
                other_value_decomposed = model.transform(other_values)
            else:
                other_value_decomposed = None
            other_value_sets_decomposed[other_set_name] = (
                other_value_decomposed)
    else:
        other_value_sets_decomposed = None

    if other_value_sets_decomposed and not other_values_provided_as_dictionary:
        other_value_sets_decomposed = other_value_sets_decomposed["unknown"]

    # Only supports centroids without data sets as top levels
    if centroids is not None and method == "PCA":
        if "means" in centroids:
            centroids = {"unknown": centroids}
        components = model.components_
        centroids_decomposed = {}
        for distribution, distribution_centroids in centroids.items():
            if distribution_centroids:
                centroids_distribution_decomposed = {}
                for parameter, parameter_values in (
                        distribution_centroids.items()):
                    if parameter == "means":
                        shape = numpy.array(parameter_values.shape)
                        original_dimension = shape[-1]
                        reshaped_parameter_values = parameter_values.reshape(
                            -1, original_dimension)
                        decomposed_parameter_values = model.transform(
                            reshaped_parameter_values)
                        shape[-1] = number_of_components
                        new_parameter_values = (
                            decomposed_parameter_values.reshape(shape))
                    elif parameter == "covariance_matrices":
                        shape = numpy.array(parameter_values.shape)
                        original_dimension = shape[-1]
                        reshaped_parameter_values = parameter_values.reshape(
                            -1, original_dimension, original_dimension)
                        n_centroids = reshaped_parameter_values.shape[0]
                        decomposed_parameter_values = numpy.empty(
                            shape=(n_centroids, 2, 2))
                        for i in range(n_centroids):
                            decomposed_parameter_values[i] = (
                                components @ reshaped_parameter_values[i]
                                @ components.T)
                        shape[-2:] = number_of_components
                        new_parameter_values = (
                            decomposed_parameter_values.reshape(shape))
                    else:
                        new_parameter_values = parameter_values
                    centroids_distribution_decomposed[parameter] = (
                        new_parameter_values)
                centroids_decomposed[distribution] = (
                    centroids_distribution_decomposed)
            else:
                centroids_decomposed[distribution] = None
        if "unknown" in centroids_decomposed:
            centroids_decomposed = centroids_decomposed["unknown"]
    else:
        centroids_decomposed = None

    output = [values_decomposed]

    if other_value_sets != {}:
        output.append(other_value_sets_decomposed)

    if centroids != {}:
        output.append(centroids_decomposed)

    return output
Ejemplo n.º 4
0
def analyse_decompositions(data_sets,
                           other_data_sets=None,
                           centroids=None,
                           colouring_data_set=None,
                           sampled_data_set=None,
                           decomposition_methods=None,
                           highlight_feature_indices=None,
                           symbol=None,
                           title="data set",
                           specifier=None,
                           analysis_level=None,
                           export_options=None,
                           analyses_directory=None):

    if analysis_level is None:
        analysis_level = defaults["analyses"]["analysis_level"]

    centroids_original = centroids

    if isinstance(data_sets, dict):
        data_sets = list(data_sets.values())

    if not isinstance(data_sets, (list, tuple)):
        data_sets = [data_sets]

    if other_data_sets is None:
        other_data_sets = [None] * len(data_sets)
    elif not isinstance(other_data_sets, (list, tuple)):
        other_data_sets = [other_data_sets]

    if len(data_sets) != len(other_data_sets):
        raise ValueError(
            "Lists of data sets and alternative data sets do not have the "
            "same length.")

    specification = None

    base_symbol = symbol

    original_title = title

    if decomposition_methods is None:
        decomposition_methods = [defaults["decomposition_method"]]
    elif not isinstance(decomposition_methods, (list, tuple)):
        decomposition_methods = [decomposition_methods]
    else:
        decomposition_methods = decomposition_methods.copy()
    decomposition_methods.insert(0, None)

    if highlight_feature_indices is None:
        highlight_feature_indices = defaults["analyses"][
            "highlight_feature_indices"]
    elif not isinstance(highlight_feature_indices, (list, tuple)):
        highlight_feature_indices = [highlight_feature_indices]
    else:
        highlight_feature_indices = highlight_feature_indices.copy()

    if analyses_directory is None:
        analyses_directory = defaults["analyses"]["directory"]

    for data_set, other_data_set in zip(data_sets, other_data_sets):

        if data_set.values.shape[1] <= 1:
            continue

        title = original_title
        name = normalise_string(title)

        if specifier:
            specification = specifier(data_set)

        if specification:
            name += "-" + str(specification)
            title += " for " + specification

        title += " set"

        if not colouring_data_set:
            colouring_data_set = data_set

        if data_set.version in ["z", "z1"]:
            centroids = copy.deepcopy(centroids_original)
        else:
            centroids = None

        if other_data_set:
            title = "{} set values in {}".format(other_data_set.version, title)
            name = other_data_set.version + "-" + name

        decompositions_directory = os.path.join(analyses_directory, name)

        for decomposition_method in decomposition_methods:

            other_values = None
            sampled_values = None

            if other_data_set:
                other_values = other_data_set.values

            if sampled_data_set:
                sampled_values = sampled_data_set.values

            if not decomposition_method:
                if data_set.number_of_features == 2:
                    values_decomposed = data_set.values
                    other_values_decomposed = other_values
                    sampled_values_decomposed = sampled_values
                    centroids_decomposed = centroids
                else:
                    continue
            else:
                decomposition_method = proper_string(
                    decomposition_method, DECOMPOSITION_METHOD_NAMES)

                values_decomposed = data_set.values
                other_values_decomposed = other_values
                sampled_values_decomposed = sampled_values
                centroids_decomposed = centroids

                other_value_sets_decomposed = {}
                if other_values is not None:
                    other_value_sets_decomposed["other"] = other_values
                if sampled_values is not None:
                    other_value_sets_decomposed["sampled"] = sampled_values
                if not other_value_sets_decomposed:
                    other_value_sets_decomposed = None

                if decomposition_method == "t-SNE":
                    if (data_set.number_of_examples >
                            MAXIMUM_NUMBER_OF_EXAMPLES_FOR_TSNE):
                        print(
                            "The number of examples for {}".format(title),
                            "is too large to decompose it",
                            "using {}. Skipping.".format(decomposition_method))
                        print()
                        continue

                    elif (data_set.number_of_features >
                          MAXIMUM_NUMBER_OF_FEATURES_FOR_TSNE):
                        number_of_pca_components_before_tsne = min(
                            MAXIMUM_NUMBER_OF_PCA_COMPONENTS_BEFORE_TSNE,
                            data_set.number_of_examples - 1)
                        print(
                            "The number of features for {}".format(title),
                            "is too large to decompose it",
                            "using {} in due time.".format(
                                decomposition_method))
                        print("Decomposing {} to {} components using PCA "
                              "beforehand.".format(
                                  title, number_of_pca_components_before_tsne))
                        decompose_time_start = time()
                        (values_decomposed, other_value_sets_decomposed,
                         centroids_decomposed) = decompose(
                             values_decomposed,
                             other_value_sets=other_value_sets_decomposed,
                             centroids=centroids_decomposed,
                             method="pca",
                             number_of_components=(
                                 number_of_pca_components_before_tsne))
                        decompose_duration = time() - decompose_time_start
                        print("{} pre-decomposed ({}).".format(
                            capitalise_string(title),
                            format_duration(decompose_duration)))

                    else:
                        if scipy.sparse.issparse(values_decomposed):
                            values_decomposed = values_decomposed.A
                        if scipy.sparse.issparse(other_values_decomposed):
                            other_values_decomposed = other_values_decomposed.A
                        if scipy.sparse.issparse(sampled_values_decomposed):
                            sampled_values_decomposed = (
                                sampled_values_decomposed.A)

                print("Decomposing {} using {}.".format(
                    title, decomposition_method))
                decompose_time_start = time()
                (values_decomposed, other_value_sets_decomposed,
                 centroids_decomposed) = decompose(
                     values_decomposed,
                     other_value_sets=other_value_sets_decomposed,
                     centroids=centroids_decomposed,
                     method=decomposition_method,
                     number_of_components=2)
                decompose_duration = time() - decompose_time_start
                print("{} decomposed ({}).".format(
                    capitalise_string(title),
                    format_duration(decompose_duration)))
                print()

                if other_value_sets_decomposed:
                    other_values_decomposed = other_value_sets_decomposed.get(
                        "other")
                    sampled_values_decomposed = (
                        other_value_sets_decomposed.get("sampled"))

            if base_symbol:
                symbol = base_symbol
            else:
                symbol = specification

            x_label = _axis_label_for_symbol(
                symbol=symbol,
                coordinate=1,
                decomposition_method=decomposition_method,
            )
            y_label = _axis_label_for_symbol(
                symbol=symbol,
                coordinate=2,
                decomposition_method=decomposition_method,
            )

            figure_labels = {
                "title": decomposition_method,
                "x label": x_label,
                "y label": y_label
            }

            if other_data_set:
                plot_values_decomposed = other_values_decomposed
            else:
                plot_values_decomposed = values_decomposed

            if plot_values_decomposed is None:
                print("No values to plot.\n")
                return

            print("Plotting {}{}.".format(
                "decomposed " if decomposition_method else "", title))

            # No colour-coding
            plot_time_start = time()
            figure, figure_name = figures.plot_values(
                plot_values_decomposed,
                centroids=centroids_decomposed,
                figure_labels=figure_labels,
                example_tag=data_set.tags["example"],
                name=name)
            figures.save_figure(figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
            plot_duration = time() - plot_time_start
            print("    {} plotted and saved ({}).".format(
                capitalise_string(title), format_duration(plot_duration)))

            # Samples
            if sampled_data_set:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    centroids=centroids_decomposed,
                    sampled_values=sampled_values_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name)
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    {} (with samples) plotted and saved ({}).".format(
                    capitalise_string(title), format_duration(plot_duration)))

            # Labels
            if colouring_data_set.labels is not None:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="labels",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name)
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    {} (with labels) plotted and saved ({}).".format(
                    capitalise_string(title), format_duration(plot_duration)))

                # Superset labels
                if colouring_data_set.superset_labels is not None:
                    plot_time_start = time()
                    figure, figure_name = figures.plot_values(
                        plot_values_decomposed,
                        colour_coding="superset labels",
                        colouring_data_set=colouring_data_set,
                        centroids=centroids_decomposed,
                        figure_labels=figure_labels,
                        example_tag=data_set.tags["example"],
                        name=name)
                    figures.save_figure(figure=figure,
                                        name=figure_name,
                                        options=export_options,
                                        directory=decompositions_directory)
                    plot_duration = time() - plot_time_start
                    print("    "
                          "{} (with superset labels) plotted and saved ({}).".
                          format(capitalise_string(title),
                                 format_duration(plot_duration)))

                # For each class
                if analysis_level == "extensive":
                    if colouring_data_set.number_of_classes <= 10:
                        plot_time_start = time()
                        for class_name in colouring_data_set.class_names:
                            figure, figure_name = figures.plot_values(
                                plot_values_decomposed,
                                colour_coding="class",
                                colouring_data_set=colouring_data_set,
                                centroids=centroids_decomposed,
                                class_name=class_name,
                                figure_labels=figure_labels,
                                example_tag=data_set.tags["example"],
                                name=name)
                            figures.save_figure(
                                figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
                        plot_duration = time() - plot_time_start
                        print(
                            "    {} (for each class) plotted and saved ({}).".
                            format(capitalise_string(title),
                                   format_duration(plot_duration)))

                    if (colouring_data_set.superset_labels is not None
                            and data_set.number_of_superset_classes <= 10):
                        plot_time_start = time()
                        for superset_class_name in (
                                colouring_data_set.superset_class_names):
                            figure, figure_name = figures.plot_values(
                                plot_values_decomposed,
                                colour_coding="superset class",
                                colouring_data_set=colouring_data_set,
                                centroids=centroids_decomposed,
                                class_name=superset_class_name,
                                figure_labels=figure_labels,
                                example_tag=data_set.tags["example"],
                                name=name)
                            figures.save_figure(
                                figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
                        plot_duration = time() - plot_time_start
                        print("    {} (for each superset class) plotted and "
                              "saved ({}).".format(
                                  capitalise_string(title),
                                  format_duration(plot_duration)))

            # Batches
            if colouring_data_set.has_batches:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="batches",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    "
                      "{} (with batches) plotted and saved ({}).".format(
                          capitalise_string(title),
                          format_duration(plot_duration)))

            # Cluster IDs
            if colouring_data_set.has_predicted_cluster_ids:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="predicted cluster IDs",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print(
                    "    "
                    "{} (with predicted cluster IDs) plotted and saved ({}).".
                    format(capitalise_string(title),
                           format_duration(plot_duration)))

            # Predicted labels
            if colouring_data_set.has_predicted_labels:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="predicted labels",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    "
                      "{} (with predicted labels) plotted and saved ({}).".
                      format(capitalise_string(title),
                             format_duration(plot_duration)))

            if colouring_data_set.has_predicted_superset_labels:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="predicted superset labels",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print(
                    "    {} (with predicted superset labels) plotted and saved"
                    " ({}).".format(capitalise_string(title),
                                    format_duration(plot_duration)))

            # Count sum
            plot_time_start = time()
            figure, figure_name = figures.plot_values(
                plot_values_decomposed,
                colour_coding="count sum",
                colouring_data_set=colouring_data_set,
                centroids=centroids_decomposed,
                figure_labels=figure_labels,
                example_tag=data_set.tags["example"],
                name=name)
            figures.save_figure(figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
            plot_duration = time() - plot_time_start
            print("    {} (with count sum) plotted and saved ({}).".format(
                capitalise_string(title), format_duration(plot_duration)))

            # Features
            for feature_index in highlight_feature_indices:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="feature",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    feature_index=feature_index,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name)
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    {} (with {}) plotted and saved ({}).".format(
                    capitalise_string(title),
                    data_set.feature_names[feature_index],
                    format_duration(plot_duration)))

            print()