Beispiel #1
0
def analyse_predictions(evaluation_set, analyses_directory=None):

    if analyses_directory is None:
        analyses_directory = defaults["analyses"]["directory"]

    print("Saving predictions.")

    predictions_directory = os.path.join(analyses_directory, "predictions")

    table_name = "predictions"

    if evaluation_set.prediction_specifications:
        table_name += "-" + (evaluation_set.prediction_specifications.name)
    else:
        table_name += "-unknown_prediction_method"

    if evaluation_set.has_predicted_cluster_ids:
        saving_time_start = time()
        save_values(values=evaluation_set.predicted_cluster_ids,
                    name="{}-predicted_cluster_ids".format(table_name),
                    row_names=evaluation_set.example_names,
                    column_names=["Cluster ID"],
                    directory=predictions_directory)
        saving_duration = time() - saving_time_start
        print("    Predicted cluster IDs saved ({}).".format(
            format_duration(saving_duration)))

    if evaluation_set.has_predicted_labels:
        saving_time_start = time()
        save_values(values=evaluation_set.predicted_labels,
                    name="{}-predicted_labels".format(table_name),
                    row_names=evaluation_set.example_names,
                    column_names=[evaluation_set.terms["class"].capitalize()],
                    directory=predictions_directory)
        saving_duration = time() - saving_time_start
        print("    Predicted labels saved ({}).".format(
            format_duration(saving_duration)))

    if evaluation_set.has_predicted_superset_labels:
        saving_time_start = time()
        save_values(values=evaluation_set.predicted_superset_labels,
                    name="{}-predicted_superset_labels".format(table_name),
                    row_names=evaluation_set.example_names,
                    column_names=[evaluation_set.terms["class"].capitalize()],
                    directory=predictions_directory)
        saving_duration = time() - saving_time_start
        print("    Predicted superset labels saved ({}).".format(
            format_duration(saving_duration)))

    print()
Beispiel #2
0
def load_original_data_set(paths, data_format):

    print("Loading original data set.")
    loading_time_start = time()

    if data_format is None:
        raise ValueError("Data format not specified.")
    elif data_format.startswith("tsv"):
        data_format = "matrix_ebf"

    load = LOADERS.get(data_format)

    if load is None:
        raise ValueError(
            "Data format `{}` not recognised.".format(data_format))

    data_dictionary = load(paths=paths)

    loading_duration = time() - loading_time_start
    print("Original data set loaded ({}).".format(
        format_duration(loading_duration)))

    if not isinstance(data_dictionary["values"], scipy.sparse.csr_matrix):

        print()

        print("Converting data set value array to sparse matrix.")
        sparse_time_start = time()

        data_dictionary["values"] = scipy.sparse.csr_matrix(
            data_dictionary["values"])

        sparse_duration = time() - sparse_time_start
        print("Data set value array converted ({}).".format(
            format_duration(sparse_duration)))

    return data_dictionary
Beispiel #3
0
def load_data_dictionary(path):
    def load(tables_file, group=None):

        if not group:
            group = tables_file.root

        data_dictionary = {}

        for node in tables_file.iter_nodes(group):
            node_title = node._v_title
            if node == group:
                pass
            elif isinstance(node, tables.Group):
                if node_title.endswith("set"):
                    data_dictionary[node_title] = load(tables_file, group=node)
                elif node_title.endswith("values"):
                    data_dictionary[node_title] = _load_sparse_matrix(
                        tables_file, group=node)
                elif node_title == "split indices":
                    data_dictionary[node_title] = _load_split_indices(
                        tables_file, group=node)
                elif node_title == "feature mapping":
                    data_dictionary[node_title] = _load_feature_mapping(
                        tables_file, group=node)
                else:
                    raise NotImplementedError(
                        "Loading group `{}` not implemented.".format(
                            node_title))
            elif isinstance(node, tables.Array):
                data_dictionary[node_title] = _load_array_or_other_type(node)
            else:
                raise NotImplementedError(
                    "Loading node `{}` not implemented.".format(node_title))

        return data_dictionary

    start_time = time()

    with tables.open_file(path, "r") as tables_file:
        data_dictionary = load(tables_file)

    duration = time() - start_time
    print("Data loaded ({}).".format(format_duration(duration)))

    return data_dictionary
Beispiel #4
0
def save_data_dictionary(data_dictionary, path):

    directory, filename = os.path.split(path)

    if not os.path.exists(directory):
        os.makedirs(directory)

    def save(data_dictionary, tables_file, group_title=None):

        if group_title:
            group = tables_file.create_group("/",
                                             normalise_string(group_title),
                                             group_title)
        else:
            group = tables_file.root

        for title, value in data_dictionary.items():

            if isinstance(value, scipy.sparse.csr_matrix):
                _save_sparse_matrix(value, title, group, tables_file)
            elif isinstance(value, (numpy.ndarray, list)):
                _save_array(value, title, group, tables_file)
            elif title == "split indices":
                _save_split_indices(value, title, group, tables_file)
            elif title == "feature mapping":
                _save_feature_mapping(value, title, group, tables_file)
            elif value is None:
                _save_string(str(value), title, group, tables_file)
            elif title.endswith("set"):
                save(value, tables_file, group_title=title)
            else:
                raise NotImplementedError(
                    "Saving type {} for title \"{}\" has not been implemented."
                    .format(type(value), title))

    start_time = time()

    filters = tables.Filters(complib="zlib", complevel=5)

    with tables.open_file(path, "w", filters=filters) as tables_file:
        save(data_dictionary, tables_file)

    duration = time() - start_time
    print("Data saved ({}).".format(format_duration(duration)))
Beispiel #5
0
def analyse_centroid_probabilities(centroids,
                                   name=None,
                                   analysis_level=None,
                                   export_options=None,
                                   analyses_directory=None):

    if name:
        name = normalise_string(name)
    if analysis_level is None:
        analysis_level = defaults["analyses"]["analysis_level"]
    if analyses_directory is None:
        analyses_directory = defaults["analyses"]["directory"]

    print("Plotting centroid probabilities.")
    plot_time_start = time()

    posterior_probabilities = None
    prior_probabilities = None

    if "posterior" in centroids and centroids["posterior"]:
        posterior_probabilities = centroids["posterior"]["probabilities"]
        n_centroids = len(posterior_probabilities)
    if "prior" in centroids and centroids["prior"]:
        prior_probabilities = centroids["prior"]["probabilities"]
        n_centroids = len(prior_probabilities)

    centroids_palette = style.darker_palette(n_centroids)
    x_label = "$k$"
    if prior_probabilities is not None:
        if posterior_probabilities is not None:
            y_label = _axis_label_for_symbol(
                symbol="\\pi",
                distribution=normalise_string("posterior"),
                suffix="^k")
            if name:
                plot_name = [name, "posterior", "prior"]
            else:
                plot_name = ["posterior", "prior"]
        else:
            y_label = _axis_label_for_symbol(
                symbol="\\pi",
                distribution=normalise_string("prior"),
                suffix="^k")
            if name:
                plot_name = [name, "prior"]
            else:
                plot_name = "prior"
    elif posterior_probabilities is not None:
        y_label = _axis_label_for_symbol(
            symbol="\\pi",
            distribution=normalise_string("posterior"),
            suffix="^k")
        if name:
            plot_name = [name, "posterior"]
        else:
            plot_name = "posterior"

    figure, figure_name = figures.plot_probabilities(posterior_probabilities,
                                                     prior_probabilities,
                                                     x_label=x_label,
                                                     y_label=y_label,
                                                     palette=centroids_palette,
                                                     uniform=False,
                                                     name=plot_name)
    figures.save_figure(figure=figure,
                        name=figure_name,
                        options=export_options,
                        directory=analyses_directory)

    plot_duration = time() - plot_time_start
    print("Centroid probabilities plotted and saved ({}).".format(
        format_duration(plot_duration)))
Beispiel #6
0
def analyse_decompositions(data_sets,
                           other_data_sets=None,
                           centroids=None,
                           colouring_data_set=None,
                           sampled_data_set=None,
                           decomposition_methods=None,
                           highlight_feature_indices=None,
                           symbol=None,
                           title="data set",
                           specifier=None,
                           analysis_level=None,
                           export_options=None,
                           analyses_directory=None):

    if analysis_level is None:
        analysis_level = defaults["analyses"]["analysis_level"]

    centroids_original = centroids

    if isinstance(data_sets, dict):
        data_sets = list(data_sets.values())

    if not isinstance(data_sets, (list, tuple)):
        data_sets = [data_sets]

    if other_data_sets is None:
        other_data_sets = [None] * len(data_sets)
    elif not isinstance(other_data_sets, (list, tuple)):
        other_data_sets = [other_data_sets]

    if len(data_sets) != len(other_data_sets):
        raise ValueError(
            "Lists of data sets and alternative data sets do not have the "
            "same length.")

    specification = None

    base_symbol = symbol

    original_title = title

    if decomposition_methods is None:
        decomposition_methods = [defaults["decomposition_method"]]
    elif not isinstance(decomposition_methods, (list, tuple)):
        decomposition_methods = [decomposition_methods]
    else:
        decomposition_methods = decomposition_methods.copy()
    decomposition_methods.insert(0, None)

    if highlight_feature_indices is None:
        highlight_feature_indices = defaults["analyses"][
            "highlight_feature_indices"]
    elif not isinstance(highlight_feature_indices, (list, tuple)):
        highlight_feature_indices = [highlight_feature_indices]
    else:
        highlight_feature_indices = highlight_feature_indices.copy()

    if analyses_directory is None:
        analyses_directory = defaults["analyses"]["directory"]

    for data_set, other_data_set in zip(data_sets, other_data_sets):

        if data_set.values.shape[1] <= 1:
            continue

        title = original_title
        name = normalise_string(title)

        if specifier:
            specification = specifier(data_set)

        if specification:
            name += "-" + str(specification)
            title += " for " + specification

        title += " set"

        if not colouring_data_set:
            colouring_data_set = data_set

        if data_set.version in ["z", "z1"]:
            centroids = copy.deepcopy(centroids_original)
        else:
            centroids = None

        if other_data_set:
            title = "{} set values in {}".format(other_data_set.version, title)
            name = other_data_set.version + "-" + name

        decompositions_directory = os.path.join(analyses_directory, name)

        for decomposition_method in decomposition_methods:

            other_values = None
            sampled_values = None

            if other_data_set:
                other_values = other_data_set.values

            if sampled_data_set:
                sampled_values = sampled_data_set.values

            if not decomposition_method:
                if data_set.number_of_features == 2:
                    values_decomposed = data_set.values
                    other_values_decomposed = other_values
                    sampled_values_decomposed = sampled_values
                    centroids_decomposed = centroids
                else:
                    continue
            else:
                decomposition_method = proper_string(
                    decomposition_method, DECOMPOSITION_METHOD_NAMES)

                values_decomposed = data_set.values
                other_values_decomposed = other_values
                sampled_values_decomposed = sampled_values
                centroids_decomposed = centroids

                other_value_sets_decomposed = {}
                if other_values is not None:
                    other_value_sets_decomposed["other"] = other_values
                if sampled_values is not None:
                    other_value_sets_decomposed["sampled"] = sampled_values
                if not other_value_sets_decomposed:
                    other_value_sets_decomposed = None

                if decomposition_method == "t-SNE":
                    if (data_set.number_of_examples >
                            MAXIMUM_NUMBER_OF_EXAMPLES_FOR_TSNE):
                        print(
                            "The number of examples for {}".format(title),
                            "is too large to decompose it",
                            "using {}. Skipping.".format(decomposition_method))
                        print()
                        continue

                    elif (data_set.number_of_features >
                          MAXIMUM_NUMBER_OF_FEATURES_FOR_TSNE):
                        number_of_pca_components_before_tsne = min(
                            MAXIMUM_NUMBER_OF_PCA_COMPONENTS_BEFORE_TSNE,
                            data_set.number_of_examples - 1)
                        print(
                            "The number of features for {}".format(title),
                            "is too large to decompose it",
                            "using {} in due time.".format(
                                decomposition_method))
                        print("Decomposing {} to {} components using PCA "
                              "beforehand.".format(
                                  title, number_of_pca_components_before_tsne))
                        decompose_time_start = time()
                        (values_decomposed, other_value_sets_decomposed,
                         centroids_decomposed) = decompose(
                             values_decomposed,
                             other_value_sets=other_value_sets_decomposed,
                             centroids=centroids_decomposed,
                             method="pca",
                             number_of_components=(
                                 number_of_pca_components_before_tsne))
                        decompose_duration = time() - decompose_time_start
                        print("{} pre-decomposed ({}).".format(
                            capitalise_string(title),
                            format_duration(decompose_duration)))

                    else:
                        if scipy.sparse.issparse(values_decomposed):
                            values_decomposed = values_decomposed.A
                        if scipy.sparse.issparse(other_values_decomposed):
                            other_values_decomposed = other_values_decomposed.A
                        if scipy.sparse.issparse(sampled_values_decomposed):
                            sampled_values_decomposed = (
                                sampled_values_decomposed.A)

                print("Decomposing {} using {}.".format(
                    title, decomposition_method))
                decompose_time_start = time()
                (values_decomposed, other_value_sets_decomposed,
                 centroids_decomposed) = decompose(
                     values_decomposed,
                     other_value_sets=other_value_sets_decomposed,
                     centroids=centroids_decomposed,
                     method=decomposition_method,
                     number_of_components=2)
                decompose_duration = time() - decompose_time_start
                print("{} decomposed ({}).".format(
                    capitalise_string(title),
                    format_duration(decompose_duration)))
                print()

                if other_value_sets_decomposed:
                    other_values_decomposed = other_value_sets_decomposed.get(
                        "other")
                    sampled_values_decomposed = (
                        other_value_sets_decomposed.get("sampled"))

            if base_symbol:
                symbol = base_symbol
            else:
                symbol = specification

            x_label = _axis_label_for_symbol(
                symbol=symbol,
                coordinate=1,
                decomposition_method=decomposition_method,
            )
            y_label = _axis_label_for_symbol(
                symbol=symbol,
                coordinate=2,
                decomposition_method=decomposition_method,
            )

            figure_labels = {
                "title": decomposition_method,
                "x label": x_label,
                "y label": y_label
            }

            if other_data_set:
                plot_values_decomposed = other_values_decomposed
            else:
                plot_values_decomposed = values_decomposed

            if plot_values_decomposed is None:
                print("No values to plot.\n")
                return

            print("Plotting {}{}.".format(
                "decomposed " if decomposition_method else "", title))

            # No colour-coding
            plot_time_start = time()
            figure, figure_name = figures.plot_values(
                plot_values_decomposed,
                centroids=centroids_decomposed,
                figure_labels=figure_labels,
                example_tag=data_set.tags["example"],
                name=name)
            figures.save_figure(figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
            plot_duration = time() - plot_time_start
            print("    {} plotted and saved ({}).".format(
                capitalise_string(title), format_duration(plot_duration)))

            # Samples
            if sampled_data_set:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    centroids=centroids_decomposed,
                    sampled_values=sampled_values_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name)
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    {} (with samples) plotted and saved ({}).".format(
                    capitalise_string(title), format_duration(plot_duration)))

            # Labels
            if colouring_data_set.labels is not None:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="labels",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name)
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    {} (with labels) plotted and saved ({}).".format(
                    capitalise_string(title), format_duration(plot_duration)))

                # Superset labels
                if colouring_data_set.superset_labels is not None:
                    plot_time_start = time()
                    figure, figure_name = figures.plot_values(
                        plot_values_decomposed,
                        colour_coding="superset labels",
                        colouring_data_set=colouring_data_set,
                        centroids=centroids_decomposed,
                        figure_labels=figure_labels,
                        example_tag=data_set.tags["example"],
                        name=name)
                    figures.save_figure(figure=figure,
                                        name=figure_name,
                                        options=export_options,
                                        directory=decompositions_directory)
                    plot_duration = time() - plot_time_start
                    print("    "
                          "{} (with superset labels) plotted and saved ({}).".
                          format(capitalise_string(title),
                                 format_duration(plot_duration)))

                # For each class
                if analysis_level == "extensive":
                    if colouring_data_set.number_of_classes <= 10:
                        plot_time_start = time()
                        for class_name in colouring_data_set.class_names:
                            figure, figure_name = figures.plot_values(
                                plot_values_decomposed,
                                colour_coding="class",
                                colouring_data_set=colouring_data_set,
                                centroids=centroids_decomposed,
                                class_name=class_name,
                                figure_labels=figure_labels,
                                example_tag=data_set.tags["example"],
                                name=name)
                            figures.save_figure(
                                figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
                        plot_duration = time() - plot_time_start
                        print(
                            "    {} (for each class) plotted and saved ({}).".
                            format(capitalise_string(title),
                                   format_duration(plot_duration)))

                    if (colouring_data_set.superset_labels is not None
                            and data_set.number_of_superset_classes <= 10):
                        plot_time_start = time()
                        for superset_class_name in (
                                colouring_data_set.superset_class_names):
                            figure, figure_name = figures.plot_values(
                                plot_values_decomposed,
                                colour_coding="superset class",
                                colouring_data_set=colouring_data_set,
                                centroids=centroids_decomposed,
                                class_name=superset_class_name,
                                figure_labels=figure_labels,
                                example_tag=data_set.tags["example"],
                                name=name)
                            figures.save_figure(
                                figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
                        plot_duration = time() - plot_time_start
                        print("    {} (for each superset class) plotted and "
                              "saved ({}).".format(
                                  capitalise_string(title),
                                  format_duration(plot_duration)))

            # Batches
            if colouring_data_set.has_batches:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="batches",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    "
                      "{} (with batches) plotted and saved ({}).".format(
                          capitalise_string(title),
                          format_duration(plot_duration)))

            # Cluster IDs
            if colouring_data_set.has_predicted_cluster_ids:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="predicted cluster IDs",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print(
                    "    "
                    "{} (with predicted cluster IDs) plotted and saved ({}).".
                    format(capitalise_string(title),
                           format_duration(plot_duration)))

            # Predicted labels
            if colouring_data_set.has_predicted_labels:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="predicted labels",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    "
                      "{} (with predicted labels) plotted and saved ({}).".
                      format(capitalise_string(title),
                             format_duration(plot_duration)))

            if colouring_data_set.has_predicted_superset_labels:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="predicted superset labels",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name,
                )
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print(
                    "    {} (with predicted superset labels) plotted and saved"
                    " ({}).".format(capitalise_string(title),
                                    format_duration(plot_duration)))

            # Count sum
            plot_time_start = time()
            figure, figure_name = figures.plot_values(
                plot_values_decomposed,
                colour_coding="count sum",
                colouring_data_set=colouring_data_set,
                centroids=centroids_decomposed,
                figure_labels=figure_labels,
                example_tag=data_set.tags["example"],
                name=name)
            figures.save_figure(figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=decompositions_directory)
            plot_duration = time() - plot_time_start
            print("    {} (with count sum) plotted and saved ({}).".format(
                capitalise_string(title), format_duration(plot_duration)))

            # Features
            for feature_index in highlight_feature_indices:
                plot_time_start = time()
                figure, figure_name = figures.plot_values(
                    plot_values_decomposed,
                    colour_coding="feature",
                    colouring_data_set=colouring_data_set,
                    centroids=centroids_decomposed,
                    feature_index=feature_index,
                    figure_labels=figure_labels,
                    example_tag=data_set.tags["example"],
                    name=name)
                figures.save_figure(figure=figure,
                                    name=figure_name,
                                    options=export_options,
                                    directory=decompositions_directory)
                plot_duration = time() - plot_time_start
                print("    {} (with {}) plotted and saved ({}).".format(
                    capitalise_string(title),
                    data_set.feature_names[feature_index],
                    format_duration(plot_duration)))

            print()
Beispiel #7
0
def analyse_distributions(data_set,
                          colouring_data_set=None,
                          cutoffs=None,
                          preprocessed=False,
                          analysis_level="normal",
                          export_options=None,
                          analyses_directory=None):

    if not colouring_data_set:
        colouring_data_set = data_set

    if analysis_level is None:
        analysis_level = defaults["analyses"]["analysis_level"]

    if analyses_directory is None:
        analyses_directory = defaults["analyses"]["directory"]
    distribution_directory = os.path.join(analyses_directory, "histograms")

    data_set_title = data_set.kind + " set"
    data_set_name = data_set.kind
    if data_set.version != "original":
        data_set_title = data_set.version + " " + data_set_title
        data_set_name = None

    data_set_discreteness = data_set.discreteness and not preprocessed

    print("Plotting distributions for {}.".format(data_set_title))

    # Class distribution
    if (data_set.number_of_classes and data_set.number_of_classes < 100
            and colouring_data_set == data_set):
        distribution_time_start = time()
        figure, figure_name = figures.plot_class_histogram(
            labels=data_set.labels,
            class_names=data_set.class_names,
            class_palette=data_set.class_palette,
            normed=True,
            scale="linear",
            label_sorter=data_set.label_sorter,
            name=data_set_name)
        figures.save_figure(figure=figure,
                            name=figure_name,
                            options=export_options,
                            directory=distribution_directory)
        distribution_duration = time() - distribution_time_start
        print("    Class distribution plotted and saved ({}).".format(
            format_duration(distribution_duration)))

    # Superset class distribution
    if data_set.label_superset and colouring_data_set == data_set:
        distribution_time_start = time()
        figure, figure_name = figures.plot_class_histogram(
            labels=data_set.superset_labels,
            class_names=data_set.superset_class_names,
            class_palette=data_set.superset_class_palette,
            normed=True,
            scale="linear",
            label_sorter=data_set.superset_label_sorter,
            name=[data_set_name, "superset"])
        figures.save_figure(figure=figure,
                            name=figure_name,
                            options=export_options,
                            directory=distribution_directory)
        distribution_duration = time() - distribution_time_start
        print("    Superset class distribution plotted and saved ({}).".format(
            format_duration(distribution_duration)))

    # Count distribution
    if scipy.sparse.issparse(data_set.values):
        series = data_set.values.data
        excess_zero_count = data_set.values.size - series.size
    else:
        series = data_set.values.reshape(-1)
        excess_zero_count = 0
    distribution_time_start = time()
    for x_scale in ["linear", "log"]:
        figure, figure_name = figures.plot_histogram(
            series=series,
            excess_zero_count=excess_zero_count,
            label=data_set.tags["value"].capitalize() + "s",
            discrete=data_set_discreteness,
            normed=True,
            x_scale=x_scale,
            y_scale="log",
            name=["counts", data_set_name])
        figures.save_figure(figure=figure,
                            name=figure_name,
                            options=export_options,
                            directory=distribution_directory)
    distribution_duration = time() - distribution_time_start
    print("    Count distribution plotted and saved ({}).".format(
        format_duration(distribution_duration)))

    # Count distributions with cut-off
    if (analysis_level == "extensive" and cutoffs
            and data_set.example_type == "counts"):
        distribution_time_start = time()
        for cutoff in cutoffs:
            figure, figure_name = figures.plot_cutoff_count_histogram(
                series=series,
                excess_zero_count=excess_zero_count,
                cutoff=cutoff,
                normed=True,
                scale="log",
                name=data_set_name)
            figures.save_figure(figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=distribution_directory + "-counts")
        distribution_duration = time() - distribution_time_start
        print("    Count distributions with cut-offs plotted and saved ({}).".
              format(format_duration(distribution_duration)))

    # Count sum distribution
    distribution_time_start = time()
    figure, figure_name = figures.plot_histogram(
        series=data_set.count_sum,
        label="Total number of {}s per {}".format(data_set.tags["item"],
                                                  data_set.tags["example"]),
        normed=True,
        y_scale="log",
        name=["count sum", data_set_name])
    figures.save_figure(figure=figure,
                        name=figure_name,
                        options=export_options,
                        directory=distribution_directory)
    distribution_duration = time() - distribution_time_start
    print("    Count sum distribution plotted and saved ({}).".format(
        format_duration(distribution_duration)))

    # Count distributions and count sum distributions for each class
    if analysis_level == "extensive" and colouring_data_set.labels is not None:

        class_count_distribution_directory = distribution_directory
        if data_set.version == "original":
            class_count_distribution_directory += "-classes"

        if colouring_data_set.label_superset:
            labels = colouring_data_set.superset_labels
            class_names = colouring_data_set.superset_class_names
            class_palette = colouring_data_set.superset_class_palette
            label_sorter = colouring_data_set.superset_label_sorter
        else:
            labels = colouring_data_set.labels
            class_names = colouring_data_set.class_names
            class_palette = colouring_data_set.class_palette
            label_sorter = colouring_data_set.label_sorter

        if not class_palette:
            index_palette = style.lighter_palette(
                colouring_data_set.number_of_classes)
            class_palette = {
                class_name: index_palette[i]
                for i, class_name in enumerate(
                    sorted(class_names, key=label_sorter))
            }

        distribution_time_start = time()
        for class_name in class_names:

            class_indices = labels == class_name

            if not class_indices.any():
                continue

            values_label = data_set.values[class_indices]

            if scipy.sparse.issparse(values_label):
                series = values_label.data
                excess_zero_count = values_label.size - series.size
            else:
                series = data_set.values.reshape(-1)
                excess_zero_count = 0

            figure, figure_name = figures.plot_histogram(
                series=series,
                excess_zero_count=excess_zero_count,
                label=data_set.tags["value"].capitalize() + "s",
                discrete=data_set_discreteness,
                normed=True,
                y_scale="log",
                colour=class_palette[class_name],
                name=["counts", data_set_name, "class", class_name])
            figures.save_figure(figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=class_count_distribution_directory)

        distribution_duration = time() - distribution_time_start
        print("    Count distributions for each class plotted and saved ({}).".
              format(format_duration(distribution_duration)))

        distribution_time_start = time()
        for class_name in class_names:

            class_indices = labels == class_name
            if not class_indices.any():
                continue

            figure, figure_name = figures.plot_histogram(
                series=data_set.count_sum[class_indices],
                label="Total number of {}s per {}".format(
                    data_set.tags["item"], data_set.tags["example"]),
                normed=True,
                y_scale="log",
                colour=class_palette[class_name],
                name=["count sum", data_set_name, "class", class_name])
            figures.save_figure(figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=class_count_distribution_directory)

        distribution_duration = time() - distribution_time_start
        print("    "
              "Count sum distributions for each class plotted and saved ({}).".
              format(format_duration(distribution_duration)))

    print()
Beispiel #8
0
def analyse_matrices(data_set,
                     plot_distances=False,
                     name=None,
                     export_options=None,
                     analyses_directory=None):

    if plot_distances:
        base_name = "distances"
    else:
        base_name = "heat_maps"

    if analyses_directory is None:
        analyses_directory = defaults["analyses"]["directory"]
    analyses_directory = os.path.join(analyses_directory, base_name)

    if not name:
        name = []
    elif not isinstance(name, list):
        name = [name]

    name.insert(0, base_name)

    # Subsampling indices (if necessary)
    random_state = numpy.random.RandomState(57)
    shuffled_indices = random_state.permutation(data_set.number_of_examples)

    # Feature selection for plotting (if necessary)
    feature_indices_for_plotting = None
    if (not plot_distances and data_set.number_of_features >
            MAXIMUM_NUMBER_OF_FEATURES_FOR_HEAT_MAPS):
        feature_variances = data_set.values.var(axis=0)
        if isinstance(feature_variances, numpy.matrix):
            feature_variances = feature_variances.A.squeeze()
        feature_indices_for_plotting = numpy.argsort(
            feature_variances)[-MAXIMUM_NUMBER_OF_FEATURES_FOR_HEAT_MAPS:]
        feature_indices_for_plotting.sort()

    # Class palette
    class_palette = data_set.class_palette
    if data_set.labels is not None and not class_palette:
        index_palette = style.lighter_palette(data_set.number_of_classes)
        class_palette = {
            class_name: tuple(index_palette[i])
            for i, class_name in enumerate(
                sorted(data_set.class_names, key=data_set.label_sorter))
        }

    # Axis labels
    example_label = data_set.tags["example"].capitalize() + "s"
    feature_label = data_set.tags["feature"].capitalize() + "s"
    value_label = data_set.tags["value"].capitalize() + "s"

    version = data_set.version
    symbol = None
    value_name = "values"

    if version in ["z", "x"]:
        symbol = "$\\mathbf{{{}}}$".format(version)
        value_name = "component"
    elif version in ["y"]:
        symbol = "${}$".format(version)
        value_name = "value"

    if version in ["y", "z"]:
        feature_label = " ".join([symbol, value_name + "s"])

    if plot_distances:
        if version in ["y", "z"]:
            value_label = symbol
        else:
            value_label = version

    if feature_indices_for_plotting is not None:
        feature_label = "{} most varying {}".format(
            len(feature_indices_for_plotting), feature_label.lower())

    plot_string = "Plotting heat map for {} values."
    if plot_distances:
        plot_string = "Plotting pairwise distances in {} space."
    print(plot_string.format(data_set.version))

    sorting_methods = ["hierarchical_clustering"]

    if data_set.labels is not None:
        sorting_methods.insert(0, "labels")

    for sorting_method in sorting_methods:

        distance_metrics = [None]

        if plot_distances or sorting_method == "hierarchical_clustering":
            distance_metrics = ["Euclidean", "cosine"]

        for distance_metric in distance_metrics:

            start_time = time()

            if (sorting_method == "hierarchical_clustering"
                    and data_set.number_of_examples >
                    MAXIMUM_NUMBER_OF_EXAMPLES_FOR_DENDROGRAM):
                sample_size = MAXIMUM_NUMBER_OF_EXAMPLES_FOR_DENDROGRAM
            elif (data_set.number_of_examples >
                  MAXIMUM_NUMBER_OF_EXAMPLES_FOR_HEAT_MAPS):
                sample_size = MAXIMUM_NUMBER_OF_EXAMPLES_FOR_HEAT_MAPS
            else:
                sample_size = None

            indices = numpy.arange(data_set.number_of_examples)

            if sample_size:
                indices = shuffled_indices[:sample_size]
                example_label = "{} randomly sampled {}".format(
                    sample_size, data_set.tags["example"] + "s")

            figure, figure_name = figures.plot_matrix(
                feature_matrix=data_set.values[indices],
                plot_distances=plot_distances,
                example_label=example_label,
                feature_label=feature_label,
                value_label=value_label,
                sorting_method=sorting_method,
                distance_metric=distance_metric,
                labels=(data_set.labels[indices]
                        if data_set.labels is not None else None),
                label_kind=data_set.tags["class"],
                class_palette=class_palette,
                feature_indices_for_plotting=feature_indices_for_plotting,
                name_parts=name +
                [data_set.version, distance_metric, sorting_method])
            figures.save_figure(figure=figure,
                                name=figure_name,
                                options=export_options,
                                directory=analyses_directory)

            duration = time() - start_time

            plot_kind_string = "Heat map for {} values".format(
                data_set.version)

            if plot_distances:
                plot_kind_string = "{} distances in {} space".format(
                    distance_metric.capitalize(), data_set.version)

            subsampling_string = ""

            if sample_size:
                subsampling_string = "{} {} randomly sampled examples".format(
                    "for" if plot_distances else "of", sample_size)

            sort_string = "sorted using {}".format(
                sorting_method.replace("_", " "))

            if (not plot_distances
                    and sorting_method == "hierarchical_clustering"):
                sort_string += " (with {} distances)".format(distance_metric)

            print("    " + " ".join([
                s for s in [
                    plot_kind_string, subsampling_string, sort_string,
                    "plotted and saved", "({})".format(
                        format_duration(duration))
                ] if s
            ]) + ".")

    print()
Beispiel #9
0
def predict_labels(training_set,
                   evaluation_set,
                   specifications=None,
                   method=None,
                   number_of_clusters=None):

    if specifications is None:
        if method is None:
            method = defaults["evaluation"]["prediction_method"]
        specifications = PredictionSpecifications(
            method=method,
            number_of_clusters=number_of_clusters,
            training_set=training_set.kind)

    method = specifications.method
    number_of_clusters = specifications.number_of_clusters

    predict = PREDICTION_METHODS[specifications.method]["function"]

    print("Predicting labels for evaluation set using {} with {} components.".
          format(method, number_of_clusters))
    prediction_time_start = time()

    if evaluation_set.has_labels:

        class_names_to_class_ids = numpy.vectorize(
            lambda class_name: evaluation_set.class_name_to_class_id[class_name
                                                                     ])
        class_ids_to_class_names = numpy.vectorize(
            lambda class_id: evaluation_set.class_id_to_class_name[class_id])

        evaluation_label_ids = class_names_to_class_ids(evaluation_set.labels)

        if evaluation_set.excluded_classes:
            excluded_class_ids = class_names_to_class_ids(
                evaluation_set.excluded_classes)
        else:
            excluded_class_ids = []

    if evaluation_set.has_superset_labels:

        superset_class_names_to_superset_class_ids = numpy.vectorize(
            lambda superset_class_name: evaluation_set.
            superset_class_name_to_superset_class_id[superset_class_name])
        superset_class_ids_to_superset_class_names = numpy.vectorize(
            lambda superset_class_id: evaluation_set.
            superset_class_id_to_superset_class_name[superset_class_id])

        evaluation_superset_label_ids = (
            superset_class_names_to_superset_class_ids(
                evaluation_set.superset_labels))

        if evaluation_set.excluded_superset_classes:
            excluded_superset_class_ids = (
                superset_class_names_to_superset_class_ids(
                    evaluation_set.excluded_superset_classes))
        else:
            excluded_superset_class_ids = []

    cluster_ids, predicted_labels, predicted_superset_labels = predict(
        training_set=training_set,
        evaluation_set=evaluation_set,
        number_of_clusters=number_of_clusters)

    if cluster_ids is not None:

        if predicted_labels is None and evaluation_set.has_labels:
            predicted_label_ids = map_cluster_ids_to_label_ids(
                evaluation_label_ids, cluster_ids, excluded_class_ids)
            predicted_labels = class_ids_to_class_names(predicted_label_ids)

        if (predicted_superset_labels is None
                and evaluation_set.has_superset_labels):
            predicted_superset_label_ids = map_cluster_ids_to_label_ids(
                evaluation_superset_label_ids, cluster_ids,
                excluded_superset_class_ids)
            predicted_superset_labels = (
                superset_class_ids_to_superset_class_names(
                    predicted_superset_label_ids))

    prediction_duration = time() - prediction_time_start
    print("Labels predicted ({}).".format(
        format_duration(prediction_duration)))

    return cluster_ids, predicted_labels, predicted_superset_labels
Beispiel #10
0
def acquire_data_set(title, urls, directory):

    paths = {}

    if not urls:
        return paths

    if not os.path.exists(directory):
        os.makedirs(directory)

    for values_or_labels in urls:
        paths[values_or_labels] = {}

        for kind in urls[values_or_labels]:

            url = urls[values_or_labels][kind]

            if not url:
                paths[values_or_labels][kind] = None
                continue

            url_filename = os.path.split(url)[-1]
            file_extension = extension(url_filename)

            filename = "-".join(
                map(normalise_string, [title, values_or_labels, kind]))
            path = os.path.join(directory, filename) + file_extension

            paths[values_or_labels][kind] = path

            if not os.path.isfile(path):

                if url.startswith("."):
                    raise Exception(
                        "Data set file have to be manually placed in "
                        "correct folder.")
                if os.path.isfile(url):

                    print("Copying {} for {} set.".format(
                        values_or_labels, kind, title))
                    start_time = time()

                    copy_file(url, path)

                    duration = time() - start_time
                    print("Data set copied ({}).".format(
                        format_duration(duration)))
                    print()

                else:

                    print("Downloading {} for {} set.".format(
                        values_or_labels, kind, title))
                    start_time = time()

                    download_file(url, path)

                    duration = time() - start_time
                    print("Data set downloaded ({}).".format(
                        format_duration(duration)))
                    print()

    return paths
Beispiel #11
0
def select_features(values_dictionary, feature_names, method=None,
                    parameters=None):

    method = normalise_string(method)

    print("Selecting features.")
    start_time = time()

    if type(values_dictionary) == dict:
        values = values_dictionary["original"]

    n_examples, n_features = values.shape

    if method == "remove_zeros":
        total_feature_sum = values.sum(axis=0)
        if isinstance(total_feature_sum, numpy.matrix):
            total_feature_sum = total_feature_sum.A.squeeze()
        indices = total_feature_sum != 0

    elif method == "keep_variances_above":
        variances = values.var(axis=0)
        if isinstance(variances, numpy.matrix):
            variances = variances.A.squeeze()
        if parameters:
            threshold = float(parameters[0])
        else:
            threshold = 0.5
        indices = variances > threshold

    elif method == "keep_highest_variances":
        variances = values.var(axis=0)
        if isinstance(variances, numpy.matrix):
            variances = variances.A.squeeze()
        variance_sorted_indices = numpy.argsort(variances)
        if parameters:
            number_to_keep = int(parameters[0])
        else:
            number_to_keep = int(n_examples/2)
        indices = numpy.sort(variance_sorted_indices[-number_to_keep:])

    else:
        raise ValueError(
            "Feature selection `{}` not found.".format(method))

    if method:
        error = Exception(
            "No features excluded using feature selection {}.".format(method))
        if indices.dtype == "bool" and all(indices):
            raise error
        elif indices.dtype != "bool" and len(indices) == n_features:
            raise error

    feature_selected_values = {}

    for version, values in values_dictionary.items():
        if values is not None:
            feature_selected_values[version] = values[:, indices]
        else:
            feature_selected_values[version] = None

    feature_selected_feature_names = feature_names[indices]

    n_features_changed = len(feature_selected_feature_names)

    duration = time() - start_time
    print("{} features selected, {} excluded ({}).".format(
        n_features_changed,
        n_features - n_features_changed,
        format_duration(duration)
    ))

    return feature_selected_values, feature_selected_feature_names
Beispiel #12
0
def split_data_set(data_dictionary, method=None, fraction=None):

    if method is None:
        method = defaults["data"]["splitting_method"]
    if fraction is None:
        fraction = defaults["data"]["splitting_fraction"]

    print("Splitting data set.")
    start_time = time()

    if method == "default":
        if "split indices" in data_dictionary:
            method = "indices"
        else:
            method = "random"

    method = normalise_string(method)

    n = data_dictionary["values"].shape[0]

    random_state = numpy.random.RandomState(42)

    if method in ["random", "sequential"]:

        n_training_validation = int(fraction * n)
        n_training = int(fraction * n_training_validation)

        if method == "random":
            indices = random_state.permutation(n)
        else:
            indices = numpy.arange(n)

        training_indices = indices[:n_training]
        validation_indices = indices[n_training:n_training_validation]
        test_indices = indices[n_training_validation:]

    elif method == "indices":

        split_indices = data_dictionary["split indices"]

        training_indices = split_indices["training"]
        test_indices = split_indices["test"]

        if "validation" in split_indices:
            validation_indices = split_indices["validation"]
        else:
            n_training_validation = training_indices.stop
            n_all = test_indices.stop

            n_training = n_training_validation - (
                n_all - n_training_validation)

            training_indices = slice(n_training)
            validation_indices = slice(n_training, n_training_validation)

    elif method == "macosko":

        values = data_dictionary["values"]

        minimum_number_of_non_zero_elements = 900
        number_of_non_zero_elements = (values != 0).sum(axis=1)

        training_indices = numpy.nonzero(
            number_of_non_zero_elements > minimum_number_of_non_zero_elements
        )[0]

        test_validation_indices = numpy.nonzero(
            number_of_non_zero_elements <= minimum_number_of_non_zero_elements
        )[0]

        random_state.shuffle(test_validation_indices)

        n_validation_test = len(test_validation_indices)
        n_validation = int((1 - fraction) * n_validation_test)

        validation_indices = test_validation_indices[:n_validation]
        test_indices = test_validation_indices[n_validation:]

    else:
        raise ValueError("Splitting method `{}` not found.".format(method))

    split_data_dictionary = {
        "training set": {
            "values": data_dictionary["values"][training_indices],
            "preprocessed values": None,
            "binarised values": None,
            "labels": None,
            "example names":
                data_dictionary["example names"][training_indices],
            "batch indices": None
        },
        "validation set": {
            "values": data_dictionary["values"][validation_indices],
            "preprocessed values": None,
            "binarised values": None,
            "labels": None,
            "example names":
                data_dictionary["example names"][validation_indices],
            "batch indices": None
        },
        "test set": {
            "values": data_dictionary["values"][test_indices],
            "preprocessed values": None,
            "binarised values": None,
            "labels": None,
            "example names": data_dictionary["example names"][test_indices],
            "batch indices": None
        },
        "feature names": data_dictionary["feature names"],
        "class names": data_dictionary["class names"]
    }

    if "labels" in data_dictionary and data_dictionary["labels"] is not None:
        split_data_dictionary["training set"]["labels"] = (
            data_dictionary["labels"][training_indices])
        split_data_dictionary["validation set"]["labels"] = (
            data_dictionary["labels"][validation_indices])
        split_data_dictionary["test set"]["labels"] = (
            data_dictionary["labels"][test_indices])

    if ("preprocessed values" in data_dictionary
            and data_dictionary["preprocessed values"] is not None):
        split_data_dictionary["training set"]["preprocessed values"] = (
            data_dictionary["preprocessed values"][training_indices])
        split_data_dictionary["validation set"]["preprocessed values"] = (
            data_dictionary["preprocessed values"][validation_indices])
        split_data_dictionary["test set"]["preprocessed values"] = (
            data_dictionary["preprocessed values"][test_indices])

    if ("binarised values" in data_dictionary
            and data_dictionary["binarised values"] is not None):
        split_data_dictionary["training set"]["binarised values"] = (
            data_dictionary["binarised values"][training_indices])
        split_data_dictionary["validation set"]["binarised values"] = (
            data_dictionary["binarised values"][validation_indices])
        split_data_dictionary["test set"]["binarised values"] = (
            data_dictionary["binarised values"][test_indices])

    if ("batch indices" in data_dictionary
            and data_dictionary["batch indices"] is not None):
        split_data_dictionary["training set"]["batch indices"] = (
            data_dictionary["batch indices"][training_indices])
        split_data_dictionary["validation set"]["batch indices"] = (
            data_dictionary["batch indices"][validation_indices])
        split_data_dictionary["test set"]["batch indices"] = (
            data_dictionary["batch indices"][test_indices])

    duration = time() - start_time
    print("Data set split ({}).".format(format_duration(duration)))

    return split_data_dictionary
Beispiel #13
0
def filter_examples(values_dictionary, example_names,
                    method=None, parameters=None,
                    labels=None, excluded_classes=None,
                    superset_labels=None, excluded_superset_classes=None,
                    batch_indices=None, count_sum=None):

    print("Filtering examples.")
    start_time = time()

    method = normalise_string(method)

    if superset_labels is not None:
        filter_labels = superset_labels.copy()
        filter_excluded_classes = excluded_superset_classes
    elif labels is not None:
        filter_labels = labels.copy()
        filter_excluded_classes = excluded_classes
    else:
        filter_labels = None

    filter_class_names = numpy.unique(filter_labels)

    if type(values_dictionary) == dict:
        values = values_dictionary["original"]

    n_examples, n_features = values.shape

    filter_indices = numpy.arange(n_examples)

    if method == "macosko":
        minimum_number_of_non_zero_elements = 900
        number_of_non_zero_elements = (values != 0).sum(axis=1)
        filter_indices = numpy.nonzero(
            number_of_non_zero_elements > minimum_number_of_non_zero_elements
        )[0]

    elif method == "inverse_macosko":
        maximum_number_of_non_zero_elements = 900
        number_of_non_zero_elements = (values != 0).sum(axis=1)
        filter_indices = numpy.nonzero(
            number_of_non_zero_elements <= maximum_number_of_non_zero_elements
        )[0]

    elif method in ["keep", "remove", "excluded_classes"]:

        if filter_labels is None:
            raise ValueError(
                "Cannot filter examples based on labels, "
                "since data set is unlabelled."
            )

        if method == "excluded_classes":
            method = "remove"
            parameters = filter_excluded_classes

        if method == "keep":
            label_indices = set()

            for parameter in parameters:
                for class_name in filter_class_names:

                    normalised_class_name = normalise_string(str(class_name))
                    normalised_parameter = normalise_string(str(parameter))

                    if normalised_class_name == normalised_parameter:
                        class_indices = filter_labels == class_name
                        label_indices.update(filter_indices[class_indices])

            filter_indices = filter_indices[list(label_indices)]

        elif method == "remove":

            for parameter in parameters:
                for class_name in filter_class_names:

                    normalised_class_name = normalise_string(str(class_name))
                    normalised_parameter = normalise_string(str(parameter))

                    if normalised_class_name == normalised_parameter:
                        label_indices = filter_labels != class_name
                        filter_labels = filter_labels[label_indices]
                        filter_indices = filter_indices[label_indices]

    elif method == "remove_count_sum_above":
        threshold = int(parameters[0])
        filter_indices = filter_indices[count_sum.reshape(-1) <= threshold]

    elif method == "random":
        n_samples = int(parameters[0])
        n_samples = min(n_samples, n_examples)
        random_state = numpy.random.RandomState(90)
        filter_indices = random_state.permutation(n_examples)[:n_samples]

    else:
        raise ValueError(
            "Example filter `{}` not found.".format(method))

    if method and len(filter_indices) == n_examples:
        raise Exception(
            "No examples filtered out using example filter `{}`."
            .format(method)
        )

    example_filtered_values = {}

    for version, values in values_dictionary.items():
        if values is not None:
            example_filtered_values[version] = values[filter_indices, :]
        else:
            example_filtered_values[version] = None

    example_filtered_example_names = example_names[filter_indices]

    if labels is not None:
        example_filtered_labels = labels[filter_indices]
    else:
        example_filtered_labels = None

    if batch_indices is not None:
        example_filtered_batch_indices = batch_indices[filter_indices]
    else:
        example_filtered_batch_indices = None

    n_examples_changed = len(example_filtered_example_names)

    duration = time() - start_time
    print("{} examples filtered out, {} remaining ({}).".format(
        n_examples - n_examples_changed,
        n_examples_changed,
        format_duration(duration)
    ))

    return (example_filtered_values, example_filtered_example_names,
            example_filtered_labels, example_filtered_batch_indices)
Beispiel #14
0
    def binarise(self):

        if self.preprocessed_values is None:
            raise NotImplementedError(
                "Data set values have to have been preprocessed and feature"
                " selected first.")

        binarise_preprocessing = ["binarise"]

        sparse_path = self._build_preprocessed_path(
            map_features=self.map_features,
            preprocessing_methods=binarise_preprocessing,
            feature_selection=self.feature_selection,
            feature_selection_parameters=self.feature_selection_parameters,
            example_filter=self.example_filter,
            example_filter_parameters=self.example_filter_parameters)

        if os.path.isfile(sparse_path):
            print("Loading binarised data.")
            data_dictionary = internal_io.load_data_dictionary(sparse_path)

        else:

            binarising_time_start = time()

            if self.preprocessing_methods != binarise_preprocessing:

                print("Binarising values.")
                start_time = time()

                binarisation_function = processing.build_preprocessor(
                    binarise_preprocessing)
                binarised_values = binarisation_function(self.values)

                duration = time() - start_time
                print("Values binarised ({}).".format(
                    format_duration(duration)))

                print()

            elif self.preprocessing_methods == binarise_preprocessing:
                binarised_values = self.preprocessed_values

            data_dictionary = {
                "values": self.values,
                "preprocessed values": binarised_values,
                "feature names": self.feature_names
            }

            binarising_duration = time() - binarising_time_start

            if binarising_duration > MINIMUM_NUMBER_OF_SECONDS_BEFORE_SAVING:

                if not os.path.exists(self.preprocess_directory):
                    os.makedirs(self.preprocess_directory)

                print("Saving binarised data set.")
                internal_io.save_data_dictionary(data_dictionary, sparse_path)

        binarised_values = sparse.SparseRowMatrix(binarised_values)

        self.update(binarised_values=data_dictionary["preprocessed values"])
Beispiel #15
0
    def preprocess(self):

        if (not self.map_features and not self.preprocessing_methods
                and not self.feature_selection and not self.example_filter):
            self.update(preprocessed_values=None)
            return

        sparse_path = self._build_preprocessed_path(
            map_features=self.map_features,
            preprocessing_methods=self.preprocessing_methods,
            feature_selection=self.feature_selection,
            feature_selection_parameters=self.feature_selection_parameters,
            example_filter=self.example_filter,
            example_filter_parameters=self.example_filter_parameters)

        if os.path.isfile(sparse_path):
            print("Loading preprocessed data.")
            data_dictionary = internal_io.load_data_dictionary(sparse_path)
            if "preprocessed values" not in data_dictionary:
                data_dictionary["preprocessed values"] = None
            if self.map_features:
                self.features_mapped = True
                self.tags = _update_tag_for_mapped_features(self.tags)
            print()
        else:

            preprocessing_time_start = time()

            values = self.values
            example_names = self.example_names
            feature_names = self.feature_names

            if self.map_features and not self.features_mapped:

                print(
                    "Mapping {} original features to {} new features.".format(
                        self.number_of_features, len(self.feature_mapping)))
                start_time = time()

                values, feature_names = processing.map_features(
                    values, feature_names, self.feature_mapping)

                self.features_mapped = True
                self.tags = _update_tag_for_mapped_features(self.tags)

                duration = time() - start_time
                print("Features mapped ({}).".format(
                    format_duration(duration)))

                print()

            if not self.preprocessed and self.preprocessing_methods:

                print("Preprocessing values.")
                start_time = time()

                preprocessing_function = processing.build_preprocessor(
                    self.preprocessing_methods)
                preprocessed_values = preprocessing_function(values)

                duration = time() - start_time
                print("Values preprocessed ({}).".format(
                    format_duration(duration)))

                print()

            else:
                preprocessed_values = None

            if self.feature_selection:
                values_dictionary, feature_names = processing.select_features(
                    {
                        "original": values,
                        "preprocessed": preprocessed_values
                    }, self.feature_names, self.feature_selection,
                    self.feature_selection_parameters)

                values = values_dictionary["original"]
                preprocessed_values = values_dictionary["preprocessed"]

                print()

            if self.example_filter:
                values_dictionary, example_names, labels, batch_indices = (
                    processing.filter_examples(
                        {
                            "original": values,
                            "preprocessed": preprocessed_values
                        },
                        self.example_names,
                        self.example_filter,
                        self.example_filter_parameters,
                        labels=self.labels,
                        excluded_classes=self.excluded_classes,
                        superset_labels=self.superset_labels,
                        excluded_superset_classes=(
                            self.excluded_superset_classes),
                        batch_indices=self.batch_indices,
                        count_sum=self.count_sum))

                values = values_dictionary["original"]
                preprocessed_values = values_dictionary["preprocessed"]

                print()

            data_dictionary = {
                "values": values,
                "preprocessed values": preprocessed_values,
            }

            if self.features_mapped or self.feature_selection:
                data_dictionary["feature names"] = feature_names

            if self.example_filter:
                data_dictionary["example names"] = example_names
                data_dictionary["labels"] = labels
                data_dictionary["batch indices"] = batch_indices

            preprocessing_duration = time() - preprocessing_time_start

            if (preprocessing_duration >
                    MINIMUM_NUMBER_OF_SECONDS_BEFORE_SAVING):

                if not os.path.exists(self.preprocess_directory):
                    os.makedirs(self.preprocess_directory)

                print("Saving preprocessed data set.")
                internal_io.save_data_dictionary(data_dictionary, sparse_path)
                print()

        values = data_dictionary["values"]
        preprocessed_values = data_dictionary["preprocessed values"]

        if preprocessed_values is None:
            preprocessed_values = values

        if self.features_mapped or self.feature_selection:
            feature_names = data_dictionary["feature names"]
        else:
            feature_names = self.feature_names

        if self.example_filter:
            example_names = data_dictionary["example names"]
            labels = data_dictionary["labels"]
            batch_indices = data_dictionary["batch indices"]
        else:
            example_names = self.example_names
            labels = self.labels
            batch_indices = self.batch_indices

        values = sparse.SparseRowMatrix(values)
        preprocessed_values = sparse.SparseRowMatrix(preprocessed_values)

        self.update(values=values,
                    preprocessed_values=preprocessed_values,
                    example_names=example_names,
                    feature_names=feature_names,
                    labels=labels,
                    batch_indices=batch_indices)