Ejemplo n.º 1
0
def get_steps():
    steps = [
        ('embedding', ts.TakensEmbedding()),
        ('window', ts.SlidingWindow(width=5, stride=1)),
        ('diagram', hl.VietorisRipsPersistence()),
        ('rescaler', diag.Scaler()),
        ('filter', diag.Filtering(epsilon=0.1)),
        ('entropy', diag.PersistenceEntropy()),
        ('scaling', skprep.MinMaxScaler(copy=True)),
   ]
    return steps
Ejemplo n.º 2
0
    def fit(self, X, y=None):
        """Do nothing and return the estimator unchanged.

        This method is there to implement the usual scikit-learn API and hence
        work in pipelines.

        Parameters
        ----------
        X : ndarray, shape: (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of
            the point cloud (i.e. the dimension of the point cloud space).

        y : None
            Ignored.

        Returns
        -------
        self : object
            Returns self.

        """

        steps = [
            ('diagram', hl.VietorisRipsPersistence(
                metric=self.metric,
                max_edge_length=self.max_edge_length,
                homology_dimensions=self.homology_dimensions,
                n_jobs=self.n_jobs)),
            ('rescaler', diag.Scaler(
                metric=self.scaler_metric,
                metric_params=self.scaler_metric_params,
                function=self.function,
                n_jobs=self.n_jobs)),
            ('filter', diag.Filtering(
                epsilon=self.epsilon,
                homology_dimensions=self.homology_dimensions)),
            ('landscape', diag.PersistenceLandscape(
                n_values=self.n_values, n_layers=self.n_layers))]

        self._pipeline = Pipeline(steps).fit(X)
        return self
Ejemplo n.º 3
0
    def fit(self, X, y=None):
        """Create a giotto :class:`Pipeline` object and fit it. Then, return
        the estimator.

        This method is there to implement the usual scikit-learn API and hence
        work in pipelines.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_points, n_dimensions)
            Input data. ``n_samples`` is the number of point clouds,
            ``n_points`` is the number of points per point cloud and
            ``n_dimensions`` is the number of features for each point of the
            point cloud (i.e. the dimension of the point cloud space)

        y : None
            Ignored.

        Returns
        -------
        self : object

        """

        steps = [('diagram',
                  hl.VietorisRipsPersistence(
                      metric=self.metric,
                      max_edge_length=self.max_edge_length,
                      homology_dimensions=self.homology_dimensions,
                      n_jobs=self.n_jobs)),
                 ('scaler',
                  diag.Scaler(metric=self.scaler_metric,
                              metric_params=self.scaler_metric_params,
                              function=self.scaler_function,
                              n_jobs=self.n_jobs)),
                 ('filter', diag.Filtering(epsilon=self.filter_epsilon)),
                 ('betticurve', diag.BettiCurve(n_values=self.n_values))]

        self._pipeline = Pipeline(steps).fit(X)
        return self
Ejemplo n.º 4
0
def varying_noise(n_steps, n_series, args_stable, args_aperiodic):
    # noise parameters
    min_noise = 0.0
    max_noise = 2.1
    step_size = 0.1
    std = 0.1

    parameters_type = "fixed"
    embedding_dimension = 2
    embedding_time_delay = 3
    n_jobs = 1

    window_width = 121 - ((embedding_dimension - 1) * embedding_time_delay + 1)
    # window_stride = 1

    metric = "euclidean"
    max_edge_length = 10
    homology_dimensions = [0, 1]

    epsilon = 0.0

    steps = [
        (
            "embedding",
            ts.TakensEmbedding(
                parameters_type=parameters_type,
                dimension=embedding_dimension,
                time_delay=embedding_time_delay,
                n_jobs=n_jobs,
            ),
        ),
        ("window", ts.SlidingWindow(width=window_width, stride=1)),
        (
            "diagrams",
            hl.VietorisRipsPersistence(
                metric=metric,
                max_edge_length=max_edge_length,
                homology_dimensions=homology_dimensions,
                n_jobs=n_jobs,
            ),
        ),
        ("diagrams_scaler", diag.Scaler()),
        ("diagrams_filter", diag.Filtering(epsilon=epsilon)),
    ]

    pipeline = Pipeline(steps)

    # maximal number of repetitions per noise level (for confidence intervals)
    max_itr = 5

    # data frames to save performance
    perf_train = pd.DataFrame(
        columns={"Score", "Type", "Mean Standard Deviation of Noise"}
    )
    perf_test = pd.DataFrame(
        columns={"Score", "Type", "Mean Standard Deviation of Noise"}
    )

    mb = master_bar(np.arange(min_noise, max_noise, step_size))
    for noise in mb:
        for _ in progress_bar(range(max_itr), parent=mb):
            mb.child.comment = "Repetitions per noise level"
            data = simulate_data(
                noise, std, n_steps, n_series, args_stable, args_aperiodic
            )
            # group data by type and series id
            grouped_data = data.groupby(["type", "series_id"])

            y_true = np.repeat([1, 0], n_series)
            id_train, id_test, y_train, y_test = train_test_split(
                range(2 * n_series), y_true, train_size=0.7, random_state=0
            )

            # classical k-means ###########################################################
            X = data["adults"].values.reshape((2 * n_series, -1))
            # train/test data
            X_train = X[id_train, :]
            X_test = X[id_test, :]

            # k means
            kmeans = KMeans(n_clusters=2, random_state=0)
            kmeans.fit(X_train)

            perf_train = perf_train.append(
                {
                    "Score": homogeneity_score(y_train, kmeans.labels_),
                    "Type": "Classic",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            perf_test = perf_test.append(
                {
                    "Score": homogeneity_score(y_test, kmeans.predict(X_test)),
                    "Type": "Classic",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            # threshold to determine whether a hole is relevant or not
            frac = 0.7

            # TDA k-means
            features = []
            for name, _ in grouped_data:
                X_filtered = pipeline.fit_transform(
                    grouped_data.get_group(name)["adults"].values
                )
                n_windows, n_points, _ = X_filtered.shape
                features.append(
                    get_mean_lifetime(X_filtered, n_windows, n_points)
                    + get_n_rel_holes(X_filtered, n_windows, n_points, frac=frac)
                    + get_n_rel_holes(X_filtered, n_windows, n_points, frac=0.0)
                    + get_max_lifetime(X_filtered, n_windows, n_points)
                    + get_amplitude(X_filtered)
                )

            # define data matrix for k-means
            X_tda = np.array(features)

            X_tda_train = X_tda[id_train, :]
            X_tda_test = X_tda[id_test, :]

            # k means
            kmeans_tda = KMeans(n_clusters=2, random_state=0)
            kmeans_tda.fit(X_tda_train)

            perf_train = perf_train.append(
                {
                    "Score": homogeneity_score(y_train, kmeans_tda.labels_),
                    "Type": "TDA",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )

            perf_test = perf_test.append(
                {
                    "Score": homogeneity_score(y_test, kmeans_tda.predict(X_tda_test)),
                    "Type": "TDA",
                    "Mean Standard Deviation of Noise": noise,
                },
                ignore_index=True,
            )
        mb.first_bar.comment = "Noise level"

    # write performance metrics to disk
    with open("models/performance_metrics_train.pkl", "wb") as file:
        pickle.dump(perf_train, file)

    with open("models/performance_metrics_test.pkl", "wb") as file:
        pickle.dump(perf_test, file)