Ejemplo n.º 1
0
    def test_with_df(self):
        # give everyting
        test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": 1}])
        result_df, column_id, column_kind, column_value = \
            dataframe_functions._normalize_input_to_internal_representation(test_df, "id", "sort", "kind", "value")

        self.assertEqual(column_id, "id")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_kind, "kind")
        self.assertIn("a", set(result_df[column_kind]))
        six.assertCountEqual(self, list(result_df.columns),
                             ["id", "value", "kind"])
        self.assertEqual(
            list(result_df[result_df[column_kind] == "a"]["value"]), [3])
        self.assertEqual(list(result_df[result_df[column_kind] == "a"]["id"]),
                         [0])

        # give no kind
        test_df = pd.DataFrame([{"id": 0, "value": 3, "sort": 1}])
        result_df, column_id, column_kind, column_value = \
            dataframe_functions._normalize_input_to_internal_representation(test_df, "id", "sort", None, "value")

        self.assertEqual(column_id, "id")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_kind, "_variables")
        self.assertIn("feature", set(result_df[column_kind]))
        six.assertCountEqual(self, list(result_df.columns),
                             ["id", "value", "_variables"])
        self.assertEqual(
            list(result_df[result_df[column_kind] == "feature"]["value"]), [3])
        self.assertEqual(
            list(result_df[result_df[column_kind] == "feature"]["id"]), [0])

        # Let the function find the values
        test_df = pd.DataFrame([{"id": 0, "a": 3, "b": 5, "sort": 1}])
        result_df, column_id, column_kind, column_value = \
            dataframe_functions._normalize_input_to_internal_representation(test_df, "id", "sort", None, None)

        self.assertEqual(column_id, "id")
        self.assertEqual(column_value, "_values")
        self.assertEqual(column_kind, "_variables")
        self.assertIn("a", set(result_df[column_kind]))
        self.assertIn("b", set(result_df[column_kind]))
        six.assertCountEqual(self, list(result_df.columns),
                             ["_values", "_variables", "id"])
        self.assertEqual(
            list(result_df[result_df[column_kind] == "a"]["_values"]), [3])
        self.assertEqual(list(result_df[result_df[column_kind] == "a"]["id"]),
                         [0])
        self.assertEqual(
            list(result_df[result_df[column_kind] == "b"]["_values"]), [5])
        self.assertEqual(list(result_df[result_df[column_kind] == "b"]["id"]),
                         [0])
Ejemplo n.º 2
0
    def test_with_df(self):
        # give everyting
        test_df = pd.DataFrame([{"id": 0, "kind": "a", "value": 3, "sort": 1}])
        result_df, column_id, column_kind, column_value = \
            dataframe_functions._normalize_input_to_internal_representation(test_df, "id", "sort", "kind", "value")

        self.assertEqual(column_id, "id")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_kind, "kind")
        self.assertIn("a", set(result_df[column_kind]))
        six.assertCountEqual(self, list(result_df.columns), ["id", "value", "kind"])
        self.assertEqual(list(result_df[result_df[column_kind] == "a"]["value"]), [3])
        self.assertEqual(list(result_df[result_df[column_kind] == "a"]["id"]), [0])

        # give no kind
        test_df = pd.DataFrame([{"id": 0, "value": 3, "sort": 1}])
        result_df, column_id, column_kind, column_value = \
            dataframe_functions._normalize_input_to_internal_representation(test_df, "id", "sort", None, "value")

        self.assertEqual(column_id, "id")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_kind, "_variables")
        self.assertIn("value", set(result_df[column_kind]))
        six.assertCountEqual(self, list(result_df.columns), ["id", "value", "_variables"])
        self.assertEqual(list(result_df[result_df[column_kind] == "value"]["value"]), [3])
        self.assertEqual(list(result_df[result_df[column_kind] == "value"]["id"]), [0])

        # Let the function find the values
        test_df = pd.DataFrame([{"id": 0, "a": 3, "b": 5, "sort": 1}])
        result_df, column_id, column_kind, column_value = \
            dataframe_functions._normalize_input_to_internal_representation(test_df, "id", "sort", None, None)

        self.assertEqual(column_id, "id")
        self.assertEqual(column_value, "_values")
        self.assertEqual(column_kind, "_variables")
        self.assertIn("a", set(result_df[column_kind]))
        self.assertIn("b", set(result_df[column_kind]))
        six.assertCountEqual(self, list(result_df.columns), ["_values", "_variables", "id"])
        self.assertEqual(list(result_df[result_df[column_kind] == "a"]["_values"]), [3])
        self.assertEqual(list(result_df[result_df[column_kind] == "a"]["id"]), [0])
        self.assertEqual(list(result_df[result_df[column_kind] == "b"]["_values"]), [5])
        self.assertEqual(list(result_df[result_df[column_kind] == "b"]["id"]), [0])
Ejemplo n.º 3
0
    def test_with_dictionaries_two_rows_sorted(self):
        test_df = pd.DataFrame([{"value": 2, "id": "id_1"},
                                {"value": 1, "id": "id_1"}])
        test_dict = {"a": test_df, "b": test_df}

        # Pass the id
        result_df, column_id, column_kind, column_value = \
            dataframe_functions._normalize_input_to_internal_representation(test_dict, "id", None, None, "value")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_id, "id")

        self.assertEqual(result_df[result_df[column_kind] == "a"].iloc[0].to_dict(), {"_variables": "a", "value": 2, "id": "id_1"})
Ejemplo n.º 4
0
    def test_with_dictionaries_two_rows_sorted(self):
        test_df = pd.DataFrame([{"value": 2, "id": "id_1"},
                                {"value": 1, "id": "id_1"}])
        test_dict = {"a": test_df, "b": test_df}

        # Pass the id
        result_df, column_id, column_kind, column_value = \
            dataframe_functions._normalize_input_to_internal_representation(test_dict, "id", None, None, "value")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_id, "id")

        self.assertEqual(result_df[result_df[column_kind] == "a"].iloc[0].to_dict(), {"_variables": "a", "value": 2, "id": "id_1"})
Ejemplo n.º 5
0
    def test_with_df_2(self):
        # give no kind
        test_df = pd.DataFrame([{"id": 0, "value": 3, "sort": 1}])
        result_df, column_id, column_kind, column_value = \
            dataframe_functions._normalize_input_to_internal_representation(test_df, "id", "sort", None, "value")

        self.assertEqual(column_id, "id")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_kind, "_variables")
        self.assertIn("value", set(result_df[column_kind]))
        self.assertCountEqual(list(result_df.columns), ["id", "value", "_variables"])
        self.assertEqual(list(result_df[result_df[column_kind] == "value"]["value"]), [3])
        self.assertEqual(list(result_df[result_df[column_kind] == "value"]["id"]), [0])
Ejemplo n.º 6
0
    def test_wide_dataframe_order_preserved(self):
        """ verifies that the order of the time series inside a wide time series container are preserved
        (columns_sort=None)
        """
        test_df = pd.DataFrame({'id': ["a", "a", "a", "b"],
                                'v1': [4, 3, 2, 1],
                                'v2': [14, 13, 12, 11]})

        melt_df, _, _, _ = \
            dataframe_functions._normalize_input_to_internal_representation(
                test_df, column_id="id", column_sort=None, column_kind=None, column_value=None)

        assert (test_df.query("id=='a'")["v1"].values ==
                melt_df.query("id=='a'").query("_variables=='v1'")["_values"].values).all()
        assert (test_df.query("id=='a'")["v2"].values ==
                melt_df.query("id=='a'").query("_variables=='v2'")["_values"].values).all()
Ejemplo n.º 7
0
    def test_wide_dataframe_order_preserved_with_sort_column(self):
        """ verifies that the order of the sort column from a wide time series container is preserved
        """

        test_df = pd.DataFrame({'id': ["a", "a", "b"],
                                'v1': [3, 2, 1],
                                'v2': [13, 12, 11],
                                'sort': [103, 102, 101]})

        melt_df, _, _, _ = \
            dataframe_functions._normalize_input_to_internal_representation(
                test_df, column_id="id", column_sort="sort", column_kind=None, column_value=None)

        assert (test_df.sort_values("sort").query("id=='a'")["v1"].values ==
                melt_df.query("id=='a'").query("_variables=='v1'")["_values"].values).all()
        assert (test_df.sort_values("sort").query("id=='a'")["v2"].values ==
                melt_df.query("id=='a'").query("_variables=='v2'")["_values"].values).all()
Ejemplo n.º 8
0
    def test_with_dictionaries_two_rows(self):
        test_df = pd.DataFrame([{"value": 2, "sort": 2, "id": "id_1"},
                                {"value": 1, "sort": 1, "id": "id_1"}])
        test_dict = {"a": test_df, "b": test_df}

        # If there are more than one column, the algorithm can not choose the correct column
        self.assertRaises(ValueError, dataframe_functions._normalize_input_to_internal_representation, test_dict,
                          "id", None, None, None)

        # Sorting should work
        result_df, column_id, column_kind, column_value = \
            dataframe_functions._normalize_input_to_internal_representation(test_dict, "id", "sort", None, "value")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_id, "id")

        # Assert sorted and without sort column
        self.assertEqual(result_df[result_df[column_kind] == "a"].iloc[0].to_dict(), {"_variables": "a", "value": 1, "id": "id_1"})
        self.assertEqual(result_df[result_df[column_kind] == "a"].iloc[1].to_dict(), {"_variables": "a", "value": 2, "id": "id_1"})
Ejemplo n.º 9
0
    def test_with_dictionaries_two_rows(self):
        test_df = pd.DataFrame([{"value": 2, "sort": 2, "id": "id_1"},
                                {"value": 1, "sort": 1, "id": "id_1"}])
        test_dict = {"a": test_df, "b": test_df}

        # If there are more than one column, the algorithm can not choose the correct column
        self.assertRaises(ValueError, dataframe_functions._normalize_input_to_internal_representation, test_dict,
                          "id", None, None, None)

        # Sorting should work
        result_df, column_id, column_kind, column_value = \
            dataframe_functions._normalize_input_to_internal_representation(test_dict, "id", "sort", None, "value")
        self.assertEqual(column_value, "value")
        self.assertEqual(column_id, "id")

        # Assert sorted and without sort column
        self.assertEqual(result_df[result_df[column_kind] == "a"].iloc[0].to_dict(), {"_variables": "a", "value": 1, "id": "id_1"})
        self.assertEqual(result_df[result_df[column_kind] == "a"].iloc[1].to_dict(), {"_variables": "a", "value": 2, "id": "id_1"})
Ejemplo n.º 10
0
def extract_features(timeseries_container, default_fc_parameters=None,
                     kind_to_fc_parameters=None,
                     column_id=None, column_sort=None, column_kind=None, column_value=None,
                     chunksize=defaults.CHUNKSIZE,
                     n_jobs=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS,
                     disable_progressbar=defaults.DISABLE_PROGRESSBAR,
                     impute_function=defaults.IMPUTE_FUNCTION,
                     profile=defaults.PROFILING,
                     profiling_filename=defaults.PROFILING_FILENAME,
                     profiling_sorting=defaults.PROFILING_SORTING,
                     distributor=None):
    """
    Extract features from

    * a :class:`pandas.DataFrame` containing the different time series

    or

    * a dictionary of :class:`pandas.DataFrame` each containing one type of time series

    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.

    For a list of all the calculated time series features, please see the
    :class:`~tsfresh.feature_extraction.settings.ComprehensiveFCParameters` class,
    which is used to control which features with which parameters are calculated.

    For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features
    >>> df, _ = load_robot_execution_failures()
    >>> X = extract_features(df, column_id='id', column_sort='time')

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
    :type timeseries_container: pandas.DataFrame or dict

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters. This means that kinds, for
            which kind_of_fc_parameters doe not have any entries, will be ignored by the feature selection.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param chunksize: The size of one chunk that is submitted to the worker
        process for the parallelisation.  Where one chunk is defined as a
        singular time series for one id and one kind. If you set the chunksize
        to 10, then it means that one task is to calculate all features for 10
        time series.  If it is set it to None, depending on distributor,
        heuristics are used to find the optimal chunksize. If you get out of
        memory exceptions, you can try it with the dask distributor and a
        smaller chunksize.
    :type chunksize: None or int

    :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param impute_function: None, if no imputing should happen or the function to call for imputing.
    :type impute_function: None or callable

    :param profile: Turn on profiling during feature extraction
    :type profile: bool

    :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for
           more information)
    :type profiling_sorting: basestring

    :param profiling_filename: Where to save the profiling results.
    :type profiling_filename: basestring

    :param distributor: Advanced parameter: set this to a class name that you want to use as a
             distributor. See the utilities/distribution.py for more information. Leave to None, if you want
             TSFresh to choose the best distributor.
    :type distributor: class

    :return: The (maybe imputed) DataFrame containing extracted features.
    :rtype: pandas.DataFrame
    """

    # Always use the standardized way of storing the data.
    # See the function normalize_input_to_internal_representation for more information.
    df_melt, column_id, column_kind, column_value = \
        dataframe_functions._normalize_input_to_internal_representation(
            timeseries_container=timeseries_container,
            column_id=column_id, column_kind=column_kind,
            column_sort=column_sort,
            column_value=column_value)
    # Use the standard setting if the user did not supply ones himself.
    if default_fc_parameters is None and kind_to_fc_parameters is None:
        default_fc_parameters = ComprehensiveFCParameters()
    elif default_fc_parameters is None and kind_to_fc_parameters is not None:
        default_fc_parameters = {}

    # If requested, do profiling (advanced feature)
    if profile:
        profiler = profiling.start_profiling()

    with warnings.catch_warnings():
        if not show_warnings:
            warnings.simplefilter("ignore")
        else:
            warnings.simplefilter("default")

        result = _do_extraction(df=df_melt,
                                column_id=column_id, column_value=column_value,
                                column_kind=column_kind,
                                n_jobs=n_jobs, chunk_size=chunksize,
                                disable_progressbar=disable_progressbar,
                                default_fc_parameters=default_fc_parameters,
                                kind_to_fc_parameters=kind_to_fc_parameters,
                                distributor=distributor)

        # Impute the result if requested
        if impute_function is not None:
            impute_function(result)

    # Turn off profiling if it was turned on
    if profile:
        profiling.end_profiling(profiler, filename=profiling_filename,
                                sorting=profiling_sorting)

    return result
Ejemplo n.º 11
0
def extract_features(timeseries_container, default_fc_parameters=None,
                     kind_to_fc_parameters=None,
                     column_id=None, column_sort=None, column_kind=None, column_value=None,
                     chunksize=defaults.CHUNKSIZE,
                     n_jobs=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS,
                     disable_progressbar=defaults.DISABLE_PROGRESSBAR,
                     impute_function=defaults.IMPUTE_FUNCTION,
                     profile=defaults.PROFILING,
                     profiling_filename=defaults.PROFILING_FILENAME,
                     profiling_sorting=defaults.PROFILING_SORTING,
                     distributor=None):
    """
    Extract features from

    * a :class:`pandas.DataFrame` containing the different time series

    or

    * a dictionary of :class:`pandas.DataFrame` each containing one type of time series

    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.

    For a list of all the calculated time series features, please see the
    :class:`~tsfresh.feature_extraction.settings.ComprehensiveFCParameters` class,
    which is used to control which features with which parameters are calculated.

    For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features
    >>> df, _ = load_robot_execution_failures()
    >>> X = extract_features(df, column_id='id', column_sort='time')

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
    :type timeseries_container: pandas.DataFrame or dict

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param chunksize: The size of one chunk for the parallelisation
    :type chunksize: None or int

    :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param impute_function: None, if no imputing should happen or the function to call for imputing.
    :type impute_function: None or callable

    :param profile: Turn on profiling during feature extraction
    :type profile: bool

    :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for
           more information)
    :type profiling_sorting: basestring

    :param profiling_filename: Where to save the profiling results.
    :type profiling_filename: basestring

    :param distributor: Advanced parameter: set this to a class name that you want to use as a
             distributor. See the utilities/distribution.py for more information. Leave to None, if you want
             TSFresh to choose the best distributor.
    :type distributor: class

    :return: The (maybe imputed) DataFrame containing extracted features.
    :rtype: pandas.DataFrame
    """
    import logging
    logging.basicConfig()

    # Always use the standardized way of storing the data.
    # See the function normalize_input_to_internal_representation for more information.
    df_melt, column_id, column_kind, column_value = \
        dataframe_functions._normalize_input_to_internal_representation(timeseries_container=timeseries_container,
                                                                        column_id=column_id, column_kind=column_kind,
                                                                        column_sort=column_sort,
                                                                        column_value=column_value)

    # Use the standard setting if the user did not supply ones himself.
    if default_fc_parameters is None:
        default_fc_parameters = ComprehensiveFCParameters()

    # If requested, do profiling (advanced feature)
    if profile:
        profiler = profiling.start_profiling()

    with warnings.catch_warnings():
        if not show_warnings:
            warnings.simplefilter("ignore")
        else:
            warnings.simplefilter("default")

        result = _do_extraction(df=df_melt,
                                column_id=column_id, column_value=column_value, column_kind=column_kind,
                                n_jobs=n_jobs, chunk_size=chunksize,
                                disable_progressbar=disable_progressbar,
                                default_fc_parameters=default_fc_parameters,
                                kind_to_fc_parameters=kind_to_fc_parameters,
                                distributor=distributor)

        # Impute the result if requested
        if impute_function is not None:
            impute_function(result)

    # Turn off profiling if it was turned on
    if profile:
        profiling.end_profiling(profiler, filename=profiling_filename,
                                sorting=profiling_sorting)

    return result
Ejemplo n.º 12
0
def compress(ts, compression_functions, interval_length, column_id,
             column_sort, column_kind, column_value):
    """
    This method compresses time series by applying a compression function on bins. Then the values of the compression
    function over the bins are returned as a new, compressed time series.

    This decreasing the memory footprint of the time series. E.g. by applying a singular compression function on chunks
    of size 10, the time series is compressed by a factor 10.

    It is also possible to use multiple compression functions.

    The time series container ts must be in one of the formats that are supported by the tsfresh package.

    :param ts: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames.
    :type ts: pandas.DataFrame or dict

    :param compression_functions: mapping from feature calculator names to parameters. See tsfresh documentation
    :type compression_functions: dict

    :param interval_length: the length of each bin to which the aggregation functions are applied
    :type interval_length: int

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str
    """

    dd, column_id, column_kind, column_value = \
        _normalize_input_to_internal_representation(ts, column_id, column_sort, column_kind, column_value)

    def create_bins(v):
        n_bins = np.ceil(len(v) / interval_length)
        return np.repeat(np.arange(n_bins), interval_length)[:len(v)]

    dd[column_id] = dd[column_id].apply(str) + "_bin_" + \
                    dd.groupby([column_id, column_kind])[column_value].transform(create_bins).apply(str)

    dd = extract_features(dd,
                          column_id=column_id,
                          column_value=column_value,
                          column_kind=column_kind,
                          default_fc_parameters=compression_functions)

    dd.columns = [x.replace("__", "_") for x in dd.columns]
    dd.columns = [x.replace("feature", "map") for x in dd.columns]
    dd.reset_index(drop=False, inplace=True)

    ids = dd[column_id].str.split("_bin_").apply(lambda s: s[0])
    bin_number = dd["id"].str.split("_bin_").apply(lambda s: eval(s[1]))

    dd[column_id] = ids
    dd["bin"] = bin_number

    return dd.sort_values(by=[column_id, "bin"])