Esempio n. 1
0
    def test_dask_dataframe_without_kind(self):
        test_df = dd.from_pandas(pd.DataFrame({
            "id": [1, 2],
            "value_a": [1, 2],
            "value_b": [3, 4]
        }),
                                 npartitions=1)

        result = to_tsdata(test_df, column_id="id")
        self.assertEqual(result.column_id, "id")

        def test_f(chunk):
            return pd.DataFrame({
                "id": chunk[0],
                "variable": chunk[1],
                "value": chunk[2]
            })

        return_f = result.apply(test_f,
                                meta=(("id", "int"), ("variable", "int"),
                                      ("value", "int"))).compute()
        pd.testing.assert_frame_equal(
            return_f.reset_index(drop=True),
            pd.DataFrame({
                "id": [1, 2, 1, 2],
                "variable": ["value_a", "value_a", "value_b", "value_b"],
                "value": [1.0, 2.0, 3.0, 4.0]
            }))

        test_df = dd.from_pandas(pd.DataFrame({
            "id": [1, 1],
            "sort": [2, 1],
            "value_a": [1, 2],
            "value_b": [3, 4]
        }),
                                 npartitions=1)

        result = to_tsdata(test_df, column_id="id", column_sort="sort")
        self.assertEqual(result.column_id, "id")

        def test_f(chunk):
            return pd.DataFrame({
                "id": chunk[0],
                "variable": chunk[1],
                "value": chunk[2]
            })

        return_f = result.apply(test_f,
                                meta=(("id", "int"), ("variable", "int"),
                                      ("value", "int"))).compute()

        pd.testing.assert_frame_equal(
            return_f.reset_index(drop=True),
            pd.DataFrame({
                "id": [1, 1, 1, 1],
                "variable": ["value_a", "value_a", "value_b", "value_b"],
                "value": [2.0, 1.0, 4.0, 3.0]
            }))
Esempio n. 2
0
    def test_dask_dataframe_with_kind(self):
        test_df = dd.from_pandas(pd.DataFrame({
            "id": [1, 2],
            "kind": ["a", "a"],
            "value": [1, 2]
        }),
                                 npartitions=1)

        result = to_tsdata(test_df, column_id="id", column_kind="kind")
        self.assertEqual(result.column_id, "id")
        self.assertEqual(result.column_kind, "kind")
        self.assertEqual(result.column_value, "value")

        def test_f(chunk):
            return pd.DataFrame({
                "id": chunk[0],
                "variable": chunk[1],
                "value": chunk[2]
            })

        return_f = result.apply(test_f,
                                meta=(("id", "int"), ("variable", "int"),
                                      ("value", "int"))).compute()
        pd.testing.assert_frame_equal(
            return_f,
            pd.DataFrame({
                "id": [1, 2],
                "variable": ["a", "a"],
                "value": [1.0, 2.0]
            }))
Esempio n. 3
0
    def test_simple_data_sample_four_timeseries(self):
        df = self.create_test_data_sample()
        df.index.name = None
        df.sort_values(by=["id", "kind", "sort"], inplace=True)

        result = to_tsdata(df, "id", "kind", "val", "sort")
        expected = TEST_DATA_EXPECTED_TUPLES

        self.assert_data_chunk_object_equal(result, expected)
Esempio n. 4
0
    def test_simple_data_sample_two_timeseries(self):
        df = pd.DataFrame({
            "id": [10] * 4,
            "kind": ["a"] * 2 + ["b"] * 2,
            "val": [36, 71, 78, 37]
        })
        df.set_index("id", drop=False, inplace=True)
        df.index.name = None

        result = to_tsdata(df, "id", "kind", "val")
        expected = [(10, 'a', pd.Series([36, 71], index=[10] * 2, name="val")),
                    (10, 'b', pd.Series([78, 37], index=[10] * 2, name="val"))]
        self.assert_data_chunk_object_equal(result, expected)
Esempio n. 5
0
    def test_with_dictionaries_two_rows(self):
        test_df = pd.DataFrame([{
            "value": 1,
            "id": "id_1"
        }, {
            "value": 2,
            "id": "id_1"
        }])
        test_dict = {"a": test_df, "b": test_df}

        result = to_tsdata(test_dict, column_id="id", column_value="value")
        expected = [("id_1", 'a', pd.Series([1, 2], index=[0, 1],
                                            name="value")),
                    ("id_1", 'b', pd.Series([1, 2], index=[0, 1],
                                            name="value"))]
        self.assert_data_chunk_object_equal(result, expected)
Esempio n. 6
0
    def test_wide_dataframe_order_preserved_with_sort_column(self):
        """ verifies that the order of the sort column from a wide time series container is preserved
        """

        test_df = pd.DataFrame({
            'id': ["a", "a", "b"],
            'v1': [3, 2, 1],
            'v2': [13, 12, 11],
            'sort': [103, 102, 101]
        })

        result = to_tsdata(test_df, column_id="id", column_sort="sort")
        expected = [("a", 'v1', pd.Series([2, 3], index=[1, 0], name="v1")),
                    ("a", 'v2', pd.Series([12, 13], index=[1, 0], name="v2")),
                    ("b", 'v1', pd.Series([1], index=[2], name="v1")),
                    ("b", 'v2', pd.Series([11], index=[2], name="v2"))]
        self.assert_data_chunk_object_equal(result, expected)
Esempio n. 7
0
def _do_extraction(df, column_id, column_value, column_kind, column_sort,
                   default_fc_parameters, kind_to_fc_parameters, n_jobs,
                   chunk_size, disable_progressbar, show_warnings, distributor,
                   pivot):
    """
    Wrapper around the _do_extraction_on_chunk, which calls it on all chunks in the data frame.
    A chunk is a subset of the data, with a given kind and id - so a single time series.

    The data is separated out into those single time series and the _do_extraction_on_chunk is
    called on each of them. The results are then combined into a single pandas DataFrame.

    The call is either happening in parallel or not and is showing a progress bar or not depending
    on the given flags.

    :param df: The dataframe in the normalized format which is used for extraction.
    :type df: pd.DataFrame

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param chunk_size: The size of one chunk for the parallelization
    :type chunk_size: None or int

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param distributor: Advanced parameter:  See the utilities/distribution.py for more information.
                         Leave to None, if you want TSFresh to choose the best distributor.
    :type distributor: DistributorBaseClass

    :return: the extracted features
    :rtype: pd.DataFrame
    """

    data = to_tsdata(df, column_id, column_kind, column_value, column_sort)

    if distributor is None:
        if isinstance(data, Iterable):
            if n_jobs == 0:
                distributor = MapDistributor(
                    disable_progressbar=disable_progressbar,
                    progressbar_title="Feature Extraction")
            else:
                distributor = MultiprocessingDistributor(
                    n_workers=n_jobs,
                    disable_progressbar=disable_progressbar,
                    progressbar_title="Feature Extraction",
                    show_warnings=show_warnings)
        else:
            distributor = ApplyDistributor(
                meta=[(data.column_id,
                       'int64'), ('variable', 'object'), ('value', 'float64')])

    if not isinstance(distributor, DistributorBaseClass):
        raise ValueError(
            "the passed distributor is not an DistributorBaseClass object")

    kwargs = dict(default_fc_parameters=default_fc_parameters,
                  kind_to_fc_parameters=kind_to_fc_parameters)

    result = distributor.map_reduce(_do_extraction_on_chunk,
                                    data=data,
                                    chunk_size=chunk_size,
                                    function_kwargs=kwargs)

    if not pivot:
        return result

    return_df = data.pivot(result)
    return return_df