def test_dask_dataframe_without_kind(self): test_df = dd.from_pandas(pd.DataFrame({ "id": [1, 2], "value_a": [1, 2], "value_b": [3, 4] }), npartitions=1) result = to_tsdata(test_df, column_id="id") self.assertEqual(result.column_id, "id") def test_f(chunk): return pd.DataFrame({ "id": chunk[0], "variable": chunk[1], "value": chunk[2] }) return_f = result.apply(test_f, meta=(("id", "int"), ("variable", "int"), ("value", "int"))).compute() pd.testing.assert_frame_equal( return_f.reset_index(drop=True), pd.DataFrame({ "id": [1, 2, 1, 2], "variable": ["value_a", "value_a", "value_b", "value_b"], "value": [1.0, 2.0, 3.0, 4.0] })) test_df = dd.from_pandas(pd.DataFrame({ "id": [1, 1], "sort": [2, 1], "value_a": [1, 2], "value_b": [3, 4] }), npartitions=1) result = to_tsdata(test_df, column_id="id", column_sort="sort") self.assertEqual(result.column_id, "id") def test_f(chunk): return pd.DataFrame({ "id": chunk[0], "variable": chunk[1], "value": chunk[2] }) return_f = result.apply(test_f, meta=(("id", "int"), ("variable", "int"), ("value", "int"))).compute() pd.testing.assert_frame_equal( return_f.reset_index(drop=True), pd.DataFrame({ "id": [1, 1, 1, 1], "variable": ["value_a", "value_a", "value_b", "value_b"], "value": [2.0, 1.0, 4.0, 3.0] }))
def test_dask_dataframe_with_kind(self): test_df = dd.from_pandas(pd.DataFrame({ "id": [1, 2], "kind": ["a", "a"], "value": [1, 2] }), npartitions=1) result = to_tsdata(test_df, column_id="id", column_kind="kind") self.assertEqual(result.column_id, "id") self.assertEqual(result.column_kind, "kind") self.assertEqual(result.column_value, "value") def test_f(chunk): return pd.DataFrame({ "id": chunk[0], "variable": chunk[1], "value": chunk[2] }) return_f = result.apply(test_f, meta=(("id", "int"), ("variable", "int"), ("value", "int"))).compute() pd.testing.assert_frame_equal( return_f, pd.DataFrame({ "id": [1, 2], "variable": ["a", "a"], "value": [1.0, 2.0] }))
def test_simple_data_sample_four_timeseries(self): df = self.create_test_data_sample() df.index.name = None df.sort_values(by=["id", "kind", "sort"], inplace=True) result = to_tsdata(df, "id", "kind", "val", "sort") expected = TEST_DATA_EXPECTED_TUPLES self.assert_data_chunk_object_equal(result, expected)
def test_simple_data_sample_two_timeseries(self): df = pd.DataFrame({ "id": [10] * 4, "kind": ["a"] * 2 + ["b"] * 2, "val": [36, 71, 78, 37] }) df.set_index("id", drop=False, inplace=True) df.index.name = None result = to_tsdata(df, "id", "kind", "val") expected = [(10, 'a', pd.Series([36, 71], index=[10] * 2, name="val")), (10, 'b', pd.Series([78, 37], index=[10] * 2, name="val"))] self.assert_data_chunk_object_equal(result, expected)
def test_with_dictionaries_two_rows(self): test_df = pd.DataFrame([{ "value": 1, "id": "id_1" }, { "value": 2, "id": "id_1" }]) test_dict = {"a": test_df, "b": test_df} result = to_tsdata(test_dict, column_id="id", column_value="value") expected = [("id_1", 'a', pd.Series([1, 2], index=[0, 1], name="value")), ("id_1", 'b', pd.Series([1, 2], index=[0, 1], name="value"))] self.assert_data_chunk_object_equal(result, expected)
def test_wide_dataframe_order_preserved_with_sort_column(self): """ verifies that the order of the sort column from a wide time series container is preserved """ test_df = pd.DataFrame({ 'id': ["a", "a", "b"], 'v1': [3, 2, 1], 'v2': [13, 12, 11], 'sort': [103, 102, 101] }) result = to_tsdata(test_df, column_id="id", column_sort="sort") expected = [("a", 'v1', pd.Series([2, 3], index=[1, 0], name="v1")), ("a", 'v2', pd.Series([12, 13], index=[1, 0], name="v2")), ("b", 'v1', pd.Series([1], index=[2], name="v1")), ("b", 'v2', pd.Series([11], index=[2], name="v2"))] self.assert_data_chunk_object_equal(result, expected)
def _do_extraction(df, column_id, column_value, column_kind, column_sort, default_fc_parameters, kind_to_fc_parameters, n_jobs, chunk_size, disable_progressbar, show_warnings, distributor, pivot): """ Wrapper around the _do_extraction_on_chunk, which calls it on all chunks in the data frame. A chunk is a subset of the data, with a given kind and id - so a single time series. The data is separated out into those single time series and the _do_extraction_on_chunk is called on each of them. The results are then combined into a single pandas DataFrame. The call is either happening in parallel or not and is showing a progress bar or not depending on the given flags. :param df: The dataframe in the normalized format which is used for extraction. :type df: pd.DataFrame :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for more information. :type default_fc_parameters: dict :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for default_fc_parameters. If you put a kind as a key here, the fc_parameters object (which is the value), will be used instead of the default_fc_parameters. :type kind_to_fc_parameters: dict :param column_id: The name of the id column to group by. :type column_id: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param chunk_size: The size of one chunk for the parallelization :type chunk_size: None or int :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used. :type n_jobs: int :param disable_progressbar: Do not show a progressbar while doing the calculation. :type disable_progressbar: bool :param show_warnings: Show warnings during the feature extraction (needed for debugging of calculators). :type show_warnings: bool :param distributor: Advanced parameter: See the utilities/distribution.py for more information. Leave to None, if you want TSFresh to choose the best distributor. :type distributor: DistributorBaseClass :return: the extracted features :rtype: pd.DataFrame """ data = to_tsdata(df, column_id, column_kind, column_value, column_sort) if distributor is None: if isinstance(data, Iterable): if n_jobs == 0: distributor = MapDistributor( disable_progressbar=disable_progressbar, progressbar_title="Feature Extraction") else: distributor = MultiprocessingDistributor( n_workers=n_jobs, disable_progressbar=disable_progressbar, progressbar_title="Feature Extraction", show_warnings=show_warnings) else: distributor = ApplyDistributor( meta=[(data.column_id, 'int64'), ('variable', 'object'), ('value', 'float64')]) if not isinstance(distributor, DistributorBaseClass): raise ValueError( "the passed distributor is not an DistributorBaseClass object") kwargs = dict(default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters) result = distributor.map_reduce(_do_extraction_on_chunk, data=data, chunk_size=chunk_size, function_kwargs=kwargs) if not pivot: return result return_df = data.pivot(result) return return_df