Beispiel #1
0
    def test_local_dask_cluster_extraction(self):

        Distributor = LocalDaskDistributor(n_workers=1)

        df = self.create_test_data_sample()
        extracted_features = extract_features(df,
                                              column_id="id",
                                              column_sort="sort",
                                              column_kind="kind",
                                              column_value="val",
                                              distributor=Distributor)

        self.assertIsInstance(extracted_features, pd.DataFrame)
        self.assertTrue(
            np.all(extracted_features.a__maximum == np.array([71, 77])))
        self.assertTrue(
            np.all(extracted_features.a__sum_values == np.array([691, 1017])))
        self.assertTrue(
            np.all(
                extracted_features.a__abs_energy == np.array([32211, 63167])))
        self.assertTrue(
            np.all(extracted_features.b__sum_values == np.array([757, 695])))
        self.assertTrue(
            np.all(extracted_features.b__minimum == np.array([3, 1])))
        self.assertTrue(
            np.all(
                extracted_features.b__abs_energy == np.array([36619, 35483])))
        self.assertTrue(
            np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
        self.assertTrue(
            np.all(extracted_features.b__median == np.array([39.5, 28.0])))
raw_data = pd.read_csv('data\Key Metrics_full.csv',
                       sep=";",
                       decimal=',',
                       index_col='Date',
                       parse_dates=True,
                       dayfirst=True,
                       usecols=[
                           'Date', 'Team Name', 'Product Category',
                           'Product Subcategory', 'Sales'
                       ])

# Clean & Prep Data
clean_df = DataPrep.DataPrepping(raw_data)
del (raw_data)

clean_df['TS_ID'] = clean_df['Team Name'] + clean_df['Product Subcategory']
TS_Ready = clean_df.drop(
    ['Team Name', 'Product Category', 'Product Subcategory'], axis=1)

Distributor = LocalDaskDistributor(n_workers=3)

extracted_features = extract_features(timeseries_container=TS_Ready,
                                      column_id='TS_ID',
                                      column_sort="Salesdate",
                                      distributor=Distributor)

extracted_features['TS_ID'] = extracted_features.index
extracted_features.to_csv('data\TSFresh Features.csv', index=False)

extracted_features.columns