def test_from_columns(self): tsn = "TEST_TIME_SERIES" fset = FeatureExtractionSettings() self.assertRaises(TypeError, fset.from_columns, 42) self.assertRaises(TypeError, fset.from_columns, 42) self.assertRaises(ValueError, fset.from_columns, ["This is not a column name"]) self.assertRaises(ValueError, fset.from_columns, ["This__neither"]) self.assertRaises(ValueError, fset.from_columns, ["This__also__not"]) # Aggregate functions feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"] # Aggregate functions with params feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30', tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf', tsn + '__value_count__value_nan'] # Apply functions feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1'] cset = fset.from_columns(feature_names) six.assertCountEqual(self, list(cset.kind_to_calculation_settings_mapping[tsn].keys()), ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks", "ar_coefficient", "value_count"]) self.assertEqual(cset.kind_to_calculation_settings_mapping[tsn]["sum_values"], None) self.assertEqual(cset.kind_to_calculation_settings_mapping[tsn]["ar_coefficient"], [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}]) self.assertEqual(cset.kind_to_calculation_settings_mapping[tsn]["value_count"], [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}])
def test_functional_equality(self): """ `extract_relevant_features` should be equivalent to running first `extract_features` with impute and `select_features` afterwards. Meaning it should produce the same relevant features and the values of these features should be identical. :return: """ df, y = self.create_test_data_sample_with_target() relevant_features = extract_relevant_features(df, y, column_id='id', column_value='val', column_kind='kind', column_sort='sort') extraction_settings = FeatureExtractionSettings() extraction_settings.IMPUTE = impute extracted_features = extract_features( df, feature_extraction_settings=extraction_settings, column_id='id', column_value='val', column_kind='kind', column_sort='sort') selected_features = select_features(extracted_features, y) self.assertEqual( set(relevant_features.columns), set(selected_features.columns), "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format( relevant_features.columns, selected_features.columns)) self.assertTrue( (relevant_features.values == selected_features.values).all().all(), "Should calculate the same feature values")
def test_profiling_file_written_out(self): fes = FeatureExtractionSettings() fes.PROFILING = True fes.PROFILING_FILENAME = "test_profiling.txt" df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "value": np.random.normal(0, 1, 20)}) X = extract_features(df, column_id="id", column_value="value", feature_extraction_settings=fes) self.assertTrue(os.path.isfile(fes.PROFILING_FILENAME)) os.remove(fes.PROFILING_FILENAME)
def setUp(self): self.test_df = self.create_test_data_sample() self.settings = FeatureExtractionSettings() self.settings.set_default_parameters("a") calculation_settings_mapping = { "length": self.settings.kind_to_calculation_settings_mapping["a"]["length"] } self.settings.kind_to_calculation_settings_mapping = { "a": calculation_settings_mapping.copy(), "b": calculation_settings_mapping.copy() }
def __init__(self, evaluate_only_added_features=True, feature_selection_settings=None, feature_extraction_settings=None, column_id=None, column_sort=None, column_kind=None, column_value=None, timeseries_container=None): """ Create a new RelevantFeatureAugmenter instance. :param settings: The extraction settings to use. Leave empty to use the default ones. :type settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :param evaluate_only_added_features: Whether to touch the manually-created features during feature selection or not. :type evaluate_only_added_features: bool :param feature_selection_settings: The feature selection settings. :type feature_selection_settings: tsfresh.feature_selection.settings.FeatureSelectionSettings :param feature_extraction_settings: The feature extraction settings. :type feature_selection_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :param column_id: The column with the id. See :mod:`~tsfresh.feature_extraction.extraction`. :type column_id: basestring :param column_sort: The column with the sort data. See :mod:`~tsfresh.feature_extraction.extraction`. :type column_sort: basestring :param column_kind: The column with the kind data. See :mod:`~tsfresh.feature_extraction.extraction`. :type column_kind: basestring :param column_value: The column with the values. See :mod:`~tsfresh.feature_extraction.extraction`. :type column_value: basestring """ # We require to have IMPUTE! if feature_extraction_settings is None: feature_extraction_settings = FeatureExtractionSettings() # Range will be our default imputation strategy feature_extraction_settings.IMPUTE = impute_dataframe_range self.feature_extractor = FeatureAugmenter(feature_extraction_settings, column_id, column_sort, column_kind, column_value) self.feature_selector = FeatureSelector(feature_selection_settings) self.evaluate_only_added_features = evaluate_only_added_features self.timeseries_container = timeseries_container
def transform(self, X): """ After the fit step, it is known which features are relevant. Only extract those from the time series handed in with the function :func:`~set_timeseries_container`. If evaluate_only_added_features is False, also delete the irrelevant, already present features in the data frame. :param X: the data sample to add the relevant (and delete the irrelevant) features to. :type X: pandas.DataFrame or numpy.array :return: a data sample with the same information as X, but with added relevant time series features and deleted irrelevant information (only if evaluate_only_added_features is False). :rtype: pandas.DataFrame """ if self.feature_selector.relevant_features is None: raise RuntimeError("You have to call fit before.") if self.timeseries_container is None: raise RuntimeError( "You have to provide a time series using the set_timeseries_container function before." ) self.feature_extractor.set_timeseries_container( self.timeseries_container) relevant_time_series_features = set( self.feature_selector.relevant_features) - set( pd.DataFrame(X).columns) relevant_extraction_settings = FeatureExtractionSettings.from_columns( relevant_time_series_features) relevant_extraction_settings.set_default = False # Set imputing strategy if self.feature_extractor.settings.IMPUTE is impute_dataframe_range: relevant_extraction_settings.IMPUTE = partial( impute_dataframe_range, col_to_max=self.col_to_max, col_to_min=self.col_to_min, col_to_median=self.col_to_median) else: relevant_extraction_settings.IMPUTE = self.feature_extractor.settings.IMPUTE relevant_feature_extractor = FeatureAugmenter( settings=relevant_extraction_settings, column_id=self.feature_extractor.column_id, column_sort=self.feature_extractor.column_sort, column_kind=self.feature_extractor.column_kind, column_value=self.feature_extractor.column_value) relevant_feature_extractor.set_timeseries_container( self.feature_extractor.timeseries_container) X_augmented = relevant_feature_extractor.transform(X) return X_augmented.copy().loc[:, self.feature_selector.relevant_features]
def setUp(self): self.settings = FeatureExtractionSettings() self.settings.PROFILING = False self.settings.n_processes = 2 # only calculate some features to reduce load on travis ci self.name_to_param = {"maximum": None, "sum_values": None, "abs_energy": None, "minimum": None, "mean": None, "median": None}
def test_default_calculates_all_features(self): """ Test that by default a FeatureExtractionSettings object should be set up to calculate all features defined in tsfresh.feature_extraction.feature_calculators """ settings = FeatureExtractionSettings() all_feature_calculators = [name for name, func in feature_calculators.__dict__.items() if hasattr(func, "fctype")] for calculator in all_feature_calculators: self.assertIn(calculator, settings.name_to_param, msg='Default FeatureExtractionSettings object does not setup calculation of {}' .format(calculator))
def transform(self, X): """ After the fit step, it is known which features are relevant. Only extract those from the time series handed in with the function :func:`~set_timeseries_container`. If evaluate_only_added_features is False, also delete the irrelevant, already present features in the data frame. :param X: the data sample to add the relevant (and delete the irrelevant) features to. :type X: pandas.DataFrame or numpy.array :return: a data sample with the same information as X, but with added relevant time series features and deleted irrelevant information (only if evaluate_only_added_features is False). :rtype: pandas.DataFrame """ if self.feature_selector.relevant_features is None: raise RuntimeError("You have to call fit before.") if self.timeseries_container is None: raise RuntimeError( "You have to provide a time series using the set_timeseries_container function before." ) self.feature_extractor.set_timeseries_container( self.timeseries_container) # We can only extract features that originate from time series relevant_extraction_settings = FeatureExtractionSettings.from_columns( list( set(self.feature_selector.relevant_features) - set(pd.DataFrame(X).columns))) relevant_extraction_settings.set_default = False relevant_extraction_settings.IMPUTE = self.feature_extractor.settings.IMPUTE feature_augmenter_restricted = FeatureAugmenter( settings=relevant_extraction_settings, column_id=self.feature_extractor.column_id, column_sort=self.feature_extractor.column_sort, column_kind=self.feature_extractor.column_kind, column_value=self.feature_extractor.column_value) feature_augmenter_restricted.set_timeseries_container( self.feature_extractor.timeseries_container) if self.evaluate_only_added_features: X_tsfresh = feature_augmenter_restricted.transform( X).loc[:, self.feature_selector.relevant_features] return pd.concat([X_tsfresh, X], axis=1) else: X_tsfresh = feature_augmenter_restricted.transform(X) return X_tsfresh.loc[:, self.feature_selector.relevant_features]
def extract_features(timeseries_container, feature_extraction_settings=None, column_id=None, column_sort=None, column_kind=None, column_value=None): """ Extract features from * a :class:`pandas.DataFrame` containing the different time series or * a dictionary of :class:`pandas.DataFrame` each containing one type of time series In both cases a :class:`pandas.DataFrame` with the calculated features will be returned. For a list of all the calculated time series features, please see the :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class, which is used to control which features with which parameters are calculated. For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features >>> df, _ = load_robot_execution_failures() >>> X = extract_features(df, column_id='id', column_sort='time') which would give the same results as described above. In this case, the column_kind is not allowed. Except that, the same rules for leaving out the columns apply as above. :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. :type timeseries_container: pandas.DataFrame or dict :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param feature_extraction_settings: settings object that controls which features are calculated :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :return: The (maybe imputed) DataFrame with the extracted features. :rtype: pandas.DataFrame """ # Always use the standardized way of storing the data. # See the function normalize_input_to_internal_representation for more information. kind_to_df_map, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(timeseries_container, column_id, column_sort, column_kind, column_value) # Use the standard setting if the user did not supply ones himself. if feature_extraction_settings is None: feature_extraction_settings = FeatureExtractionSettings() for key in kind_to_df_map: feature_extraction_settings.set_default_parameters(key) # If requested, do profiling (advanced feature) if feature_extraction_settings.PROFILING: profiler = profiling.start_profiling() # Extract the time series features for every type of time series and concatenate them together. all_possible_unique_id_values = set(id_value for kind, df in kind_to_df_map.items() for id_value in df[column_id]) df_with_ids = pd.DataFrame(index=all_possible_unique_id_values) pool = Pool(feature_extraction_settings.n_processes) partial_extract_features_for_one_time_series = partial( _extract_features_for_one_time_series, column_id=column_id, column_value=column_value, settings=feature_extraction_settings) extracted_features = pool.map(partial_extract_features_for_one_time_series, kind_to_df_map.items()) # Add time series features to result result = pd.concat([df_with_ids] + extracted_features, axis=1, join='outer', join_axes=[df_with_ids.index])\ .astype(np.float64) # Impute the result if requested if feature_extraction_settings.IMPUTE is not None: feature_extraction_settings.IMPUTE(result) # Turn off profiling if it was turned on if feature_extraction_settings.PROFILING: profiling.end_profiling( profiler, filename=feature_extraction_settings.PROFILING_FILENAME, sorting=feature_extraction_settings.PROFILING_SORTING) return result
def setUp(self): self.settings = FeatureExtractionSettings() self.settings.PROFILING = False
class FeatureAugmenterTestCase(DataTestCase): def setUp(self): self.test_df = self.create_test_data_sample() self.settings = FeatureExtractionSettings() self.settings.set_default_parameters("a") calculation_settings_mapping = { "length": self.settings.kind_to_calculation_settings_mapping["a"]["length"] } self.settings.kind_to_calculation_settings_mapping = { "a": calculation_settings_mapping.copy(), "b": calculation_settings_mapping.copy() } def test_fit_and_transform(self): augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort", column_kind="kind", settings=self.settings) # Fit should do nothing returned_df = augmenter.fit() self.assertEqual(returned_df, augmenter) self.assertRaises(RuntimeError, augmenter.transform, None) augmenter.set_timeseries_container(self.test_df) # Add features to all time series X_with_index = pd.DataFrame([{"feature_1": 1}] * 2, index=[1, 5]) X_transformed = augmenter.transform(X_with_index) # Require same shape for i in X_transformed.index: self.assertIn(i, X_with_index.index) for i in X_with_index.index: self.assertIn(i, X_transformed.index) self.assertEqual(X_transformed.shape, (2, 3)) # Preserve old features self.assertEqual(list(X_transformed.columns), ["feature_1", "a__length", "b__length"]) # Features are not allowed to be NaN for index, row in X_transformed.iterrows(): print(index, row) self.assertFalse(np.isnan(row["a__length"])) self.assertFalse(np.isnan(row["b__length"])) def test_add_features_to_only_a_part(self): augmenter = FeatureAugmenter(column_value="val", column_id="id", column_sort="sort", column_kind="kind", settings=self.settings) augmenter.set_timeseries_container(self.test_df) X_with_not_all_ids = pd.DataFrame([{"feature_1": 1}], index=[1]) X_transformed = augmenter.transform(X_with_not_all_ids) for i in X_transformed.index: self.assertIn(i, X_with_not_all_ids.index) for i in X_with_not_all_ids.index: self.assertIn(i, X_transformed.index) self.assertEqual(X_transformed.shape, (1, 3)) self.assertEqual(X_transformed.index, [1]) # Features are not allowed to be NaN for index, row in X_transformed.iterrows(): print(index, row) self.assertFalse(np.isnan(row["a__length"])) self.assertFalse(np.isnan(row["b__length"]))
def run(filename='data/clean_data.csv', city_regions_file='data/CityRegions.csv', load_from_file=True, grid_search=False, baseline=False): if city_regions_file == None: temp = [['Abiline', 'Texas','South'],['West Jordon', 'Utah', 'West' ], ['Yonkers','New York', 'Northeast']] city_regions = pd.DataFrame(temp, columns=['City', 'State','Region']) else: city_regions = pd.read_csv(city_regions_file, header=0).reset_index(drop=True) FEATURE_EXTRACTION='data/data_with_features.csv' if not os.path.isfile(FEATURE_EXTRACTION): df = pd.read_csv(filename, header=0) df.dropna(inplace=True) X_labels = ['City', 'State', 'dt', 'AverageTemperature', 'CityIndex'] df = df[X_labels] df = df.dropna() #city_state = df[['City', 'State']] # Sadness because multiple cities with same name....... #df['CityIndex'] = city_state.apply(number_cities, axis=1) #df.to_csv('data/clean_data.csv', index=False) orig_cities = city_regions[['City','State']] print "Total cities ", len(orig_cities) y_regions = city_regions['Region'] y_regions = y_regions.apply(number_regions) feature_extraction_settings = FeatureExtractionSettings() feature_extraction_settings.IMPUTE = impute feat_extractor = FeatureAugmenter(feature_extraction_settings, column_id='CityIndex', column_sort='dt', column_value='AverageTemperature') empty_df = pd.DataFrame(index=y_regions.index) feat_extractor.set_timeseries_container(df) output = feat_extractor.fit_transform(empty_df,y_regions) output['City'] = city_regions['City'] output['State'] = city_regions['State'] output['Region'] = city_regions['Region'] output.to_csv(FEATURE_EXTRACTION, index=False) else: output = pd.read_csv(FEATURE_EXTRACTION) output = output.drop(['City', 'State', 'Region'], axis=1) if baseline: output = output['AverageTemperature__mean'].to_frame() train, test, validation = split_data(output, city_regions) """ aug = FeatureAugmenter(feature_extraction_settings, column_id='CityIndex', column_sort='dt', column_value='AverageTemperature', timeseries_container=train['df']) output = aug.fit_transform(train['X'], train['y']) output['City_Name'] = train['city_names'] output.to_csv('data/features_from_tsfresh.csv', index=False) """ if load_from_file: clf = joblib.load('./model.joblib.pkl') else: clf = DecisionTreeClassifier(criterion='entropy', max_features=None, min_samples_split=0.1, max_depth=50, class_weight=None) # feat_extractor = RelevantFeatureAugmenter(column_id='CityIndex', column_sort='dt', column_value='AverageTemperature') # for the fit on the train test set, we set the fresh__timeseries_container to `df_train` if grid_search and not baseline: grid = {'max_features': [2, 10, 20, 30, 50, 100, 200, None], 'max_depth': [1, 25, 50, 100], 'class_weight': [None, 'balanced'], 'min_samples_split': [0.1, 0.25, 0.75, 1.0]} scorer = metrics.make_scorer(partial(metrics.accuracy_score)) clf = GridSearchCV(clf, grid, scoring=scorer, n_jobs=multiprocessing.cpu_count()) clf.fit(train['X'], train['y']) # pipeline.set_params(augmenter__timeseries_container=train['df']) # pipeline.fit(train['X'], train['y']) y_pred = pd.Series(clf.predict(train['X'])) y_true = pd.Series(np.array(train['y'])) result = train['city_names'] result.reset_index(drop=True, inplace=True) result['Orig'] = y_true result['Pred'] = y_pred wrongs = y_true == y_pred result['Correct'] = wrongs result.to_csv('data/results_train.csv', index=False) if grid_search and not baseline: print "Best Parameters found from grid search: " print clf.best_params_ print "train accuracy ", accuracy_score(y_true, y_pred) cm_train = confusion_matrix(y_true, y_pred) print "Confusion matrix for training\n", cm_train # for the predict on the test test set, we set the fresh__timeseries_container to `df_test` joblib.dump(clf, './model.joblib.pkl') #### ENDIF y_pred = pd.Series(clf.predict(test['X'])) y_true = pd.Series(np.array(test['y'])) result = test['city_names'] result.reset_index(drop=True, inplace=True) result['Orig'] = y_true result['Pred'] = y_pred wrongs = y_true == y_pred result['Correct'] = wrongs result.to_csv('data/results_test.csv', index=False) print "test accuracy ", accuracy_score(y_true, y_pred) cm_test = confusion_matrix(y_true, y_pred) print "Confusion matrix for testing\n", cm_test class_names = ['Northeast', 'Midwest', 'West', 'South'] if not load_from_file: plot_confusion_matrix(cm_train, class_names) plt.tight_layout() plt.savefig('train_cm.png') plt.hold(False) plot_confusion_matrix(cm_test, class_names) plt.tight_layout() plt.savefig('test_cm.png') if not load_from_file and not grid_search: features = output.columns.values importances = clf.feature_importances_ with open("tree_viz.dot", "w") as f: f = tree.export_graphviz(clf, out_file=f) top_n = 20 ndx = np.argsort(importances)[::-1] sorted_features = features[ndx][:20] sorted_importances = importances[ndx][:20] print '%80s & %s' %('Feature', 'Importance') for f, i in zip(sorted_features, sorted_importances): # print '%80s & %.2f \\\\' % (f[20:], i) print '%s & %.2f \\\\' % (f[20:], i) y_pred = clf.predict(validation['X']) y_true = np.array(validation['y']) y_pred = pd.Series(clf.predict(validation['X'])) y_true = pd.Series(np.array(validation['y'])) result = validation['city_names'] result.reset_index(drop=True, inplace=True) result['Orig'] = y_true result['Pred'] = y_pred wrongs = y_true == y_pred result['Correct'] = wrongs result.to_csv('data/results_val.csv', index=False) print "validation accuracy ", accuracy_score(y_true, y_pred) cm_val = confusion_matrix(y_true, y_pred) print "Confusion matrix for validation\n", cm_val print "done" class_names = ['Northeast', 'Midwest', 'West', 'South'] plt.hold(False) plot_confusion_matrix(cm_val, class_names) plt.tight_layout() plt.savefig('val_cm.png')
def extract_features(timeseries_container, feature_extraction_settings=None, column_id=None, column_sort=None, column_kind=None, column_value=None, parallelization=None): """ Extract features from * a :class:`pandas.DataFrame` containing the different time series or * a dictionary of :class:`pandas.DataFrame` each containing one type of time series In both cases a :class:`pandas.DataFrame` with the calculated features will be returned. For a list of all the calculated time series features, please see the :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class, which is used to control which features with which parameters are calculated. For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`. Examples ======== >>> from tsfresh.examples import load_robot_execution_failures >>> from tsfresh import extract_features >>> df, _ = load_robot_execution_failures() >>> X = extract_features(df, column_id='id', column_sort='time') which would give the same results as described above. In this case, the column_kind is not allowed. Except that, the same rules for leaving out the columns apply as above. :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a dictionary of pandas.DataFrames. :type timeseries_container: pandas.DataFrame or dict :param feature_extraction_settings: settings object that controls which features are calculated :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings :param column_id: The name of the id column to group by. :type column_id: str :param column_sort: The name of the sort column. :type column_sort: str :param column_kind: The name of the column keeping record on the kind of the value. :type column_kind: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param parallelization: Either ``'per_sample'`` or ``'per_kind'`` , see :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_sample`, :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_kind` and :ref:`parallelization-label` for details. :type parallelization: str :return: The (maybe imputed) DataFrame containing extracted features. :rtype: pandas.DataFrame """ import logging logging.basicConfig() # Always use the standardized way of storing the data. # See the function normalize_input_to_internal_representation for more information. kind_to_df_map, column_id, column_value = \ dataframe_functions.normalize_input_to_internal_representation(df_or_dict=timeseries_container, column_id=column_id, column_sort=column_sort, column_kind=column_kind, column_value=column_value) # Use the standard setting if the user did not supply ones himself. if feature_extraction_settings is None: feature_extraction_settings = FeatureExtractionSettings() for key in kind_to_df_map: feature_extraction_settings.set_default_parameters(key) # Choose the parallelization according to a rule-of-thumb if parallelization is None: parallelization = 'per_sample' if (feature_extraction_settings.n_processes / 2) > len(kind_to_df_map) \ else 'per_kind' _logger.info('Parallelizing feature calculation {}'.format(parallelization)) # If requested, do profiling (advanced feature) if feature_extraction_settings.PROFILING: profiler = profiling.start_profiling() # Calculate the result if parallelization == 'per_kind': result = _extract_features_per_kind(kind_to_df_map, feature_extraction_settings, column_id, column_value) elif parallelization == 'per_sample': result = _extract_features_parallel_per_sample(kind_to_df_map, feature_extraction_settings, column_id, column_value) elif parallelization == 'no_parallelization': result = _extract_features_per_kind(kind_to_df_map, feature_extraction_settings, column_id, column_value, serial=True) else: raise ValueError("Argument parallelization must be one of: 'per_kind', 'per_sample'") # Turn off profiling if it was turned on if feature_extraction_settings.PROFILING: profiling.end_profiling(profiler, filename=feature_extraction_settings.PROFILING_FILENAME, sorting=feature_extraction_settings.PROFILING_SORTING) return result
X = pd.DataFrame(index=y.index) print X.shape print X_empty.shape """ pipeline = Pipeline([('augmenter', RelevantFeatureAugmenter(column_id='id', column_sort='time')), ('classifier', DecisionTreeClassifier())]) pipeline.set_params(augmenter__timeseries_container=df_ts) pipeline.fit(X, y) quit() """ print y_regions.shape feature_extraction_settings = FeatureExtractionSettings() feature_extraction_settings.IMPUTE = impute pipeline = Pipeline([('augmenter', FeatureAugmenter(feature_extraction_settings, column_id='City', column_sort='dt', column_value='AverageTemperature')), ('classifier', DecisionTreeClassifier(criterion='entropy'))]) pipeline.set_params(augmenter__timeseries_container=X_train) pipeline.fit(X_empty, y_regions) """ aug = RelevantFeatureAugmenter(column_id='City', column_sort='dt', column_value="AverageTemperature", timeseries_container=X_train) new_X = aug.fit_transform(X_empty, y_regions) clf = DecisionTreeClassifier(criterion='entropy') """