Beispiel #1
0
    def test_from_columns(self):
        tsn = "TEST_TIME_SERIES"

        fset = FeatureExtractionSettings()
        self.assertRaises(TypeError, fset.from_columns, 42)
        self.assertRaises(TypeError, fset.from_columns, 42)
        self.assertRaises(ValueError, fset.from_columns, ["This is not a column name"])
        self.assertRaises(ValueError, fset.from_columns, ["This__neither"])
        self.assertRaises(ValueError, fset.from_columns, ["This__also__not"])

        # Aggregate functions
        feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"]


        # Aggregate functions with params
        feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30',
                          tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf',
                          tsn + '__value_count__value_nan']

        # Apply functions
        feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1']

        cset = fset.from_columns(feature_names)

        six.assertCountEqual(self, list(cset.kind_to_calculation_settings_mapping[tsn].keys()), 
          ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks", "ar_coefficient",
                                  "value_count"])
        
        self.assertEqual(cset.kind_to_calculation_settings_mapping[tsn]["sum_values"], None)
        self.assertEqual(cset.kind_to_calculation_settings_mapping[tsn]["ar_coefficient"],
                         [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}])

        self.assertEqual(cset.kind_to_calculation_settings_mapping[tsn]["value_count"],
                         [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}])
    def test_functional_equality(self):
        """
        `extract_relevant_features` should be equivalent to running first `extract_features` with impute and
        `select_features` afterwards.
        Meaning it should produce the same relevant features and the values of these features should be identical.
        :return:
        """
        df, y = self.create_test_data_sample_with_target()

        relevant_features = extract_relevant_features(df,
                                                      y,
                                                      column_id='id',
                                                      column_value='val',
                                                      column_kind='kind',
                                                      column_sort='sort')

        extraction_settings = FeatureExtractionSettings()
        extraction_settings.IMPUTE = impute
        extracted_features = extract_features(
            df,
            feature_extraction_settings=extraction_settings,
            column_id='id',
            column_value='val',
            column_kind='kind',
            column_sort='sort')
        selected_features = select_features(extracted_features, y)

        self.assertEqual(
            set(relevant_features.columns), set(selected_features.columns),
            "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format(
                relevant_features.columns, selected_features.columns))
        self.assertTrue(
            (relevant_features.values == selected_features.values).all().all(),
            "Should calculate the same feature values")
Beispiel #3
0
    def test_profiling_file_written_out(self):

        fes = FeatureExtractionSettings()
        fes.PROFILING = True
        fes.PROFILING_FILENAME = "test_profiling.txt"

        df = pd.DataFrame(data={"id": np.repeat([1, 2], 10), "value": np.random.normal(0, 1, 20)})
        X = extract_features(df, column_id="id", column_value="value", feature_extraction_settings=fes)

        self.assertTrue(os.path.isfile(fes.PROFILING_FILENAME))
        os.remove(fes.PROFILING_FILENAME)
 def setUp(self):
     self.test_df = self.create_test_data_sample()
     self.settings = FeatureExtractionSettings()
     self.settings.set_default_parameters("a")
     calculation_settings_mapping = {
         "length":
         self.settings.kind_to_calculation_settings_mapping["a"]["length"]
     }
     self.settings.kind_to_calculation_settings_mapping = {
         "a": calculation_settings_mapping.copy(),
         "b": calculation_settings_mapping.copy()
     }
Beispiel #5
0
    def __init__(self,
                 evaluate_only_added_features=True,
                 feature_selection_settings=None,
                 feature_extraction_settings=None,
                 column_id=None,
                 column_sort=None,
                 column_kind=None,
                 column_value=None,
                 timeseries_container=None):
        """
        Create a new RelevantFeatureAugmenter instance.

        :param settings: The extraction settings to use. Leave empty to use the default ones.
        :type settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings

        :param evaluate_only_added_features: Whether to touch the manually-created features during feature selection or
                                             not.
        :type evaluate_only_added_features: bool
        :param feature_selection_settings: The feature selection settings.
        :type feature_selection_settings: tsfresh.feature_selection.settings.FeatureSelectionSettings
        :param feature_extraction_settings: The feature extraction settings.
        :type feature_selection_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings
        :param column_id: The column with the id. See :mod:`~tsfresh.feature_extraction.extraction`.
        :type column_id: basestring
        :param column_sort: The column with the sort data. See :mod:`~tsfresh.feature_extraction.extraction`.
        :type column_sort: basestring
        :param column_kind: The column with the kind data. See :mod:`~tsfresh.feature_extraction.extraction`.
        :type column_kind: basestring
        :param column_value: The column with the values. See :mod:`~tsfresh.feature_extraction.extraction`.
        :type column_value: basestring
        """

        # We require to have IMPUTE!
        if feature_extraction_settings is None:
            feature_extraction_settings = FeatureExtractionSettings()

        # Range will be our default imputation strategy
        feature_extraction_settings.IMPUTE = impute_dataframe_range

        self.feature_extractor = FeatureAugmenter(feature_extraction_settings,
                                                  column_id, column_sort,
                                                  column_kind, column_value)

        self.feature_selector = FeatureSelector(feature_selection_settings)

        self.evaluate_only_added_features = evaluate_only_added_features

        self.timeseries_container = timeseries_container
Beispiel #6
0
    def transform(self, X):
        """
        After the fit step, it is known which features are relevant. Only extract those from the time series handed in
        with the function :func:`~set_timeseries_container`.

        If evaluate_only_added_features is False, also delete the irrelevant, already present features in the data frame.

        :param X: the data sample to add the relevant (and delete the irrelevant) features to.
        :type X: pandas.DataFrame or numpy.array

        :return: a data sample with the same information as X, but with added relevant time series features and
            deleted irrelevant information (only if evaluate_only_added_features is False).
        :rtype: pandas.DataFrame
        """
        if self.feature_selector.relevant_features is None:
            raise RuntimeError("You have to call fit before.")

        if self.timeseries_container is None:
            raise RuntimeError(
                "You have to provide a time series using the set_timeseries_container function before."
            )

        self.feature_extractor.set_timeseries_container(
            self.timeseries_container)

        relevant_time_series_features = set(
            self.feature_selector.relevant_features) - set(
                pd.DataFrame(X).columns)

        relevant_extraction_settings = FeatureExtractionSettings.from_columns(
            relevant_time_series_features)
        relevant_extraction_settings.set_default = False

        # Set imputing strategy
        if self.feature_extractor.settings.IMPUTE is impute_dataframe_range:
            relevant_extraction_settings.IMPUTE = partial(
                impute_dataframe_range,
                col_to_max=self.col_to_max,
                col_to_min=self.col_to_min,
                col_to_median=self.col_to_median)
        else:
            relevant_extraction_settings.IMPUTE = self.feature_extractor.settings.IMPUTE

        relevant_feature_extractor = FeatureAugmenter(
            settings=relevant_extraction_settings,
            column_id=self.feature_extractor.column_id,
            column_sort=self.feature_extractor.column_sort,
            column_kind=self.feature_extractor.column_kind,
            column_value=self.feature_extractor.column_value)

        relevant_feature_extractor.set_timeseries_container(
            self.feature_extractor.timeseries_container)

        X_augmented = relevant_feature_extractor.transform(X)

        return X_augmented.copy().loc[:,
                                      self.feature_selector.relevant_features]
Beispiel #7
0
    def setUp(self):
        self.settings = FeatureExtractionSettings()
        self.settings.PROFILING = False
        self.settings.n_processes = 2

        # only calculate some features to reduce load on travis ci
        self.name_to_param = {"maximum": None,
                              "sum_values": None,
                              "abs_energy": None,
                              "minimum": None,
                              "mean": None,
                              "median": None}
Beispiel #8
0
    def test_default_calculates_all_features(self):
        """
        Test that by default a FeatureExtractionSettings object should be set up to calculate all features defined
        in tsfresh.feature_extraction.feature_calculators
        """
        settings = FeatureExtractionSettings()
        all_feature_calculators = [name for name, func in feature_calculators.__dict__.items()
                                   if hasattr(func, "fctype")]

        for calculator in all_feature_calculators:
            self.assertIn(calculator, settings.name_to_param,
                          msg='Default FeatureExtractionSettings object does not setup calculation of {}'
                          .format(calculator))
Beispiel #9
0
    def transform(self, X):
        """
        After the fit step, it is known which features are relevant. Only extract those from the time series handed in
        with the function :func:`~set_timeseries_container`.

        If evaluate_only_added_features is False, also delete the irrelevant, already present features in the data frame.

        :param X: the data sample to add the relevant (and delete the irrelevant) features to.
        :type X: pandas.DataFrame or numpy.array

        :return: a data sample with the same information as X, but with added relevant time series features and
            deleted irrelevant information (only if evaluate_only_added_features is False).
        :rtype: pandas.DataFrame
        """
        if self.feature_selector.relevant_features is None:
            raise RuntimeError("You have to call fit before.")

        if self.timeseries_container is None:
            raise RuntimeError(
                "You have to provide a time series using the set_timeseries_container function before."
            )

        self.feature_extractor.set_timeseries_container(
            self.timeseries_container)

        # We can only extract features that originate from time series
        relevant_extraction_settings = FeatureExtractionSettings.from_columns(
            list(
                set(self.feature_selector.relevant_features) -
                set(pd.DataFrame(X).columns)))

        relevant_extraction_settings.set_default = False
        relevant_extraction_settings.IMPUTE = self.feature_extractor.settings.IMPUTE

        feature_augmenter_restricted = FeatureAugmenter(
            settings=relevant_extraction_settings,
            column_id=self.feature_extractor.column_id,
            column_sort=self.feature_extractor.column_sort,
            column_kind=self.feature_extractor.column_kind,
            column_value=self.feature_extractor.column_value)

        feature_augmenter_restricted.set_timeseries_container(
            self.feature_extractor.timeseries_container)

        if self.evaluate_only_added_features:
            X_tsfresh = feature_augmenter_restricted.transform(
                X).loc[:, self.feature_selector.relevant_features]
            return pd.concat([X_tsfresh, X], axis=1)
        else:
            X_tsfresh = feature_augmenter_restricted.transform(X)
            return X_tsfresh.loc[:, self.feature_selector.relevant_features]
Beispiel #10
0
def extract_features(timeseries_container,
                     feature_extraction_settings=None,
                     column_id=None,
                     column_sort=None,
                     column_kind=None,
                     column_value=None):
    """
    Extract features from

    * a :class:`pandas.DataFrame` containing the different time series

    or

    * a dictionary of :class:`pandas.DataFrame` each containing one type of time series

    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.

    For a list of all the calculated time series features, please see the
    :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class,
    which is used to control which features with which parameters are calculated.

    For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features
    >>> df, _ = load_robot_execution_failures()
    >>> X = extract_features(df, column_id='id', column_sort='time')

    which would give the same results as described above. In this case, the column_kind is not allowed.
    Except that, the same rules for leaving out the columns apply as above.

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
    :type timeseries_container: pandas.DataFrame or dict

    :param column_id: The name of the id column to group by.
    :type column_id: str
    :param column_sort: The name of the sort column.
    :type column_sort: str
    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str
    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param feature_extraction_settings: settings object that controls which features are calculated
    :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings

    :return: The (maybe imputed) DataFrame with the extracted features.
    :rtype: pandas.DataFrame
    """

    # Always use the standardized way of storing the data.
    # See the function normalize_input_to_internal_representation for more information.
    kind_to_df_map, column_id, column_value = \
        dataframe_functions.normalize_input_to_internal_representation(timeseries_container, column_id, column_sort,
                                                                       column_kind, column_value)

    # Use the standard setting if the user did not supply ones himself.
    if feature_extraction_settings is None:
        feature_extraction_settings = FeatureExtractionSettings()
        for key in kind_to_df_map:
            feature_extraction_settings.set_default_parameters(key)

    # If requested, do profiling (advanced feature)
    if feature_extraction_settings.PROFILING:
        profiler = profiling.start_profiling()

    # Extract the time series features for every type of time series and concatenate them together.
    all_possible_unique_id_values = set(id_value
                                        for kind, df in kind_to_df_map.items()
                                        for id_value in df[column_id])
    df_with_ids = pd.DataFrame(index=all_possible_unique_id_values)

    pool = Pool(feature_extraction_settings.n_processes)
    partial_extract_features_for_one_time_series = partial(
        _extract_features_for_one_time_series,
        column_id=column_id,
        column_value=column_value,
        settings=feature_extraction_settings)
    extracted_features = pool.map(partial_extract_features_for_one_time_series,
                                  kind_to_df_map.items())

    # Add time series features to result
    result = pd.concat([df_with_ids] + extracted_features, axis=1, join='outer', join_axes=[df_with_ids.index])\
        .astype(np.float64)

    # Impute the result if requested
    if feature_extraction_settings.IMPUTE is not None:
        feature_extraction_settings.IMPUTE(result)

    # Turn off profiling if it was turned on
    if feature_extraction_settings.PROFILING:
        profiling.end_profiling(
            profiler,
            filename=feature_extraction_settings.PROFILING_FILENAME,
            sorting=feature_extraction_settings.PROFILING_SORTING)

    return result
Beispiel #11
0
 def setUp(self):
     self.settings = FeatureExtractionSettings()
     self.settings.PROFILING = False
class FeatureAugmenterTestCase(DataTestCase):
    def setUp(self):
        self.test_df = self.create_test_data_sample()
        self.settings = FeatureExtractionSettings()
        self.settings.set_default_parameters("a")
        calculation_settings_mapping = {
            "length":
            self.settings.kind_to_calculation_settings_mapping["a"]["length"]
        }
        self.settings.kind_to_calculation_settings_mapping = {
            "a": calculation_settings_mapping.copy(),
            "b": calculation_settings_mapping.copy()
        }

    def test_fit_and_transform(self):
        augmenter = FeatureAugmenter(column_value="val",
                                     column_id="id",
                                     column_sort="sort",
                                     column_kind="kind",
                                     settings=self.settings)

        # Fit should do nothing
        returned_df = augmenter.fit()
        self.assertEqual(returned_df, augmenter)

        self.assertRaises(RuntimeError, augmenter.transform, None)

        augmenter.set_timeseries_container(self.test_df)

        # Add features to all time series
        X_with_index = pd.DataFrame([{"feature_1": 1}] * 2, index=[1, 5])
        X_transformed = augmenter.transform(X_with_index)

        # Require same shape
        for i in X_transformed.index:
            self.assertIn(i, X_with_index.index)

        for i in X_with_index.index:
            self.assertIn(i, X_transformed.index)

        self.assertEqual(X_transformed.shape, (2, 3))

        # Preserve old features
        self.assertEqual(list(X_transformed.columns),
                         ["feature_1", "a__length", "b__length"])

        # Features are not allowed to be NaN
        for index, row in X_transformed.iterrows():
            print(index, row)
            self.assertFalse(np.isnan(row["a__length"]))
            self.assertFalse(np.isnan(row["b__length"]))

    def test_add_features_to_only_a_part(self):
        augmenter = FeatureAugmenter(column_value="val",
                                     column_id="id",
                                     column_sort="sort",
                                     column_kind="kind",
                                     settings=self.settings)

        augmenter.set_timeseries_container(self.test_df)

        X_with_not_all_ids = pd.DataFrame([{"feature_1": 1}], index=[1])
        X_transformed = augmenter.transform(X_with_not_all_ids)

        for i in X_transformed.index:
            self.assertIn(i, X_with_not_all_ids.index)

        for i in X_with_not_all_ids.index:
            self.assertIn(i, X_transformed.index)

        self.assertEqual(X_transformed.shape, (1, 3))
        self.assertEqual(X_transformed.index, [1])

        # Features are not allowed to be NaN
        for index, row in X_transformed.iterrows():
            print(index, row)
            self.assertFalse(np.isnan(row["a__length"]))
            self.assertFalse(np.isnan(row["b__length"]))
Beispiel #13
0
def run(filename='data/clean_data.csv', city_regions_file='data/CityRegions.csv', load_from_file=True, grid_search=False, baseline=False):
    if city_regions_file == None:
        temp = [['Abiline', 'Texas','South'],['West Jordon', 'Utah', 'West' ], ['Yonkers','New York', 'Northeast']]
        city_regions = pd.DataFrame(temp, columns=['City', 'State','Region'])
    else:
        city_regions = pd.read_csv(city_regions_file, header=0).reset_index(drop=True)

    FEATURE_EXTRACTION='data/data_with_features.csv'
    if not os.path.isfile(FEATURE_EXTRACTION):
        df = pd.read_csv(filename, header=0)
        df.dropna(inplace=True)

        X_labels = ['City', 'State', 'dt', 'AverageTemperature', 'CityIndex']
        df = df[X_labels]
        df = df.dropna()
        #city_state = df[['City', 'State']]
        # Sadness because multiple cities with same name.......
        #df['CityIndex'] = city_state.apply(number_cities, axis=1)
        #df.to_csv('data/clean_data.csv', index=False)

        orig_cities = city_regions[['City','State']]
        print "Total cities ", len(orig_cities)
        y_regions = city_regions['Region']
        y_regions = y_regions.apply(number_regions)

        feature_extraction_settings = FeatureExtractionSettings()
        feature_extraction_settings.IMPUTE = impute
        feat_extractor = FeatureAugmenter(feature_extraction_settings,
                                          column_id='CityIndex', column_sort='dt', column_value='AverageTemperature')

        empty_df = pd.DataFrame(index=y_regions.index)
        feat_extractor.set_timeseries_container(df)
        output = feat_extractor.fit_transform(empty_df,y_regions)
        output['City'] = city_regions['City']
        output['State'] = city_regions['State']
        output['Region'] = city_regions['Region']

        output.to_csv(FEATURE_EXTRACTION, index=False)
    else:
        output = pd.read_csv(FEATURE_EXTRACTION)

    output = output.drop(['City', 'State', 'Region'], axis=1)

    if baseline:
        output = output['AverageTemperature__mean'].to_frame()

    train, test, validation = split_data(output, city_regions)

    """
    aug = FeatureAugmenter(feature_extraction_settings, column_id='CityIndex',
                    column_sort='dt', column_value='AverageTemperature',
                    timeseries_container=train['df'])
    output = aug.fit_transform(train['X'], train['y'])
    output['City_Name'] = train['city_names']
    output.to_csv('data/features_from_tsfresh.csv', index=False)
    """
    if load_from_file:
        clf = joblib.load('./model.joblib.pkl')
    else:
        clf = DecisionTreeClassifier(criterion='entropy', max_features=None,
                                     min_samples_split=0.1, max_depth=50, class_weight=None)
        # feat_extractor = RelevantFeatureAugmenter(column_id='CityIndex', column_sort='dt', column_value='AverageTemperature')

        # for the fit on the train test set, we set the fresh__timeseries_container to `df_train`
        if grid_search and not baseline:
            grid = {'max_features': [2, 10, 20, 30, 50, 100, 200, None],
                    'max_depth': [1, 25, 50, 100],
                    'class_weight': [None, 'balanced'],
                    'min_samples_split': [0.1, 0.25, 0.75, 1.0]}
            scorer = metrics.make_scorer(partial(metrics.accuracy_score))
            clf = GridSearchCV(clf, grid, scoring=scorer, n_jobs=multiprocessing.cpu_count())

        clf.fit(train['X'], train['y'])
        # pipeline.set_params(augmenter__timeseries_container=train['df'])
        # pipeline.fit(train['X'], train['y'])

        y_pred = pd.Series(clf.predict(train['X']))
        y_true = pd.Series(np.array(train['y']))
        result = train['city_names']
        result.reset_index(drop=True, inplace=True)
        result['Orig'] = y_true
        result['Pred'] = y_pred
        wrongs = y_true == y_pred
        result['Correct'] = wrongs
        result.to_csv('data/results_train.csv', index=False)
        

        if grid_search and not baseline:
            print "Best Parameters found from grid search: "
            print clf.best_params_

        print "train accuracy ", accuracy_score(y_true, y_pred)
        cm_train = confusion_matrix(y_true, y_pred)
        print "Confusion matrix for training\n", cm_train
        # for the predict on the test test set, we set the fresh__timeseries_container to `df_test`
        joblib.dump(clf, './model.joblib.pkl')
    #### ENDIF

    y_pred = pd.Series(clf.predict(test['X']))
    y_true = pd.Series(np.array(test['y']))
    result = test['city_names']
    result.reset_index(drop=True, inplace=True)
    result['Orig'] = y_true
    result['Pred'] = y_pred
    wrongs = y_true == y_pred
    result['Correct'] = wrongs
    result.to_csv('data/results_test.csv', index=False)
    
    print "test accuracy ", accuracy_score(y_true, y_pred)
    cm_test = confusion_matrix(y_true, y_pred)
    print "Confusion matrix for testing\n", cm_test

    class_names = ['Northeast', 'Midwest', 'West', 'South']
    if not load_from_file:
        plot_confusion_matrix(cm_train, class_names)
        plt.tight_layout()
        plt.savefig('train_cm.png')
    plt.hold(False)
    plot_confusion_matrix(cm_test, class_names)
    plt.tight_layout()
    plt.savefig('test_cm.png')

    if not load_from_file and not grid_search:
        features = output.columns.values
        importances = clf.feature_importances_
        with open("tree_viz.dot", "w") as f:
            f = tree.export_graphviz(clf, out_file=f)
        top_n = 20
        ndx = np.argsort(importances)[::-1]
        sorted_features = features[ndx][:20]
        sorted_importances = importances[ndx][:20]
        print '%80s & %s' %('Feature', 'Importance')
        for f, i in zip(sorted_features, sorted_importances):
            # print '%80s & %.2f \\\\' % (f[20:], i)
            print '%s & %.2f \\\\' % (f[20:], i)

    y_pred = clf.predict(validation['X'])
    y_true = np.array(validation['y'])

    y_pred = pd.Series(clf.predict(validation['X']))
    y_true = pd.Series(np.array(validation['y']))
    result = validation['city_names']
    result.reset_index(drop=True, inplace=True)
    result['Orig'] = y_true
    result['Pred'] = y_pred
    wrongs = y_true == y_pred
    result['Correct'] = wrongs
    result.to_csv('data/results_val.csv', index=False)
    

    print "validation accuracy ", accuracy_score(y_true, y_pred)
    cm_val = confusion_matrix(y_true, y_pred)
    print "Confusion matrix for validation\n", cm_val
    print "done"

    class_names = ['Northeast', 'Midwest', 'West', 'South']
    plt.hold(False)
    plot_confusion_matrix(cm_val, class_names)
    plt.tight_layout()
    plt.savefig('val_cm.png')
Beispiel #14
0
def extract_features(timeseries_container, feature_extraction_settings=None,
                     column_id=None, column_sort=None, column_kind=None, column_value=None,
                     parallelization=None):
    """
    Extract features from

    * a :class:`pandas.DataFrame` containing the different time series

    or

    * a dictionary of :class:`pandas.DataFrame` each containing one type of time series

    In both cases a :class:`pandas.DataFrame` with the calculated features will be returned.

    For a list of all the calculated time series features, please see the
    :class:`~tsfresh.feature_extraction.settings.FeatureExtractionSettings` class,
    which is used to control which features with which parameters are calculated.

    For a detailed explanation of the different parameters and data formats please see :ref:`data-formats-label`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_features
    >>> df, _ = load_robot_execution_failures()
    >>> X = extract_features(df, column_id='id', column_sort='time')

    which would give the same results as described above. In this case, the column_kind is not allowed.
    Except that, the same rules for leaving out the columns apply as above.

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
    :type timeseries_container: pandas.DataFrame or dict

    :param feature_extraction_settings: settings object that controls which features are calculated
    :type feature_extraction_settings: tsfresh.feature_extraction.settings.FeatureExtractionSettings

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param parallelization: Either ``'per_sample'`` or ``'per_kind'``   , see
                            :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_sample`,
                            :func:`~tsfresh.feature_extraction.extraction._extract_features_parallel_per_kind` and
                            :ref:`parallelization-label` for details.
    :type parallelization: str

    :return: The (maybe imputed) DataFrame containing extracted features.
    :rtype: pandas.DataFrame
    """
    import logging
    logging.basicConfig()
    
    # Always use the standardized way of storing the data.
    # See the function normalize_input_to_internal_representation for more information.
    kind_to_df_map, column_id, column_value = \
        dataframe_functions.normalize_input_to_internal_representation(df_or_dict=timeseries_container,
                                                                       column_id=column_id,
                                                                       column_sort=column_sort,
                                                                       column_kind=column_kind,
                                                                       column_value=column_value)

    # Use the standard setting if the user did not supply ones himself.
    if feature_extraction_settings is None:
        feature_extraction_settings = FeatureExtractionSettings()
        for key in kind_to_df_map:
            feature_extraction_settings.set_default_parameters(key)

    # Choose the parallelization according to a rule-of-thumb
    if parallelization is None:
        parallelization = 'per_sample' if (feature_extraction_settings.n_processes / 2) > len(kind_to_df_map) \
            else 'per_kind'

    _logger.info('Parallelizing feature calculation {}'.format(parallelization))

    # If requested, do profiling (advanced feature)
    if feature_extraction_settings.PROFILING:
        profiler = profiling.start_profiling()

    # Calculate the result
    if parallelization == 'per_kind':
        result = _extract_features_per_kind(kind_to_df_map, feature_extraction_settings,
                                            column_id, column_value)
    elif parallelization == 'per_sample':
        result = _extract_features_parallel_per_sample(kind_to_df_map, feature_extraction_settings,
                                                       column_id, column_value)
    elif parallelization == 'no_parallelization':
        result = _extract_features_per_kind(kind_to_df_map, feature_extraction_settings,
                                            column_id, column_value, serial=True)
    else:
        raise ValueError("Argument parallelization must be one of: 'per_kind', 'per_sample'")

    # Turn off profiling if it was turned on
    if feature_extraction_settings.PROFILING:
        profiling.end_profiling(profiler, filename=feature_extraction_settings.PROFILING_FILENAME,
                                sorting=feature_extraction_settings.PROFILING_SORTING)

    return result
Beispiel #15
0
X = pd.DataFrame(index=y.index)
print X.shape
print X_empty.shape

"""
pipeline = Pipeline([('augmenter', RelevantFeatureAugmenter(column_id='id', column_sort='time')),
                ('classifier', DecisionTreeClassifier())])
pipeline.set_params(augmenter__timeseries_container=df_ts)
pipeline.fit(X, y)
quit()

"""


print y_regions.shape
feature_extraction_settings = FeatureExtractionSettings()
feature_extraction_settings.IMPUTE = impute
pipeline = Pipeline([('augmenter', FeatureAugmenter(feature_extraction_settings, column_id='City', column_sort='dt', column_value='AverageTemperature')),
                ('classifier', DecisionTreeClassifier(criterion='entropy'))])

pipeline.set_params(augmenter__timeseries_container=X_train)
pipeline.fit(X_empty, y_regions)

"""
aug = RelevantFeatureAugmenter(column_id='City', column_sort='dt', column_value="AverageTemperature", timeseries_container=X_train)
new_X = aug.fit_transform(X_empty, y_regions)

clf = DecisionTreeClassifier(criterion='entropy')
"""