def extractFeatures(dataSetToExtractFrom, feature_settings="minimal"):
    """ Extracts features of the given dataset and returns a new dataset of features only.

    Keyword arguments:
    dataSetToExtractFrom     -- Dataset (type: pandas.core.frame.DataFrame)
    feature_settings         -- Feature extraction parameter (type: string, options: 'minimal','maximal', 'findBest')

    Returns:
        pandas.core.frame.DataFrame
    """

    dataset_for_extraction = dataSetToExtractFrom.drop(
        columns=['label', 'hand', 'annotator'])

    if feature_settings == "minimal":
        extractedFeatures = MinimalFCParameters()
    elif feature_settings == "maximal":
        extractedFeatures = ComprehensiveFCParameters()
    elif feature_settings == "findBest":
        extractedFeatures = EfficientFCParameters()
    else:
        extractedFeatures = MinimalFCParameters()
        print('Given value for feature_parameter not valid! Minimal feature set is used instead.')

    extracted_featureset = extract_features(dataset_for_extraction, column_id="punch_id",
                                            column_sort="timestamp", impute_function=impute, default_fc_parameters=extractedFeatures)
    return extracted_featureset
Ejemplo n.º 2
0
    def test_from_columns_correct_for_different_kind_datatypes(self):
        """The `settings.from_columns()` function is supposed to save the feature extraction / selection results so it
        can be reused later. It works by parsing the column names of the extracted dataframes. An unfortunate side
        effect of this is that when used with the 'long' format time series input, the typing information about the
        'kind' column is lost. For example, even if the 'kind' values are in int32, in the resulting settings dict, the
        type of the top level keys (representing different kind values) will be str
        """
        df = pd.DataFrame({
            'id': [1, 1, 1, 1],
            'time': [1, 1, 2, 2],
            'kind': [1, 2, 1, 2],
            'value': [1, 2, 3, 4]
        })

        features = extract_features(
            df,
            column_id='id',
            column_sort='time',
            column_kind='kind',
            column_value='value',
            default_fc_parameters=MinimalFCParameters())
        sample_settings = from_columns(features)
        X = extract_features(df,
                             column_id='id',
                             column_sort='time',
                             column_kind='kind',
                             column_value='value',
                             kind_to_fc_parameters=sample_settings)
        assert X.shape == (1, 2 * len(MinimalFCParameters()))
def extract_sub_window(df_x,
                       y,
                       window,
                       start_index,
                       lag,
                       fc_parameters="min",
                       n_jobs=-1):
    from tsfresh import extract_relevant_features
    from tsfresh.feature_extraction.settings import MinimalFCParameters

    if fc_parameters == "min":
        fc_parameters = MinimalFCParameters()

    window_start, window_end = window
    sub_df_x = get_rolling_timeseries(df_x, start_index, lag, window_start,
                                      window_end)
    if n_jobs == -1:
        n_jobs = multiprocessing.cpu_count()
    y = y[y.index.isin(sub_df_x.window_id)]
    features = extract_relevant_features(sub_df_x,
                                         y,
                                         column_id="window_id",
                                         column_sort="timestamp",
                                         column_value=None,
                                         default_fc_parameters=fc_parameters,
                                         n_jobs=n_jobs)
    features = features.add_suffix(f"_{window_start}_{window_end}")
    return (features, y)
Ejemplo n.º 4
0
    def test_make_forecasting_frame_feature_extraction(self):
        t_index = pd.date_range('1/1/2011', periods=4, freq='H')
        df, y = dataframe_functions.make_forecasting_frame(x=pd.Series(data=range(4), index=t_index),
                                                           kind="test", max_timeshift=1, rolling_direction=1)

        extract_relevant_features(df, y, column_id="id", column_sort="time", column_value="value",
                                  default_fc_parameters=MinimalFCParameters())
Ejemplo n.º 5
0
def get_ts_features(X: Union[np.ndarray, torch.Tensor],
                    y: Union[None, np.ndarray, torch.Tensor] = None,
                    features: Union[str, dict] = 'min',
                    n_jobs: Optional[int] = None,
                    **kwargs):
    """
    Args:
        X: np.array or torch.Tesnor of shape [samples, dimensions, timesteps].
        y: Not required for unlabeled data. Otherwise, you need to pass it.
        features: 'min', 'efficient', 'all', or a dictionary. Be aware that 'efficient' and 'all' may required substantial memory and time.
    """
    df = to_tsfresh_df(X)
    n_jobs = ifnone(n_jobs, defaults.cpus)
    if 'default_fc_parameters' in kwargs.keys():
        default_fc_parameters = default_fc_parameters
    elif features == 'min':
        default_fc_parameters = MinimalFCParameters()
    elif features == 'efficient':
        default_fc_parameters = EfficientFCParameters()
    elif features == 'all':
        default_fc_parameters = ComprehensiveFCParameters()
    else:
        default_fc_parameters = None
    df = tsfresh.extract_features(df,
                                  column_id="id",
                                  n_jobs=n_jobs,
                                  default_fc_parameters=default_fc_parameters,
                                  **kwargs)
    if y is not None:
        if y.ndim == 1: y = y.reshape(-1, 1)
        for i in range(y.shape[-1]):
            df['target' if y.shape[-1] == 1 else f'target_{i}'] = y[:, i]
    return df
Ejemplo n.º 6
0
    def _get_extraction_params(self):
        """Helper function to set default parameters from tsfresh"""
        # make n_jobs compatible with scikit-learn
        self.n_jobs = check_n_jobs(self.n_jobs)

        # lazy imports to avoid hard dependency
        from tsfresh.defaults import CHUNKSIZE
        from tsfresh.defaults import DISABLE_PROGRESSBAR
        from tsfresh.utilities.dataframe_functions import impute
        from tsfresh.defaults import N_PROCESSES
        from tsfresh.defaults import PROFILING
        from tsfresh.defaults import PROFILING_FILENAME
        from tsfresh.defaults import PROFILING_SORTING
        from tsfresh.defaults import SHOW_WARNINGS
        from tsfresh.feature_extraction.settings import ComprehensiveFCParameters
        from tsfresh.feature_extraction.settings import EfficientFCParameters
        from tsfresh.feature_extraction.settings import MinimalFCParameters

        # Set defaults from tsfresh
        extraction_params = {
            "kind_to_fc_parameters": self.kind_to_fc_parameters,
            "n_jobs": N_PROCESSES,
            "chunksize": CHUNKSIZE,
            "show_warnings": SHOW_WARNINGS,
            "disable_progressbar": DISABLE_PROGRESSBAR,
            "impute_function": impute,
            "profiling_sorting": PROFILING_SORTING,
            "profiling_filename": PROFILING_FILENAME,
            "profile": PROFILING,
        }

        # Replace defaults with user defined parameters
        for name in extraction_params.keys():
            if hasattr(self, name):
                value = getattr(self, name)
                if value is not None:
                    extraction_params[name] = value

        # Convert convenience string arguments to tsfresh parameters classes
        fc_param_lookup = {
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters(),
            "comprehensive": ComprehensiveFCParameters(),
        }
        if isinstance(self.default_fc_parameters, str):
            if self.default_fc_parameters not in fc_param_lookup:
                raise ValueError(
                    f"If `default_fc_parameters` is passed as a "
                    f"string, "
                    f"it must be one of"
                    f" {fc_param_lookup.keys()}, but found: "
                    f"{self.default_fc_parameters}"
                )
            else:
                fc_parameters = fc_param_lookup[self.default_fc_parameters]
        else:
            fc_parameters = self.default_fc_parameters
        extraction_params["default_fc_parameters"] = fc_parameters

        return extraction_params
Ejemplo n.º 7
0
    def test_all_minimal_features_in(self):
        mfs = MinimalFCParameters()

        self.assertIn("mean", mfs)
        self.assertIn("median", mfs)
        self.assertIn("minimum", mfs)
        self.assertIn("maximum", mfs)
        self.assertIn("length", mfs)
        self.assertIn("sum_values", mfs)
        self.assertIn("standard_deviation", mfs)
        self.assertIn("variance", mfs)
Ejemplo n.º 8
0
def test_param_sim_summ():
    lhd = LatinHypercube(dmin, dmax)
    n_points = 10
    lhd.generate_array(n_points)
    summ = lambda x: generate_tsfresh_features(x, MinimalFCParameters())
    graph_dict = core.get_graph_chunked(param_func=lhd.draw,
                                        sim_func=simulator2,
                                        summaries_func=summ,
                                        batch_size=n_points,
                                        chunk_size=2)
    assert len(
        graph_dict["parameters"]) == 5, "Core test failed, dimensions mismatch"
    assert len(graph_dict["trajectories"]
               ) == 5, "Core test failed, dimensions mismatch"
    assert len(
        graph_dict["summarystats"]) == 5, "Core test failed, expected None"

    params, sim, summaries = dask.compute(graph_dict["parameters"],
                                          graph_dict["trajectories"],
                                          graph_dict["summarystats"])

    sim = np.asarray(sim)
    params = np.asarray(params)
    summaries = np.asarray(summaries)

    assert params.shape == (5, 2, 5), "Core test failed, dimensions mismatch"
    assert sim.shape == (5, 2, 1, 2,
                         101), "Core test failed, dimensions mismatch"
    assert summaries.shape == (5, 2, 1,
                               16), "Core test failed, dimensions mismatch"

    fixed_data = np.asarray([simulator2(bound) for p in range(10)])
    print(fixed_data.shape)
    fixed_data = fixed_data.reshape(10, 2, 101)

    fixed_mean = core.get_fixed_mean(fixed_data, summ, chunk_size=2)

    m, = dask.compute(fixed_mean)
    m = np.asarray(m)
    assert m.shape == (1, 16), "Core test failed, dimensions mismatch"

    dist_class = ns.NaiveSquaredDistance()

    dist_func = lambda x: dist_class.compute(x, m)

    dist = core.get_distance(dist_func, graph_dict["summarystats"])

    assert len(dist) == 5, "Core test failed, dimesnion mismatch"

    dist_res, = dask.compute(dist)
    dist_res = np.asarray(dist_res)

    assert dist_res.shape == (5, 2, 1,
                              16), "Core test failed, dimension mismatch"
Ejemplo n.º 9
0
    def test_extraction_runs_through(self):
        mfs = MinimalFCParameters()

        data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"])

        extracted_features = extract_features(data, default_fc_parameters=mfs,
                                              column_kind="kind", column_value="value",
                                              column_sort="time", column_id="id")

        six.assertCountEqual(self, extracted_features.columns, ["0__median", "0__standard_deviation", "0__sum_values",
                                                                "0__maximum", "0__variance", "0__minimum", "0__mean",
                                                                "0__length"])
        six.assertCountEqual(self, extracted_features.index, [0, 1])
Ejemplo n.º 10
0
    def test_feature_extraction(self):
        df = pd.DataFrame({"my_id": [1, 1, 1, 2, 2, 2], "my_kind": ["a"]*6,
                           "my_value": [1, 2, 3, 4, 5, 6]})

        df = dd.from_pandas(df, chunksize=3)

        df_grouped = df.groupby(["my_id", "my_kind"])

        features = dask_feature_extraction_on_chunk(df_grouped, column_id="my_id",
                                                    column_kind="my_kind",
                                                    column_value="my_value",
                                                    column_sort=None,
                                                    default_fc_parameters=MinimalFCParameters())

        features = features.categorize(columns=["variable"])
        features = features.reset_index(drop=True)

        feature_table = features.pivot_table(index="my_id", columns="variable", values="value", aggfunc="sum")

        feature_table = feature_table.compute()

        self.assertEqual(len(feature_table.columns), len(MinimalFCParameters()))
        self.assertEqual(len(feature_table), 2)
Ejemplo n.º 11
0
    def test_feature_extraction(self):
        df = pd.DataFrame({
            "my_id": [1, 1, 1, 2, 2, 2],
            "my_kind": ["a"] * 6,
            "my_value": [1, 2, 3, 4, 5, 6]
        })

        df = dd.from_pandas(df, chunksize=3)

        df_grouped = df.groupby(["my_id", "my_kind"])

        features = dask_feature_extraction_on_chunk(
            df_grouped,
            column_id="my_id",
            column_kind="my_kind",
            column_value="my_value",
            column_sort=None,
            default_fc_parameters=MinimalFCParameters())

        features = features.compute()

        self.assertEqual(list(sorted(features.columns)),
                         ["my_id", "value", "variable"])
        self.assertEqual(len(features), 2 * len(MinimalFCParameters()))
Ejemplo n.º 12
0
def extract_sub_window(df_x, y, window, start_index, lag, fc_parameters=MinimalFCParameters(), n_jobs=-1):
    from tsfresh import extract_relevant_features
    window_start, window_end = window
    sub_df_x = get_rolling_timeseries(df_x, start_index, lag, window_end-window_start)
    if n_jobs == -1:
        n_jobs = multiprocessing.cpu_count()
    print('Remove non target values...')
    y = y.iloc[start_index + lag:]
    # y = y[y.index.isin(sub_df_x.window_id)]
    print('Extracting features...')
    features = extract_relevant_features(sub_df_x, y, column_id="window_id", column_sort="timestamp", column_value=None,
                                         default_fc_parameters=fc_parameters, n_jobs=n_jobs)
    # features = pd.concat([extracted_features], axis=1)
    features = features.add_suffix(f"_{window_start}_{window_end}")
    return features
Ejemplo n.º 13
0
def extract_sub_windows(df_x, df_y, window_array, lag, fc_parameters=MinimalFCParameters(), n_jobs=-1):
    # df_x = df_x.reset_index('timestamp')
    df_x['timestamp'] = list(range(len(df_x)))

    split_func = lambda x: list(map(int, x.split("-")))
    windows = np.array(list(map(split_func, window_array)))
    max_end = max(windows[:, 1])

    y = df_y.iloc[max_end + lag:]
    y = y.reset_index(drop=True)
    y.index.name = 'window_id'
    features = [
        extract_sub_window(df_x.copy(), y.copy(), window, max_end - (window[1] - window[0]), lag, fc_parameters, n_jobs)
        for window in windows]
    features = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True, how='inner'),
                      features)
Ejemplo n.º 14
0
    def _extract_features(self, devices, trial_id):

        if self.motion == True:
            pickle_path = FEATURE_CACHE + 'X{}-{}-{}.pickle'.format(
                trial_id, self.window_size, self.feature_type)
        elif self.motion == 'only':
            pickle_path = FEATURE_CACHE + 'X{}-{}-{}-motion-only.pickle'.format(
                trial_id, self.window_size, self.feature_type)
        else:
            pickle_path = FEATURE_CACHE + 'X{}-{}-{}-no-motion.pickle'.format(
                trial_id, self.window_size, self.feature_type)
        if os.path.isfile(pickle_path):
            return pickle.load(open(pickle_path, "rb"))
        else:

            wrist_device = devices[0]
            if self.motion == True:
                input_columns = ['red', 'ir', 'gyro', 'accel']
            elif self.motion == 'only':
                input_columns = ['gyro', 'accel']
            else:
                input_columns = ['red', 'ir']
            X_raw = wrist_device[input_columns]

            X_windowed = self._windowize_tsfresh(X_raw)

            if self.feature_type == 'efficient':
                features = EfficientFCParameters()
            elif self.feature_type == 'comprehensive':
                features = ComprehensiveFCParameters()
            elif self.feature_type == 'minimal':
                features = MinimalFCParameters()
            else:
                raise RuntimeError("Invalid feature type")
            print("Extracting features for trial " + str(trial_id))
            X = extract_features(X_windowed,
                                 column_id='id',
                                 column_sort='time',
                                 n_jobs=N_JOBS,
                                 default_fc_parameters=features)
            impute(X)
            pickle.dump(X, open(pickle_path, "wb"))
            return X
Ejemplo n.º 15
0
    def __init__(self, features='minimal', corrcoef=False, use_logger=False):
        self.name = 'SummariesTSFRESH'
        super(SummariesTSFRESH, self).__init__(self.name,
                                               use_logger=use_logger)

        if type(features) is str:
            allowed_str = ['minimal', 'full']
            assert features in allowed_str, "{0} is not recognized, supported sets are 'minimal' and 'full'".format(
                features)
            if features == 'minimal':
                self.features = MinimalFCParameters()
                self.features.pop('length')
            else:
                self.features = EfficientFCParameters()
        else:
            self.features = features

        self.corrcoef = corrcoef

        self.summaries_names = _get_tsfresh_features_names(self.features)
from tsfresh import extract_features
from tsfresh.feature_extraction.settings import MinimalFCParameters
import h5py
import os

if __name__ == "__main__":

    savefile = "tsfresh_features.h5"
    datafile = "simSeriesData.h5"
    chunckLength = 100


    with h5py.File(datafile,"r") as f:
        n = f["deltas"].shape[0]//chunckLength

        for i in range(n):
            print(f"Chunck: {i}")
            ts = np.tile(f["t"],chunckLength)
            deltas = np.reshape(f["deltas"][i*chunckLength:(i+1)*chunckLength],(-1,))
            ids = np.repeat(range(i*chunckLength,(i+1)*chunckLength),f["deltas"].shape[1])

            data = pd.DataFrame({'id':ids,'time':ts,'y':deltas})


            features = extract_features(data,column_id="id", column_sort="time", default_fc_parameters=MinimalFCParameters(),n_jobs=15)

            
            features.to_hdf(savefile, 'table', mode='a',complevel=9, complib='zlib',format='table',append=True)
            

                
Ejemplo n.º 17
0
import seaborn as sns
import pandas as pd
from tsfresh.feature_extraction import extract_features, feature_calculators
#from tsfresh import extract_features
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters
from tsfresh.feature_extraction.settings import MinimalFCParameters
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from sklearn.ensemble import AdaBoostRegressor
from tsfresh.utilities.dataframe_functions import impute
import warnings
warnings.filterwarnings('ignore')

#%matplotlib inline
#%load_ext autoreload
#%autoreload 2
settings_time = MinimalFCParameters()


#settings_time
def feature_extraction(df):
    import pandas as pd
    X_tsfresh = extract_features(df,
                                 column_id="id",
                                 column_value="value",
                                 default_fc_parameters=settings_time)
    #df = X_tsfresh
    #df =df.T
    #df2.replace(to_replace='value__agg_linear_trend__f_agg_"max"__chunk_len_', value='linear', regex=True)

    return X_tsfresh
Ejemplo n.º 18
0
true_params = np.array(bound)
dmin = true_params * 0.5
dmax = true_params * 2.0

uni_prior = uniform_prior.UniformPrior(dmin, dmax)

fixed_data = toggle_model.run(solver=NumPySSASolver,
                              number_of_trajectories=100,
                              show_labels=False)

# reshape data to (N,S,T)
fixed_data = np.asarray([x.T for x in fixed_data])
# and remove timepoints
fixed_data = fixed_data[:, 1:, :]

summ_func = lambda x: fe.generate_tsfresh_features(x, MinimalFCParameters())

ns = naive_squared.NaiveSquaredDistance()


def test_abc_functional():
    abc = ABC(fixed_data,
              sim=simulator2,
              prior_function=uni_prior,
              summaries_function=summ_func,
              distance_function=ns)

    abc.compute_fixed_mean(chunk_size=2)

    # run in multiprocessing mode
    res = abc.infer(num_samples=30, batch_size=10, chunk_size=2)