Beispiel #1
0
    def _get_extraction_params(self):
        """Helper function to set default parameters from tsfresh"""
        # make n_jobs compatible with scikit-learn
        self.n_jobs = check_n_jobs(self.n_jobs)

        # lazy imports to avoid hard dependency
        from tsfresh.defaults import CHUNKSIZE
        from tsfresh.defaults import DISABLE_PROGRESSBAR
        from tsfresh.utilities.dataframe_functions import impute
        from tsfresh.defaults import N_PROCESSES
        from tsfresh.defaults import PROFILING
        from tsfresh.defaults import PROFILING_FILENAME
        from tsfresh.defaults import PROFILING_SORTING
        from tsfresh.defaults import SHOW_WARNINGS
        from tsfresh.feature_extraction.settings import ComprehensiveFCParameters
        from tsfresh.feature_extraction.settings import EfficientFCParameters
        from tsfresh.feature_extraction.settings import MinimalFCParameters

        # Set defaults from tsfresh
        extraction_params = {
            "kind_to_fc_parameters": self.kind_to_fc_parameters,
            "n_jobs": N_PROCESSES,
            "chunksize": CHUNKSIZE,
            "show_warnings": SHOW_WARNINGS,
            "disable_progressbar": DISABLE_PROGRESSBAR,
            "impute_function": impute,
            "profiling_sorting": PROFILING_SORTING,
            "profiling_filename": PROFILING_FILENAME,
            "profile": PROFILING,
        }

        # Replace defaults with user defined parameters
        for name in extraction_params.keys():
            if hasattr(self, name):
                value = getattr(self, name)
                if value is not None:
                    extraction_params[name] = value

        # Convert convenience string arguments to tsfresh parameters classes
        fc_param_lookup = {
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters(),
            "comprehensive": ComprehensiveFCParameters(),
        }
        if isinstance(self.default_fc_parameters, str):
            if self.default_fc_parameters not in fc_param_lookup:
                raise ValueError(
                    f"If `default_fc_parameters` is passed as a "
                    f"string, "
                    f"it must be one of"
                    f" {fc_param_lookup.keys()}, but found: "
                    f"{self.default_fc_parameters}"
                )
            else:
                fc_parameters = fc_param_lookup[self.default_fc_parameters]
        else:
            fc_parameters = self.default_fc_parameters
        extraction_params["default_fc_parameters"] = fc_parameters

        return extraction_params
def extractFeatures(dataSetToExtractFrom, feature_settings="minimal"):
    """ Extracts features of the given dataset and returns a new dataset of features only.

    Keyword arguments:
    dataSetToExtractFrom     -- Dataset (type: pandas.core.frame.DataFrame)
    feature_settings         -- Feature extraction parameter (type: string, options: 'minimal','maximal', 'findBest')

    Returns:
        pandas.core.frame.DataFrame
    """

    dataset_for_extraction = dataSetToExtractFrom.drop(
        columns=['label', 'hand', 'annotator'])

    if feature_settings == "minimal":
        extractedFeatures = MinimalFCParameters()
    elif feature_settings == "maximal":
        extractedFeatures = ComprehensiveFCParameters()
    elif feature_settings == "findBest":
        extractedFeatures = EfficientFCParameters()
    else:
        extractedFeatures = MinimalFCParameters()
        print('Given value for feature_parameter not valid! Minimal feature set is used instead.')

    extracted_featureset = extract_features(dataset_for_extraction, column_id="punch_id",
                                            column_sort="timestamp", impute_function=impute, default_fc_parameters=extractedFeatures)
    return extracted_featureset
Beispiel #3
0
def get_ts_features(X: Union[np.ndarray, torch.Tensor],
                    y: Union[None, np.ndarray, torch.Tensor] = None,
                    features: Union[str, dict] = 'min',
                    n_jobs: Optional[int] = None,
                    **kwargs):
    """
    Args:
        X: np.array or torch.Tesnor of shape [samples, dimensions, timesteps].
        y: Not required for unlabeled data. Otherwise, you need to pass it.
        features: 'min', 'efficient', 'all', or a dictionary. Be aware that 'efficient' and 'all' may required substantial memory and time.
    """
    df = to_tsfresh_df(X)
    n_jobs = ifnone(n_jobs, defaults.cpus)
    if 'default_fc_parameters' in kwargs.keys():
        default_fc_parameters = default_fc_parameters
    elif features == 'min':
        default_fc_parameters = MinimalFCParameters()
    elif features == 'efficient':
        default_fc_parameters = EfficientFCParameters()
    elif features == 'all':
        default_fc_parameters = ComprehensiveFCParameters()
    else:
        default_fc_parameters = None
    df = tsfresh.extract_features(df,
                                  column_id="id",
                                  n_jobs=n_jobs,
                                  default_fc_parameters=default_fc_parameters,
                                  **kwargs)
    if y is not None:
        if y.ndim == 1: y = y.reshape(-1, 1)
        for i in range(y.shape[-1]):
            df['target' if y.shape[-1] == 1 else f'target_{i}'] = y[:, i]
    return df
Beispiel #4
0
    def test_extraction_runs_through(self):
        rfs = EfficientFCParameters()
        data = pd.DataFrame([[0, 0, 0, 0], [1, 0, 0, 0]], columns=["id", "time", "kind", "value"])

        extracted_features = extract_features(data, default_fc_parameters=rfs,
                                              column_kind="kind", column_value="value",
                                              column_sort="time", column_id="id")

        six.assertCountEqual(self, extracted_features.index, [0, 1])
Beispiel #5
0
    def __init__(self, features='minimal', corrcoef=False, use_logger=False):
        self.name = 'SummariesTSFRESH'
        super(SummariesTSFRESH, self).__init__(self.name,
                                               use_logger=use_logger)

        if type(features) is str:
            allowed_str = ['minimal', 'full']
            assert features in allowed_str, "{0} is not recognized, supported sets are 'minimal' and 'full'".format(
                features)
            if features == 'minimal':
                self.features = MinimalFCParameters()
                self.features.pop('length')
            else:
                self.features = EfficientFCParameters()
        else:
            self.features = features

        self.corrcoef = corrcoef

        self.summaries_names = _get_tsfresh_features_names(self.features)
Beispiel #6
0
    def test_contains_all_non_high_comp_cost_features(self):
        """
        Test that by default a EfficientFCParameters object should be set up to calculate all features defined
        in tsfresh.feature_extraction.feature_calculators that do not have the attribute "high_comp_cost"
        """
        rfs = EfficientFCParameters()
        all_feature_calculators = [name for name, func in feature_calculators.__dict__.items()
                                   if hasattr(func, "fctype") and not hasattr(func, "high_comp_cost")]

        for calculator in all_feature_calculators:
            self.assertIn(calculator, rfs,
                          msg='Default EfficientFCParameters object does not setup calculation of {}'
                          .format(calculator))
Beispiel #7
0
def get_potential_auxiliary_tasks(x, y, n_jobs=4):
    x_df = collect_tsfresh_dataframe(x)
    x_transform_df = make_tsfresh_transform_dataframe(x)

    ts_features = RelevantFeatureAugmenter(
        column_id="id",
        column_value="value",
        n_jobs=n_jobs,
        filter_only_tsfresh_features=True,
        show_warnings=False,
        # We use EfficientFCParameters here for computational performance reasons.
        # ComprehensiveFCParameters may be used when performance is not critical.
        # ComprehensiveFCParameters was used in
        # "Distantly Supervised Multitask Learning in Critical Care".
        default_fc_parameters=EfficientFCParameters())

    ts_features.set_timeseries_container(x_df)
    ts_features.fit(X=x_transform_df, y=y)
    return ts_features
Beispiel #8
0
    def _extract_features(self, devices, trial_id):

        if self.motion == True:
            pickle_path = FEATURE_CACHE + 'X{}-{}-{}.pickle'.format(
                trial_id, self.window_size, self.feature_type)
        elif self.motion == 'only':
            pickle_path = FEATURE_CACHE + 'X{}-{}-{}-motion-only.pickle'.format(
                trial_id, self.window_size, self.feature_type)
        else:
            pickle_path = FEATURE_CACHE + 'X{}-{}-{}-no-motion.pickle'.format(
                trial_id, self.window_size, self.feature_type)
        if os.path.isfile(pickle_path):
            return pickle.load(open(pickle_path, "rb"))
        else:

            wrist_device = devices[0]
            if self.motion == True:
                input_columns = ['red', 'ir', 'gyro', 'accel']
            elif self.motion == 'only':
                input_columns = ['gyro', 'accel']
            else:
                input_columns = ['red', 'ir']
            X_raw = wrist_device[input_columns]

            X_windowed = self._windowize_tsfresh(X_raw)

            if self.feature_type == 'efficient':
                features = EfficientFCParameters()
            elif self.feature_type == 'comprehensive':
                features = ComprehensiveFCParameters()
            elif self.feature_type == 'minimal':
                features = MinimalFCParameters()
            else:
                raise RuntimeError("Invalid feature type")
            print("Extracting features for trial " + str(trial_id))
            X = extract_features(X_windowed,
                                 column_id='id',
                                 column_sort='time',
                                 n_jobs=N_JOBS,
                                 default_fc_parameters=features)
            impute(X)
            pickle.dump(X, open(pickle_path, "wb"))
            return X
def split_into_train_test_out_tsfresh(data, in_num):
    """
    Get the time series to be used for feature extraction
    y_train is the y value of the data fitting data
    
    """

    data1 = np.roll(data, -1)  # roll the data once

    #make the dataframe using Tsfresh package
    df_shift_small, y_train = make_forecasting_frame(data1,
                                                     kind="price",
                                                     max_timeshift=in_num,
                                                     rolling_direction=1)

    #create the features needed for the
    result = extract_features(df_shift_small,
                              column_id="id",
                              column_sort="time",
                              column_value="value",
                              impute_function=impute,
                              show_warnings=False,
                              disable_progressbar=False,
                              n_jobs=5,
                              chunksize=1,
                              default_fc_parameters=EfficientFCParameters())

    #result_without_zero = result.loc[:, (result != 0).any(axis=0)]
    #the 50 columns i only need out tsfresh
    columl_list = [
        #            'value__absolute_sum_of_changes',
        # =============================================================================
        # =============================================================================
        'value__agg_autocorrelation__f_agg_"mean"',
        'value__agg_autocorrelation__f_agg_"median"',
        'value__agg_autocorrelation__f_agg_"var"',
        'value__autocorrelation__lag_0',
        'value__autocorrelation__lag_1',
        'value__autocorrelation__lag_2',
        'value__binned_entropy__max_bins_10',
        # =============================================================================
        # =============================================================================
        #                     'value__cid_ce__normalize_False',
        #                     'value__cid_ce__normalize_True',
        #                     'value__count_above_mean',
        #                     'value__count_below_mean',
        #                     'value__fft_aggregated__aggtype_"centroid"',
        'value__fft_aggregated__aggtype_"variance"',
        'value__fft_coefficient__coeff_0__attr_"abs"',
        'value__fft_coefficient__coeff_0__attr_"real"',
        'value__fft_coefficient__coeff_1__attr_"abs"',
        'value__fft_coefficient__coeff_1__attr_"angle"',
        'value__fft_coefficient__coeff_1__attr_"imag"',
        'value__fft_coefficient__coeff_1__attr_"real"',
        'value__first_location_of_maximum',
        #=============================================================================
        # =============================================================================
        'value__large_standard_deviation__r_0.05',
        'value__large_standard_deviation__r_0.1',
        'value__large_standard_deviation__r_0.15000000000000002',
        'value__large_standard_deviation__r_0.2',
        'value__large_standard_deviation__r_0.25',
        #                       'value__large_standard_deviation__r_0.30000000000000004',
        #                       'value__large_standard_deviation__r_0.35000000000000003',
        #                       'value__large_standard_deviation__r_0.4',
        #                       'value__large_standard_deviation__r_0.45',
        # =============================================================================
        # =============================================================================
        'value__linear_trend__attr_"intercept"',
        'value__linear_trend__attr_"pvalue"',
        'value__linear_trend__attr_"rvalue"',
        'value__linear_trend__attr_"slope"',
        'value__longest_strike_above_mean',
        'value__longest_strike_below_mean',
        'value__max_langevin_fixed_point__m_3__r_30',
        'value__maximum',
        'value__mean',
        'value__mean_abs_change',
        'value__mean_change',
        'value__median',
        'value__minimum',
        'value__number_cwt_peaks__n_5',
        'value__partial_autocorrelation__lag_0',
        'value__partial_autocorrelation__lag_1',
        'value__partial_autocorrelation__lag_2',
        'value__standard_deviation',
        'value__sum_values',
        'value__variance'
    ]
    #extract just only those colums
    result_without_zero = result[columl_list]

    #return these values
    x_train = result_without_zero[:-1]
    x_test = result_without_zero[-1:]
    y_train = y_train[:-1]

    return x_train, y_train, x_test
Beispiel #10
0
def test_generate_tsfresh_features():
    X = np.random.randn(2, 100)
    features = EfficientFCParameters()
    test = generate_tsfresh_features(X, features)
Beispiel #11
0
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

def extract_product_features(df,fc_parameter,destination):
  features_product = []
  extraction_method = fc_parameter.__class__.__name__
  for p in df.sitc_id.unique():
    product = df[df.sitc_id==p]
    p_features = extract_features(
      product[["export_val","year","country"]],
      column_id="country",
      column_sort="year",
      column_value=None,column_kind=None,
      chunksize=None,
      default_fc_parameters=fc_parameter
      )
    features_product.append(p_features)
    p_features.to_csv(f"{p}_{extraction_method}_expval.csv")
    print(f'Extracted features for {p}: \n {features_product}')
  product_features = pd.concat(features_product)
  return p_features

%timeit
destination_1 =f'{PATH}/efficient_parameters'
destination_2 = f'{PATH}/comprehensive_parameters'
fc_parameters=[EfficientFCParameters(),ComprehensiveFCParameters()]
extract_product_features(trade_dframe,fc_parameters[0],destination_1)
extract_product_features(trade_dframe,fc_parameters[1],destination_2)
Beispiel #12
0
features_data = np.column_stack((lab_ver_features, lab_hor_features))

# Create TSFresh features for each projected dimension,
# and stack both dimensions horizontally:
lab_ver_for_tsf = ts_fresh.convert_signals_for_ts_fresh(
    sub_lab_ver_proj, "ver")
lab_ver_tsf_features = extract_features(
    lab_ver_for_tsf,
    default_fc_parameters=ComprehensiveFCParameters(),
    column_id="signal_id",
    column_sort="time")
lab_hor_for_tsf = ts_fresh.convert_signals_for_ts_fresh(
    sub_lab_hor_proj, "hor")
lab_hor_tsf_features = extract_features(
    lab_hor_for_tsf,
    default_fc_parameters=EfficientFCParameters(),
    column_id="signal_id",
    column_sort="time")
features_data = pd.concat([
    lab_ver_tsf_features, lab_hor_tsf_features,
    pd.DataFrame(lab_ver_features),
    pd.DataFrame(lab_hor_features)
],
                          axis=1)
'''
Prepare the data for the classification process:
'''


def create_labels(symptom_name, tags_data, condition_vector, binarize=True):
    if symptom_name == 'tremor':
Beispiel #13
0
class SummariesTSFRESH(SummaryBase):
    """
    Class for computing features/statistics on time series data.
    An ensemble of different statistics from TSFRESH are supported.
    """
    def __init__(self, features='minimal', corrcoef=False, use_logger=False):
        self.name = 'SummariesTSFRESH'
        super(SummariesTSFRESH, self).__init__(self.name,
                                               use_logger=use_logger)

        if type(features) is str:
            allowed_str = ['minimal', 'full']
            assert features in allowed_str, "{0} is not recognized, supported sets are 'minimal' and 'full'".format(
                features)
            if features == 'minimal':
                self.features = MinimalFCParameters()
                self.features.pop('length')
            else:
                self.features = EfficientFCParameters()
        else:
            self.features = features

        self.corrcoef = corrcoef

        self.summaries_names = _get_tsfresh_features_names(self.features)

    def _compute_tsfresh(self, point):
        """
        Computes features for one point (time series).
        
        Parameters
        ----------
        point : ndarray
            trajectory of shape n_timepoints x 1

        Returns
        -------
        list
            list of generated features 
        """
        return generate_tsfresh_features(point, features=self.features)

    def _compute_corrcoef(self, x, y):
        """
        Computes the Pearson correlation coefficient between two trajectories
        
        Parameters
        ---------

        x : ndarray 
            Trajectory of shape n_timepoints x 1 

        y: ndarray 
            Trajectory of shape n_timepoints x 1 

        Returns
        list
            list of generated feature
        """
        return [np.corrcoef(x, y)[0, 1]]

    def compute(self, point):
        """[summary]
        
        Parameters
        ----------
        point : [type]
            [description]
        """
        point = np.asarray(point)
        assert len(
            point.shape
        ) == 3, "required input shape is (n_points, n_species, n_timepoints)"
        tsfresh_summaries = self._compute_tsfresh(point)
        tsfresh_summaries = np.asarray(tsfresh_summaries)
        tsfresh_summaries = np.mean(tsfresh_summaries, axis=0, keepdims=True)
        if self.corrcoef:
            assert point.shape[
                1] > 1, "corrcoef = True can only be used if the n_species > 1"
            corrcoef_summaries = []
            n_species = range(point.shape[1])
            for n in point:
                corr = []
                for s in combinations(n_species, 2):
                    x = n[s[0]]
                    y = n[s[1]]
                    corr.append(self._compute_corrcoef(x, y)[0])
                corrcoef_summaries.append(corr)
            corrcoef_summaries = np.asarray(corrcoef_summaries)
            corrcoef_summaries = np.mean(corrcoef_summaries,
                                         axis=0,
                                         keepdims=True)
            tot = np.hstack((tsfresh_summaries, corrcoef_summaries))
            return tot
        else:
            return tsfresh_summaries
Beispiel #14
0
def test_generate_tsfresh_features():
    x = np.random.randn(2, 2, 100)
    features = EfficientFCParameters()
    test = generate_tsfresh_features(x, features)
    assert test.shape == (2, 1500)