Exemple #1
0
def reshape_data_tsfresh(seq_dataset, n_classes, n_steps, settings):
    """
    Transform sequences dataset into dataset of features
    """
    len_data = seq_dataset.shape[0]
    data_divided = []
    for i in range(n_classes):
        data_divided.append(seq_dataset[:, :, i].reshape(-1))
    to_extract = []
    for i in range(n_classes):
        ids = np.arange(len_data).repeat(n_steps)
        tmp = np.vstack((ids, data_divided[i]))
        tmp = tmp.T
        to_extract.append(pd.DataFrame(data=tmp, columns=["id", "value"]))
    tfs = []
    # parameters of tsfresh features extraction
    if settings == "complete":
        settings = ComprehensiveFCParameters()
    elif settings == "efficient":
        settings = EfficientFCParameters()
    elif settings == "minimal":
        settings = MinimalFCParameters()
    for i in range(n_classes):
        tf = tsfresh.extract_features(
            to_extract[i], column_id="id", default_fc_parameters=settings
        )
        tfs.append(tf)
    data_feat = pd.concat(
        [tfs[i].reindex(tfs[0].index) for i in range(n_classes)], axis=1
    )
    print(data_feat.shape)
    data_feat.fillna(0, inplace=True)
    data_feat.replace([np.inf, -np.inf], 0, inplace=True)
    data_tensor = torch.from_numpy(data_feat.values).float()
    return data_tensor
Exemple #2
0
 def _extract_tsfresh_features(self, X):
     X_df = self._convert_to_df(X)
     X_df_no_nans = X_df.dropna()
     if self.extraction_type == "minimal":
         extraction_setting = MinimalFCParameters()
     elif self.extraction_type == "efficient":
         extraction_setting = EfficientFCParameters()
     elif self.extraction_type == "all":
         extraction_setting = ComprehensiveFCParameters()
     else:
         raise ValueError(
             f"{self.extraction_type} is not a supported feature extraction option. Please choose one from "
             f"the following options: [minimal, efficient, all]."
         )
     # Extract time series features from the dataframe
     # Replace any ``NaNs`` and ``infs`` in the extracted features with median/extreme values for that column
     tsfresh_features = extract_features(
         X_df_no_nans,
         default_fc_parameters=extraction_setting,
         column_id="id",
         column_sort="time",
         impute_function=impute,
     )
     # If X_df.dropna() dropped some observations entirely (i.e., due to all NaNs),
     # impute each tsfresh feature for those observations with the median of that tsfresh feature
     tsfresh_features_imputed = impute(tsfresh_features.reindex(pd.RangeIndex(X_df["id"].max() + 1)))
     return tsfresh_features_imputed, X_df
Exemple #3
0
    def test_gen_global_feature_multi_id(self):
        dates = pd.date_range('1/1/2019', periods=8)
        data = np.random.randn(8, 3)
        df = pd.DataFrame({"datetime": dates, "values": data[:, 0],
                           "A": data[:, 1], "B": data[:, 2],
                           "id": ["00"]*4+["01"]*4})
        from tsfresh.feature_extraction import ComprehensiveFCParameters
        from tsfresh.feature_extraction import MinimalFCParameters
        from tsfresh.feature_extraction import EfficientFCParameters
        for params in [ComprehensiveFCParameters(),
                       MinimalFCParameters(),
                       EfficientFCParameters()]:
            output_df, _ = generate_global_features(input_df=df,
                                                    column_id="id",
                                                    column_sort="datetime",
                                                    default_fc_parameters=params)

            assert "datetime" in output_df.columns
            assert "values" in output_df.columns
            assert "A" in output_df.columns
            assert "B" in output_df.columns
            assert "id" in output_df.columns

            for col in output_df.columns:
                if col in ["datetime", "values", "A", "B", "id"]:
                    continue
                assert len(set(output_df[output_df["id"] == "00"][col])) == 1
                assert len(set(output_df[output_df["id"] == "01"][col])) == 1
                assert output_df[output_df["id"] == "00"][col].isna().sum() == 0
                assert output_df[output_df["id"] == "01"][col].isna().sum() == 0
Exemple #4
0
def featurize_set(ids, fc_params=None):
    if fc_params is None:
        fc_params = EfficientFCParameters()
    X_df = pd.DataFrame()
    for id in tqdm(ids):
        X_df = pd.concat([X_df, featurize_audio(id, fc_params)])
    return X_df
def main():
    files = pd.read_excel(
        '/home/velaraptor/Downloads/Raw Data 10yrs (2018).xlsx', header=1)
    files = files.fillna(0)
    groups = files.groupby('Name')
    forecast_df = []
    for name, group in tqdm.tqdm(groups):
        if len(group) > 1:
            group.index = group.Year
            df_shift, y = make_forecasting_frame(group["FantPt"],
                                                 kind=name,
                                                 max_timeshift=10,
                                                 rolling_direction=1)
            forecast_df.append(df_shift)

    features_df = []
    for sample in tqdm.tqdm(forecast_df):
        X = extract_features(sample,
                             column_id="id",
                             column_sort="time",
                             column_value="value",
                             impute_function=impute,
                             show_warnings=False,
                             disable_progressbar=True,
                             default_fc_parameters=EfficientFCParameters())
        X = X.reset_index()
        X.loc[:, 'Name'] = sample['kind']
        features_df.append(X)
    features_time_series = pd.concat(features_df)
    features_time_series.to_csv('features_time_series.csv', index=False)
Exemple #6
0
def get_tsfresh_features(df):
    """Calculate different aggregates/descriptive statistics, using tsfresh,
    of the some of the more informative raw timeseries.

    Parameters:
    -----------
    - df: pd.DataFrame
        the raw (timeseries) data containing the categorical features

    Returns:
    --------
    - ts_features: pd.DataFrame
        a DataFrame with each record a process, containing the features
        based on the binary-valued timeseries
    """

    # We only keep the feature extraction functions that are not too
    # computationally expensive & that do not return too many values
    extraction_settings = EfficientFCParameters()
    filtered_funcs = [
        'abs_energy', 'mean_abs_change', 'mean_change', 'skewness', 'kurtosis',
        'absolute_sum_of_changes', 'longest_strike_below_mean',
        'longest_strike_above_mean', 'count_above_mean', 'count_below_mean',
        'last_location_of_maximum', 'first_location_of_maximum',
        'last_location_of_minimum', 'first_location_of_minimum',
        'percentage_of_reoccurring_datapoints_to_all_datapoints',
        'percentage_of_reoccurring_values_to_all_values',
        'sum_of_reoccurring_values', 'sum_of_reoccurring_data_points',
        'ratio_value_number_to_time_series_length', 'cid_ce',
        'symmetry_looking', 'large_standard_deviation', 'quantile',
        'autocorrelation', 'number_peaks', 'binned_entropy',
        'index_mass_quantile', 'linear_trend', 'number_crossing_m',
        'augmented_dickey_fuller', 'number_cwt_peaks', 'agg_autocorrelation',
        'spkt_welch_density', 'friedrich_coefficients',
        'max_langevin_fixed_point', 'c3', 'ar_coefficient',
        'mean_second_derivative_central', 'ratio_beyond_r_sigma',
        'energy_ratio_by_chunks', 'partial_autocorrelation', 'fft_aggregated',
        'time_reversal_asymmetry_statistic', 'range_count'
    ]
    filtered_settings = {}
    for func in filtered_funcs:
        filtered_settings[func] = extraction_settings[func]

    # Extract the features
    ts_features = extract_features(df[[
        'process_id', 'timestamp', 'return_turbidity', 'return_flow',
        'supply_flow', 'target_value', 'flow_diff'
    ]],
                                   column_id='process_id',
                                   column_sort="timestamp",
                                   column_kind=None,
                                   column_value=None,
                                   impute_function=impute,
                                   default_fc_parameters=filtered_settings,
                                   show_warnings=False,
                                   disable_progressbar=True)

    return ts_features
def add_tsfresh_day(new_data, data, tsfresh_features, columns):

    # The dictionary containing the features that we want to extract and the setting for those features
    if tsfresh_features == 'minimal':
        settings = MinimalFCParameters()
    elif tsfresh_features == 'efficient':
        settings = EfficientFCParameters()
    elif tsfresh_features == 'comprehensive':
        settings = ComprehensiveFCParameters()
    else:
        settings = MinimalFCParameters()

    for participant in range(len(data)):

        all_days = []
        for day in range(len(data[participant])):

            # We only take the columns that we are interested in
            sub_data = data[participant][day].loc[data[participant][day]
                                                  ['variable'].isin(columns)]

            # Drop all nan values
            sub_data = sub_data.dropna(axis=0)

            # If a columns is missing we add a row with that column and a 0.
            # If a column contains nan values we do the same
            for col in columns:
                if col not in sub_data['variable']:
                    new_row = sub_data.iloc[0].copy(deep=True)
                    new_row['variable'] = col
                    new_row['value'] = 0
                    sub_data.append(new_row)

            from tsfresh.utilities.dataframe_functions import impute_dataframe_zero
            # Extract features for every variable still left in the dataframe
            extracted = extract_features(sub_data,
                                         default_fc_parameters=settings,
                                         column_id='variable',
                                         column_sort='time_seconds',
                                         column_value='value')

            # We do not want multiple rows therefore in the case of multiple variables therefore we need to change it
            # We also change the column names so that we know what kind if features they are
            extracted = extracted.stack()
            extracted.index = extracted.index.map('{0[1]}_{0[0]}_day'.format)
            extracted = extracted.to_frame().T

            # Add the extracted features to a list
            all_days.append(extracted)

        # Concat the days to make a new dataframe and reset the index to prevent conflicts
        all_days = pd.concat(all_days, axis=0).reset_index(drop=True)

        # Add the new features to the data
        new_data[participant] = pd.concat([new_data[participant], all_days],
                                          axis=1)

    return new_data
def create_agg_tsfresh(x_train, y_train, x_val, y_val, input_path, size=None):

    y_train = pd.DataFrame(y_train).idxmax(axis=1)
    y_val = pd.DataFrame(y_val).idxmax(axis=1)
    if os.path.exists(input_path + 'agg_train.csv') and os.path.exists(
            input_path + 'agg_val.csv') and size is None:

        x_train_filtered = pd.read_csv(input_path + 'agg_train.csv',
                                       index_col=0)
        x_val_filtered = pd.read_csv(input_path + 'agg_val.csv', index_col=0)

        x_train_filtered = x_train_filtered.loc[:, x_train_filtered.var() != 0]
        x_val_filtered = x_val_filtered[x_train_filtered.columns]

    else:
        x_train_df = df_from_3d_np(x_train)
        x_val_df = df_from_3d_np(x_val)

        x_train_df = x_train_df.fillna(0)
        x_val_df = x_val_df.fillna(0)
        # start_time = time.time()
        x_train_extracted = extract_features(
            x_train_df,
            column_id='index',
            column_sort='time',
            default_fc_parameters=EfficientFCParameters())
        # duration = time.time() - start_time
        # print(f'feature extraction {duration}')
        if 'mts_archive' in input_path:
            x_train_sel = select_features(x_train_extracted, y_train, n_jobs=0)

            # if not enough features, take larger set
            if x_train_sel.shape[1] < 300:
                X_best = SelectKBest(f_classif,
                                     k='all').fit(x_train_extracted, y_train)
                ufs_scores = pd.DataFrame(X_best.scores_,
                                          index=x_train_extracted.columns,
                                          columns=['score']).sort_values(
                                              by=['score'], ascending=False)
                x_train_sel = x_train_extracted[ufs_scores.iloc[:300].index]

            x_train_extracted = x_train_sel

        x_train_extracted = x_train_extracted.dropna(axis='columns')

        x_train_extracted.to_csv(input_path + f'agg_train.csv')
        y_train.to_csv(input_path + f'y_train.csv')

        x_val_filtered = pd.read_csv(input_path + 'agg_val.csv', index_col=0)

        x_train_filtered = x_train_extracted.loc[:,
                                                 x_train_extracted.var() != 0]
        x_val_filtered = x_val_filtered[x_train_filtered.columns]

        y_val.to_csv(input_path + 'y_test.csv')

    return x_train_filtered, y_train, x_val_filtered, y_val
    def features(x: pd.Series) -> pd.DataFrame:
        data = pd.DataFrame(dtype=np.float64)

        data['x'] = x
        data['id'] = 1

        df = extract_features(data,
                              column_id='id',
                              default_fc_parameters=EfficientFCParameters())

        return df
 def relevance(self, X, y):
     from tsfresh.feature_extraction import EfficientFCParameters
     features_extracted = tsfresh.extract_features(
         X,
         column_id="id",
         default_fc_parameters=EfficientFCParameters(),
         disable_progressbar=True)
     tsfresh.utilities.dataframe_functions.impute(features_extracted)
     relevance_features = tsfresh.feature_selection.relevance.calculate_relevance_table(
         features_extracted, y)
     return features_extracted, relevance_features
Exemple #11
0
    def gen_global_feature(self, settings="comprehensive", full_settings=None):
        '''
        Generate per-time-series feature for each time series.
        This method will be implemented by tsfresh.

        :param settings: str or dict. If a string is set, then it must be one of "comprehensive"
               "minimal" and "efficient". If a dict is set then it should follow the instruction
               for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive".
        :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in
               tsfresh. The value is defaulted to None.

        :return: the tsdataset instance.

        '''
        if full_settings is not None:
            self.df = generate_global_features(
                input_df=self.df,
                column_id=self.id_col,
                column_sort=self.dt_col,
                kind_to_fc_parameters=full_settings)
            return self

        from tsfresh.feature_extraction import ComprehensiveFCParameters,\
            MinimalFCParameters, EfficientFCParameters
        default_params = {
            "comprehensive": ComprehensiveFCParameters(),
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters()
        }

        if isinstance(settings, str):
            assert settings in ["comprehensive", "minimal", "efficient"], \
                f"settings str should be one of \"comprehensive\", \"minimal\", \"efficient\"\
                    , but found {settings}."

            default_fc_parameters = default_params[settings]
        else:
            default_fc_parameters = settings

        self.df,\
            addtional_feature =\
            generate_global_features(input_df=self.df,
                                     column_id=self.id_col,
                                     column_sort=self.dt_col,
                                     default_fc_parameters=default_fc_parameters)

        self.feature_col += addtional_feature

        return self
def add_tsfresh_participant(data, tsfresh_features, columns, k):

    # The dictionary containing the features that we want to extract and the setting for those features
    if tsfresh_features == 'minimal':
        settings = MinimalFCParameters()
    elif tsfresh_features == 'efficient':
        settings = EfficientFCParameters()
    elif tsfresh_features == 'comprehensive':
        settings = ComprehensiveFCParameters()
    else:
        settings = MinimalFCParameters()

    for participant in range(len(data)):

        # First we add the necesary columns
        data[participant]['id'] = 0
        data[participant]['index'] = data[participant].index

        # We create the rolled time series which also creates new ids, also note that putting max_timeshift to none
        # means that it takes the maximal possible lengths
        rolled_series = roll_time_series(data[participant],
                                         column_id='id',
                                         column_sort='index',
                                         max_timeshift=k)

        all_features = []
        for column in columns:
            # We extract the features for every element of the time series which return a dataframe with the same number
            # of rows as the original dataframe but a different number of columns
            extracted = extract_features(rolled_series,
                                         default_fc_parameters=settings,
                                         column_id='id',
                                         column_sort='index',
                                         column_value=column)

            # We need to reset the indexes as they have been changed and add them to our list of features
            all_features.append(extracted.reset_index(drop=True))

        # Add all the features together
        extracted = pd.concat(all_features, axis=1)

        # We drop the columns that we previously created because we do no want them in the data
        del data[participant]['id']  # note that you can also use df.drop here
        del data[participant]['index']

        data[participant] = pd.concat([data[participant], extracted], axis=1)

    return data
Exemple #13
0
def features_generator(path_to_file):
    signals = pd.read_csv(path_to_file)
    seg = int(path_to_file.split('/')[-1].split('.')[0])
    signals['segment_id'] = seg
    
    sel = signals.fillna(0).astype(bool).sum(axis=0) / 60001 > 0.5
    signals = signals.fillna(0).loc[:,sel]

    extracted_features = extract_features(signals.iloc[:,:], 
                                          column_id = 'segment_id', 
                                          default_fc_parameters=EfficientFCParameters(),
                                          n_jobs = 0,
                                          disable_progressbar = True,
                                          chunksize = None
                                         )
    return extracted_features
Exemple #14
0
    def extract(self, signal):
        df = pd.DataFrame(signal.reshape(-1, 1))
        df['time'] = np.arange(len(df), dtype=int)
        df['id'] = 1

        features = extract_features(
            df,
            column_id="id",
            column_sort="time",
            default_fc_parameters=EfficientFCParameters())

        results = {}
        values, names = features.iloc[0, :].values, features.columns
        for name, value in zip(names, values):
            results[self.__class__.__name__ + name] = value
        return results
def compute_tsfresh_features(x, save_path, nb_splits=8, which_set='training'):
    print('Processing %s set...' % (which_set))
    n = x.shape[0]
    split_breaks = [int(n / nb_splits) * i for i in range(nb_splits)] + [n]
    for i in range(nb_splits):
        start = split_breaks[i]
        stop = split_breaks[i + 1]
        print('Number of rows being processed:', stop - start)
        features = extract_features(TSFormatting().transform(x.iloc[start:stop]),
                                    column_id='id', column_sort='time',
                                    default_fc_parameters=EfficientFCParameters())
        features['neuron_id'] = x.iloc[start:stop]['neuron_id']
        if (i == 0):
            features.to_csv(save_path, mode='w', header=True, index=True)
        else:
            features.to_csv(save_path, mode='a', header=False, index=True)
        del features
Exemple #16
0
    def transform_ts(start, end, file):
        train_columns = pq.read_schema(
            file).names  # List with all column names to test
        # print(train_columns)
        X = pd.DataFrame(data=None)

        for i in train_columns[start:end]:
            df_signal = pq.read_pandas(file, columns=[i]).to_pandas()
            # turn parquet to dataframe of one single signal
            # print("Shape of signal data {}".format(df_signal.shape))

            sig = np.ravel(df_signal.iloc[:, 0].to_numpy())  # turn to numpy
            t = df_signal.index.to_numpy()  # turn time to numpy

            x_dn = de_noising(high_pass_filter(sig))
            x_deleted = delete_repeat(x_dn)
            x_deleted_cond = (x_deleted < 99998)
            x_deleted = x_deleted[x_deleted_cond]
            print(x_deleted.shape)
            t_deleted = t[x_deleted_cond]

            # Generating New Time Series Features from signal
            master_train = pd.DataFrame({
                0: x_deleted,
                1: np.repeat(i, x_deleted.shape[0]),
                2: t_deleted
            })
            # print("Shape of master train data {}".format(master_train.shape))
            # master_train.to_csv('output/master_train.csv')

            extraction_settings = EfficientFCParameters()
            X_signal = extract_features(
                master_train,
                column_id=1,
                column_sort=2,
                impute_function=impute,
                default_fc_parameters=extraction_settings)

            print("Number of extracted features in {}: {}.".format(
                i, X_signal.shape[1]))
            X = X.append(X_signal)

        return X
Exemple #17
0
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import l1_min_c
from pathlib import Path

print('Reading data...')
wav_files = glob.glob('sounds/kick/*.wav') + glob.glob(
    'sounds/snare/*.wav') + glob.glob('sounds/tom/*.wav')
all_audio = pd.concat([audio_to_dataframe(path) for path in wav_files])
all_labels = pd.Series(np.repeat(['kick', 'snare', 'tom'], 25),
                       index=wav_files)
all_audio.head()

regenerate_tsfresh = True
if regenerate_tsfresh:
    print('Generating tsfresh data...')
    settings = EfficientFCParameters()
    audio_tsfresh = extract_relevant_features(all_audio,
                                              all_labels,
                                              column_id='file_id',
                                              column_sort='time_id',
                                              default_fc_parameters=settings)
else:
    print('Reading tsfresh data...')
    all_labels = pd.read_pickle('pkl/drum_tsfresh_labels.pkl')
    audio_tsfresh = pd.read_pickle('pkl/drum_tsfresh.pkl')

print('Running logistic regression CV...')
print('Started CV %s' % datetime.now())
cs = l1_min_c(audio_tsfresh, all_labels, loss='log') * np.logspace(0, 7, 16)
cv_result = LogisticRegressionCV(Cs=cs,
                                 penalty='l1',
Exemple #18
0
from zoo.chronos.data.utils.feature import generate_dt_features, generate_global_features
from zoo.chronos.data.utils.impute import impute_timeseries_dataframe
from zoo.chronos.data.utils.deduplicate import deduplicate_timeseries_dataframe
from zoo.chronos.data.utils.roll import roll_timeseries_dataframe
from zoo.chronos.data.utils.scale import unscale_timeseries_numpy
from zoo.chronos.data.utils.resample import resample_timeseries_dataframe
from zoo.chronos.data.utils.split import split_timeseries_dataframe

from tsfresh.utilities.dataframe_functions import roll_time_series
from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters,\
    MinimalFCParameters, EfficientFCParameters
DEFAULT_PARAMS = {"comprehensive": ComprehensiveFCParameters(),
                  "minimal": MinimalFCParameters(),
                  "efficient": EfficientFCParameters()}

_DEFAULT_ID_COL_NAME = "id"
_DEFAULT_ID_PLACEHOLDER = "0"


class TSDataset:
    def __init__(self, data, **schema):
        '''
        TSDataset is an abstract of time series dataset.
        Cascade call is supported for most of the transform methods.
        '''
        self.df = data
        self.id_col = schema["id_col"]
        self.dt_col = schema["dt_col"]
        self.feature_col = schema["feature_col"].copy()
Exemple #19
0
def create_test_features_profile(json_file):
    filename = os.path.basename(json_file)
    metric = filename.replace('.mirage.redis.24h.json', '')
    metric_data_dir = os.path.dirname(json_file)
    anomaly_json = json_file
    ts_csv = '%s.test.echo.tsfresh.input.csv' % (json_file)
    fname_in = ts_csv
    t_fname_out = fname_in + '.features.transposed.csv'
    if os.path.isfile(t_fname_out):
        return t_fname_out
    start = timer()
    with open(anomaly_json, 'r') as f:
        raw_timeseries = f.read()
    # Convert the timeseries to csv
    try:
        timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']')
        del raw_timeseries
        timeseries = literal_eval(timeseries_array_str)
        del timeseries_array_str
    except:
        print('error :: could not literal_eval %s' % anomaly_json)
        print(traceback.format_exc())
        return False
    datapoints = timeseries
    del timeseries
    converted = []
    for datapoint in datapoints:
        try:
            new_datapoint = [float(datapoint[0]), float(datapoint[1])]
            converted.append(new_datapoint)
        # @modified 20170913 - Task #2160: Test skyline with bandit
        # Added nosec to exclude from bandit tests
        except:  # nosec
            continue

    if os.path.isfile(ts_csv):
        os.remove(ts_csv)

    for ts, value in converted:
        # print('%s,%s' % (str(int(ts)), str(value)))
        utc_ts_line = '%s,%s,%s\n' % (metric, str(int(ts)), str(value))
        with open(ts_csv, 'a') as fh:
            fh.write(utc_ts_line)
    del converted

    df = pd.read_csv(ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value'])
#    print('DataFrame created with %s' % ts_csv)
    df.columns = ['metric', 'timestamp', 'value']

    # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
    # tsf_settings = ReasonableFeatureExtractionSettings()
    # Disable tqdm progress bar
    # tsf_settings.disable_progressbar = True

    df_features = extract_features(
        # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
        # df, column_id='metric', column_sort='timestamp', column_kind=None,
        # column_value=None, feature_extraction_settings=tsf_settings)
        df, default_fc_parameters=EfficientFCParameters(),
        column_id='metric', column_sort='timestamp', column_kind=None,
        column_value=None, disable_progressbar=True)
    del df
#    print('features extracted from %s data' % ts_csv)
    # write to disk
    fname_out = fname_in + '.features.csv'
    # Transpose
    df_t = df_features.transpose()
#    print('features transposed')
    # Create transposed features csv
    t_fname_out = fname_in + '.features.transposed.csv'
    df_t.to_csv(t_fname_out)
    del df_t
    # Calculate the count and sum of the features values
    df_sum = pd.read_csv(
        t_fname_out, delimiter=',', header=0,
        names=['feature_name', 'value'])
    df_sum.columns = ['feature_name', 'value']
    df_sum['feature_name'] = df_sum['feature_name'].astype(str)
    df_sum['value'] = df_sum['value'].astype(float)

    features_count = len(df_sum['value'])
    features_sum = df_sum['value'].sum()
    del df_sum
#    print('features saved to %s' % (fname_out))
#    print('transposed features saved to %s' % (t_fname_out))
    return t_fname_out
Exemple #20
0
def calculate_features_other_minmax(use_file, i_json_file, metric):

    fp_id = 'testing.feature2484'
    base_name = metric
    metric_timestamp = 'none'

    not_anomalous = False
    minmax_not_anomalous = False
    minmax = 0
    minmax_check = True

    with open(use_file, 'r') as f:
        raw_timeseries = f.read()
    # Convert the timeseries to csv
    timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']')
    del raw_timeseries
    anomalous_timeseries = literal_eval(timeseries_array_str)
    anomalous_ts_values_count = len(anomalous_timeseries)

    with open(i_json_file, 'r') as f:
        fp_raw_timeseries = f.read()
    # Convert the timeseries to csv
    fp_timeseries_array_str = str(fp_raw_timeseries).replace('(', '[').replace(')', ']')
    del fp_raw_timeseries
    fp_id_metric_ts = literal_eval(fp_timeseries_array_str)
    fp_id_metric_ts_values_count = len(fp_id_metric_ts)

    try:
        range_tolerance = settings.IONOSPHERE_MINMAX_SCALING_RANGE_TOLERANCE
    except:
        range_tolerance = 0.15
    range_tolerance_percentage = range_tolerance * 100
    check_range = False
    range_similar = False
    if fp_id_metric_ts:
        if anomalous_ts_values_count > 0:
            check_range = True
    lower_range_similar = False
    upper_range_similar = False

    min_fp_value = None
    min_anomalous_value = None
    max_fp_value = None
    max_anomalous_value = None

    if check_range:
        try:
            minmax_fp_values = [x[1] for x in fp_id_metric_ts]
            min_fp_value = min(minmax_fp_values)
            max_fp_value = max(minmax_fp_values)
        except:
            min_fp_value = False
            max_fp_value = False
        try:
            minmax_anomalous_values = [x2[1] for x2 in anomalous_timeseries]
            min_anomalous_value = min(minmax_anomalous_values)
            max_anomalous_value = max(minmax_anomalous_values)
        except:
            min_anomalous_value = False
            max_anomalous_value = False
        lower_range_not_same = True
        try:
            if int(min_fp_value) == int(min_anomalous_value):
                lower_range_not_same = False
                lower_range_similar = True
                print('min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % (
                    str(min_fp_value), str(min_anomalous_value)))
        except:
            lower_range_not_same = True
        if min_fp_value and min_anomalous_value and lower_range_not_same:
            if int(min_fp_value) == int(min_anomalous_value):
                lower_range_similar = True
                print('min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % (
                    str(min_fp_value), str(min_anomalous_value)))
            else:
                lower_min_fp_value = int(min_fp_value - (min_fp_value * range_tolerance))
                upper_min_fp_value = int(min_fp_value + (min_fp_value * range_tolerance))
                if int(min_anomalous_value) in range(lower_min_fp_value, upper_min_fp_value):
                    lower_range_similar = True
                    print('min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other' % (
                        str(min_fp_value),
                        str(min_anomalous_value),
                        str(range_tolerance_percentage)))
        if not lower_range_similar:
            print('lower range of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar' % (
                str(min_fp_value), str(min_anomalous_value)))
        upper_range_not_same = True
        try:
            if int(max_fp_value) == int(max_anomalous_value):
                upper_range_not_same = False
                upper_range_similar = True
                print('max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % (
                    str(max_fp_value), str(max_anomalous_value)))
        except:
            upper_range_not_same = True
        if max_fp_value and max_anomalous_value and lower_range_similar and upper_range_not_same:
            # @added 20180717 - Task #2446: Optimize Ionosphere
            #                   Feature #2404: Ionosphere - fluid approximation
            # On low values such as 1 and 2, the range_tolerance
            # should be adjusted to account for the very small
            # range. TODO
            lower_max_fp_value = int(max_fp_value - (max_fp_value * range_tolerance))
            upper_max_fp_value = int(max_fp_value + (max_fp_value * range_tolerance))
            if int(max_anomalous_value) in range(lower_max_fp_value, upper_max_fp_value):
                upper_range_similar = True
                print('max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other' % (
                    str(max_fp_value), str(max_anomalous_value),
                    str(range_tolerance_percentage)))
            else:
                print('max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar' % (
                    str(max_fp_value), str(max_anomalous_value)))
    if lower_range_similar and upper_range_similar:
        range_similar = True
    else:
        print('the ranges of fp_id_metric_ts and anomalous_timeseries differ significantly Min-Max scaling will be skipped')

    minmax_fp_ts = []
    # if fp_id_metric_ts:
    if range_similar:
        try:
            minmax_fp_values = [x[1] for x in fp_id_metric_ts]
            x_np = np.asarray(minmax_fp_values)
            # Min-Max scaling
            np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min())
            for (ts, v) in zip(fp_id_metric_ts, np_minmax):
                minmax_fp_ts.append([ts[0], v])
            print('minmax_fp_ts list populated with the minmax scaled time series with %s data points' % str(len(minmax_fp_ts)))
            del minmax_fp_values
        except:
            print('error :: could not minmax scale fp id %s time series for %s' % (str(fp_id), str(base_name)))
        if not minmax_fp_ts:
            print('error :: minmax_fp_ts list not populated')

    minmax_anomalous_ts = []
    if minmax_fp_ts:
        # Only process if they are approximately the same length
        minmax_fp_ts_values_count = len(minmax_fp_ts)
        if minmax_fp_ts_values_count - anomalous_ts_values_count in range(-14, 14):
            try:
                minmax_anomalous_values = [x2[1] for x2 in anomalous_timeseries]
                x_np = np.asarray(minmax_anomalous_values)
                # Min-Max scaling
                np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min())
                for (ts, v) in zip(fp_id_metric_ts, np_minmax):
                    minmax_anomalous_ts.append([ts[0], v])
                del anomalous_timeseries
                del minmax_anomalous_values
            except:
                print('error :: could not minmax scale current time series anomalous_timeseries for %s' % (str(fp_id), str(base_name)))
            if len(minmax_anomalous_ts) > 0:
                print('minmax_anomalous_ts is populated with %s data points' % str(len(minmax_anomalous_ts)))
            else:
                print('error :: minmax_anomalous_ts is not populated')
        else:
            print('minmax scaled check will be skipped - anomalous_ts_values_count is %s and minmax_fp_ts is %s' % (str(anomalous_ts_values_count), str(minmax_fp_ts_values_count)))

    minmax_fp_ts_csv = '%s/fpid.%s.%s.minmax_fp_ts.tsfresh.input.std.csv' % (
        settings.SKYLINE_TMP_DIR, str(fp_id), base_name)
    if os.path.isfile(minmax_fp_ts_csv):
        os.remove(minmax_fp_ts_csv)
    minmax_fp_fname_out = minmax_fp_ts_csv + '.transposed.csv'
    if os.path.isfile(minmax_fp_fname_out):
        os.remove(minmax_fp_fname_out)
    anomalous_ts_csv = '%s/%s.%s.minmax_anomalous_ts.tsfresh.std.csv' % (
        settings.SKYLINE_TMP_DIR, metric_timestamp, base_name)
    if os.path.isfile(anomalous_ts_csv):
        os.remove(anomalous_ts_csv)
    anomalous_fp_fname_out = anomalous_ts_csv + '.transposed.csv'
    if os.path.isfile(anomalous_fp_fname_out):
        os.remove(anomalous_fp_fname_out)

    tsf_settings = ReasonableFeatureExtractionSettings()
    tsf_settings.disable_progressbar = True
    minmax_fp_features_sum = None
    minmax_anomalous_features_sum = None
    if minmax_anomalous_ts and minmax_fp_ts:
        if not os.path.isfile(minmax_fp_ts_csv):
            datapoints = minmax_fp_ts
            converted = []
            for datapoint in datapoints:
                try:
                    new_datapoint = [float(datapoint[0]), float(datapoint[1])]
                    converted.append(new_datapoint)
                except:  # nosec
                    continue
            for ts, value in converted:
                try:
                    utc_ts_line = '%s,%s,%s\n' % (base_name, str(int(ts)), str(value))
                    with open(minmax_fp_ts_csv, 'a') as fh:
                        fh.write(utc_ts_line)
                except:
                    print('error :: could not write to file %s' % (str(minmax_fp_ts_csv)))
            del converted
        else:
            print('file found %s, using for data' % minmax_fp_ts_csv)

        if not os.path.isfile(minmax_fp_ts_csv):
            print('error :: file not found %s' % minmax_fp_ts_csv)
        else:
            print('file exists to create the minmax_fp_ts data frame from - %s' % minmax_fp_ts_csv)

        try:
            df = pd.read_csv(minmax_fp_ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value'])
            df.columns = ['metric', 'timestamp', 'value']
        except:
            print('error :: failed to created data frame from %s' % (str(minmax_fp_ts_csv)))
        try:
            df_features = extract_features(
                # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
                # df, column_id='metric', column_sort='timestamp', column_kind=None,
                # column_value=None, feature_extraction_settings=tsf_settings)
                df, default_fc_parameters=EfficientFCParameters(),
                column_id='metric', column_sort='timestamp', column_kind=None,
                column_value=None, disable_progressbar=True)
        except:
            print('error :: failed to created df_features from %s' % (str(minmax_fp_ts_csv)))
        # Create transposed features csv
        if not os.path.isfile(minmax_fp_fname_out):
            # Transpose
            df_t = df_features.transpose()
            df_t.to_csv(minmax_fp_fname_out)

        try:
            # Calculate the count and sum of the features values
            df_sum = pd.read_csv(
                minmax_fp_fname_out, delimiter=',', header=0,
                names=['feature_name', 'value'])
            df_sum.columns = ['feature_name', 'value']
            df_sum['feature_name'] = df_sum['feature_name'].astype(str)
            df_sum['value'] = df_sum['value'].astype(float)
            minmax_fp_features_count = len(df_sum['value'])
            minmax_fp_features_sum = df_sum['value'].sum()
            print('minmax_fp_ts - features_count: %s, features_sum: %s' % (str(minmax_fp_features_count), str(minmax_fp_features_sum)))
            del df_sum
        except:
            print('error :: failed to created df_sum from %s' % (str(minmax_fp_fname_out)))

        if minmax_fp_features_count > 0:
            print('debug :: minmax_fp_features_count of the minmax_fp_ts is %s' % str(minmax_fp_features_count))
        else:
            print('error :: minmax_fp_features_count is %s' % str(minmax_fp_features_count))

        if not os.path.isfile(anomalous_ts_csv):
            datapoints = minmax_anomalous_ts
            converted = []
            for datapoint in datapoints:
                try:
                    new_datapoint = [float(datapoint[0]), float(datapoint[1])]
                    converted.append(new_datapoint)
                except:  # nosec
                    continue
            for ts, value in converted:
                utc_ts_line = '%s,%s,%s\n' % (base_name, str(int(ts)), str(value))
                with open(anomalous_ts_csv, 'a') as fh:
                    fh.write(utc_ts_line)
            del converted

        df = pd.read_csv(anomalous_ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value'])
        df.columns = ['metric', 'timestamp', 'value']
        df_features_current = extract_features(
            # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
            # df, column_id='metric', column_sort='timestamp', column_kind=None,
            # column_value=None, feature_extraction_settings=tsf_settings)
            df, default_fc_parameters=EfficientFCParameters(),
            column_id='metric', column_sort='timestamp', column_kind=None,
            column_value=None, disable_progressbar=True)
        del df

        # Create transposed features csv
        if not os.path.isfile(anomalous_fp_fname_out):
            # Transpose
            df_t = df_features_current.transpose()
            df_t.to_csv(anomalous_fp_fname_out)
            del df_t
            del df_features_current
        # Calculate the count and sum of the features values
        df_sum_2 = pd.read_csv(
            anomalous_fp_fname_out, delimiter=',', header=0,
            names=['feature_name', 'value'])
        df_sum_2.columns = ['feature_name', 'value']
        df_sum_2['feature_name'] = df_sum_2['feature_name'].astype(str)
        df_sum_2['value'] = df_sum_2['value'].astype(float)
        minmax_anomalous_features_count = len(df_sum_2['value'])
        minmax_anomalous_features_sum = df_sum_2['value'].sum()
        print('minmax_anomalous_ts - minmax_anomalous_features_count: %s, minmax_anomalous_features_sum: %s' % (
            str(minmax_anomalous_features_count),
            str(minmax_anomalous_features_sum)))
        del df_sum_2
        del minmax_anomalous_ts

    percent_different = 100
    if minmax_fp_features_sum and minmax_anomalous_features_sum:
        percent_different = None
        try:
            fp_sum_array = [minmax_fp_features_sum]
            calc_sum_array = [minmax_anomalous_features_sum]
            percent_different = 100
            sums_array = np.array([minmax_fp_features_sum, minmax_anomalous_features_sum], dtype=float)
            calc_percent_different = np.diff(sums_array) / sums_array[:-1] * 100.
            percent_different = calc_percent_different[0]
            print('percent_different between minmax scaled features sums - %s' % str(percent_different))
        except:
            print('error :: failed to calculate percent_different from minmax scaled features sums')

        if percent_different:
            almost_equal = None
            try:
                np.testing.assert_array_almost_equal(fp_sum_array, calc_sum_array)
                almost_equal = True
            except:
                almost_equal = False

            if almost_equal:
                minmax_not_anomalous = True
                print('minmax scaled common features sums are almost equal, not anomalous')

            # if diff_in_sums <= 1%:
            if percent_different < 0:
                new_pdiff = percent_different * -1
                percent_different = new_pdiff

            # @modified 20190321
            # if percent_different < (settings.IONOSPHERE_FEATURES_PERCENT_SIMILAR + 1):
            if percent_different < IONOSPHERE_ECHO_MINMAX_SCALING_FEATURES_PERCENT_SIMILAR:
                minmax_not_anomalous = True
                # log
                print('not anomalous - minmax scaled features profile match - %s - %s' % (base_name, str(minmax_not_anomalous)))
                print(
                    'minmax scaled calculated features sum are within %s percent of fp_id %s with %s, not anomalous' %
                    (str(IONOSPHERE_ECHO_MINMAX_SCALING_FEATURES_PERCENT_SIMILAR),
                        str(fp_id), str(percent_different)))
            if minmax_not_anomalous:
                not_anomalous = True
                minmax = 1
                # Created time series resources for graphing in
                # the matched page

    try:
        clean_file = anomalous_ts_csv
        if os.path.isfile(anomalous_ts_csv):
            os.remove(anomalous_ts_csv)
        # print('cleaned up - %s' % clean_file)
    except:
        print('no anomalous_ts_csv file to clean up')
    try:
        clean_file = anomalous_fp_fname_out
        if os.path.isfile(anomalous_fp_fname_out):
            os.remove(anomalous_fp_fname_out)
        # print('cleaned up - %s' % clean_file)
    except:
        print('no anomalous_fp_fname_out file to clean up')
    return not_anomalous
Exemple #21
0
import numpy as np
import pandas as pd

from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_selection.relevance import calculate_relevance_table

from sklearn.utils.validation import check_is_fitted
from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.filterwarnings('ignore')

# TODO: Make a dict from EfficientFCParameters with faster features
extraction_settings = EfficientFCParameters()
filtered_funcs = [
    'abs_energy', 'mean_abs_change', 'mean_change', 'skewness', 'kurtosis',
    'absolute_sum_of_changes', 'longest_strike_below_mean',
    'longest_strike_above_mean', 'count_above_mean', 'count_below_mean',
    'last_location_of_maximum', 'first_location_of_maximum',
    'last_location_of_minimum', 'first_location_of_minimum',
    'percentage_of_reoccurring_datapoints_to_all_datapoints',
    'percentage_of_reoccurring_values_to_all_values',
    'sum_of_reoccurring_values', 'sum_of_reoccurring_data_points',
    'ratio_value_number_to_time_series_length', 'cid_ce', 'symmetry_looking',
    'large_standard_deviation', 'quantile', 'autocorrelation', 'number_peaks',
    'binned_entropy', 'index_mass_quantile', 'linear_trend',
    'number_crossing_m', 'augmented_dickey_fuller', 'number_cwt_peaks',
    'agg_autocorrelation', 'spkt_welch_density', 'friedrich_coefficients',
    'max_langevin_fixed_point', 'c3', 'ar_coefficient',
Exemple #22
0
    def predict(
        self,
        forecast_length: int,
        future_regressor=[],
        just_point_forecast: bool = False,
    ):
        """Generates forecast data immediately following dates of index supplied to .fit()

        Args:
            forecast_length (int): Number of periods of data to forecast ahead
            regressor (numpy.Array): additional regressor
            just_point_forecast (bool): If True, return a pandas.DataFrame of just point forecasts

        Returns:
            Either a PredictionObject of forecasts and metadata, or
            if just_point_forecast == True, a dataframe of point forecasts
        """
        if not _has_tsfresh:
            raise ImportError("Package tsfresh is required")
        # num_subsamples = 10
        predictStartTime = datetime.datetime.now()

        # from tsfresh import extract_features
        from tsfresh.utilities.dataframe_functions import make_forecasting_frame

        # from sklearn.ensemble import AdaBoostRegressor
        from tsfresh.utilities.dataframe_functions import impute as tsfresh_impute

        # from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters

        max_timeshift = 10
        regression_model = 'Adaboost'
        feature_selection = None

        max_timeshift = self.max_timeshift
        regression_model = self.regression_model
        feature_selection = self.feature_selection

        sktraindata = self.df_train.copy()

        X = pd.DataFrame()
        y = pd.DataFrame()
        counter = 0
        for column in sktraindata.columns:
            df_shift, current_y = make_forecasting_frame(
                sktraindata[column],
                kind="time_series",
                max_timeshift=max_timeshift,
                rolling_direction=1,
            )
            # disable_progressbar = True MinimalFCParameters EfficientFCParameters
            current_X = extract_features(
                df_shift,
                column_id="id",
                column_sort="time",
                column_value="value",
                impute_function=tsfresh_impute,
                show_warnings=False,
                default_fc_parameters=EfficientFCParameters(),
                n_jobs=1,
            )  #
            current_X["feature_last_value"] = current_y.shift(1)
            current_X.rename(columns=lambda x: str(counter) + '_' + x,
                             inplace=True)

            X = pd.concat([X, current_X], axis=1)
            y = pd.concat([y, current_y], axis=1)
            counter += 1

        # drop constant features
        X = X.loc[:, X.apply(pd.Series.nunique) != 1]
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(0)
        y = y.fillna(method='ffill').fillna(method='bfill')

        if feature_selection == 'Variance':
            from sklearn.feature_selection import VarianceThreshold

            sel = VarianceThreshold(threshold=(0.15))
            X = pd.DataFrame(sel.fit_transform(X))
        if feature_selection == 'Percentile':
            from sklearn.feature_selection import SelectPercentile, chi2

            X = pd.DataFrame(
                SelectPercentile(chi2, percentile=20).fit_transform(
                    X, y[y.columns[0]]))
        if feature_selection == 'DecisionTree':
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.feature_selection import SelectFromModel

            clf = DecisionTreeRegressor()
            clf = clf.fit(X, y)
            model = SelectFromModel(clf, prefit=True)

            X = model.transform(X)
        if feature_selection == 'Lasso':
            from sklearn.linear_model import MultiTaskLasso
            from sklearn.feature_selection import SelectFromModel

            clf = MultiTaskLasso(max_iter=2000)
            clf = clf.fit(X, y)
            model = SelectFromModel(clf, prefit=True)

            X = model.transform(X)
        """
         decisionTreeList = X.columns[model.get_support()]
         LassoList = X.columns[model.get_support()]
         
         feature_list = decisionTreeList.to_list()
         set([x for x in feature_list if feature_list.count(x) > 1])
         from collections import Counter
         repeat_features = Counter(feature_list)
         repeat_features = repeat_features.most_common(20)
        """

        # Drop first line
        X = X.iloc[1:, ]
        y = y.iloc[1:]

        y = y.fillna(method='ffill').fillna(method='bfill')

        index = self.create_forecast_index(forecast_length=forecast_length)

        if regression_model == 'ElasticNet':
            from sklearn.linear_model import MultiTaskElasticNet

            regr = MultiTaskElasticNet(alpha=1.0,
                                       random_state=self.random_seed)
        elif regression_model == 'DecisionTree':
            from sklearn.tree import DecisionTreeRegressor

            regr = DecisionTreeRegressor(random_state=self.random_seed)
        elif regression_model == 'MLP':
            from sklearn.neural_network import MLPRegressor

            # relu/tanh lbfgs/adam layer_sizes (100) (10)
            regr = MLPRegressor(
                hidden_layer_sizes=(10, 25, 10),
                verbose=self.verbose_bool,
                max_iter=200,
                activation='tanh',
                solver='lbfgs',
                random_state=self.random_seed,
            )
        elif regression_model == 'KNN':
            from sklearn.multioutput import MultiOutputRegressor
            from sklearn.neighbors import KNeighborsRegressor

            regr = MultiOutputRegressor(
                KNeighborsRegressor(random_state=self.random_seed))
        elif regression_model == 'Adaboost':
            from sklearn.multioutput import MultiOutputRegressor
            from sklearn.ensemble import AdaBoostRegressor

            regr = MultiOutputRegressor(AdaBoostRegressor(
                n_estimators=200))  # , random_state=self.random_seed))
        else:
            regression_model = 'RandomForest'
            from sklearn.ensemble import RandomForestRegressor

            regr = RandomForestRegressor(random_state=self.random_seed,
                                         n_estimators=1000,
                                         verbose=self.verbose)

        regr.fit(X, y)

        combined_index = self.df_train.index.append(index)
        forecast = pd.DataFrame()
        sktraindata.columns = [x for x in range(len(sktraindata.columns))]

        for x in range(forecast_length):
            x_dat = pd.DataFrame()
            y_dat = pd.DataFrame()
            counter = 0
            for column in sktraindata.columns:
                df_shift, current_y = make_forecasting_frame(
                    sktraindata.tail(max_timeshift)[column],
                    kind="time_series",
                    max_timeshift=max_timeshift,
                    rolling_direction=1,
                )
                # disable_progressbar = True MinimalFCParameters EfficientFCParameters
                current_X = extract_features(
                    df_shift,
                    column_id="id",
                    column_sort="time",
                    column_value="value",
                    impute_function=tsfresh_impute,
                    show_warnings=False,
                    n_jobs=1,
                    default_fc_parameters=EfficientFCParameters(),
                )  # default_fc_parameters=MinimalFCParameters(),
                current_X["feature_last_value"] = current_y.shift(1)

                current_X.rename(columns=lambda x: str(counter) + '_' + x,
                                 inplace=True)

                x_dat = pd.concat([x_dat, current_X], axis=1)
                y_dat = pd.concat([y_dat, current_y], axis=1)
                counter += 1

            x_dat = x_dat[X.columns]
            rfPred = pd.DataFrame(regr.predict(x_dat.tail(1).values))

            forecast = pd.concat([forecast, rfPred], axis=0, ignore_index=True)
            sktraindata = pd.concat([sktraindata, rfPred],
                                    axis=0,
                                    ignore_index=True)
            sktraindata.index = combined_index[:len(sktraindata.index)]

        forecast.columns = self.column_names
        forecast.index = index

        if just_point_forecast:
            return forecast
        else:
            upper_forecast, lower_forecast = Point_to_Probability(
                self.df_train,
                forecast,
                prediction_interval=self.prediction_interval)

            predict_runtime = datetime.datetime.now() - predictStartTime
            prediction = PredictionObject(
                model_name=self.name,
                forecast_length=forecast_length,
                forecast_index=forecast.index,
                forecast_columns=forecast.columns,
                lower_forecast=lower_forecast,
                forecast=forecast,
                upper_forecast=upper_forecast,
                prediction_interval=self.prediction_interval,
                predict_runtime=predict_runtime,
                fit_runtime=self.fit_runtime,
                model_parameters=self.get_params(),
            )
            return prediction
def get_tsfresh_feat(df, colName=None):
    df = df.reset_index()
    df.columns = ['timestamp',colName]
    df['id'] = 0 # Mandatory for tsfresh to group
    ext_feat = extract_features(df, column_id='id', column_value=colName, column_sort='timestamp', default_fc_parameters=EfficientFCParameters(), disable_progressbar=True)
    ext_feat_val = ext_feat.values[0]
    #print(ext_feat_val, ext_feat.columns); exit()
    return ext_feat_val
    return df


# Processing first 20 min of raw scg signal and creating dataset with adaptive indexes and corresponding time row to extract features according to them.
# Arguments are in order:start_hour,start_min,end_hour,end_min
new_id_number_train, bin_length_train,targets_startInx_train, train_df = preprocess(raw_value, train_start_hour, train_start_minute,
                                                             train_end_hour, train_end_minute)

# Processing last 7 minutes of raw scg signal to predict future parameters.
new_id_number_test, bin_length_test,targets_startInx_test, test_df = preprocess(raw_value, test_start_hour, test_start_minute, test_end_hour,
                                                          test_end_minute)

# Because of problems occuring while parallel processing i had to set n_jobs = 0,it calculates slower but works fine.
train_extracted_features = extract_features(train_df, column_id="id", column_sort="time",
                                           default_fc_parameters=EfficientFCParameters(), n_jobs=0)
test_extracted_features = extract_features(test_df, column_id="id", column_sort="time",
                                          default_fc_parameters=EfficientFCParameters(), n_jobs=0)

# train_extracted_features.to_csv('train_final_features.csv',index = False)
#tt = pd.read_csv(r'C:\Users\Samane\Desktop\hw\20minFeature.xlsx')
# test_extracted_features.to_csv('test_final_features.csv',index = False)

train_features = train_extracted_features[features]
train_features_norm = data_segmenation_normalization1(train_features)
S, D, H, R, labels_train= label_modification_df(targets, targets_startInx_train, new_id_number_train, bin_length_train)

train_df, val_df, test_df = data_shaping(train_features_norm, S, D, H, R)

test_array = test_df.values
test_array = np.expand_dims(test_array, axis=0)
Exemple #25
0
    def gen_rolling_feature(self,
                            window_size,
                            settings="comprehensive",
                            full_settings=None,
                            n_jobs=1):
        '''
        Generate aggregation feature for each sample.
        This method will be implemented by tsfresh.
        Make sure that the specified column name does not contain '__'.

        TODO: relationship with scale should be figured out.

        :param window_size: int, generate feature according to the rolling result.
        :param settings: str or dict. If a string is set, then it must be one of "comprehensive"
               "minimal" and "efficient". If a dict is set, then it should follow the instruction
               for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive".
        :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in
               tsfresh. The value is defaulted to None.
        :param n_jobs: int. The number of processes to use for parallelization.

        :return: the tsdataset instance.
        '''
        from tsfresh.utilities.dataframe_functions import roll_time_series
        from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh
        from tsfresh import extract_features
        from tsfresh.feature_extraction import ComprehensiveFCParameters, \
            MinimalFCParameters, EfficientFCParameters

        DEFAULT_PARAMS = {
            "comprehensive": ComprehensiveFCParameters(),
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters()
        }

        assert not self._has_generate_agg_feature,\
            "Only one of gen_global_feature and gen_rolling_feature should be called."
        if isinstance(settings, str):
            assert settings in ['comprehensive', 'minimal', 'efficient'], \
                "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\
                f", but found {settings}."
            default_fc_parameters = DEFAULT_PARAMS[settings]
        else:
            default_fc_parameters = settings

        assert window_size < self.df.groupby(self.id_col).size().min() + 1, "gen_rolling_feature "\
            "should have a window_size smaller than shortest time series length."
        df_rolled = roll_time_series(self.df,
                                     column_id=self.id_col,
                                     column_sort=self.dt_col,
                                     max_timeshift=window_size - 1,
                                     min_timeshift=window_size - 1,
                                     n_jobs=n_jobs)
        if not full_settings:
            self.roll_feature_df = extract_features(
                df_rolled,
                column_id=self.id_col,
                column_sort=self.dt_col,
                default_fc_parameters=default_fc_parameters,
                n_jobs=n_jobs)
        else:
            self.roll_feature_df = extract_features(
                df_rolled,
                column_id=self.id_col,
                column_sort=self.dt_col,
                kind_to_fc_parameters=full_settings,
                n_jobs=n_jobs)
        impute_tsfresh(self.roll_feature_df)

        self.feature_col += list(self.roll_feature_df.columns)
        self.roll_additional_feature = list(self.roll_feature_df.columns)
        self._has_generate_agg_feature = True
        return self
Exemple #26
0
def calculate_features_profile(current_skyline_app, timestamp, metric,
                               context):
    """
    Calculates a tsfresh features profile from a training data set

    :param timestamp: the timestamp of metric anomaly with training data
    :type timestamp: str
    :param metric: the base_name of the metric
    :type metric: str
    :param context: the context
    :type metric: str

    :return: (features_profile_csv_file_path, successful, fail_msg, traceback_format_exc, calc_time)
    :rtype: int
    :rtype: (str, boolean, str, str, str)
    """

    current_skyline_app_logger = current_skyline_app + 'Log'
    current_logger = logging.getLogger(current_skyline_app_logger)

    base_name = str(metric)

    # @added 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
    # Set a default log_context, just in case it is not set if something is
    # added in the future
    log_context = 'unknown'

    if context == 'training_data':
        log_context = 'training data'
    if context == 'features_profiles':
        log_context = 'features profile data'
    if context == 'ionosphere':
        log_context = 'ionosphere'
    # @added 20170114 - Feature #1854: Ionosphere learn
    if context == 'ionosphere_learn':
        log_context = 'ionosphere :: learn'

    # TODO
    # @added 20190314 - Feature #2484: FULL_DURATION feature profiles
    # Here we add the bifurcation to also create a features
    # profile at FULL_DURATION for all Mirage metrics.  With a
    # view to increase the number of matches trained metric
    # achieve by also allowing for the creation and comparing of
    # the FULL_DURATION features profiles as well.
    # How I am not certain but needs to tie up with this Feature in:
    # skyline/ionosphere/ionosphere.py
    # skyline/webapp/webapp.py
    if context == 'ionosphere_echo':
        log_context = 'ionosphere :: echo'
    if context == 'ionosphere_echo_check':
        log_context = 'ionosphere :: echo check'

    current_logger.info('%s feature profile creation requested for %s at %s' %
                        (log_context, base_name, timestamp))

    timeseries_dir = base_name.replace('.', '/')
    if context == 'training_data' or context == 'ionosphere':
        metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER,
                                        timestamp, timeseries_dir)

        # @added 20200813 - Feature #3670: IONOSPHERE_CUSTOM_KEEP_TRAINING_TIMESERIES_FOR
        if context == 'training_data':
            metric_data_dir_does_not_exist = False
            if not os.path.exists(metric_data_dir):
                metric_data_dir_does_not_exist = True
            if IONOSPHERE_CUSTOM_KEEP_TRAINING_TIMESERIES_FOR and metric_data_dir_does_not_exist:
                try:
                    historical_data, metric_data_dir = historical_data_dir_exists(
                        current_skyline_app, metric_data_dir)
                    if historical_data:
                        current_logger.info(
                            'create_features_profile :: using historical training data - %s'
                            % metric_data_dir)
                except:
                    trace = traceback.format_exc()
                    current_logger.error(trace)
                    fail_msg = 'error :: create_features_profile :: failed to determine whether this is historical training data'
                    current_logger.error('%s' % fail_msg)
                    if context == 'training_data':
                        # Raise to webbapp I believe to provide traceback to user in UI
                        raise
                    else:
                        return False, False, False, fail_msg, trace

    if context == 'features_profiles':
        metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_PROFILES_FOLDER,
                                        timeseries_dir, timestamp)

    # @added 20170113 - Feature #1854: Ionosphere learn
    if context == 'ionosphere_learn':
        metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_LEARN_FOLDER,
                                        timestamp, timeseries_dir)

    # @added 20190327 - Feature #2484: FULL_DURATION feature profiles
    # Added ionosphere_echo and ionosphere_echo_check
    if context == 'ionosphere_echo' or context == 'ionosphere_echo_check':
        metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER,
                                        timestamp, timeseries_dir)

    features_profile_created_file = '%s/%s.%s.fp.created.txt' % (
        metric_data_dir, str(timestamp), base_name)

    features_profile_details_file = '%s/%s.%s.fp.details.txt' % (
        metric_data_dir, str(timestamp), base_name)

    # @added 20190327 - Feature #2484: FULL_DURATION feature profiles
    if context == 'ionosphere_echo_check':
        features_profile_created_file = '%s/%s.%s.echo.fp.created.txt' % (
            metric_data_dir, str(timestamp), base_name)
        features_profile_details_file = '%s/%s.%s.echo.fp.details.txt' % (
            metric_data_dir, str(timestamp), base_name)

    # @added 20170108 - Feature #1842: Ionosphere - Graphite now graphs
    # Added metric_check_file and ts_full_duration is needed to be determined
    # and added the to features_profile_details_file as it was not added here on
    # the 20170104 when it was added the webapp and ionosphere
    metric_var_filename = '%s.txt' % str(base_name)
    anomaly_check_file = '%s/%s' % (metric_data_dir, metric_var_filename)
    ts_full_duration = int(settings.FULL_DURATION)
    if os.path.isfile(anomaly_check_file):
        # Read the details file
        with open(anomaly_check_file, 'r') as f:
            anomaly_details = f.readlines()
            for i, line in enumerate(anomaly_details):
                if 'full_duration' in line:
                    _ts_full_duration = '%s' % str(line).split("'", 2)
                    full_duration_array = literal_eval(_ts_full_duration)
                    ts_full_duration = str(int(full_duration_array[1]))

    anomaly_json = '%s/%s.json' % (metric_data_dir, base_name)

    # @added 20190327 - Feature #2484: FULL_DURATION feature profiles
    if context == 'ionosphere_echo' or context == 'ionosphere_echo_check':
        ts_full_duration = str(settings.FULL_DURATION)
        full_duration_in_hours = int(settings.FULL_DURATION / 60 / 60)
        anomaly_json = '%s/%s.mirage.redis.%sh.json' % (
            metric_data_dir, base_name, str(full_duration_in_hours))

    ts_csv = '%s/%s.tsfresh.input.csv' % (metric_data_dir, base_name)
    # @added 20190327 - Feature #2484: FULL_DURATION feature profiles
    if context == 'ionosphere_echo_check':
        ts_csv = '%s/%s.echo.tsfresh.input.csv' % (metric_data_dir, base_name)

#    anomaly_json = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.json'
#    ts_csv = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.tsfresh.input.csv'
# This is simply to stay in line with tsfresh naming conventions in their
# docs and examples
    fname_in = ts_csv
    t_fname_out = fname_in + '.features.transposed.csv'

    fp_id = None
    f_calc = 'unknown'
    if os.path.isfile(features_profile_details_file):
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.info('%s :: features profile details file exist - %s' %
                            (log_context, features_profile_details_file))
        try:
            with open(features_profile_details_file, 'r') as f:
                fp_details_str = f.read()
            fp_details_array = literal_eval(fp_details_str)
            f_calc = ' (previously calculated by Ionosphere) - %s' % str(
                fp_details_array[2])
        except:
            trace = traceback.format_exc()
            current_logger.error(trace)
            # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
            # Added log_context to report the context
            current_logger.error('error: %s :: failed to read from %s' %
                                 (log_context, features_profile_details_file))
    else:
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.info(
            '%s - OK no features profile details file exists - %s' %
            (log_context, features_profile_details_file))

    fp_created = None
    if os.path.isfile(features_profile_created_file):
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.info('%s :: features profile created file exist - %s' %
                            (log_context, features_profile_created_file))
        try:
            with open(features_profile_created_file, 'r') as f:
                fp_created_str = f.read()
            fp_created_array = literal_eval(fp_created_str)
            fp_id = fp_created_array[0]
            fp_created = True
        except:
            trace = traceback.format_exc()
            current_logger.error(trace)
            # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
            # Added log_context to report the context
            current_logger.error('error: %s :: failed to read fp_id from %s' %
                                 (log_context, features_profile_created_file))
    else:
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.info(
            '%s :: OK no features profile created file exists - %s' %
            (log_context, features_profile_created_file))

    if os.path.isfile(t_fname_out):
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.info('%s :: transposed features already exist - %s' %
                            (log_context, t_fname_out))
        return str(
            t_fname_out), True, fp_created, fp_id, 'none', 'none', f_calc

    start = timer()
    raw_timeseries = []
    if os.path.isfile(anomaly_json):
        try:
            # Read the timeseries json file
            with open(anomaly_json, 'r') as f:
                raw_timeseries = f.read()
        except:
            trace = traceback.format_exc()
            current_logger.error(trace)
            # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
            # Added log_context to report the context
            current_logger.error(
                'error: %s :: failed to read timeseries data from %s' %
                (log_context, anomaly_json))
            fail_msg = 'error: %s :: failed to read timeseries data from %s' % (
                log_context, anomaly_json)
            end = timer()
            return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc
    else:
        trace = 'none'
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        fail_msg = 'error :: %s :: file not found - %s' % (log_context,
                                                           anomaly_json)
        current_logger.error(fail_msg)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    # Convert the timeseries to csv
    timeseries_array_str = str(raw_timeseries).replace('(',
                                                       '[').replace(')', ']')
    del raw_timeseries
    timeseries = literal_eval(timeseries_array_str)

    datapoints = timeseries
    del timeseries
    converted = []
    for datapoint in datapoints:
        try:
            new_datapoint = [float(datapoint[0]), float(datapoint[1])]
            converted.append(new_datapoint)
        # @modified 20170913 - Task #2160: Test skyline with bandit
        # Added nosec to exclude from bandit tests
        except:  # nosec
            continue

    del datapoints

    if os.path.isfile(ts_csv):
        os.remove(ts_csv)

    for ts, value in converted:
        # print('%s,%s' % (str(int(ts)), str(value)))
        utc_ts_line = '%s,%s,%s\n' % (metric, str(int(ts)), str(value))
        with open(ts_csv, 'a') as fh:
            fh.write(utc_ts_line)

    del converted

    try:
        df = pd.read_csv(ts_csv,
                         delimiter=',',
                         header=None,
                         names=['metric', 'timestamp', 'value'])
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.info('%s :: DataFrame created with %s' %
                            (log_context, ts_csv))
    except:
        trace = traceback.format_exc()
        current_logger.error(trace)
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        fail_msg = 'error: %s :: failed to create a pandas DataFrame with %s' % (
            log_context, ts_csv)
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
            # Added log_context to report the context
            current_logger.info('%s :: removed %s' % (log_context, ts_csv))
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

# @added 20161207 - Task #1658: Patterning Skyline Ionosphere
# Coverting the Dataframe types to suit MySQL data types
# For anyone in here if you have done a code review of Skyline there are
# a number of questions that arise from the decision to deviate from json or
# storing msgppack as BLOB etc.  tsfresh used csv and we can csv from Graphite
# etc.  Skyline should be able to handle csv.  As for how data is stored in
# MySQL, this was given considerable review and thought.  Given that Ionosphere
# and Skyline in general should not be limited to the domain of analyzing
# Graphite machine metrics but other timeseries data sources too.
#    df['feature_name'] = df['feature_name'].astype(string)
#    df['value'] = df['value'].astype(float)

# Test the DataFrame
    try:
        df_created = df.head()
        del df_created
    except:
        trace = traceback.format_exc()
        current_logger.debug(trace)
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        fail_msg = 'error: %s :: failed to read the pandas DataFrame created with %s' % (
            log_context, ts_csv)
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
            # Added log_context to report the context
            current_logger.info('%s :: removed %s' % (log_context, ts_csv))
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    df.columns = ['metric', 'timestamp', 'value']

    start_feature_extraction = timer()
    # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
    # Added log_context to report the context
    current_logger.info('%s :: starting extract_features with %s' %
                        (log_context, str(TSFRESH_VERSION)))
    df_features = False
    try:
        # @modified 20161226 - Bug #1822: tsfresh extract_features process stalling
        # Changed to use the new ReasonableFeatureExtractionSettings that was
        # introduced in tsfresh-0.4.0 to exclude the computationally high cost
        # of extracting features from very static timeseries that has little to
        # no variation is the values, which results in features taking up to
        # almost 600 seconds to calculate on a timeseries of length 10075
        # (168h - 1 datapoint per 60s)
        # In terms of inline feature calculatation, always exclude
        # high_comp_cost features.
        # df_features = extract_features(df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None)
        # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
        # tsf_settings = ReasonableFeatureExtractionSettings()
        # >>> from tsfresh.feature_extraction import extract_features, EfficientFCParameters
        # >>> extract_features(df, default_fc_parameters=EfficientFCParameters())

        # Disable tqdm progress bar
        # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
        # tsf_settings.disable_progressbar = True

        df_features = extract_features(
            # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
            # df, column_id='metric', column_sort='timestamp', column_kind=None,
            # column_value=None, feature_extraction_settings=tsf_settings)
            df,
            default_fc_parameters=EfficientFCParameters(),
            column_id='metric',
            column_sort='timestamp',
            column_kind=None,
            column_value=None,
            disable_progressbar=True)
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.info('%s :: features extracted from %s data' %
                            (log_context, ts_csv))
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        fail_msg = 'error: %s :: extracting features with tsfresh from - %s' % (
            log_context, ts_csv)
        current_logger.error('%s' % fail_msg)
        end_feature_extraction = timer()
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.info(
            '%s :: feature extraction failed in %.6f seconds' %
            (log_context, (end_feature_extraction - start_feature_extraction)))
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
            # Added log_context to report the context
            current_logger.info('%s :: removed %s' % (log_context, ts_csv))
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    end_feature_extraction = timer()
    feature_extraction_time = end_feature_extraction - start_feature_extraction
    # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
    # Added log_context to report the context
    current_logger.info('%s :: feature extraction took %.6f seconds' %
                        (log_context, feature_extraction_time))

    del df

    # write to disk
    fname_out = fname_in + '.features.csv'
    # df_features.to_csv(fname_out)

    # Transpose
    df_t = False
    try:
        df_t = df_features.transpose()
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.info('%s :: features transposed' % log_context)
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        fail_msg = 'error :: %s :: transposing tsfresh features from - %s' % (
            log_context, ts_csv)
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
            # Added log_context to report the context
            current_logger.info('%s :: removed %s' % (log_context, ts_csv))
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    del df_features

    # Create transposed features csv
    t_fname_out = fname_in + '.features.transposed.csv'
    try:
        df_t.to_csv(t_fname_out)
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        fail_msg = 'error :: %s :: saving transposed tsfresh features from - %s' % (
            log_context, ts_csv)
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
            # Added log_context to report the context
            current_logger.info('%s :: removed %s' % (log_context, ts_csv))
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    del df_t

    # Calculate the count and sum of the features values
    df_sum = False
    try:
        df_sum = pd.read_csv(t_fname_out,
                             delimiter=',',
                             header=0,
                             names=['feature_name', 'value'])
        df_sum.columns = ['feature_name', 'value']
        df_sum['feature_name'] = df_sum['feature_name'].astype(str)
        df_sum['value'] = df_sum['value'].astype(float)
    except:
        trace = traceback.print_exc()
        current_logger.error(trace)
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.error(
            'error :: %s :: failed to create Dataframe to sum' % log_context)
    try:
        features_count = len(df_sum['value'])
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.error(
            'error :: %s :: failed to count number of features, set to 0' %
            log_context)
        features_count = 0
    try:
        features_sum = df_sum['value'].sum()
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.error(
            'error :: %s :: failed to sum feature values, set to 0' %
            log_context)
        features_sum = 0

    end = timer()

    # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
    # Added log_context to report the context
    current_logger.info('%s :: features saved to %s' %
                        (log_context, fname_out))
    current_logger.info('%s :: transposed features saved to %s' %
                        (log_context, t_fname_out))
    total_calc_time = '%.6f' % (end - start)
    calc_time = '%.6f' % (feature_extraction_time)
    current_logger.info('%s :: total feature profile completed in %s seconds' %
                        (log_context, str(total_calc_time)))

    # Create a features profile details file
    try:
        # @modified 20170108 - Feature #1842: Ionosphere - Graphite now graphs
        # Added the ts_full_duration here as it was not added here on the 20170104
        # when it was added the webapp and ionosphere
        data = '[%s, \'%s\', %s, %s, %s, %s]' % (
            str(int(time.time())), str(tsfresh_version), str(calc_time),
            str(features_count), str(features_sum), str(ts_full_duration))
        write_data_to_file(current_skyline_app, features_profile_details_file,
                           'w', data)
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        fail_msg = 'error :: %s :: failed to write %s' % (
            log_context, features_profile_details_file)
        current_logger.error('%s' % fail_msg)

    del df_sum

    if os.path.isfile(ts_csv):
        os.remove(ts_csv)
        # @modified 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
        # Added log_context to report the context
        current_logger.info('%s :: removed the created csv - %s' %
                            (log_context, ts_csv))

    # @added 20170112 - Feature #1854: Ionosphere learn - Redis ionosphere.learn.work namespace
    # Ionosphere learn needs Redis works sets, but this was moved to
    # ionosphere_backend.py and learn.py not done here

    return str(t_fname_out), True, fp_created, fp_id, 'none', 'none', str(
        calc_time)
        "ch57": chanels_data[57, :],
        "ch58": chanels_data[58, :],
        "ch59": chanels_data[59, :],
        "ch60": chanels_data[60, :],
        "ch61": chanels_data[61, :],
        "ch62": chanels_data[62, :],
        "ch63": chanels_data[63, :]
    }

    #d = {'ID': pd.Series(ids), 'time': pd.Series(time),'x': pd.Series(full_data_matrix[i])}
    df = pd.DataFrame(d)
    extracted_features[i, :] = extract_features(
        df,
        column_id="id",
        column_sort="time",
        default_fc_parameters=EfficientFCParameters())

################################################Quick test
#Normalize data
full_normalized_array = preprocessing.scale(extracted_features)  #normalize

################PCA AND VARIANCE EXPLAINED
pca = PCA(svd_solver='auto')  #PCA with all components
pca.fit(full_normalized_array)
pca_cumsum = np.cumsum(pca.explained_variance_ratio_) * 100

plt.figure()
plt.plot(pca_cumsum)
plt.grid()
plt.ylabel('% Variance Explained')
plt.xlabel('# of Features')
 def compute(self, data, features=EfficientFCParameters()):
     self.features = features
     feature_values = generate_tsfresh_features(data, features)
     return feature_values.reshape(1, feature_values.size)
Exemple #29
0
    def gen_global_feature(self,
                           settings="comprehensive",
                           full_settings=None,
                           n_jobs=1):
        '''
        Generate per-time-series feature for each time series.
        This method will be implemented by tsfresh.
        Make sure that the specified column name does not contain '__'.

        TODO: relationship with scale should be figured out.

        :param settings: str or dict. If a string is set, then it must be one of "comprehensive"
               "minimal" and "efficient". If a dict is set, then it should follow the instruction
               for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive".
        :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in
               tsfresh. The value is defaulted to None.
        :param n_jobs: int. The number of processes to use for parallelization.

        :return: the tsdataset instance.
        '''
        from tsfresh import extract_features
        from tsfresh.feature_extraction import ComprehensiveFCParameters, \
            MinimalFCParameters, EfficientFCParameters

        DEFAULT_PARAMS = {
            "comprehensive": ComprehensiveFCParameters(),
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters()
        }

        assert not self._has_generate_agg_feature, \
            "Only one of gen_global_feature and gen_rolling_feature should be called."
        if full_settings is not None:
            self.df,\
                addtional_feature =\
                generate_global_features(input_df=self.df,
                                         column_id=self.id_col,
                                         column_sort=self.dt_col,
                                         kind_to_fc_parameters=full_settings,
                                         n_jobs=n_jobs)
            self.feature_col += addtional_feature
            return self

        if isinstance(settings, str):
            assert settings in ['comprehensive', 'minimal', 'efficient'], \
                "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\
                f", but found {settings}."
            default_fc_parameters = DEFAULT_PARAMS[settings]
        else:
            default_fc_parameters = settings

        self.df,\
            addtional_feature =\
            generate_global_features(input_df=self.df,
                                     column_id=self.id_col,
                                     column_sort=self.dt_col,
                                     default_fc_parameters=default_fc_parameters,
                                     n_jobs=n_jobs)

        self.feature_col += addtional_feature
        self._has_generate_agg_feature = True
        return self
Exemple #30
0

nobs = np.size(full_data_matrix,0)
ntime = np.size(full_data_matrix,1)

extracted_features = np.zeros((nobs,788))

time = np.arange(ntime)

for i in range(nobs):
    print(i)
    ids = np.repeat(i,ntime)
    
    d = {'ID': pd.Series(ids), 'time': pd.Series(time),'x': pd.Series(full_data_matrix[i])}
    df = pd.DataFrame(d)
    extracted_features[i,:] = extract_features(df, column_id="ID", column_sort="time", default_fc_parameters=EfficientFCParameters())



################################################Quick test
#Normalize data
full_normalized_array = preprocessing.scale(extracted_features)#normalize

################PCA AND VARIANCE EXPLAINED
pca = PCA(svd_solver='auto')#PCA with all components
pca.fit(full_normalized_array)
pca_cumsum = np.cumsum(pca.explained_variance_ratio_)*100

plt.figure()
plt.plot(pca_cumsum)
plt.grid()