def test_extraction_null_as_column_name(self): df1 = pd.DataFrame(data={0: range(10), 1: np.repeat([0, 1], 5), 2: np.repeat([0, 1, 2, 3, 4], 2)}) X1 = extract_features(df1, column_id=1, column_sort=2) self.assertEqual(len(X1), 2) df2 = pd.DataFrame(data={1: range(10), 0: np.repeat([0, 1], 5), 2: np.repeat([0, 1, 2, 3, 4], 2)}) X2 = extract_features(df2, column_id=0, column_sort=2) self.assertEqual(len(X2), 2) df3 = pd.DataFrame(data={0: range(10), 2: np.repeat([0, 1], 5), 1: np.repeat([0, 1, 2, 3, 4], 2)}) X3 = extract_features(df3, column_id=2, column_sort=1) self.assertEqual(len(X3), 2)
def test_functional_equality(self): """ `extract_relevant_features` should be equivalent to running first `extract_features` with impute and `select_features` afterwards. Meaning it should produce the same relevant features and the values of these features should be identical. :return: """ df, y = self.create_test_data_sample_with_target() relevant_features = extract_relevant_features(df, y, column_id='id', column_value='val', column_kind='kind', column_sort='sort') extracted_features = extract_features(df, column_id='id', column_value='val', column_kind='kind', column_sort='sort', impute_function=impute) selected_features = select_features(extracted_features, y) self.assertEqual( set(relevant_features.columns), set(selected_features.columns), "Should select the same columns:\n\t{}\n\nvs.\n\n\t{}".format(relevant_features.columns, selected_features.columns)) relevant_columns = relevant_features.columns relevant_index = relevant_features.index self.assertTrue( relevant_features.equals(selected_features.loc[relevant_index][relevant_columns]), "Should calculate the same feature values")
def test_extract_feature(self): ts = pd.DataFrame({ 'id': np.array(['a', 'a', 'a', 'b', 'b', 'b']), 'time': np.array([0,1,2,0,1,2]), 'x': np.array([3,4,5,7,8,10]) }) extracted_features = extract_features(ts, column_id='id', column_sort='time') self.assertEqual(2, len(extracted_features))
def ts_extract(df, features): logging.getLogger('distributed.utils_perf').setLevel(logging.CRITICAL) # participant = file.split("_")[0] # video = file.split("_")[1] dd = df.copy() dd['id'] = dd.index extracted_features = extract_features(dd, column_id="id", column_sort="time", default_fc_parameters=features, distributor=MapDistributor()) impute(extracted_features) return extracted_features
def ts_feature_extraction(dataframe): """ Gets 5 transformed features from 794 features extracted by tsfresh :param dataframe: A pandas dataframe :return: A pandas dataframe """ features = extract_features( dataframe, column_id='period', column_sort='date', column_value='value', ) features = rm_const_cols(features) return pca_transformation(features)
def generate_global_features(input_df, column_id, column_sort, default_fc_parameters=None, kind_to_fc_parameters=None): ''' generate global features by tsfresh. :param input_df: input dataframe. :param column_id: id column name :param column_sort: time column name :param default_fc_parameters: same as tsfresh. :param kind_to_fc_parameters: same as tsfresh. :return : a new input_df that contains all generated feature. ''' if kind_to_fc_parameters is not None: global_feature = extract_features(input_df, column_id=column_id, column_sort=column_sort, kind_to_fc_parameters=kind_to_fc_parameters) else: global_feature = extract_features(input_df, column_id=column_id, column_sort=column_sort, default_fc_parameters=default_fc_parameters) res_df = input_df.copy() id_list = list(np.unique(input_df[column_id])) addtional_feature = [] for col_name in global_feature.columns: # any feature that can not be extracted will be dropped if global_feature[col_name].isna().sum() > 0: continue # const value will be given to each univariate time series for id_name in id_list: res_df.loc[input_df["id"] == id_name, col_name] = global_feature.loc[id_name][col_name] addtional_feature.append(col_name) return res_df, addtional_feature
def make_extra_ts_featurs(train, meta_train): feats = [] feats2 = [] feats3 = [] for chk in tqdm_chunks(meta_train.object_id.unique(), 1000): slc = train[get_membership_mask(train[OBJECT_ID], set(chk))] extracted_features = extract_features(slc, EXTRA_FLUX_PARS, column_value='flux', disable_progressbar=True, **TSKW) feats.append(extracted_features) extracted_features2 = extract_features(slc, FC_PASSBAND_V2, column_value='flux', column_kind='passband', disable_progressbar=True, **TSKW) feats2.append(extracted_features2) extracted_features3 = extract_features( slc, column_value='flux_by_flux_ratio_sq', column_kind='passband', disable_progressbar=True, **TSKW) feats3.append(extracted_features3) new_feat_df = pd.concat(feats) new_feat_df2 = pd.concat(feats2) new_feat_df3 = pd.concat(feats3) catted = pd.concat([ new_feat_df, new_feat_df2, new_feat_df3.add_prefix('flux_by_flux_ratio_sq') ], axis=1) return catted
def test_pandas_no_pivot(self): df = self.df X = extract_features(df, column_id="my_id", column_sort="time", column_kind="dimension", column_value="value", pivot=False, default_fc_parameters=MinimalFCParameters()) X = pd.DataFrame(X, columns=["my_id", "variable", "value"]) self.assertIn("1__mean", X["variable"].values) self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], 5.516e-05, 4) self.assertEqual(X.shape, (100*20, 3)) X = extract_features(df, column_id="my_id", column_sort="time", column_kind="dimension", pivot=False, default_fc_parameters=MinimalFCParameters()) X = pd.DataFrame(X, columns=["my_id", "variable", "value"]) self.assertIn("1__mean", X["variable"].values) self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "1__mean")]["value"].iloc[0], 5.516e-05, 4) self.assertEqual(X.shape, (100*20, 3)) X = extract_features(df.drop(columns=["dimension"]), column_id="my_id", column_sort="time", pivot=False, default_fc_parameters=MinimalFCParameters()) X = pd.DataFrame(X, columns=["my_id", "variable", "value"]) self.assertIn("value__mean", X["variable"].values) self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], 5.516e-05, 4) self.assertEqual(X.shape, (100*10, 3)) X = extract_features(df.drop(columns=["dimension", "time"]), column_id="my_id", pivot=False, default_fc_parameters=MinimalFCParameters()) X = pd.DataFrame(X, columns=["my_id", "variable", "value"]) self.assertIn("value__mean", X["variable"].values) self.assertAlmostEqual(X[(X["my_id"] == "5") & (X["variable"] == "value__mean")]["value"].iloc[0], 5.516e-05, 4) self.assertEqual(X.shape, (100*10, 3))
def tsfresh_extract_features(): train_df_list = [] for file_name in os.listdir(train_path): if file_name.endswith('.csv'): df = pd.read_csv(os.path.join(train_path, file_name)) train_df_list.append(df) test_df_list = [] for file_name in os.listdir(test_path): if file_name.endswith('.csv'): df = pd.read_csv(os.path.join(test_path, file_name)) test_df_list.append(df) train_df = pd.concat(train_df_list) test_df = pd.concat(test_df_list) train_df['time'] = pd.to_datetime(train_df['time'], format='%m%d %H:%M:%S') test_df['time'] = pd.to_datetime(test_df['time'], format='%m%d %H:%M:%S') all_df = pd.concat([train_df, test_df], sort=False) df = all_df.drop(columns=['type']) extracted_df = extract_features(df, column_id='渔船ID', column_sort='time', n_jobs=8, kind_to_fc_parameters=fc_parameters_v1) train_df = extracted_df.iloc[:len(train_df_list)] test_df = extracted_df.iloc[len(train_df_list):] y = [] for name, group in all_df.groupby('渔船ID'): y.append(group.iloc[0]['type']) y_train = y[:train_df.shape[0]] le = preprocessing.LabelEncoder() y_train = le.fit_transform(y_train) # impute(train_df) # filtered_train_df = select_features(train_df, y_train) # filtered_test_df = test_df[filtered_train_df.columns] train_df['type'] = le.inverse_transform(y_train) if not os.path.exists('./feature'): os.makedirs('./feature') train_df.to_csv('./feature/train.csv') test_df.to_csv('./feature/test.csv') return train_df, test_df
def ts_feature_extraction(dataframe, num_jobs=0): """ Gets 5 transformed features from 794 features extracted by tsfresh :param dataframe: A pandas dataframe :param num_jobs: integer, number of parallel processes in tsfresh :return: A pandas dataframe """ features = extract_features(dataframe, column_id='period', column_sort='date', column_value='value', n_jobs=num_jobs ) features = rm_const_cols(features) return pca_transformation(features)
def pd_ts_tsfresh_features(df: pd.DataFrame, cols: list = None, pars: dict = None): from tsfresh import extract_relevant_features, extract_features from tsfresh.utilities.dataframe_functions import roll_time_series single_row_df = df[cols] single_row_df["time"] = range(0, len(single_row_df.index)) id_col = pars.get("id_col", "id") if not "id_col" in pars.keys(): single_row_df["id"] = 1 X_feat = extract_features(single_row_df, column_id=id_col, column_sort='time') return X_feat, X_feat.columns.to_list()
def test_features_on_btc(): df = pd.DataFrame({ "id": [1, 1, 1, 1, 2, 2], "time": [1, 2, 3, 4, 8, 9], "x": [1, 2, 3, 4, 10, 11], "y": [5, 6, 7, 8, 12, 13], }) df_rolled = roll_time_series(df, column_id="id", column_sort="time") assert df_rolled['id'].nunique() == 6 df_features = extract_features(df_rolled, column_id="id", column_sort="time") assert df_features.shape[0] == 6
def extract_features(self, ts, column_id='id', impute_function=impute, default_fc_parameters=ComprehensiveFCParameters(), show_warnings=False, profile=False): '''Extract all possible features from ts using tsfresh's extract_features method''' return extract_features(ts, column_id=column_id, impute_function=impute_function, default_fc_parameters=default_fc_parameters, n_jobs=self.n_jobs, show_warnings=show_warnings, profile=profile)
def get_features(file_name, count): csv_data = pd.read_csv(file_name) timeseries = csv_data.iloc[:, :-1] print('start getfeatures...') # 全部特征 extracted_features = extract_features(timeseries, column_id="id", column_sort="time") impute(extracted_features) print('start save ...') extracted_features.to_csv('tsfresh_extractedFeatures' + str(count) + '.csv') print(str(count) + ' end')
def tsfresh_calculator(timeseries, column_id, column_sort, cleanup): from tsfresh import extract_features from tsfresh import extract_relevant_features if cleanup == "Yes": extracted_features = extract_relevant_features(timeseries, column_id=column_id, column_sort=column_sort) else: extracted_features = extract_features(timeseries, column_id=column_id, column_sort=column_sort) return extracted_features
def _extract_features(self, data_frame): df_rolled = roll_time_series( data_frame, column_id=self.column_id, column_sort=self.time_stamp, max_timeshift=self.memory, ) extracted_minimal = tsfresh.extract_features( df_rolled, column_id=self.column_id, column_sort=self.time_stamp, default_fc_parameters=tsfresh.feature_extraction. MinimalFCParameters(), ) extracted_index_based = tsfresh.extract_features( df_rolled, column_id=self.column_id, column_sort=self.time_stamp, default_fc_parameters=tsfresh.feature_extraction.settings. IndexBasedFCParameters(), ) extracted_features = pd.concat( [extracted_minimal, extracted_index_based], axis=1) del extracted_minimal del extracted_index_based gc.collect() extracted_features[np.isnan(extracted_features)] = 0.0 extracted_features[np.isinf(extracted_features)] = 0.0 return extracted_features
def add_tsfresh_participant(data, tsfresh_features, columns, k): # The dictionary containing the features that we want to extract and the setting for those features if tsfresh_features == 'minimal': settings = MinimalFCParameters() elif tsfresh_features == 'efficient': settings = EfficientFCParameters() elif tsfresh_features == 'comprehensive': settings = ComprehensiveFCParameters() else: settings = MinimalFCParameters() for participant in range(len(data)): # First we add the necesary columns data[participant]['id'] = 0 data[participant]['index'] = data[participant].index # We create the rolled time series which also creates new ids, also note that putting max_timeshift to none # means that it takes the maximal possible lengths rolled_series = roll_time_series(data[participant], column_id='id', column_sort='index', max_timeshift=k) all_features = [] for column in columns: # We extract the features for every element of the time series which return a dataframe with the same number # of rows as the original dataframe but a different number of columns extracted = extract_features(rolled_series, default_fc_parameters=settings, column_id='id', column_sort='index', column_value=column) # We need to reset the indexes as they have been changed and add them to our list of features all_features.append(extracted.reset_index(drop=True)) # Add all the features together extracted = pd.concat(all_features, axis=1) # We drop the columns that we previously created because we do no want them in the data del data[participant]['id'] # note that you can also use df.drop here del data[participant]['index'] data[participant] = pd.concat([data[participant], extracted], axis=1) return data
def get_features(file_name, count): csv_data = pd.read_csv(file_name) timeseries = csv_data.iloc[:, :-1] del timeseries['Unnamed: 0'] y = csv_data[['id', 'y']] y = handle_y(y) print(timeseries) print(y) print('start getfeatures...') # 全部特征 extracted_features = extract_features(timeseries, column_id="id", column_sort="time") impute(extracted_features) extracted_features.to_csv('tsfresh_extractedFeatures' + str(count) + '.csv') print(str(count) + ' end')
def extract_select_inference_features(self, data, args=None): """ Extract-Select features Only extract specific features passed via args, we want the same as in taining https://stackoverflow.com/questions/50426458/retrieve-specific-features-by-using-tsfresh-in-python :param data: pandas.DataFrame :param args: :return: list """ X = extract_features(data, column_id=args[0], n_jobs=args[1], chunksize=args[2], kind_to_fc_parameters=args[3]) X = impute(X) return X
def extract_features(self, data, args=None): """ Extract features :param data: pandas.DataFrame :param args: :return: pandas.DataFrame """ print(args[0]) print(args[1]) print(args[2]) X = extract_features(data, column_id=args[0], n_jobs=args[1], chunksize=args[2]) X = impute(X) return X
def predict_gamma_for_timeseries(self, timeseries_df): ''' Predict the best gamma based on time series properties ''' timeseries_df[ 'dummy_col'] = 'dummy' # the library requires an id column, but all ids are the same for our timeseries, so adding a dummy id column try: features_df = extract_features(timeseries_df.rename(columns={self.id_col: 'value'}), column_id='dummy_col', column_sort=self.timestamp_col, disable_progressbar=True)[selected_features].fillna(0) features_df = features_df.replace(np.inf, 0) features_df = features_df.replace(-np.inf, 0) gamma = self.metadata.estimator.predict(features_df)[0] except: gamma = 1.0 return gamma
def extract(window): # Get all unique patients so we can pull first 24 hours of data pats = insheet.sort_values('mrn_csn_pair')['mrn_csn_pair'].unique() first = insheet[(insheet['timestamp'] < window) & (insheet['timestamp'] >= 0)] first = first.sort_values('mrn_csn_pair').reset_index(drop=True) extracted_flowsheet = tsfresh.extract_features(first, column_id='mrn_csn_pair', column_sort='timestamp', column_kind='measure', column_value='value', n_jobs=8) # Drop features that are only NaN extracted_flowsheet = extracted_flowsheet.dropna(axis=1, how='all') tsfresh.utilities.dataframe_functions.impute(extracted_flowsheet) # Add back the mrn_csn_pair extracted_flowsheet.insert(0, 'mrn_csn_pair', pats) return extracted_flowsheet.reset_index(drop=True)
def tsfresh_features(signal_df, channels): ''' Calculate features of sensor signal using TSFresh package. :param signal_df: dataframe housing sensor signals :param channels: channels of sensor signal to calculate TSFresh features :return: dataframe of calculated features for each sensor channel ''' signal_df = signal_df[channels] signal_df.loc[:, 'id'] = 1 tsfresh_df = tsf.extract_features(signal_df, column_id='id', disable_progressbar=True) return tsfresh_df.reset_index(drop=True)
def extract_features(data_windows: DataFrame, features: List[Feature]) -> Dict[Feature, DataFrame]: settings = { key: ComprehensiveFCParameters()[key] for key in [str(feature.value).lower() for feature in features] } extracted: DataFrame = tsfresh.extract_features( data_windows, column_id="id", default_fc_parameters=settings, disable_progressbar=True) result = {} for feature_index in range(len(features)): feature = features[feature_index] result[feature] = extracted.iloc[:, [feature_index]] return result
def extractFeatures(rawData): print("\nSetting extraction settings") extraction_settings = ComprehensiveFCParameters() print("Before extracting features") X = extract_features(rawData, column_id='id', column_value=None, column_kind=None, impute_function=impute, default_fc_parameters=extraction_settings) print("After extracting features") print("Number of extracted features: {}.".format(X.shape[1])) print("\nShape of X: ") print(X.shape) return X
def transform(self, X, y=None): """Transform X. Parameters ---------- X : pd.DataFrame nested pandas DataFrame of shape [n_samples, n_columns] y : pd.Series, optional (default=None) Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame """ # input checks self.check_is_fitted() X = check_X(X, coerce_to_pandas=True) # tsfresh requires unique index, returns only values for # unique index values if X.index.nunique() < X.shape[0]: warn( "tsfresh requires a unique index, but found " "non-unique. To avoid this warning, please make sure the index of X " "contains only unique values." ) X = X.reset_index(drop=True) Xt = from_nested_to_long(X) # lazy imports to avoid hard dependency from tsfresh import extract_features extraction_params = self._get_extraction_params() Xt = extract_features( Xt, column_id="index", column_value="value", column_kind="column", column_sort="time_index", **extraction_params, ) # When using the long input format, tsfresh seems to sort the index, # here we make sure we return the dataframe in the sort order as the # input data return Xt.reindex(X.index)
def transform(self, X, y=None): feats = extract_features( X, column_id=self.column_id, column_sort=self.column_sort, chunksize=self.chunk_size, default_fc_parameters=self.default_fc_parameters, n_jobs=self.n_jobs, ) # Rename columns to allow use with LightGBM, doesn't like "-", "." feats = feats.rename( columns=lambda x: re.sub("[^A-Za-z0-9_]+", "_", x) ) # Grab the datetime index out of tuple multi index that tsfresh uses return feats.set_index(feats.index.map(lambda x: x[1]), drop=True)
def compute_tsfresh_features(x, save_path, nb_splits=8, which_set='training'): print('Processing %s set...' % (which_set)) n = x.shape[0] split_breaks = [int(n / nb_splits) * i for i in range(nb_splits)] + [n] for i in range(nb_splits): start = split_breaks[i] stop = split_breaks[i + 1] print('Number of rows being processed:', stop - start) features = extract_features(TSFormatting().transform(x.iloc[start:stop]), column_id='id', column_sort='time', default_fc_parameters=EfficientFCParameters()) features['neuron_id'] = x.iloc[start:stop]['neuron_id'] if (i == 0): features.to_csv(save_path, mode='w', header=True, index=True) else: features.to_csv(save_path, mode='a', header=False, index=True) del features
def run(self): raw: RawData = self.load("raw") df = pd.melt( raw.sales_train_validation, id_vars=[ "id", "item_id", "dept_id", "cat_id", "store_id", "state_id" ], var_name="d", value_name="sales", ) tsfresh_df = extract_features(df[["id", "d", "sales"]], column_id="id", column_sort="d") self.dump(tsfresh_df)
def time_series_analyis(data): ''' Function to perform time series analysis on provided dataset. Remove the columns stopped as it has nominal values ''' rm_colm = ['stopped'] df = data[data.columns.difference(rm_colm)] extracted_features = extract_features(df, column_id='animal_id', column_sort='time') impute(extracted_features) return (extracted_features)
def jacky_feature(path = './data/data_odiginal/test', file_format='xls', num_files=40, Trainable=True): all_files = glob.glob(path + "/*.xls") # print(all_files) ans = [] file_list = [] for file in all_files: print(file) df = pd.read_excel(file, header=None) if Trainable: ans_ = df.iloc[-1,0] ans.append(ans_) df = df[:-1] file_list.append(df) else: file_list.append(df) df_con = pd.concat(file_list, ignore_index=True) df_con = df_con.astype('float32') df_con.columns = ['1st', '2nd', '3rd', '4th'] df_con['id'] = pd.Series(np.repeat(np.arange(num_files), 7500), index=df_con.index) df_con['time'] = pd.Series(np.tile(np.arange(7500), num_files), index=df_con.index) f = [] for file in os.listdir(path): if file.endswith(file_format): print(file) f.append(file) min_max_scaler = MinMaxScaler() df_con[['1st', '2nd', '3rd', '4th']] = min_max_scaler.fit_transform(df_con[['1st', '2nd', '3rd', '4th']]) print('feature extract.') df_feature = extract_features(df_con, column_id='id', column_sort='time') features_filtered = ['id', '2nd__fft_coefficient__coeff_76__attr_"imag"', '3rd__fft_coefficient__coeff_24__attr_"real"', '2nd__fft_coefficient__coeff_94__attr_"real"', '2nd__partial_autocorrelation__lag_3', '3rd__fft_coefficient__coeff_62__attr_"abs"', '2nd__fft_coefficient__coeff_99__attr_"imag"', '3rd__fft_coefficient__coeff_57__attr_"real"', '2nd__fft_coefficient__coeff_96__attr_"real"', '1st__energy_ratio_by_chunks__num_segments_10__segment_focus_1', '2nd__fft_coefficient__coeff_11__attr_"angle"', '3rd__fft_coefficient__coeff_73__attr_"imag"', '1st__fft_coefficient__coeff_41__attr_"imag"', '2nd__fft_coefficient__coeff_81__attr_"real"'] print('feature select.') df_feature = df_feature[features_filtered] df_feature = df_feature.iloc[:, 1:] print(f) return df_feature, f, ans
def extract(self, data): assert isinstance(data, pd.DataFrame) # assert that data have no missing values assert not pd.isnull( data).values.any(), 'data should not contain missing values.' log.debug('Running Global feature extractor ..') gfe_start_time = time.time() # setting time series features to extract or use default # fc_parameters = MinimalFCParameters() # fc_parameters = EfficientFCParameters() # fc_parameters = ComprehensiveFCParameters() # feature extraction design_matrix = extract_features( data, default_fc_parameters=self._fc_parameters, column_id='batch_id', column_sort='end_time_stamp', column_kind='metric_id', column_value='sensor_value', n_jobs=self._num_of_cores_to_use) # impute: use a builtin tsfresh method that replaces NaN with median and -inf # [+inf] with min [max] in a columnwise fashion (and in place) # If the column does not contain finite values at all, it is filled with zeros # Also, all columns will be guaranteed to be of type np.float64 # (can also be done by passing impute_function=impute) to extract_features()) impute(design_matrix) # TODO: assert that none cf the columns was filled with zeros # TODO: think about feature selection as well (extract_relevant_features), see: # https://github.com/blue-yonder/tsfresh/blob/master/notebooks/robot_failure_example.ipynb # note though that this may be problematic for real-time ts anomaly detection gfe_end_time = time.time() gfe_duration = round((gfe_end_time - gfe_start_time) / 60, 2) log.debug( 'Done running Global feature extractor [Total time: {} mins.].'. format(gfe_duration)) return design_matrix
def test_local_dask_cluster_extraction(self): Distributor = LocalDaskDistributor(n_workers=1) df = self.create_test_data_sample() extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", distributor=Distributor) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))
def ts_all_features(data): """ Perform time series analysis on record data. Remove the column 'stopped' as it has nominal values :param data: pandas DataFrame, containing preprocessed movement records and features. :return: pandas DataFrame, containing autocorrelation for each id for each feature. """ rm_colm = ['stopped'] df = data[data.columns.difference(rm_colm)] time_series_features = tsfresh.extract_features(df, column_id='animal_id', column_sort='time') tsfresh.utilities.dataframe_functions.impute(time_series_features) return time_series_features
def fit(self, data, labels): feats = tsfresh.extract_features(data, column_id='level_0', column_sort='level_1', default_fc_parameters=self.def_settings, distributor=self.distributor) tsfresh.utilities.dataframe_functions.impute(feats) # Remove NaNs, if any relevant_feats = tsfresh.select_features(feats, labels, fdr_level=1e-15) self.relevant_features = relevant_feats.columns self.settings = tsfresh.feature_extraction.settings.from_columns(relevant_feats) clf = RandomForestClassifier(n_estimators=40) clf.fit(relevant_feats, labels) self.classifier = clf self.trained = True
def predict(self, data): if not self.trained: assert self.architecture is not None, 'No classifier selected and no fit performed.' filename = os.path.join(os.path.dirname(__file__), 'classifiers', self.architecture + '.pkl') with open(filename, 'rb') as f: arch = pickle.load(f) clf = arch['clf'] settings = arch['settings'] relevant_features = arch['relevant_features'] else: clf = self.classifier settings = self.settings relevant_features = self.relevant_features features = tsfresh.extract_features(data, column_id='level_0', column_sort='level_1', default_fc_parameters=settings['0']) return clf.predict(features[relevant_features])
def test_extraction_runs_through(self): df = extract_features(self.X[self.X.id < 3], column_id="id", column_sort="time") six.assertCountEqual(self, df.index.values, [1, 2]) self.assertGreater(len(df), 0)
def main(console_args=None): parser = argparse.ArgumentParser(description="Extract features from time series stored in a CSV file and " "write them back into another CSV file. The time series in the CSV " "file should either have one of the dataframe-formats described in " "http://tsfresh.readthedocs.io/en/latest/text/data_formats.html, " "which means you have to supply the --csv-with-headers flag " "or should be in the form " "[time series 1 values ..., time series 2 values ...] " "where you should not add the --csv-with-headers flag. " "The CSV is expected to be space-separated.") parser.add_argument("input_file_name", help="File name of the input CSV file to read in.") parser.add_argument("--output-file-name", help="File name of the output CSV file to write to. " "Defaults to input_file_name.features.csv", default=None) parser.add_argument("--column-sort", help="Column name to be used to sort the rows. " "Only available when --csv-with-headers is enabled.", default=None) parser.add_argument("--column-kind", help="Column name where the kind column can be found." "Only available when --csv-with-headers is enabled.", default=None) parser.add_argument("--column-value", help="Column name where the values can be found." "Only available when --csv-with-headers is enabled.", default=None) parser.add_argument("--column-id", help="Column name where the ids can be found." "Only available when --csv-with-headers is enabled.", default=None) parser.add_argument('--csv-with-headers', action='store_true', help="") print(console_args) args = parser.parse_args(console_args) if (args.column_id or args.column_kind or args.column_sort or args.column_value) and (not args.csv_with_headers): raise AttributeError("You can only pass in column-value, column-kind, column-id or column-sort if " "--csv-with-headers is enabled.") if args.csv_with_headers: column_kind = args.column_kind column_sort = args.column_sort column_value = args.column_value column_id = args.column_id header = 0 else: column_kind = None column_sort = "time" column_value = "value" column_id = "id" header = None # Read in CSV file input_file_name = args.input_file_name df = pd.read_csv(input_file_name, delim_whitespace=True, header=header) if not args.csv_with_headers: df = _preprocess(df) df_features = extract_features(df, column_kind=column_kind, column_sort=column_sort, column_value=column_value, column_id=column_id) # re-cast index from float to int df_features.index = df_features.index.astype('int') # write to disk default_out_file_name = os.path.splitext(input_file_name)[0] + '.features.csv' output_file_name = args.output_file_name or default_out_file_name df_features.to_csv(output_file_name)