def get_stats(pi, file): player_name = file[:-4] gain = [float(i) for i in pi['gain_series']['gain']] n_games = pi['gain_series']['n_games'] dates = [ datetime.fromtimestamp(t / 1000) for t in pi['gain_series']['date'] ] date_diff = np.diff(dates) games_diff = np.diff(n_games) games_per_day = [ safe_division(g, d.days * 86400 + d.seconds) * 86400 for g, d in zip(games_diff, date_diff) ] features_gain = tsfel.time_series_features_extractor(CFG, gain, fs=1, verbose=0)[TSFEL_COLS] features_gain.columns = COLS_GAIN features_gain.index = [player_name] features_gain['GAMES_PER_STEP'] = games_diff[0] features_gpd = tsfel.time_series_features_extractor(CFG, games_per_day, fs=1, verbose=0)[TSFEL_COLS] features_gpd.columns = COLS_GPD features_gpd.index = [player_name] roi, rake, games = approximate(pi['rake_series']['n_games'], pi['rake_series']['roi'], pi['rake_series']['rake'], pi['rake_series']['game_value'], STANDARD_RANGE) features_roi = pd.DataFrame(roi, index=[player_name]) features_roi.columns = COLS_ROI features_rake = pd.DataFrame(rake, index=[player_name]) features_rake.columns = COLS_RAKE features_rake['TOTAL_RAKE'] = np.nansum(features_rake.fillna(np.NaN)) features_games = pd.DataFrame(games, index=[player_name]) features_games.columns = COLS_GAMES custom = extract_features(pi, player_name) return pd.concat([ features_gain, features_gpd, features_roi, features_rake, features_games, custom ], axis=1)
def auto_feature_engineering(input_df: pd.DataFrame, cols: List[str], window_size: int = 5) -> pd.DataFrame: """ Automated feature engineering wrapper using TSFEL package for features generated based on temporal metrics :input_df: Input data :window_size: Size of the window :return: X data """ cfg_file = tsfel.get_features_by_domain("temporal") X_df_list = [] for col in cols: current_df = tsfel.time_series_features_extractor( cfg_file, input_df, fs=50, window_splitter=True, window_size=window_size, ) current_df = current_df.add_prefix(col) X_df_list.append(current_df) X = pd.concat(X_df_list, axis=1) return X
def tsfel_calculator(x): import tsfel # Instantiate calculation configuration cfg_file = tsfel.get_features_by_domain() # Produce calculations extracted_features = tsfel.time_series_features_extractor(cfg_file, x) return extracted_features
df = pd.read_sql_table('display', 'sqlite:///dissertation.db') df_engineering = df.copy() #df_engineering['id']=np.repeat(range(1,4033),360) df_20weeks = df_engineering[:1209600] # 20 weeks #df_tsfresh=df_engineering[['time','id','kWh']] df_tsfel = df_20weeks[['kWh']] df_hour = df_20weeks[['kWh']] * 1000 # convert kwh to wh df_hour.rename(columns={'kWh': 'Wh'}, inplace=True) import tsfel #from tsfresh import extract_relevant_features cfg = tsfel.get_features_by_domain() X_train = tsfel.time_series_features_extractor(cfg, df_tsfel, window_splitter=True, window_size=8640) # one day print(list(X_train.columns)) X_train = X_train[[ '0_Absolute energy', '0_Mean', '0_Max', '0_Standard deviation', '0_FFT mean coefficient_0', '0_Spectral kurtosis', '0_Skewness', '0_Zero crossing rate' ]] X_hour = tsfel.time_series_features_extractor(cfg, df_hour, window_splitter=True, window_size=360) # one hour X_hour = X_hour[[ '0_Absolute energy', '0_Mean', '0_Max', '0_Standard deviation', '0_FFT mean coefficient_0', '0_Spectral kurtosis', '0_Skewness',
# IMPORT df = pd.read_csv('Data kategorier.csv', skiprows=2, index_col=['READ_TIME_Date']) df.index = pd.to_datetime(df.index) categories = df.iloc[1:, :].dropna(axis=1).columns[df.iloc[1:, :].dropna( axis=1).columns.isin( k for k in df.columns if 'Average' in k and 'huser' not in k and 'lejl' not in k)] #minmaxscaling scaler = MinMaxScaler(feature_range=(0, 1)) df[categories] = scaler.fit_transform(df[categories]) X_train = tsfel.time_series_features_extractor(cfg_file, serie, fs=24, window_splitter=True, window_size=720) #serie= #X_train = tsfel.time_series_features_extractor(cfg_file, serie, fs=24, window_splitter=True, window_size=720) corr_features = tsfel.correlated_features(X_train) list = [] cfg_file = tsfel.get_features_by_domain() for category in categories: serie = df[category].dropna() X_train = tsfel.time_series_features_extractor(cfg_file, serie, fs=24, window_splitter=True, window_size=720)
tam_janela = np.round(len(series) / tam) series_split = np.array_split(series, tam_janela) dataset = pd.DataFrame() cont = 0 #with tqdm(total=len(series_split[atual:])) as pbar: for i in series_split[atual:]: cont = cont + 1 wandb.log({"Window (tot. " + str(len(series_split)) + ")": cont}) result = adfuller(i) ADF = pd.DataFrame([result[0]]) ADF_pvalue = pd.DataFrame([result[1]]) wandb.log({"ADF_pvalue": result[1]}) best_config = evaluate_models(i, p_values, d_values, q_values) best_config = pd.DataFrame([best_config]) best_config.columns = ["p", "d", "q"] #try: i = i.reset_index(drop=False) features = tsfel.time_series_features_extractor(cfg, i, verbose=10) features = pd.concat([ pd.DataFrame([file_]), pd.DataFrame([tam]), ADF, ADF_pvalue, features, best_config ], axis=1, ignore_index=False) #except: #print(i.values) #continue] features.to_csv(output_file, mode='a', header=False) #pbar.update(1)
def makeDataset(self, signals=False,strategy="ad-hoc"): def adhoc(strategy): self.features = strategy if(strategy=="ad-hoc"): self.features = self.signalsToFeatures(signals) featuresSpec = [] for var in self.features: #print (var) x = var.split(" ") if len(x) == 2: x.append(False) else: x[2] = True featuresSpec.append(x) signals.append(x[0]) for feature in featuresSpec: var = feature[0] isDerivate = feature[2] featureType = feature[1] if(isDerivate == True): temp = "" if(self.includeFiltering == True): temp = self.filterOutliers(chunk[var].diff(), self.n) else: temp = chunk[var].diff() if featureType == "mean": w.append(np.nanmean(temp)) elif featureType == "std": w.append(np.nanstd(temp)) else: w.append(np.nanpercentile(temp,int(featureType))) else: temp = "" if(self.includeFiltering == True): temp = self.filterOutliers(chunk[var], self.n) else: temp = chunk[var] if featureType == "mean": w.append(np.nanmean(temp)) elif featureType == "std": w.append(np.nanstd(temp)) else: w.append(np.nanpercentile(temp,int(featureType))) return w Id = [] X = [] Y = [] for j, row in self.files_list.iterrows(): file = row["cycleName"] df = pd.read_csv(self.fpathCameoFiles + file, encoding ="latin1", usecols = signals) chunks = self.makeChunks(df, int(df.shape[0]/(self.minuteWindow*60))) for chunk in chunks: chunk = chunk.fillna(method='bfill') #return chunk,"a","a" w = [] if(strategy=="ad-hoc" or type(strategy) == list): w = adhoc(strategy) elif(strategy=="tsfel-all" or strategy=="tsfel-all-corr"): cfg = tsfel.get_features_by_domain() w = tsfel.time_series_features_extractor(cfg, chunk) self.features = list(w.columns) elif(strategy=="tsfel-statistical"): cfg = tsfel.get_features_by_domain('statistical') w = tsfel.time_series_features_extractor(cfg, chunk) self.features = list(w.columns) elif(strategy=="tsfel-temporal"): cfg = tsfel.get_features_by_domain('temporal') w = tsfel.time_series_features_extractor(cfg, chunk) self.features = list(w.columns) elif(strategy=="vest"): model = BivariateVEST() features = model.extract_features(df, pairwise_transformations=False, summary_operators=SUMMARY_OPERATIONS_SMALL) w = multivariate_feature_extraction(df, apply_transform_operators=False, summary_operators=SUMMARY_OPERATIONS_SMALL) self.features = list(w.columns) X.append(w) Id.append(file) Y.append(row['label']) if(strategy=="tsfel-all-corr"): dataset = pd.concat(X) to_drop = tsfel.correlated_features(dataset) dataset = dataset.drop(to_drop,axis=1) self.features = dataset.columns X = dataset return np.array(X), np.array(Y), np.array(Id)
from datetime import datetime from features import extract_features, FEATURES DATASET_DIR = './dataset' STANDARD_RANGE = [ 0, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5, 10, 25, 50, 75, 100, 250, 500, 750, 1000 ] COLS_ROI = ['ROI_{}'.format(n) for n in STANDARD_RANGE] + ['ROI_Other'] COLS_RAKE = ['RAKE_{}'.format(n) for n in STANDARD_RANGE] + ['RAKE_Other'] COLS_GAMES = ['GAMES_{}'.format(n) for n in STANDARD_RANGE] + ['GAMES_Other'] CFG = tsfel.get_features_by_domain() TSFEL_COLS = [ '{}'.format(n) for n in tsfel.time_series_features_extractor( CFG, np.random.rand(10), fs=1).columns ] COLS_GAIN = ['GAIN_{}'.format(n[2:]) for n in TSFEL_COLS] COLS_GPD = ['GPD_{}'.format(n[2:]) for n in TSFEL_COLS] CUSTOM = list(FEATURES.keys()) + ['TOTAL_RAKE', 'GAMES_PER_STEP'] COIN_FINDER = re.compile(r'[^\d.]+') files = [f for f in os.walk(DATASET_DIR)][0][2] def diff(ts): return [t_1 - t_0 for t_0, t_1 in zip(ts[:-1], ts[1:])]
print(t) path1_s = folderRealData + '/' + dataset + '/' + dataset + 'test_features_' + str( t) + '.tsv' path2_s = folderRealData + '/' + dataset + '/' + dataset + 'val_features_' + str( t) + '.tsv' path3_s = folderRealData + '/' + dataset + '/' + dataset + 'train_features_' + str( t) + '.tsv' path4_s = folderRealData + '/' + dataset + '/' + dataset + 'ep_features_' + str( t) + '.tsv' try: if not (os.path.exists(path1_s)): d1_X_feat = tsfel.time_series_features_extractor(cfg_file, d1_X[:, :t], fs=100) d1_X_feat.columns = [ 'f' + str(ti) for ti in range(d1_X_feat.shape[1]) ] d1_X_feat.replace([np.inf, -np.inf], np.nan, inplace=True) d1_X_feat = d1_X_feat.fillna(d1_X_feat.median()) d1_X_feat.to_csv(path1_s, sep='\t', header=None, index=False) if not (os.path.exists(path2_s)): d2_X_feat = tsfel.time_series_features_extractor(cfg_file, d2_X[:, :t], fs=100) d2_X_feat.columns = [ 'f' + str(ti) for ti in range(d2_X_feat.shape[1]) ]
for date_range, yw in zip(date_ranges_w, yws): df_list = [] for df in ori_df_list: print(date_range[0], date_range[1]) filter_df = utils.filter_date(df, device_id, date_range[0], date_range[1]).drop(columns=['T', 'ParticipantId', 'FileCreationTime', 'DeviceId']) print(filter_df.head()) df_list.append(filter_df) X_train_list = [] col_list = [] pre_X_train = None for i in range(len(df_list)): df = df_list[i] try: cfg = tsfel.get_features_by_domain() X_train = tsfel.time_series_features_extractor(cfg, df) X_train_list.append(X_train) col_names = X_train.columns col_list.append(col_names) pre_X_train = X_train except: X_train_list.append(pre_X_train) continue if len(col_list) != 0: prev_list = col_list[0] for col in col_list[1:]: prev_list = set(prev_list).intersection(col) extracted_X_train = [] for X_train in X_train_list: new_train = X_train[prev_list]
settings0 = json.load(open(tsfel_path_json)) settings1 = json.load(open(personal_path_json)) settings2 = tsfel.get_features_by_domain('statistical') settings3 = tsfel.get_features_by_domain('temporal') settings4 = tsfel.get_features_by_domain('spectral') settings5 = tsfel.get_features_by_domain() settings6 = tsfel.extract_sheet('Features') settings7 = tsfel.extract_sheet('Features_test', path_json=personal_path_json) # Signal processing data_new = tsfel.merge_time_series(sensor_data, resample_rate, time_unit) windows = tsfel.signal_window_splitter(data_new, window_size, overlap) # time_series_features_extractor features0 = tsfel.time_series_features_extractor(settings4, windows, fs=resample_rate) features1 = tsfel.time_series_features_extractor(settings2, data_new, fs=resample_rate, window_size=70, overlap=0.5) features2 = tsfel.time_series_features_extractor(settings3, windows, fs=resample_rate) # Dataset features extractor data = tsfel.dataset_features_extractor(main_directory, settings1, search_criteria=search_criteria, time_unit=time_unit,
plt.figure() plt.scatter(results_tsne[:,0], results_tsne[:,1], c=df_uci_pivot.index.get_level_values('cluster'), cmap=cmap, alpha=0.6, ) df_uci_pivot['week'] = pd.to_datetime(df_uci_pivot.index.get_level_values(0)).strftime('%W') df['week']=df.index.strftime('%W') dailymean=df_uci_pivot.iloc[0:-1].mean(axis=1) df_uci_pivot['rollingmean']=dailymean.dropna().rolling(window = 50).mean() plt.figure() #plt.plot(df.week[1:],rolling_mean,label='30 days moving average consumption') plt.scatter(pd.to_datetime(df_uci_pivot.index.get_level_values(0)), df_uci_pivot.rollingmean, c=df_uci_pivot.index.get_level_values('cluster'), cmap=cmap,alpha=0.6) ax.set_xticks(np.arange(1,25)) ax.set_ylabel('kiloWatts') plt.title('30 days moving average consumption') plt.annotate('',xy=(0,1,),xytext=(0,1),fontsize=10) ax.legend() for name in groups: sb.lineplot(AvgInClasses.index[AvgInClasses.index.month==1],AvgInClasses.[name][AvgInClasses.index.month==1])(subplots=True, legend=False) cfg = tsfel.get_features_by_domain() # Extract features Features = tsfel.time_series_features_extractor(len(AvgInClasses),AvgInClasses)
pl.rc('font', family='serif', serif='Times') #%% #Don't forget to change this PATH path = 'csv/' dataframeempty = pd.DataFrame() W = [] for csv_path in glob(path + 'Residential_*.csv'): df = pd.read_csv(csv_path) df.dropna(inplace=True) df.drop(['date'], axis=1, inplace=True) # Retrieves a pre-defined feature configuration file to extract all available features cfg = tsfel.get_features_by_domain() # Extract features X = tsfel.time_series_features_extractor(cfg, df) X['File Name'] = csv_path dataframeempty = dataframeempty.append(X) cfgx = tsfel.get_features_by_domain(domain="spectral") cfgs = tsfel.get_features_by_domain(domain="statistical") cfgt = tsfel.get_features_by_domain(domain="temporal") x = tsfel.time_series_features_extractor(cfg, df, verbose=0) xx = tsfel.time_series_features_extractor(cfgx, df, verbose=0) xs = tsfel.time_series_features_extractor(cfgs, df, verbose=0) xt = tsfel.time_series_features_extractor(cfgt, df, verbose=0) x = pd.concat([ xs, xt,