Example #1
0
def get_stats(pi, file):
    player_name = file[:-4]

    gain = [float(i) for i in pi['gain_series']['gain']]
    n_games = pi['gain_series']['n_games']
    dates = [
        datetime.fromtimestamp(t / 1000) for t in pi['gain_series']['date']
    ]
    date_diff = np.diff(dates)
    games_diff = np.diff(n_games)
    games_per_day = [
        safe_division(g, d.days * 86400 + d.seconds) * 86400
        for g, d in zip(games_diff, date_diff)
    ]

    features_gain = tsfel.time_series_features_extractor(CFG,
                                                         gain,
                                                         fs=1,
                                                         verbose=0)[TSFEL_COLS]
    features_gain.columns = COLS_GAIN
    features_gain.index = [player_name]
    features_gain['GAMES_PER_STEP'] = games_diff[0]

    features_gpd = tsfel.time_series_features_extractor(CFG,
                                                        games_per_day,
                                                        fs=1,
                                                        verbose=0)[TSFEL_COLS]
    features_gpd.columns = COLS_GPD
    features_gpd.index = [player_name]

    roi, rake, games = approximate(pi['rake_series']['n_games'],
                                   pi['rake_series']['roi'],
                                   pi['rake_series']['rake'],
                                   pi['rake_series']['game_value'],
                                   STANDARD_RANGE)

    features_roi = pd.DataFrame(roi, index=[player_name])
    features_roi.columns = COLS_ROI
    features_rake = pd.DataFrame(rake, index=[player_name])
    features_rake.columns = COLS_RAKE
    features_rake['TOTAL_RAKE'] = np.nansum(features_rake.fillna(np.NaN))
    features_games = pd.DataFrame(games, index=[player_name])
    features_games.columns = COLS_GAMES

    custom = extract_features(pi, player_name)

    return pd.concat([
        features_gain, features_gpd, features_roi, features_rake,
        features_games, custom
    ],
                     axis=1)
def auto_feature_engineering(input_df: pd.DataFrame,
                             cols: List[str],
                             window_size: int = 5) -> pd.DataFrame:
    """ Automated feature engineering wrapper using TSFEL 
    package for features generated based on temporal metrics
    
    :input_df: Input data
    :window_size: Size of the window
    :return: X data
    """
    cfg_file = tsfel.get_features_by_domain("temporal")
    X_df_list = []
    for col in cols:
        current_df = tsfel.time_series_features_extractor(
            cfg_file,
            input_df,
            fs=50,
            window_splitter=True,
            window_size=window_size,
        )
        current_df = current_df.add_prefix(col)
        X_df_list.append(current_df)

    X = pd.concat(X_df_list, axis=1)
    return X
Example #3
0
def tsfel_calculator(x):
    
    import tsfel

    # Instantiate calculation configuration

    cfg_file = tsfel.get_features_by_domain()

    # Produce calculations
    
    extracted_features = tsfel.time_series_features_extractor(cfg_file, x)
    
    return extracted_features
Example #4
0
df = pd.read_sql_table('display', 'sqlite:///dissertation.db')

df_engineering = df.copy()
#df_engineering['id']=np.repeat(range(1,4033),360)
df_20weeks = df_engineering[:1209600]  # 20 weeks
#df_tsfresh=df_engineering[['time','id','kWh']]
df_tsfel = df_20weeks[['kWh']]

df_hour = df_20weeks[['kWh']] * 1000  # convert kwh to wh
df_hour.rename(columns={'kWh': 'Wh'}, inplace=True)

import tsfel
#from tsfresh import extract_relevant_features
cfg = tsfel.get_features_by_domain()
X_train = tsfel.time_series_features_extractor(cfg,
                                               df_tsfel,
                                               window_splitter=True,
                                               window_size=8640)  # one day
print(list(X_train.columns))
X_train = X_train[[
    '0_Absolute energy', '0_Mean', '0_Max', '0_Standard deviation',
    '0_FFT mean coefficient_0', '0_Spectral kurtosis', '0_Skewness',
    '0_Zero crossing rate'
]]

X_hour = tsfel.time_series_features_extractor(cfg,
                                              df_hour,
                                              window_splitter=True,
                                              window_size=360)  # one hour
X_hour = X_hour[[
    '0_Absolute energy', '0_Mean', '0_Max', '0_Standard deviation',
    '0_FFT mean coefficient_0', '0_Spectral kurtosis', '0_Skewness',
Example #5
0
# IMPORT
df = pd.read_csv('Data kategorier.csv',
                 skiprows=2,
                 index_col=['READ_TIME_Date'])
df.index = pd.to_datetime(df.index)

categories = df.iloc[1:, :].dropna(axis=1).columns[df.iloc[1:, :].dropna(
    axis=1).columns.isin(
        k for k in df.columns
        if 'Average' in k and 'huser' not in k and 'lejl' not in k)]
#minmaxscaling
scaler = MinMaxScaler(feature_range=(0, 1))
df[categories] = scaler.fit_transform(df[categories])
X_train = tsfel.time_series_features_extractor(cfg_file,
                                               serie,
                                               fs=24,
                                               window_splitter=True,
                                               window_size=720)

#serie=
#X_train = tsfel.time_series_features_extractor(cfg_file, serie, fs=24, window_splitter=True, window_size=720)
corr_features = tsfel.correlated_features(X_train)
list = []
cfg_file = tsfel.get_features_by_domain()
for category in categories:
    serie = df[category].dropna()
    X_train = tsfel.time_series_features_extractor(cfg_file,
                                                   serie,
                                                   fs=24,
                                                   window_splitter=True,
                                                   window_size=720)
Example #6
0
tam_janela = np.round(len(series) / tam)

series_split = np.array_split(series, tam_janela)
dataset = pd.DataFrame()
cont = 0
#with tqdm(total=len(series_split[atual:])) as pbar:
for i in series_split[atual:]:
    cont = cont + 1
    wandb.log({"Window (tot. " + str(len(series_split)) + ")": cont})
    result = adfuller(i)
    ADF = pd.DataFrame([result[0]])
    ADF_pvalue = pd.DataFrame([result[1]])
    wandb.log({"ADF_pvalue": result[1]})
    best_config = evaluate_models(i, p_values, d_values, q_values)
    best_config = pd.DataFrame([best_config])
    best_config.columns = ["p", "d", "q"]
    #try:
    i = i.reset_index(drop=False)
    features = tsfel.time_series_features_extractor(cfg, i, verbose=10)
    features = pd.concat([
        pd.DataFrame([file_]),
        pd.DataFrame([tam]), ADF, ADF_pvalue, features, best_config
    ],
                         axis=1,
                         ignore_index=False)
    #except:
    #print(i.values)
    #continue]
    features.to_csv(output_file, mode='a', header=False)
    #pbar.update(1)
Example #7
0
    def makeDataset(self, signals=False,strategy="ad-hoc"):
    
        def adhoc(strategy):
            
            
            self.features = strategy
            if(strategy=="ad-hoc"):
                self.features = self.signalsToFeatures(signals)            

            featuresSpec = [] 
            for var in self.features:
                #print (var)
                x = var.split(" ")
                if len(x) == 2:
                    x.append(False)
                else:
                    x[2] = True
                featuresSpec.append(x)
                signals.append(x[0])            

            for feature in featuresSpec:
                var = feature[0]
                isDerivate = feature[2]
                featureType = feature[1]

                if(isDerivate == True):
                    temp = ""
                    if(self.includeFiltering == True):
                        temp = self.filterOutliers(chunk[var].diff(), self.n)
                    else:
                        temp = chunk[var].diff()                   

                    if featureType == "mean":
                        w.append(np.nanmean(temp))
                    elif featureType == "std":
                        w.append(np.nanstd(temp))
                    else:
                        w.append(np.nanpercentile(temp,int(featureType)))
                else:
                    temp = ""
                    if(self.includeFiltering == True):
                        temp = self.filterOutliers(chunk[var], self.n)
                    else:
                        temp = chunk[var]

                    if featureType == "mean":
                        w.append(np.nanmean(temp))
                    elif featureType == "std":
                        w.append(np.nanstd(temp))
                    else:
                        w.append(np.nanpercentile(temp,int(featureType)))
                            
            return w
        
        
        Id = []
        X = []
        Y = []

        for j, row in self.files_list.iterrows():            
            file = row["cycleName"]
            df = pd.read_csv(self.fpathCameoFiles + file,  encoding ="latin1", usecols = signals)
            
            chunks = self.makeChunks(df, int(df.shape[0]/(self.minuteWindow*60)))
            
            for chunk in chunks:
                chunk = chunk.fillna(method='bfill')
                #return chunk,"a","a"
                w = []
                if(strategy=="ad-hoc" or type(strategy) == list): w = adhoc(strategy)
                elif(strategy=="tsfel-all" or strategy=="tsfel-all-corr"): 
                    cfg = tsfel.get_features_by_domain()
                    w = tsfel.time_series_features_extractor(cfg, chunk)   
                    self.features = list(w.columns)
                elif(strategy=="tsfel-statistical"): 
                    cfg = tsfel.get_features_by_domain('statistical')
                    w = tsfel.time_series_features_extractor(cfg, chunk)  
                    self.features = list(w.columns)
                elif(strategy=="tsfel-temporal"): 
                    cfg = tsfel.get_features_by_domain('temporal')
                    w = tsfel.time_series_features_extractor(cfg, chunk)  
                    self.features = list(w.columns)
                elif(strategy=="vest"): 
                    model = BivariateVEST()
                    features = model.extract_features(df, pairwise_transformations=False, summary_operators=SUMMARY_OPERATIONS_SMALL)
                    w = multivariate_feature_extraction(df, apply_transform_operators=False, summary_operators=SUMMARY_OPERATIONS_SMALL)    
                    self.features = list(w.columns)
                
                X.append(w)
                Id.append(file)
                Y.append(row['label'])        
        
        
        if(strategy=="tsfel-all-corr"):
            dataset = pd.concat(X)
            to_drop = tsfel.correlated_features(dataset)
            dataset = dataset.drop(to_drop,axis=1)
            self.features = dataset.columns
            X = dataset
                
            
        return np.array(X), np.array(Y), np.array(Id)
Example #8
0
from datetime import datetime

from features import extract_features, FEATURES

DATASET_DIR = './dataset'
STANDARD_RANGE = [
    0, 0.25, 0.5, 0.75, 1, 2.5, 5, 7.5, 10, 25, 50, 75, 100, 250, 500, 750,
    1000
]
COLS_ROI = ['ROI_{}'.format(n) for n in STANDARD_RANGE] + ['ROI_Other']
COLS_RAKE = ['RAKE_{}'.format(n) for n in STANDARD_RANGE] + ['RAKE_Other']
COLS_GAMES = ['GAMES_{}'.format(n) for n in STANDARD_RANGE] + ['GAMES_Other']

CFG = tsfel.get_features_by_domain()
TSFEL_COLS = [
    '{}'.format(n) for n in tsfel.time_series_features_extractor(
        CFG, np.random.rand(10), fs=1).columns
]
COLS_GAIN = ['GAIN_{}'.format(n[2:]) for n in TSFEL_COLS]
COLS_GPD = ['GPD_{}'.format(n[2:]) for n in TSFEL_COLS]

CUSTOM = list(FEATURES.keys()) + ['TOTAL_RAKE', 'GAMES_PER_STEP']

COIN_FINDER = re.compile(r'[^\d.]+')

files = [f for f in os.walk(DATASET_DIR)][0][2]


def diff(ts):
    return [t_1 - t_0 for t_0, t_1 in zip(ts[:-1], ts[1:])]

Example #9
0
        print(t)

        path1_s = folderRealData + '/' + dataset + '/' + dataset + 'test_features_' + str(
            t) + '.tsv'
        path2_s = folderRealData + '/' + dataset + '/' + dataset + 'val_features_' + str(
            t) + '.tsv'
        path3_s = folderRealData + '/' + dataset + '/' + dataset + 'train_features_' + str(
            t) + '.tsv'
        path4_s = folderRealData + '/' + dataset + '/' + dataset + 'ep_features_' + str(
            t) + '.tsv'

        try:

            if not (os.path.exists(path1_s)):
                d1_X_feat = tsfel.time_series_features_extractor(cfg_file,
                                                                 d1_X[:, :t],
                                                                 fs=100)
                d1_X_feat.columns = [
                    'f' + str(ti) for ti in range(d1_X_feat.shape[1])
                ]
                d1_X_feat.replace([np.inf, -np.inf], np.nan, inplace=True)
                d1_X_feat = d1_X_feat.fillna(d1_X_feat.median())
                d1_X_feat.to_csv(path1_s, sep='\t', header=None, index=False)

            if not (os.path.exists(path2_s)):
                d2_X_feat = tsfel.time_series_features_extractor(cfg_file,
                                                                 d2_X[:, :t],
                                                                 fs=100)
                d2_X_feat.columns = [
                    'f' + str(ti) for ti in range(d2_X_feat.shape[1])
                ]
        for date_range, yw in zip(date_ranges_w, yws):
            df_list = []
            for df in ori_df_list:
                print(date_range[0], date_range[1])
                filter_df = utils.filter_date(df, device_id, date_range[0], date_range[1]).drop(columns=['T', 'ParticipantId', 'FileCreationTime', 'DeviceId'])
                print(filter_df.head())
                df_list.append(filter_df)

            X_train_list = []
            col_list = []
            pre_X_train = None
            for i in range(len(df_list)):
                df = df_list[i]
                try:
                    cfg = tsfel.get_features_by_domain()
                    X_train = tsfel.time_series_features_extractor(cfg, df)
                    X_train_list.append(X_train)
                    col_names = X_train.columns
                    col_list.append(col_names)
                    pre_X_train = X_train
                except:
                    X_train_list.append(pre_X_train)
                    continue
            if len(col_list) != 0:
                prev_list = col_list[0]
                for col in col_list[1:]:
                    prev_list = set(prev_list).intersection(col)

                extracted_X_train = []
                for X_train in X_train_list:
                    new_train = X_train[prev_list]
Example #11
0
settings0 = json.load(open(tsfel_path_json))
settings1 = json.load(open(personal_path_json))
settings2 = tsfel.get_features_by_domain('statistical')
settings3 = tsfel.get_features_by_domain('temporal')
settings4 = tsfel.get_features_by_domain('spectral')
settings5 = tsfel.get_features_by_domain()
settings6 = tsfel.extract_sheet('Features')
settings7 = tsfel.extract_sheet('Features_test', path_json=personal_path_json)

# Signal processing
data_new = tsfel.merge_time_series(sensor_data, resample_rate, time_unit)
windows = tsfel.signal_window_splitter(data_new, window_size, overlap)

# time_series_features_extractor
features0 = tsfel.time_series_features_extractor(settings4,
                                                 windows,
                                                 fs=resample_rate)
features1 = tsfel.time_series_features_extractor(settings2,
                                                 data_new,
                                                 fs=resample_rate,
                                                 window_size=70,
                                                 overlap=0.5)
features2 = tsfel.time_series_features_extractor(settings3,
                                                 windows,
                                                 fs=resample_rate)

# Dataset features extractor
data = tsfel.dataset_features_extractor(main_directory,
                                        settings1,
                                        search_criteria=search_criteria,
                                        time_unit=time_unit,
Example #12
0
plt.figure()
plt.scatter(results_tsne[:,0], results_tsne[:,1],
    c=df_uci_pivot.index.get_level_values('cluster'),
    cmap=cmap,
    alpha=0.6,
    )


df_uci_pivot['week'] = pd.to_datetime(df_uci_pivot.index.get_level_values(0)).strftime('%W')
df['week']=df.index.strftime('%W')
dailymean=df_uci_pivot.iloc[0:-1].mean(axis=1)
df_uci_pivot['rollingmean']=dailymean.dropna().rolling(window = 50).mean()

plt.figure()
#plt.plot(df.week[1:],rolling_mean,label='30 days moving average consumption')
plt.scatter(pd.to_datetime(df_uci_pivot.index.get_level_values(0)), df_uci_pivot.rollingmean, c=df_uci_pivot.index.get_level_values('cluster'), cmap=cmap,alpha=0.6)
ax.set_xticks(np.arange(1,25))
ax.set_ylabel('kiloWatts')
plt.title('30 days moving average consumption')
plt.annotate('',xy=(0,1,),xytext=(0,1),fontsize=10)
ax.legend()


for name in groups:
	sb.lineplot(AvgInClasses.index[AvgInClasses.index.month==1],AvgInClasses.[name][AvgInClasses.index.month==1])(subplots=True, legend=False)

cfg = tsfel.get_features_by_domain()
# Extract features
Features = tsfel.time_series_features_extractor(len(AvgInClasses),AvgInClasses)
pl.rc('font', family='serif', serif='Times')

#%%
#Don't forget to change this PATH
path = 'csv/'
dataframeempty = pd.DataFrame()
W = []
for csv_path in glob(path + 'Residential_*.csv'):
    df = pd.read_csv(csv_path)
    df.dropna(inplace=True)
    df.drop(['date'], axis=1, inplace=True)
    # Retrieves a pre-defined feature configuration file to extract all available features
    cfg = tsfel.get_features_by_domain()

    # Extract features
    X = tsfel.time_series_features_extractor(cfg, df)
    X['File Name'] = csv_path
    dataframeempty = dataframeempty.append(X)

    cfgx = tsfel.get_features_by_domain(domain="spectral")
    cfgs = tsfel.get_features_by_domain(domain="statistical")
    cfgt = tsfel.get_features_by_domain(domain="temporal")

    x = tsfel.time_series_features_extractor(cfg, df, verbose=0)
    xx = tsfel.time_series_features_extractor(cfgx, df, verbose=0)
    xs = tsfel.time_series_features_extractor(cfgs, df, verbose=0)
    xt = tsfel.time_series_features_extractor(cfgt, df, verbose=0)

    x = pd.concat([
        xs,
        xt,