"n": 2 }, { "n": 3 }], "autocorrelation": [{ "lag": 2 }, { "lag": 3 }] # "value_count": #"large_standard_deviation": [{"r": 0.05}, {"r": 0.1}] } # For convenience, three dictionaries are predefined and can be used right away # ComprehensiveFCParameters, MinimalFCParameters, EfficientFCParameters # MinimalFCParameters is used by default else: extraction_settings = MinimalFCParameters() extracted_features = extract_features( dataframe, column_id=REF_COLUMN, column_sort=TIME_COLUMN, default_fc_parameters=extraction_settings) extracted_features[REF_COLUMN] = extracted_features.index # ------------------------------------------------------------- # Transfer data to the next tasks # dataframe_id = compress_and_transfer_dataframe(dataframe) print("dataframe id (out): ", dataframe_id) resultMetadata.put("task.name", __file__)
from zoo.chronos.data.utils.feature import generate_dt_features, generate_global_features from zoo.chronos.data.utils.impute import impute_timeseries_dataframe from zoo.chronos.data.utils.deduplicate import deduplicate_timeseries_dataframe from zoo.chronos.data.utils.roll import roll_timeseries_dataframe from zoo.chronos.data.utils.scale import unscale_timeseries_numpy from zoo.chronos.data.utils.resample import resample_timeseries_dataframe from zoo.chronos.data.utils.split import split_timeseries_dataframe from tsfresh.utilities.dataframe_functions import roll_time_series from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh from tsfresh import extract_features from tsfresh.feature_extraction import ComprehensiveFCParameters,\ MinimalFCParameters, EfficientFCParameters DEFAULT_PARAMS = {"comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters()} _DEFAULT_ID_COL_NAME = "id" _DEFAULT_ID_PLACEHOLDER = "0" class TSDataset: def __init__(self, data, **schema): ''' TSDataset is an abstract of time series dataset. Cascade call is supported for most of the transform methods. ''' self.df = data self.id_col = schema["id_col"] self.dt_col = schema["dt_col"]
def gen_global_feature(self, settings="comprehensive", full_settings=None, n_jobs=1): ''' Generate per-time-series feature for each time series. This method will be implemented by tsfresh. Make sure that the specified column name does not contain '__'. TODO: relationship with scale should be figured out. :param settings: str or dict. If a string is set, then it must be one of "comprehensive" "minimal" and "efficient". If a dict is set, then it should follow the instruction for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive". :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in tsfresh. The value is defaulted to None. :param n_jobs: int. The number of processes to use for parallelization. :return: the tsdataset instance. ''' from tsfresh import extract_features from tsfresh.feature_extraction import ComprehensiveFCParameters, \ MinimalFCParameters, EfficientFCParameters DEFAULT_PARAMS = { "comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters() } assert not self._has_generate_agg_feature, \ "Only one of gen_global_feature and gen_rolling_feature should be called." if full_settings is not None: self.df,\ addtional_feature =\ generate_global_features(input_df=self.df, column_id=self.id_col, column_sort=self.dt_col, kind_to_fc_parameters=full_settings, n_jobs=n_jobs) self.feature_col += addtional_feature return self if isinstance(settings, str): assert settings in ['comprehensive', 'minimal', 'efficient'], \ "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\ f", but found {settings}." default_fc_parameters = DEFAULT_PARAMS[settings] else: default_fc_parameters = settings self.df,\ addtional_feature =\ generate_global_features(input_df=self.df, column_id=self.id_col, column_sort=self.dt_col, default_fc_parameters=default_fc_parameters, n_jobs=n_jobs) self.feature_col += addtional_feature self._has_generate_agg_feature = True return self
def gen_rolling_feature(self, window_size, settings="comprehensive", full_settings=None, n_jobs=1): ''' Generate aggregation feature for each sample. This method will be implemented by tsfresh. Make sure that the specified column name does not contain '__'. TODO: relationship with scale should be figured out. :param window_size: int, generate feature according to the rolling result. :param settings: str or dict. If a string is set, then it must be one of "comprehensive" "minimal" and "efficient". If a dict is set, then it should follow the instruction for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive". :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in tsfresh. The value is defaulted to None. :param n_jobs: int. The number of processes to use for parallelization. :return: the tsdataset instance. ''' from tsfresh.utilities.dataframe_functions import roll_time_series from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh from tsfresh import extract_features from tsfresh.feature_extraction import ComprehensiveFCParameters, \ MinimalFCParameters, EfficientFCParameters DEFAULT_PARAMS = { "comprehensive": ComprehensiveFCParameters(), "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters() } assert not self._has_generate_agg_feature,\ "Only one of gen_global_feature and gen_rolling_feature should be called." if isinstance(settings, str): assert settings in ['comprehensive', 'minimal', 'efficient'], \ "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\ f", but found {settings}." default_fc_parameters = DEFAULT_PARAMS[settings] else: default_fc_parameters = settings assert window_size < self.df.groupby(self.id_col).size().min() + 1, "gen_rolling_feature "\ "should have a window_size smaller than shortest time series length." df_rolled = roll_time_series(self.df, column_id=self.id_col, column_sort=self.dt_col, max_timeshift=window_size - 1, min_timeshift=window_size - 1, n_jobs=n_jobs) if not full_settings: self.roll_feature_df = extract_features( df_rolled, column_id=self.id_col, column_sort=self.dt_col, default_fc_parameters=default_fc_parameters, n_jobs=n_jobs) else: self.roll_feature_df = extract_features( df_rolled, column_id=self.id_col, column_sort=self.dt_col, kind_to_fc_parameters=full_settings, n_jobs=n_jobs) impute_tsfresh(self.roll_feature_df) self.feature_col += list(self.roll_feature_df.columns) self.roll_additional_feature = list(self.roll_feature_df.columns) self._has_generate_agg_feature = True return self
import pandas as pd import datetime from tsfresh.feature_extraction import extract_features, EfficientFCParameters, MinimalFCParameters, ComprehensiveFCParameters # settings = EfficientFCParameters() settings = MinimalFCParameters() # settings = ComprehensiveFCParameters() def run_click(): t_click = pd.read_csv('../data/t_click.csv') try: t_click_8_10 = t_click[t_click['click_time'] < '2016-11-01'] t_click_9_11 = t_click[t_click['click_time'] > '2016-08-31'] extracted_features_click = extract_features(t_click_8_10, column_id="uid", column_sort="click_time",default_fc_parameters=settings) extracted_features_click.to_pickle('extracted_features_click_8_10.pickle') extracted_features_click = extract_features(t_click_9_11, column_id="uid", column_sort="click_time",default_fc_parameters=settings) extracted_features_click.to_pickle('extracted_features_click_9_11.pickle') except Exception as e: print(e) def run_loan(): t_loan = pd.read_csv('../data/t_loan.csv') try: t_loan_8_10 = t_loan[t_loan['loan_time']<'2016-11-01'] t_loan_9_11 = t_loan[t_loan['loan_time']>'2016-08-31'] extracted_features_loan = extract_features(t_loan_8_10, column_id="uid", column_sort="loan_time") extracted_features_loan.to_pickle('extracted_features_loan_8_10.pickle') extracted_features_loan = extract_features(t_loan_9_11, column_id="uid", column_sort="loan_time") extracted_features_loan.to_pickle('extracted_features_loan_9_11.pickle') except Exception as e: print(e)
#%% Imports import numpy as np import pandas as pd from tsfresh import extract_relevant_features from tsfresh.feature_extraction import MinimalFCParameters #%% Load data data=np.load('../data/dataset.npy') labels = ['test_num', 'location', 'x', 'y', 'z', 't', 'weight'] dataset = pd.DataFrame({ labels[0]:data[:,0].astype(int) , labels[1]:data[:,1] , labels[2]:data[:,2], \ labels[3]:data[:,3] , labels[4]:data[:,4] , labels[5]:data[:,5] , labels[6]:data[:,6]}) y = np.load('../data/y.npy') y = pd.Series(y.astype(int)) #%% Access the data method def getTest(dataset,num): return dataset[dataset['test_num']==num] #%% Extract relevant features features = extract_relevant_features(dataset, y, column_id='test_num', column_sort='t', default_fc_parameters=MinimalFCParameters()) features.to_pickle('../data/features.pkl')
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from xgboost import XGBClassifier from voting import VotingClassifier MODEL_PATH = './Models/' MODEL_NAMES = ['./Models/KNeighborsClassifier_model.pkl', './Models/DecisionTreeClassifier_model.pkl', './Models/RandomForestClassifier_model.pkl', './Models/AdaBoostClassifier_model.pkl', './Models/GradientBoostingClassifier_model.pkl', './Models/GaussianNB_model.pkl', './Models/LinearDiscriminantAnalysis_model.pkl', './Models/XGBClassifier_model.pkl'] TSFRESH_SETTINGS = MinimalFCParameters() RAW_COLUMNS = ['attention', 'meditation', 'delta', 'theta', 'lowAlpha', 'highAlpha', 'lowBeta', 'highBeta', 'lowGamma', 'highGamma'] COLUMNS = ['delta', 'theta', 'lowAlpha', 'highAlpha', 'lowBeta', 'highBeta', 'lowGamma', 'highGamma'] NEW_COLUMNS = ['id', 'time', 'delta', 'theta', 'lowAlpha', 'highAlpha', 'lowBeta', 'highBeta', 'lowGamma', 'highGamma'] SEQ_SIZE = 8 class EmotionML(object): def __init__(self): self.models = None self.raw = None # raw data from json.loads self.data = None # pd dataframe self.cleaned = None # data after cleaning self.sequences = None self.MLInput = None def load_data(self, json_data):
kind_to_fc_parameters = { '0.0': fc_parameters, '1.0': fc_parameters, '2.0': fc_parameters, '3.0': fc_parameters, '4.0': fc_parameters, '5.0': fc_parameters } train_features4 = extract_features( sub_pb, column_id='object_id', column_value='flux', column_sort='mjd', column_kind='passband', default_fc_parameters=MinimalFCParameters()) impute(train_features4) train_features5 = extract_features( sub_pb, column_id='object_id', column_value='flux', column_sort='mjd', column_kind='passband', default_fc_parameters=fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters) impute(train_features5) #Task5 model pruning training5_X = train_features5 training5_Y = sub_meta['target']
def preprocess_data(raw_features, labels): """ Preprocesses the data by: 1. Removing duplicate labels and joining both dataframes on bookingID 2. Unflatten trips by making each trip one row (all data points for that column is put into a list sorted by time) 3. Computing aggregate features from raw features using tsfresh and joining to combined dataframe 4. Computing derivatives of certain columns and their aggregates 5. Returning time series features and aggregate features as separate dataframes Warning: this takes a while (10+ mins on all the given data) Returns: pd.DataFrame: bookingIDs pd.DataFrame: labels pd.DataFrame: aggregate_features pd.DataFrame: timeseries_features """ #drop all trips with duplicate labels since there are only 26 of them deduplicated_labels = labels.drop_duplicates( subset='bookingID', keep=False).set_index(keys=['bookingID']).sort_index() filtered_features = raw_features[raw_features.bookingID.isin( deduplicated_labels.index)] #make each time series into a list in it's row timeseries_features = raw_features.sort_values( by='second').groupby('bookingID').agg(list).reset_index() combined = timeseries_features.join(deduplicated_labels, on='bookingID', how='inner') #tsfresh gives features which are aggregates of the time series, e.g. mean, median, standard deviation, etc extracted_features = extract_features( raw_features, column_id="bookingID", column_sort="second", default_fc_parameters=MinimalFCParameters()) combined = combined.merge(extracted_features, left_on="bookingID", right_index=True, how="inner") #get derivatives such as jerk (second order derivative of velocity) combined = combined.assign( jerk_x=get_derivative_list(combined, 'acceleration_x'), jerk_y=get_derivative_list(combined, 'acceleration_y'), jerk_z=get_derivative_list(combined, 'acceleration_z'), gyro_accel_x=get_derivative_list(combined, 'gyro_x'), gyro_accel_y=get_derivative_list(combined, 'gyro_y'), gyro_accel_z=get_derivative_list(combined, 'gyro_z'), gps_accel=get_derivative_list(combined, 'Speed')) #compute aggregates for derivative functions combined['min_jerk_x'] = np.vectorize(min)(combined['jerk_x']) combined['min_jerk_y'] = np.vectorize(min)(combined['jerk_y']) combined['min_jerk_z'] = np.vectorize(min)(combined['jerk_z']) combined['max_jerk_x'] = np.vectorize(max)(combined['jerk_x']) combined['max_jerk_y'] = np.vectorize(max)(combined['jerk_y']) combined['max_jerk_z'] = np.vectorize(max)(combined['jerk_z']) combined['mean_jerk_x'] = np.vectorize(mean)(combined['jerk_x']) combined['mean_jerk_y'] = np.vectorize(mean)(combined['jerk_y']) combined['mean_jerk_z'] = np.vectorize(mean)(combined['jerk_z']) combined['median_jerk_x'] = np.vectorize(median)(combined['jerk_x']) combined['median_jerk_y'] = np.vectorize(median)(combined['jerk_y']) combined['median_jerk_z'] = np.vectorize(median)(combined['jerk_z']) combined['stdev_jerk_x'] = np.vectorize(stdev)(combined['jerk_x']) combined['stdev_jerk_y'] = np.vectorize(stdev)(combined['jerk_y']) combined['stdev_jerk_z'] = np.vectorize(stdev)(combined['jerk_z']) combined['min_gyro_accel_x'] = np.vectorize(min)(combined['gyro_accel_x']) combined['min_gyro_accel_y'] = np.vectorize(min)(combined['gyro_accel_y']) combined['min_gyro_accel_z'] = np.vectorize(min)(combined['gyro_accel_z']) combined['max_gyro_accel_x'] = np.vectorize(max)(combined['gyro_accel_x']) combined['max_gyro_accel_y'] = np.vectorize(max)(combined['gyro_accel_y']) combined['max_gyro_accel_z'] = np.vectorize(max)(combined['gyro_accel_z']) combined['mean_gyro_accel_x'] = np.vectorize(mean)( combined['gyro_accel_x']) combined['mean_gyro_accel_y'] = np.vectorize(mean)( combined['gyro_accel_y']) combined['mean_gyro_accel_z'] = np.vectorize(mean)( combined['gyro_accel_z']) combined['median_gyro_accel_x'] = np.vectorize(median)( combined['gyro_accel_x']) combined['median_gyro_accel_y'] = np.vectorize(median)( combined['gyro_accel_y']) combined['median_gyro_accel_z'] = np.vectorize(median)( combined['gyro_accel_z']) combined['stdev_gyro_accel_x'] = np.vectorize(stdev)( combined['gyro_accel_x']) combined['stdev_gyro_accel_y'] = np.vectorize(stdev)( combined['gyro_accel_y']) combined['stdev_gyro_accel_z'] = np.vectorize(stdev)( combined['gyro_accel_z']) combined['min_gps_accel'] = np.vectorize(min)(combined['gps_accel']) combined['max_gps_accel'] = np.vectorize(max)(combined['gps_accel']) combined['mean_gps_accel'] = np.vectorize(mean)(combined['gps_accel']) combined['median_gps_accel'] = np.vectorize(median)(combined['gps_accel']) combined['stdev_gps_accel'] = np.vectorize(stdev)(combined['gps_accel']) #prepare output booking_id = combined['bookingID'] labels = combined['label'] aggregate_features_columns = [ 'min_jerk_x', 'min_jerk_y', 'min_jerk_z', 'max_jerk_x', 'max_jerk_y', 'max_jerk_z', 'mean_jerk_x', 'mean_jerk_y', 'mean_jerk_z', 'median_jerk_x', 'median_jerk_y', 'median_jerk_z', 'stdev_jerk_x', 'stdev_jerk_y', 'stdev_jerk_z', 'min_gyro_accel_x', 'min_gyro_accel_y', 'min_gyro_accel_z', 'max_gyro_accel_x', 'max_gyro_accel_y', 'max_gyro_accel_z', 'mean_gyro_accel_x', 'mean_gyro_accel_y', 'mean_gyro_accel_z', 'median_gyro_accel_x', 'median_gyro_accel_y', 'median_gyro_accel_z', 'stdev_gyro_accel_x', 'stdev_gyro_accel_y', 'stdev_gyro_accel_z', 'min_gps_accel', 'max_gps_accel', 'mean_gps_accel', 'median_gps_accel', 'stdev_gps_accel' ] final_timeseries_features = combined.drop( ['bookingID', 'label', 'second'] + list(extracted_features.columns) + aggregate_features_columns, axis=1) aggregate_features = combined[list(extracted_features.columns) + aggregate_features_columns] return booking_id, labels, aggregate_features, final_timeseries_features