"n": 2
        }, {
            "n": 3
        }],
        "autocorrelation": [{
            "lag": 2
        }, {
            "lag": 3
        }]
        # "value_count": #"large_standard_deviation": [{"r": 0.05}, {"r": 0.1}]
    }
# For convenience, three dictionaries are predefined and can be used right away
# ComprehensiveFCParameters, MinimalFCParameters, EfficientFCParameters
# MinimalFCParameters is used by default
else:
    extraction_settings = MinimalFCParameters()

extracted_features = extract_features(
    dataframe,
    column_id=REF_COLUMN,
    column_sort=TIME_COLUMN,
    default_fc_parameters=extraction_settings)
extracted_features[REF_COLUMN] = extracted_features.index

# -------------------------------------------------------------
# Transfer data to the next tasks
#
dataframe_id = compress_and_transfer_dataframe(dataframe)
print("dataframe id (out): ", dataframe_id)

resultMetadata.put("task.name", __file__)
コード例 #2
0
from zoo.chronos.data.utils.feature import generate_dt_features, generate_global_features
from zoo.chronos.data.utils.impute import impute_timeseries_dataframe
from zoo.chronos.data.utils.deduplicate import deduplicate_timeseries_dataframe
from zoo.chronos.data.utils.roll import roll_timeseries_dataframe
from zoo.chronos.data.utils.scale import unscale_timeseries_numpy
from zoo.chronos.data.utils.resample import resample_timeseries_dataframe
from zoo.chronos.data.utils.split import split_timeseries_dataframe

from tsfresh.utilities.dataframe_functions import roll_time_series
from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters,\
    MinimalFCParameters, EfficientFCParameters
DEFAULT_PARAMS = {"comprehensive": ComprehensiveFCParameters(),
                  "minimal": MinimalFCParameters(),
                  "efficient": EfficientFCParameters()}

_DEFAULT_ID_COL_NAME = "id"
_DEFAULT_ID_PLACEHOLDER = "0"


class TSDataset:
    def __init__(self, data, **schema):
        '''
        TSDataset is an abstract of time series dataset.
        Cascade call is supported for most of the transform methods.
        '''
        self.df = data
        self.id_col = schema["id_col"]
        self.dt_col = schema["dt_col"]
コード例 #3
0
ファイル: tsdataset.py プロジェクト: EmiCareOfCell44/BigDL
    def gen_global_feature(self,
                           settings="comprehensive",
                           full_settings=None,
                           n_jobs=1):
        '''
        Generate per-time-series feature for each time series.
        This method will be implemented by tsfresh.
        Make sure that the specified column name does not contain '__'.

        TODO: relationship with scale should be figured out.

        :param settings: str or dict. If a string is set, then it must be one of "comprehensive"
               "minimal" and "efficient". If a dict is set, then it should follow the instruction
               for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive".
        :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in
               tsfresh. The value is defaulted to None.
        :param n_jobs: int. The number of processes to use for parallelization.

        :return: the tsdataset instance.
        '''
        from tsfresh import extract_features
        from tsfresh.feature_extraction import ComprehensiveFCParameters, \
            MinimalFCParameters, EfficientFCParameters

        DEFAULT_PARAMS = {
            "comprehensive": ComprehensiveFCParameters(),
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters()
        }

        assert not self._has_generate_agg_feature, \
            "Only one of gen_global_feature and gen_rolling_feature should be called."
        if full_settings is not None:
            self.df,\
                addtional_feature =\
                generate_global_features(input_df=self.df,
                                         column_id=self.id_col,
                                         column_sort=self.dt_col,
                                         kind_to_fc_parameters=full_settings,
                                         n_jobs=n_jobs)
            self.feature_col += addtional_feature
            return self

        if isinstance(settings, str):
            assert settings in ['comprehensive', 'minimal', 'efficient'], \
                "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\
                f", but found {settings}."
            default_fc_parameters = DEFAULT_PARAMS[settings]
        else:
            default_fc_parameters = settings

        self.df,\
            addtional_feature =\
            generate_global_features(input_df=self.df,
                                     column_id=self.id_col,
                                     column_sort=self.dt_col,
                                     default_fc_parameters=default_fc_parameters,
                                     n_jobs=n_jobs)

        self.feature_col += addtional_feature
        self._has_generate_agg_feature = True
        return self
コード例 #4
0
ファイル: tsdataset.py プロジェクト: EmiCareOfCell44/BigDL
    def gen_rolling_feature(self,
                            window_size,
                            settings="comprehensive",
                            full_settings=None,
                            n_jobs=1):
        '''
        Generate aggregation feature for each sample.
        This method will be implemented by tsfresh.
        Make sure that the specified column name does not contain '__'.

        TODO: relationship with scale should be figured out.

        :param window_size: int, generate feature according to the rolling result.
        :param settings: str or dict. If a string is set, then it must be one of "comprehensive"
               "minimal" and "efficient". If a dict is set, then it should follow the instruction
               for default_fc_parameters in tsfresh. The value is defaulted to "comprehensive".
        :param full_settings: dict. It should follow the instruction for kind_to_fc_parameters in
               tsfresh. The value is defaulted to None.
        :param n_jobs: int. The number of processes to use for parallelization.

        :return: the tsdataset instance.
        '''
        from tsfresh.utilities.dataframe_functions import roll_time_series
        from tsfresh.utilities.dataframe_functions import impute as impute_tsfresh
        from tsfresh import extract_features
        from tsfresh.feature_extraction import ComprehensiveFCParameters, \
            MinimalFCParameters, EfficientFCParameters

        DEFAULT_PARAMS = {
            "comprehensive": ComprehensiveFCParameters(),
            "minimal": MinimalFCParameters(),
            "efficient": EfficientFCParameters()
        }

        assert not self._has_generate_agg_feature,\
            "Only one of gen_global_feature and gen_rolling_feature should be called."
        if isinstance(settings, str):
            assert settings in ['comprehensive', 'minimal', 'efficient'], \
                "settings str should be one of 'comprehensive', 'minimal', 'efficient'"\
                f", but found {settings}."
            default_fc_parameters = DEFAULT_PARAMS[settings]
        else:
            default_fc_parameters = settings

        assert window_size < self.df.groupby(self.id_col).size().min() + 1, "gen_rolling_feature "\
            "should have a window_size smaller than shortest time series length."
        df_rolled = roll_time_series(self.df,
                                     column_id=self.id_col,
                                     column_sort=self.dt_col,
                                     max_timeshift=window_size - 1,
                                     min_timeshift=window_size - 1,
                                     n_jobs=n_jobs)
        if not full_settings:
            self.roll_feature_df = extract_features(
                df_rolled,
                column_id=self.id_col,
                column_sort=self.dt_col,
                default_fc_parameters=default_fc_parameters,
                n_jobs=n_jobs)
        else:
            self.roll_feature_df = extract_features(
                df_rolled,
                column_id=self.id_col,
                column_sort=self.dt_col,
                kind_to_fc_parameters=full_settings,
                n_jobs=n_jobs)
        impute_tsfresh(self.roll_feature_df)

        self.feature_col += list(self.roll_feature_df.columns)
        self.roll_additional_feature = list(self.roll_feature_df.columns)
        self._has_generate_agg_feature = True
        return self
コード例 #5
0
import pandas as pd
import datetime
from tsfresh.feature_extraction import extract_features, EfficientFCParameters, MinimalFCParameters, ComprehensiveFCParameters

# settings = EfficientFCParameters()
settings = MinimalFCParameters()
# settings = ComprehensiveFCParameters()

def run_click():
    t_click = pd.read_csv('../data/t_click.csv')
    try:
        t_click_8_10 = t_click[t_click['click_time'] < '2016-11-01']
        t_click_9_11 = t_click[t_click['click_time'] > '2016-08-31']
        extracted_features_click = extract_features(t_click_8_10, column_id="uid", column_sort="click_time",default_fc_parameters=settings)
        extracted_features_click.to_pickle('extracted_features_click_8_10.pickle')
        extracted_features_click = extract_features(t_click_9_11, column_id="uid", column_sort="click_time",default_fc_parameters=settings)
        extracted_features_click.to_pickle('extracted_features_click_9_11.pickle')
    except Exception as e:
        print(e)

def run_loan():
    t_loan = pd.read_csv('../data/t_loan.csv')
    try:
        t_loan_8_10 = t_loan[t_loan['loan_time']<'2016-11-01']
        t_loan_9_11 = t_loan[t_loan['loan_time']>'2016-08-31']
        extracted_features_loan = extract_features(t_loan_8_10, column_id="uid", column_sort="loan_time")
        extracted_features_loan.to_pickle('extracted_features_loan_8_10.pickle')
        extracted_features_loan = extract_features(t_loan_9_11, column_id="uid", column_sort="loan_time")
        extracted_features_loan.to_pickle('extracted_features_loan_9_11.pickle')
    except Exception as e:
        print(e)
コード例 #6
0
#%% Imports
import numpy as np
import pandas as pd
from tsfresh import extract_relevant_features
from tsfresh.feature_extraction import MinimalFCParameters


#%% Load data
data=np.load('../data/dataset.npy')
labels = ['test_num', 'location', 'x', 'y', 'z', 't', 'weight']
dataset = pd.DataFrame({ labels[0]:data[:,0].astype(int) , labels[1]:data[:,1] , labels[2]:data[:,2], \
labels[3]:data[:,3] , labels[4]:data[:,4] , labels[5]:data[:,5] , labels[6]:data[:,6]})
y = np.load('../data/y.npy')
y = pd.Series(y.astype(int))

#%% Access the data method
def getTest(dataset,num):
    return dataset[dataset['test_num']==num]

#%% Extract relevant features

features =  extract_relevant_features(dataset, y, column_id='test_num', column_sort='t', default_fc_parameters=MinimalFCParameters())
features.to_pickle('../data/features.pkl')
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier

from voting import VotingClassifier

MODEL_PATH = './Models/'
MODEL_NAMES = ['./Models/KNeighborsClassifier_model.pkl',
               './Models/DecisionTreeClassifier_model.pkl',
               './Models/RandomForestClassifier_model.pkl',
               './Models/AdaBoostClassifier_model.pkl',
               './Models/GradientBoostingClassifier_model.pkl',
               './Models/GaussianNB_model.pkl',
               './Models/LinearDiscriminantAnalysis_model.pkl',
               './Models/XGBClassifier_model.pkl']

TSFRESH_SETTINGS = MinimalFCParameters()
RAW_COLUMNS = ['attention', 'meditation', 'delta', 'theta', 'lowAlpha', 'highAlpha', 'lowBeta', 'highBeta', 'lowGamma', 'highGamma']
COLUMNS = ['delta', 'theta', 'lowAlpha', 'highAlpha', 'lowBeta', 'highBeta', 'lowGamma', 'highGamma']
NEW_COLUMNS = ['id', 'time', 'delta', 'theta', 'lowAlpha', 'highAlpha', 'lowBeta', 'highBeta', 'lowGamma', 'highGamma']
SEQ_SIZE = 8

class EmotionML(object): 
    def __init__(self): 
        self.models = None
        self.raw = None         # raw data from json.loads
        self.data = None        # pd dataframe
        self.cleaned = None     # data after cleaning
        self.sequences = None
        self.MLInput = None

    def load_data(self, json_data): 
    kind_to_fc_parameters = {
        '0.0': fc_parameters,
        '1.0': fc_parameters,
        '2.0': fc_parameters,
        '3.0': fc_parameters,
        '4.0': fc_parameters,
        '5.0': fc_parameters
    }

    train_features4 = extract_features(
        sub_pb,
        column_id='object_id',
        column_value='flux',
        column_sort='mjd',
        column_kind='passband',
        default_fc_parameters=MinimalFCParameters())
    impute(train_features4)

    train_features5 = extract_features(
        sub_pb,
        column_id='object_id',
        column_value='flux',
        column_sort='mjd',
        column_kind='passband',
        default_fc_parameters=fc_parameters,
        kind_to_fc_parameters=kind_to_fc_parameters)
    impute(train_features5)

    #Task5 model pruning
    training5_X = train_features5
    training5_Y = sub_meta['target']
コード例 #9
0
ファイル: datautils.py プロジェクト: williamhaw/grab_safety
def preprocess_data(raw_features, labels):
    """

	Preprocesses the data by:
		1. Removing duplicate labels and joining both dataframes on bookingID
		2. Unflatten trips by making each trip one row (all data points for that column is put into a list sorted by time)
		3. Computing aggregate features from raw features using tsfresh and joining to combined dataframe
		4. Computing derivatives of certain columns and their aggregates
		5. Returning time series features and aggregate features as separate dataframes

	Warning: this takes a while (10+ mins on all the given data)
	

	Returns:
	pd.DataFrame: bookingIDs
	pd.DataFrame: labels
	pd.DataFrame: aggregate_features
	pd.DataFrame: timeseries_features

	"""

    #drop all trips with duplicate labels since there are only 26 of them
    deduplicated_labels = labels.drop_duplicates(
        subset='bookingID',
        keep=False).set_index(keys=['bookingID']).sort_index()
    filtered_features = raw_features[raw_features.bookingID.isin(
        deduplicated_labels.index)]

    #make each time series into a list in it's row
    timeseries_features = raw_features.sort_values(
        by='second').groupby('bookingID').agg(list).reset_index()
    combined = timeseries_features.join(deduplicated_labels,
                                        on='bookingID',
                                        how='inner')

    #tsfresh gives features which are aggregates of the time series, e.g. mean, median, standard deviation, etc
    extracted_features = extract_features(
        raw_features,
        column_id="bookingID",
        column_sort="second",
        default_fc_parameters=MinimalFCParameters())
    combined = combined.merge(extracted_features,
                              left_on="bookingID",
                              right_index=True,
                              how="inner")

    #get derivatives such as jerk (second order derivative of velocity)
    combined = combined.assign(
        jerk_x=get_derivative_list(combined, 'acceleration_x'),
        jerk_y=get_derivative_list(combined, 'acceleration_y'),
        jerk_z=get_derivative_list(combined, 'acceleration_z'),
        gyro_accel_x=get_derivative_list(combined, 'gyro_x'),
        gyro_accel_y=get_derivative_list(combined, 'gyro_y'),
        gyro_accel_z=get_derivative_list(combined, 'gyro_z'),
        gps_accel=get_derivative_list(combined, 'Speed'))

    #compute aggregates for derivative functions
    combined['min_jerk_x'] = np.vectorize(min)(combined['jerk_x'])
    combined['min_jerk_y'] = np.vectorize(min)(combined['jerk_y'])
    combined['min_jerk_z'] = np.vectorize(min)(combined['jerk_z'])

    combined['max_jerk_x'] = np.vectorize(max)(combined['jerk_x'])
    combined['max_jerk_y'] = np.vectorize(max)(combined['jerk_y'])
    combined['max_jerk_z'] = np.vectorize(max)(combined['jerk_z'])

    combined['mean_jerk_x'] = np.vectorize(mean)(combined['jerk_x'])
    combined['mean_jerk_y'] = np.vectorize(mean)(combined['jerk_y'])
    combined['mean_jerk_z'] = np.vectorize(mean)(combined['jerk_z'])

    combined['median_jerk_x'] = np.vectorize(median)(combined['jerk_x'])
    combined['median_jerk_y'] = np.vectorize(median)(combined['jerk_y'])
    combined['median_jerk_z'] = np.vectorize(median)(combined['jerk_z'])

    combined['stdev_jerk_x'] = np.vectorize(stdev)(combined['jerk_x'])
    combined['stdev_jerk_y'] = np.vectorize(stdev)(combined['jerk_y'])
    combined['stdev_jerk_z'] = np.vectorize(stdev)(combined['jerk_z'])

    combined['min_gyro_accel_x'] = np.vectorize(min)(combined['gyro_accel_x'])
    combined['min_gyro_accel_y'] = np.vectorize(min)(combined['gyro_accel_y'])
    combined['min_gyro_accel_z'] = np.vectorize(min)(combined['gyro_accel_z'])

    combined['max_gyro_accel_x'] = np.vectorize(max)(combined['gyro_accel_x'])
    combined['max_gyro_accel_y'] = np.vectorize(max)(combined['gyro_accel_y'])
    combined['max_gyro_accel_z'] = np.vectorize(max)(combined['gyro_accel_z'])

    combined['mean_gyro_accel_x'] = np.vectorize(mean)(
        combined['gyro_accel_x'])
    combined['mean_gyro_accel_y'] = np.vectorize(mean)(
        combined['gyro_accel_y'])
    combined['mean_gyro_accel_z'] = np.vectorize(mean)(
        combined['gyro_accel_z'])

    combined['median_gyro_accel_x'] = np.vectorize(median)(
        combined['gyro_accel_x'])
    combined['median_gyro_accel_y'] = np.vectorize(median)(
        combined['gyro_accel_y'])
    combined['median_gyro_accel_z'] = np.vectorize(median)(
        combined['gyro_accel_z'])

    combined['stdev_gyro_accel_x'] = np.vectorize(stdev)(
        combined['gyro_accel_x'])
    combined['stdev_gyro_accel_y'] = np.vectorize(stdev)(
        combined['gyro_accel_y'])
    combined['stdev_gyro_accel_z'] = np.vectorize(stdev)(
        combined['gyro_accel_z'])

    combined['min_gps_accel'] = np.vectorize(min)(combined['gps_accel'])

    combined['max_gps_accel'] = np.vectorize(max)(combined['gps_accel'])

    combined['mean_gps_accel'] = np.vectorize(mean)(combined['gps_accel'])

    combined['median_gps_accel'] = np.vectorize(median)(combined['gps_accel'])

    combined['stdev_gps_accel'] = np.vectorize(stdev)(combined['gps_accel'])

    #prepare output
    booking_id = combined['bookingID']
    labels = combined['label']
    aggregate_features_columns = [
        'min_jerk_x', 'min_jerk_y', 'min_jerk_z', 'max_jerk_x', 'max_jerk_y',
        'max_jerk_z', 'mean_jerk_x', 'mean_jerk_y', 'mean_jerk_z',
        'median_jerk_x', 'median_jerk_y', 'median_jerk_z', 'stdev_jerk_x',
        'stdev_jerk_y', 'stdev_jerk_z', 'min_gyro_accel_x', 'min_gyro_accel_y',
        'min_gyro_accel_z', 'max_gyro_accel_x', 'max_gyro_accel_y',
        'max_gyro_accel_z', 'mean_gyro_accel_x', 'mean_gyro_accel_y',
        'mean_gyro_accel_z', 'median_gyro_accel_x', 'median_gyro_accel_y',
        'median_gyro_accel_z', 'stdev_gyro_accel_x', 'stdev_gyro_accel_y',
        'stdev_gyro_accel_z', 'min_gps_accel', 'max_gps_accel',
        'mean_gps_accel', 'median_gps_accel', 'stdev_gps_accel'
    ]

    final_timeseries_features = combined.drop(
        ['bookingID', 'label', 'second'] + list(extracted_features.columns) +
        aggregate_features_columns,
        axis=1)
    aggregate_features = combined[list(extracted_features.columns) +
                                  aggregate_features_columns]

    return booking_id, labels, aggregate_features, final_timeseries_features