Esempio n. 1
0
    def run(self):
        input_file = self._get_input_targets("data.csv")[0]

        with input_file.open("r") as f:
            df = pd.read_csv(f)

        start_time = time()
        extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs,
                         default_fc_parameters=self.feature_parameter,
                         disable_progressbar=True)
        end_time = time()

        single_parameter_name = list(self.feature_parameter.keys())[0]
        single_parameter_params = self.feature_parameter[single_parameter_name]

        result_json = {
            "time": end_time - start_time,
            "n_ids": self.num_ids,
            "n_jobs": self.n_jobs,
            "feature": single_parameter_name,
            "number_parameters": len(single_parameter_params) if single_parameter_params else 0,
            "time_series_length": int((df["id"] == 0).sum()),
            "try_number": self.try_number,
        }

        with self._get_output_target("result.json").open("w") as f:
            json.dump(result_json, f)
Esempio n. 2
0
    def encode(self, column_data):
        """
        Encode a column data into time series

        :param column_data: a list of timeseries data eg: ['91.0 92.0 93.0 94.0', '92.0 93.0 94.0 95.0' ...]
        :return: a torch.floatTensor
        """

        ret = []
        default_fc_parameters = MinimalFCParameters()
        all_values = []

        for i, values in enumerate(column_data):
            if values is None:
                values = [0] * self.max_series_len
            elif type(values) == type([]):
                values = list(map(float, values))
            else:
                values = list(map(lambda x: float(x), values.split(' ')))

            all_values.append(values)
            df = pd.DataFrame({
                'main_feature': values,
                'id': [1] * len(values)
            })

            try:
                features = extract_features(
                    df,
                    column_id='id',
                    disable_progressbar=True,
                    default_fc_parameters=default_fc_parameters,
                    n_jobs=self.n_jobs)
            except:
                self.n_jobs = 1
                features = extract_features(
                    df,
                    column_id='id',
                    disable_progressbar=True,
                    default_fc_parameters=default_fc_parameters,
                    n_jobs=self.n_jobs)

            features.fillna(value=0, inplace=True)

            features = list(features.iloc[0])
            ret.append(features)

        for i, values in enumerate(all_values):
            while len(values) < self.max_series_len:
                values.append(0)

            encoded_values = self.numerical_encoder.encode(values)

            encoded_numbers_list = []
            for pair in encoded_values.tolist():
                encoded_numbers_list.extend(pair)

            ret[i].extend(encoded_numbers_list)

        return self._pytorch_wrapper(ret)
Esempio n. 3
0
def featurize(df, df_meta, aggs, fcp):
    """
    Extracting Features from train set
    Features from olivier's kernel
    very smart and powerful feature that is generously given here
    https://www.kaggle.com/c/PLAsTiCC-2018/discussion/69696#410538
    per passband features with tsfresh library. fft features added to capture periodicity
    https://www.kaggle.com/c/PLAsTiCC-2018/discussion/70346#415506
    """

    df = process_flux(df)

    agg_df = df.groupby(OBJECT_ID).agg(aggs)
    agg_df.columns = [
        '{}_{}'.format(k, agg) for k in aggs.keys() for agg in aggs[k]
    ]
    agg_df = process_flux_agg(agg_df)
    df.sort_values('mjd', inplace=True)
    default_params = dict(column_id=OBJECT_ID,
                          disable_progressbar=True,
                          column_sort='mjd')
    # Add more features with tsfresh
    agg_df_ts_flux_passband = extract_features(
        df,
        column_kind='passband',
        column_value='flux',
        default_fc_parameters=fcp['flux_passband'],
        **default_params)

    agg_df_ts_flux = extract_features(df,
                                      column_value='flux',
                                      default_fc_parameters=fcp['flux'],
                                      **default_params)

    agg_df_ts_flux_by_flux_ratio_sq = extract_features(
        df,
        column_value='flux_by_flux_ratio_sq',
        default_fc_parameters=fcp['flux_by_flux_ratio_sq'],
        **default_params)

    # https://www.kaggle.com/c/PLAsTiCC-2018/discussion/69696#410538
    # dt[detected==1, mjd_diff:=max(mjd)-min(mjd), by=object_id]
    df_det = df[df['detected'] == 1].copy()
    agg_df_mjd = extract_features(df_det,
                                  column_id=OBJECT_ID,
                                  column_value='mjd',
                                  default_fc_parameters=fcp['mjd'],
                                  disable_progressbar=True)
    agg_df_mjd['mjd_diff_det'] = agg_df_mjd[
        'mjd__maximum'].values - agg_df_mjd['mjd__minimum'].values
    del agg_df_mjd['mjd__maximum'], agg_df_mjd['mjd__minimum']
    agg_df_ts = pd.concat([
        agg_df, agg_df_ts_flux_passband, agg_df_ts_flux,
        agg_df_ts_flux_by_flux_ratio_sq, agg_df_mjd
    ],
                          axis=1).rename_axis(OBJECT_ID).reset_index()

    result = agg_df_ts.merge(right=df_meta, how='left', on=OBJECT_ID)
    return result
def diff_featurize(diff_df, df_meta, diff_aggs, fcp, n_jobs=36):
    """
    Extracting Features from train set
    Features from olivier's kernel
    very smart and powerful feature that is generously given here https://www.kaggle.com/c/PLAsTiCC-2018/discussion/69696#410538
    per passband features with tsfresh library. fft features added to capture periodicity https://www.kaggle.com/c/PLAsTiCC-2018/discussion/70346#415506
    """

    # df = train.copy()

    # diff_df = process_flux(diff_df)

    diff_agg_df = diff_df.groupby('object_id').agg(diff_aggs)
    diff_agg_df.columns = [
        '{}_{}'.format(k, agg) for k in diff_aggs.keys()
        for agg in diff_aggs[k]
    ]
    # diff_agg_df = process_flux_agg(diff_agg_df) # new feature to play with tsfresh

    # Add more features with
    diff_agg_df_ts_flux_passband = extract_features(
        diff_df,
        column_id='object_id',
        column_sort='mjd',
        column_kind='passband',
        column_value='flux_diff',
        default_fc_parameters=fcp['flux_passband'],
        n_jobs=n_jobs)

    diff_agg_df_ts_flux = extract_features(diff_df,
                                           column_id='object_id',
                                           column_value='flux_diff',
                                           default_fc_parameters=fcp['flux'],
                                           n_jobs=n_jobs)

    diff_agg_df_ts_flux_passband.index.rename('object_id', inplace=True)
    diff_agg_df_ts_flux_passband.columns = [
        column + '_diff' for column in diff_agg_df_ts_flux_passband.columns
    ]
    diff_agg_df_ts_flux.index.rename('object_id', inplace=True)
    # agg_df_ts_flux_by_flux_ratio_sq.index.rename('object_id', inplace=True)
    # agg_df_mjd.index.rename('object_id', inplace=True)
    diff_agg_df_ts = pd.concat(
        [
            diff_agg_df,
            diff_agg_df_ts_flux_passband,
            diff_agg_df_ts_flux,
            # agg_df_ts_flux_by_flux_ratio_sq,
            # agg_df_mjd
        ],
        axis=1).reset_index()

    # result = agg_df_ts.merge(right=df_meta, how='left', on='object_id')
    result = diff_agg_df_ts
    return result
Esempio n. 5
0
def run_loan():
    t_loan = pd.read_csv('../data/t_loan.csv')
    try:
        t_loan_8_10 = t_loan[t_loan['loan_time']<'2016-11-01']
        t_loan_9_11 = t_loan[t_loan['loan_time']>'2016-08-31']
        extracted_features_loan = extract_features(t_loan_8_10, column_id="uid", column_sort="loan_time")
        extracted_features_loan.to_pickle('extracted_features_loan_8_10.pickle')
        extracted_features_loan = extract_features(t_loan_9_11, column_id="uid", column_sort="loan_time")
        extracted_features_loan.to_pickle('extracted_features_loan_9_11.pickle')
    except Exception as e:
        print(e)
Esempio n. 6
0
def run_click():
    t_click = pd.read_csv('../data/t_click.csv')
    try:
        t_click_8_10 = t_click[t_click['click_time'] < '2016-11-01']
        t_click_9_11 = t_click[t_click['click_time'] > '2016-08-31']
        extracted_features_click = extract_features(t_click_8_10, column_id="uid", column_sort="click_time",default_fc_parameters=settings)
        extracted_features_click.to_pickle('extracted_features_click_8_10.pickle')
        extracted_features_click = extract_features(t_click_9_11, column_id="uid", column_sort="click_time",default_fc_parameters=settings)
        extracted_features_click.to_pickle('extracted_features_click_9_11.pickle')
    except Exception as e:
        print(e)
Esempio n. 7
0
def agg_engineer_features_merge(df, df_meta, aggs, feature_spec):

    df = process_flux(df)
    aggregate = df.groupby('object_id').agg(aggs)
    aggregate.columns = [
        '{}_{}'.format(k, agg) for k in aggs.keys() for agg in aggs[k]
    ]

    # Extract some features (mentioned in report), mainly moments, ranges, means, and extrema of flux, flux ratios and passband data
    aggregate = process_flux_agg(aggregate)
    aggregate_ts_flux_passband = extract_features(
        df,
        column_id='object_id',
        column_sort='mjd',
        column_kind='passband',
        column_value='flux',
        default_fc_parameters=feature_spec['flux_passband'])

    aggregate_ts_flux = extract_features(
        df,
        column_id='object_id',
        column_value='flux',
        default_fc_parameters=feature_spec['flux'])

    aggregate_ts_flux_by_flux_ratio_sq = extract_features(
        df,
        column_id='object_id',
        column_value='flux_by_flux_ratio_sq',
        default_fc_parameters=feature_spec['flux_by_flux_ratio_sq'])

    # This feature was suggested here and testified as being very useful https://www.kaggle.com/c/PLAsTiCC-2018/discussion/69696#410538
    df_det = df[df['detected'] == 1].copy()
    aggregate_mjd = extract_features(df_det,
                                     column_id='object_id',
                                     column_value='mjd',
                                     default_fc_parameters=feature_spec['mjd'])

    aggregate_mjd['mjd_diff_det'] = aggregate_mjd[
        'mjd__maximum'].values - aggregate_mjd['mjd__minimum'].values
    del aggregate_mjd['mjd__maximum'], aggregate_mjd['mjd__minimum']

    # Bring all of the generated features together in one data frame before merging it with metadata and returning
    aggregate_ts_flux_passband.index.rename('object_id', inplace=True)
    aggregate_ts_flux.index.rename('object_id', inplace=True)
    aggregate_ts_flux_by_flux_ratio_sq.index.rename('object_id', inplace=True)
    aggregate_mjd.index.rename('object_id', inplace=True)
    aggregate_ts = pd.concat([
        aggregate, aggregate_ts_flux_passband, aggregate_ts_flux,
        aggregate_ts_flux_by_flux_ratio_sq, aggregate_mjd
    ],
                             axis=1).reset_index()
    return aggregate.merge(right=df_meta, how='left', on='object_id')
Esempio n. 8
0
def featurize(df):
    # METHOD I
    # Create fourier transform coefficients here.
    # Fft coefficient is meant to capture periodicity
    # Features to compute with tsfresh library.
    fcp = {
        'fft_coefficient': [{
            'coeff': 0,
            'attr': 'abs'
        }, {
            'coeff': 1,
            'attr': 'abs'
        }],
        'kurtosis':
        None,
        'skewness':
        None
    }

    agg_df_ts = extract_features(df,
                                 column_id='object_id',
                                 column_sort='mjd',
                                 column_kind='passband',
                                 column_value='flux',
                                 default_fc_parameters=fcp,
                                 n_jobs=cores)

    # METHOD III
    # Find bursts decay rate based on detected == 1
    # Get mjd_diff_det which is the difference of mjd where detected == 1
    # Taken from https://www.kaggle.com/c/PLAsTiCC-2018/discussion/69696#410538
    df_det = df[df['detected'] == 1].copy()

    agg_df_mjd = extract_features(df_det,
                                  column_id='object_id',
                                  column_value='mjd',
                                  default_fc_parameters={
                                      'maximum': None,
                                      'minimum': None,
                                      'mean_abs_change': None
                                  },
                                  n_jobs=4)
    agg_df_mjd['mjd_diff_det'] = agg_df_mjd['mjd__maximum'] - agg_df_mjd[
        'mjd__minimum']
    del agg_df_mjd['mjd__maximum'], agg_df_mjd['mjd__minimum']
    agg_df_ts = pd.merge(agg_df_ts, agg_df_mjd, on='id')

    # tsfresh returns a dataframe with an index name='id'
    agg_df_ts.index.rename('object_id', inplace=True)

    return agg_df_ts
Esempio n. 9
0
def featurize(df, df_meta, aggs, fcp, n_jobs=6):
    """
    Extracting Features from train set
    per passband features with tsfresh library. fft features added to capture periodicity https://www.kaggle.com/c/PLAsTiCC-2018/discussion/70346#415506
    """
    df = process_flux(df)
    agg_df = df.groupby('object_id').agg(aggs)
    agg_df.columns = [ '{}_{}'.format(k, agg) for k in aggs.keys() for agg in aggs[k]]
    agg_df = process_flux_agg(agg_df) # new feature to play with tsfresh

    # Add more features with
    agg_df_ts_flux_passband = extract_features(df, 
                                               column_id='object_id', 
                                               column_sort='mjd', 
                                               column_kind='passband', 
                                               column_value='flux', 
                                               default_fc_parameters=fcp['flux_passband'], n_jobs=n_jobs)

    agg_df_ts_flux = extract_features(df, 
                                      column_id='object_id', 
                                      column_value='flux', 
                                      default_fc_parameters=fcp['flux'], n_jobs=n_jobs)

    agg_df_ts_flux_by_flux_ratio_sq = extract_features(df, 
                                      column_id='object_id', 
                                      column_value='flux_by_flux_ratio_sq', 
                                      default_fc_parameters=fcp['flux_by_flux_ratio_sq'], n_jobs=n_jobs)

    # Add smart feature 
    # dt[detected==1, mjd_diff:=max(mjd)-min(mjd), by=object_id]
    df_det = df[df['detected']==1].copy()
    agg_df_mjd = extract_features(df_det, 
                                  column_id='object_id', 
                                  column_value='mjd', 
                                  default_fc_parameters=fcp['mjd'], n_jobs=n_jobs)
    agg_df_mjd['mjd_diff_det'] = agg_df_mjd['mjd__maximum'].values - agg_df_mjd['mjd__minimum'].values
    del agg_df_mjd['mjd__maximum'], agg_df_mjd['mjd__minimum']
    
    agg_df_ts_flux_passband.index.rename('object_id', inplace=True) 
    agg_df_ts_flux.index.rename('object_id', inplace=True) 
    agg_df_ts_flux_by_flux_ratio_sq.index.rename('object_id', inplace=True) 
    agg_df_mjd.index.rename('object_id', inplace=True)      
    agg_df_ts = pd.concat([agg_df, 
                           agg_df_ts_flux_passband, 
                           agg_df_ts_flux, 
                           agg_df_ts_flux_by_flux_ratio_sq, 
                           agg_df_mjd], axis=1).reset_index()
    result=agg_df_ts.merge(right=df_meta, how='left', on='object_id')
    return result
def create_features_by_tsfresh(path, dataset, years, features):
    data = dataset.copy()
    data = data[features + ['time', 'id']]
    data_rolled = roll_time_series(data,
                                   column_id="id",
                                   column_sort="time",
                                   max_timeshift=7 * 24,
                                   n_jobs=8)
    features = extract_features(data_rolled,
                                column_id="id",
                                column_sort="time",
                                n_jobs=8)
    impute(features)
    print(features.shape)
    features.to_csv(path + '/modified_data_after_feature_extraction/')

    AQI = get_raw_AQI_data(path, years)
    AQI_data = pd.Series(data=AQI['AQI'].values,
                         index=features.index,
                         name='AQI')
    print(AQI_data.shape)
    selected_features = select_features(features, AQI_data)
    print(selected_features.shape)
    # features.drop('ID', axis=1, inplace=True)
    selected_features.index = range(selected_features.shape[0])
    return selected_features
Esempio n. 11
0
def add_improved_feats(mock_tr, feat_df):
    X = extract_features(mock_tr, default_fc_parameters=fcp_improved,
                         column_id=OBJECT_ID, profile=True,
                         column_kind='passband',
                         column_sort='mjd',
                         column_value='flux', disable_progressbar=True).rename_axis(OBJECT_ID)
    return feat_df.join(X)
def main():
    files = pd.read_excel(
        '/home/velaraptor/Downloads/Raw Data 10yrs (2018).xlsx', header=1)
    files = files.fillna(0)
    groups = files.groupby('Name')
    forecast_df = []
    for name, group in tqdm.tqdm(groups):
        if len(group) > 1:
            group.index = group.Year
            df_shift, y = make_forecasting_frame(group["FantPt"],
                                                 kind=name,
                                                 max_timeshift=10,
                                                 rolling_direction=1)
            forecast_df.append(df_shift)

    features_df = []
    for sample in tqdm.tqdm(forecast_df):
        X = extract_features(sample,
                             column_id="id",
                             column_sort="time",
                             column_value="value",
                             impute_function=impute,
                             show_warnings=False,
                             disable_progressbar=True,
                             default_fc_parameters=EfficientFCParameters())
        X = X.reset_index()
        X.loc[:, 'Name'] = sample['kind']
        features_df.append(X)
    features_time_series = pd.concat(features_df)
    features_time_series.to_csv('features_time_series.csv', index=False)
Esempio n. 13
0
def extract_relevant_features(timeseries_container,
                              y,
                              X=None,
                              feature_extraction_settings=None,
                              feature_selection_settings=None,
                              column_id=None,
                              column_sort=None,
                              column_kind=None,
                              column_value=None):
    """
    High level convenience function to extract time series features from `timeseries_container`. Then return feature
    matrix `X` possibly augmented with features relevant with respect to target vector `y`.

    For more details see the documentation of :func:`~tsfresh.feature_extraction.extraction.extract_features` and
    :func:`~tsfresh.feature_selection.selection.select_features`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_relevant_features
    >>> df, y = load_robot_execution_failures()
    >>> X = extract_relevant_features(df, y, column_id='id', column_sort='time')

    :param timeseries_container: See parameter `timeseries_container` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param y: See parameter `y` in :func:`~tsfresh.feature_selection.selection.select_features`
    :param X: See parameter `X` in :func:`~tsfresh.feature_selection.selection.select_features`
    :param column_id: See parameter `column_id` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param column_sort: See parameter `column_sort` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param column_kind: See parameter `column_kind` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param column_value: See parameter `column_value` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param feature_extraction_settings: See parameter `feature_extraction_settings` in :func:`~tsfresh.feature_extraction.extraction.extract_features`
    :param feature_selection_settings: See parameter `feature_selection_settings` in :func:`~tsfresh.feature_selection.selection.select_features`

    :return: Feature matrix X, possibly extended with relevant time series features.
    """
    if X is not None:
        timeseries_container = restrict_input_to_index(timeseries_container,
                                                       column_id, X.index)

    if feature_extraction_settings is None:
        feature_extraction_settings = FeatureExtractionSettings()
        feature_extraction_settings.IMPUTE = impute

    X_ext = extract_features(
        timeseries_container,
        feature_extraction_settings=feature_extraction_settings,
        column_id=column_id,
        column_sort=column_sort,
        column_kind=column_kind,
        column_value=column_value)
    X_sel = select_features(
        X_ext, y, feature_selection_settings=feature_selection_settings)

    if X is None:
        X = X_sel
    else:
        X = pd.merge(X, X_sel, left_index=True, right_index=True, how="left")

    return X
    def extract(self, use_features=[]):
        x = self.__x_data_frame()
        y = self.__y_series()

        settings = ReasonableFeatureExtractionSettings()
        extracted_features = extract_features(x, column_id='id', \
                feature_extraction_settings=settings)
        if len(use_features) == 0:
            impute(extracted_features)
            features_filtered = select_features(extracted_features, y)
            use_features = features_filtered.keys()
        else:
            features_filtered = extracted_features[use_features]

        keys = features_filtered.keys()
        timeseries = []
        for index, row in features_filtered.iterrows():
            values = []
            for key in keys:
                if key == 'id':
                    continue

                value = row[key]
                values.append(value)

            timeseries.append(Timeseries([values]))

        return timeseries, use_features
def test_primitive(entityset, df, parameters, primitive):
    expected = extract_features(
        timeseries_container=df,
        column_id='session_id',
        default_fc_parameters=parameters,
    )

    base = entityset['transactions']['amount']

    if primitive.name == 'linear_trend_timewise':
        base = [base, entityset['transactions']['transaction_time']]

    feature = ft.Feature(
        base=base,
        parent_entity=entityset['sessions'],
        primitive=primitive,
    )

    actual = ft.calculate_feature_matrix(
        features=[feature],
        entityset=entityset,
    )

    assert_almost_equal(
        actual=actual.values,
        desired=expected.values,
    )
Esempio n. 16
0
def f1005_tsfresh_flux_per_passband(input: Input, **kw):
    fcp = {
        'fft_coefficient': [{
            'coeff': 0,
            'attr': 'abs'
        }, {
            'coeff': 1,
            'attr': 'abs'
        }],
        'kurtosis':
        None,
        'skewness':
        None,
    }

    dfs = []
    for i in tqdm(range(30)):
        lc = common.load_partial_lightcurve(i)
        partial = extract_features(lc,
                                   column_id='object_id',
                                   column_sort='mjd',
                                   column_kind='passband',
                                   column_value='flux',
                                   default_fc_parameters=fcp,
                                   n_jobs=config.TSFRESH_N_JOBS)
        dfs.append(partial.reset_index())
        gc.collect()

    return extract_features_postproc(pd.concat(dfs))
def create_df_participant(df):
    #Drop NaN
    df = df.dropna()
    #targets for each sliding window (id)
    targets = df["target"].groupby(level="id").agg(return_unique).tolist()
    #Reconstruct comp table
    df = df._drop_axis("target", axis=1)
    #create id
    idx_slice = df.index.tolist()  #idx_slice contains ids' and samples
    idx_list = [el[0] for el in idx_slice]  #get only ids
    #get targets
    cols = df.columns
    flat_cols = ["".join(comp) for comp in cols]
    #"Flat" dataframe
    df_feat = pd.DataFrame(df.values, columns=flat_cols)
    df_feat["id"] = idx_list

    #Feature extraction
    extracted_features = extract_features(
        df_feat,
        column_id="id",
        column_sort="Recording timestamp",
        default_fc_parameters=MinimalFCParameters())

    #Feature filtering
    impute(extracted_features)
    extracted_features["target"] = targets

    return extracted_features.reset_index(drop=False)
Esempio n. 18
0
def test_primitive(entityset, df, parameters, primitive):
    expected = extract_features(
        timeseries_container=df,
        column_id="session_id",
        default_fc_parameters=parameters,
    )

    base = ft.Feature(entityset["transactions"].ww["amount"])

    if primitive.name == "linear_trend_timewise":
        base = [base, ft.Feature(entityset["transactions"].ww["transaction_time"])]

    feature = ft.Feature(
        base=base,
        parent_dataframe_name="sessions",
        primitive=primitive,
    )

    actual = ft.calculate_feature_matrix(
        features=[feature],
        entityset=entityset,
    )

    assert_almost_equal(
        actual=actual.values,
        desired=expected.values,
    )
Esempio n. 19
0
 def get_series_features(self, bandstructure, spin=Spin.up, n_valence_bands=5, n_conduction_bands=5):
     bands_as_df = self._get_bands_as_df(bandstructure,
                                         spin=spin,
                                         n_valence_bands=n_valence_bands,
                                         n_conduction_bands=n_conduction_bands)
     features = extract_features(
         bands_as_df, default_fc_parameters=FC_PARAMETERS, column_id="id", column_sort="k")
     return features.values[0], features.columns.tolist()
Esempio n. 20
0
def add_acor_feat(mock_tr, feat_df):
    acor_params = {'partial_autocorrelation': [{'lag': 1}, ]}
    X = extract_features(mock_tr, default_fc_parameters=acor_params,
                         column_id=OBJECT_ID, profile=True,
                         column_sort='mjd',
                         column_value='flux', disable_progressbar=True).rename_axis(OBJECT_ID)
    X.columns  = ['efficient_flux__partial_autocorrelation__lag_1']  # HACK
    return feat_df.join(X)
 def transform(self, df):
     df_stacked = convert(df)
     extracted_features = extract_features(
         df_stacked,
         column_id="id",
         column_kind="kind",
         column_value="value",
         default_fc_parameters=MinimalFCParameters())
     self.extracted_features = impute(extracted_features)
     return self.extracted_features.values.tolist()
Esempio n. 22
0
 def get_series_features(self, dos, dos_type='total', orbital_type='s', e_min=-5.0, e_max=5, spin=Spin.up):
     if dos_type == 'total':
         dos_as_df = self._get_total_dos_as_df(dos, e_min=e_min, e_max=e_max, spin=spin)
     elif dos_type == 'spd':
         dos_as_df = self._get_spd_dos_as_df(dos, orbital_type=orbital_type, e_min=e_min, e_max=e_max)
     else:
         raise ValueError('unknown dos type')
     features = extract_features(
         dos_as_df, default_fc_parameters=FC_PARAMETERS, column_id="id", column_sort="k")
     return features.values[0], features.columns.tolist()
Esempio n. 23
0
    def run(self):
        input_file = self._get_input_targets("data.csv")[0]

        with input_file.open("r") as f:
            df = pd.read_csv(f)

        start_time = time()
        extract_features(df, column_id="id", column_sort="time", n_jobs=self.n_jobs,
                         disable_progressbar=True)
        end_time = time()

        result_json = {
            "time": end_time - start_time,
            "n_ids": self.num_ids,
            "n_jobs": self.n_jobs,
            "time_series_length": int((df["id"] == 0).sum()),
        }

        with self._get_output_target("result.json").open("w") as f:
            json.dump(result_json, f)
Esempio n. 24
0
def generate_valid_features(record_dir, df_series):
    valid_features = extract_features(df_series, column_id='id', column_sort='time', show_warnings=False). \
        replace([numpy.inf, -numpy.inf], numpy.nan).dropna(axis=1, how='any')
    excel_writer = pandas.ExcelWriter(
        os.path.join(record_dir, 'valid_features.xlsx'))
    df_valid_features = pandas.DataFrame(data=valid_features)
    df_valid_features.to_excel(excel_writer)
    excel_writer.save()

    print('Number of valid features: %d' % (df_valid_features.shape[1] - 1))
    return valid_features
Esempio n. 25
0
    def features(x: pd.Series) -> pd.DataFrame:
        data = pd.DataFrame(dtype=np.float64)

        data['x'] = x
        data['id'] = 1

        df = extract_features(data,
                              column_id='id',
                              default_fc_parameters=EfficientFCParameters())

        return df
    def transform(self, X_train, y_train=None, **fit_params):
        X_features = extract_features(
            X_train,
            column_id=self.column_id,
            column_sort=self.column_sort,
            column_value=self.column_value,
            default_fc_parameters=self.extraction_settings,
            disable_progressbar=True)

        impute(X_features)
        return X_features
Esempio n. 27
0
def feature_extraction(df):
    import pandas as pd
    X_tsfresh = extract_features(df,
                                 column_id="id",
                                 column_value="value",
                                 default_fc_parameters=settings_time)
    #df = X_tsfresh
    #df =df.T
    #df2.replace(to_replace='value__agg_linear_trend__f_agg_"max"__chunk_len_', value='linear', regex=True)

    return X_tsfresh
def handle_message(msg):

    if msg.key() is None or type(msg.key()) is not dict:
        logger.warning("Key is none. Ignoring message.")
        return
    elif msg.value() is None or type(msg.value()) is not dict:
        logger.warning("Value is none. Ignoring message.")
        return

    try:
        time_begin = time.time()

        timeseries = pd.melt(
            pd.DataFrame.from_dict(msg.value(),
                                   orient='index').transpose()).dropna()
        timeseries['group_id'] = 0

        if timeseries.isnull().sum().sum() > 0:
            logger.warning("at least one field of timeseries is null")
            return

        X = extract_features(
            timeseries,
            column_id='group_id',
            column_kind="variable",
            column_value="value",
            kind_to_fc_parameters=settings.from_columns(fc_parameters))

        if X.isnull().sum().sum() > 0:
            logger.warning("at least one field of extracted features is null")
            return

        kritisch = ml_model.predict(pca_model.transform(X))[0]

        time_end = time.time()

        start_prediction_interval = time.localtime(msg.key()['timestamp_end'] /
                                                   1000)
        end_prediction_interval = time.localtime(msg.key()['timestamp_end'] /
                                                 1000 + 60 * 5)

        print("Prediction for interval",
              time.strftime("%H:%M:%S", start_prediction_interval), "to",
              time.strftime("%H:%M:%S", end_prediction_interval), ":",
              "kritisch" if kritisch else "unkritisch")

        if SHOW_CALCULATION_TIME == 1:
            print("time for calculation", round(time_end - time_begin, 5),
                  "seconds")

    except Exception as e:
        logger.exception(e)
        consumer.stop()
Esempio n. 29
0
def run_order():
    t_order = pd.read_csv('../data/t_order.csv')
    try:
        t_order = t_order.fillna(0)
        t_order_8_10 = t_order[t_order['buy_time']<'2016-11-01']
        t_order_9_11 = t_order[t_order['buy_time']>'2016-08-31']
        extracted_features_order = extract_features(t_order_8_10, column_id="uid", column_sort="buy_time",default_fc_parameters=settings)
        extracted_features_order.to_pickle('extracted_features_order_8_10.pickle')
        # extracted_features_order = extract_features(t_order_9_11, column_id="uid", column_sort="buy_time",default_fc_parameters=settings)
        # extracted_features_order.to_pickle('extracted_features_order_9_11.pickle')
    except Exception as e:
        print(e)
Esempio n. 30
0
def aggregate(df, output_path, drop_oid=True):
    """
    df = pd.read_pickle('../data/train_log.pkl').head(999)
    """

    df['flux_ratio_sq'] = np.power(df['flux'] / df['flux_err'], 2.0)
    df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']

    pt = pd.pivot_table(df, index=['object_id'], aggfunc=num_aggregations)

    pt.columns = pd.Index([f'{e[0]}_{e[1]}' for e in pt.columns.tolist()])

    # max / min, max - min, (max - min)/mean
    col_max = [c for c in pt.columns if c.endswith('_max')]
    for c in col_max:
        pt[f'{c}-d-min'] = pt[c] / pt[c.replace('_max', '_min')]
        pt[f'{c}-m-min'] = pt[c] - pt[c.replace('_max', '_min')]
        try:
            pt[f'{c}-m-min-d-mean'] = pt[f'{c}-m-min'] / pt[c.replace(
                '_max', '_mean')]
        except:
            pass

    # std / mean
    col_std = [c for c in pt.columns if c.endswith('_std')]
    for c in col_std:
        pt[f'{c}-d-mean'] = pt[c] / pt[c.replace('_std', '_mean')]

    agg_df_ts = extract_features(df,
                                 column_id='object_id',
                                 column_sort='mjd',
                                 column_kind='passband',
                                 column_value='flux',
                                 default_fc_parameters=fcp,
                                 n_jobs=4)
    agg_df_ts.index.name = 'object_id'

    pt = pd.concat([pt, agg_df_ts], axis=1)

    if usecols is not None:
        col = [c for c in pt.columns if c not in usecols]
        pt.drop(col, axis=1, inplace=True)

    if drop_oid:
        pt.reset_index(drop=True, inplace=True)
    else:
        pt.reset_index(inplace=True)
    pt.add_prefix(PREF + '_').to_pickle(output_path)

    return
Esempio n. 31
0
    def transform(self, X):
        """
        Add the features calculated using the timeseries_container and add them to the corresponding rows in the input
        pandas.DataFrame X.

        To save some computing time, you should only include those time serieses in the container, that you
        need. You can set the timeseries container with the method :func:`set_timeseries_container`.

        :param X: the DataFrame to which the calculated timeseries features will be added. This is *not* the
               dataframe with the timeseries itself.
        :type X: pandas.DataFrame

        :return: The input DataFrame, but with added features.
        :rtype: pandas.DataFrame
        """
        if self.timeseries_container is None:
            raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.")

        # Extract only features for the IDs in X.index
        timeseries_container_X = restrict_input_to_index(self.timeseries_container, self.column_id, X.index)

        extracted_features = extract_features(timeseries_container_X,
                                              default_fc_parameters=self.default_fc_parameters,
                                              kind_to_fc_parameters=self.kind_to_fc_parameters,
                                              column_id=self.column_id, column_sort=self.column_sort,
                                              column_kind=self.column_kind, column_value=self.column_value,
                                              chunksize=self.chunksize,
                                              n_jobs=self.n_jobs, show_warnings=self.show_warnings,
                                              disable_progressbar=self.disable_progressbar,
                                              impute_function=self.impute_function,
                                              profile=self.profile,
                                              profiling_filename=self.profiling_filename,
                                              profiling_sorting=self.profiling_sorting)

        X = pd.merge(X, extracted_features, left_index=True, right_index=True, how="left")

        return X
Esempio n. 32
0
def extract_relevant_features(timeseries_container, y, X=None,
                              default_fc_parameters=None,
                              kind_to_fc_parameters=None,
                              column_id=None, column_sort=None, column_kind=None, column_value=None,
                              show_warnings=defaults.SHOW_WARNINGS,
                              disable_progressbar=defaults.DISABLE_PROGRESSBAR,
                              profile=defaults.PROFILING,
                              profiling_filename=defaults.PROFILING_FILENAME,
                              profiling_sorting=defaults.PROFILING_SORTING,
                              test_for_binary_target_binary_feature=defaults.TEST_FOR_BINARY_TARGET_BINARY_FEATURE,
                              test_for_binary_target_real_feature=defaults.TEST_FOR_BINARY_TARGET_REAL_FEATURE,
                              test_for_real_target_binary_feature=defaults.TEST_FOR_REAL_TARGET_BINARY_FEATURE,
                              test_for_real_target_real_feature=defaults.TEST_FOR_REAL_TARGET_REAL_FEATURE,
                              fdr_level=defaults.FDR_LEVEL,
                              hypotheses_independent=defaults.HYPOTHESES_INDEPENDENT,
                              n_jobs=defaults.N_PROCESSES,
                              chunksize=defaults.CHUNKSIZE,
                              ml_task='auto'):
    """
    High level convenience function to extract time series features from `timeseries_container`. Then return feature
    matrix `X` possibly augmented with relevant features with respect to target vector `y`.

    For more details see the documentation of :func:`~tsfresh.feature_extraction.extraction.extract_features` and
    :func:`~tsfresh.feature_selection.selection.select_features`.

    Examples
    ========

    >>> from tsfresh.examples import load_robot_execution_failures
    >>> from tsfresh import extract_relevant_features
    >>> df, y = load_robot_execution_failures()
    >>> X = extract_relevant_features(df, y, column_id='id', column_sort='time')

    :param timeseries_container: The pandas.DataFrame with the time series to compute the features for, or a
            dictionary of pandas.DataFrames.
            See :func:`~tsfresh.feature_extraction.extraction.extract_features`.

    :param X: A DataFrame containing additional features
    :type X: pandas.DataFrame

    :param y: The target vector
    :type y: pandas.Series

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_sort: The name of the sort column.
    :type column_sort: str

    :param column_kind: The name of the column keeping record on the kind of the value.
    :type column_kind: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param chunksize: The size of one chunk for the parallelisation
    :type chunksize: None or int

    :param n_jobs: The number of processes to use for parallelization. If zero, no parallelization is used.
    :type n_jobs: int

    :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param profile: Turn on profiling during feature extraction
    :type profile: bool

    :param profiling_sorting: How to sort the profiling results (see the documentation of the profiling package for
           more information)
    :type profiling_sorting: basestring

    :param profiling_filename: Where to save the profiling results.
    :type profiling_filename: basestring

    :param test_for_binary_target_binary_feature: Which test to be used for binary target, binary feature (currently unused)
    :type test_for_binary_target_binary_feature: str

    :param test_for_binary_target_real_feature: Which test to be used for binary target, real feature
    :type test_for_binary_target_real_feature: str

    :param test_for_real_target_binary_feature: Which test to be used for real target, binary feature (currently unused)
    :type test_for_real_target_binary_feature: str

    :param test_for_real_target_real_feature: Which test to be used for real target, real feature (currently unused)
    :type test_for_real_target_real_feature: str

    :param fdr_level: The FDR level that should be respected, this is the theoretical expected percentage of irrelevant
                      features among all created features.
    :type fdr_level: float

    :param hypotheses_independent: Can the significance of the features be assumed to be independent?
                                   Normally, this should be set to False as the features are never
                                   independent (e.g. mean and median)
    :type hypotheses_independent: bool

    :param ml_task: The intended machine learning task. Either `'classification'`, `'regression'` or `'auto'`.
                    Defaults to `'auto'`, meaning the intended task is inferred from `y`.
                    If `y` has a boolean, integer or object dtype, the task is assumend to be classification,
                    else regression.
    :type ml_task: str

    :return: Feature matrix X, possibly extended with relevant time series features.
    """
    if X is not None:
        timeseries_container = restrict_input_to_index(timeseries_container, column_id, X.index)

    X_ext = extract_features(timeseries_container,
                             default_fc_parameters=default_fc_parameters,
                             kind_to_fc_parameters=kind_to_fc_parameters,
                             show_warnings=show_warnings,
                             disable_progressbar=disable_progressbar,
                             profile=profile,
                             profiling_filename=profiling_filename,
                             profiling_sorting=profiling_sorting,
                             n_jobs=n_jobs,
                             column_id=column_id, column_sort=column_sort,
                             column_kind=column_kind, column_value=column_value,
                             impute_function=impute)

    X_sel = select_features(X_ext, y,
                            test_for_binary_target_binary_feature=test_for_binary_target_binary_feature,
                            test_for_binary_target_real_feature=test_for_binary_target_real_feature,
                            test_for_real_target_binary_feature=test_for_real_target_binary_feature,
                            test_for_real_target_real_feature=test_for_real_target_real_feature,
                            fdr_level=fdr_level, hypotheses_independent=hypotheses_independent,
                            n_jobs=n_jobs,
                            chunksize=chunksize,
                            ml_task=ml_task)

    if X is None:
        X = X_sel
    else:
        X = pd.merge(X, X_sel, left_index=True, right_index=True, how="left")

    return X