Python split Examples, preprocessing.splitting.split Python Examples

Example #1

0

Show file

def preprocessing_select(data, dataset, day_len, all_feat, features):
    """
    Dataset train, valid, test creation for specific features, after samples creation:
    features selection -> splitting -> cleaning (2nd) -> standardization

    :param data: samples creation after first cleaning
    :param dataset: name of the dataset, e.g. "idiab"
    :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5)
    :param all_feat:
    :param features: features to be used by the models during the processing phase
    :return: training folds, validation folds, testing folds, list of scaler (one per fold)
    """
    to_drop = [ele for ele in all_feat if ele not in features]
    for col in data.columns:
        for ele in to_drop:
            if ele in col:
                data = data.drop(col, axis=1)
                break

    train, valid, test = split(data, day_len,
                               misc.datasets.datasets[dataset]["n_days_test"],
                               cs.cv)
    if "idiab" in dataset or "ohio" in dataset:
        [train, valid,
         test] = [remove_nans(set_) for set_ in [train, valid, test]]
    train, valid, test, scalers = standardize(train, valid, test)
    return train, valid, test, scalers

Example #2

0

Show file

def preprocessing_idiab(dataset, subject, ph, hist, day_len, n_days_test):
    """
    OhioT1DM dataset preprocessing pipeline:
    loading -> samples creation -> cleaning (1st) -> splitting -> cleaning (2nd) -> standardization

    First cleaning is done before splitting to speedup the preprocessing

    :param dataset: name of the dataset, e.g. "idiab"
    :param subject: id of the subject, e.g. "1"
    :param ph: prediction horizon, e.g. 30
    :param hist: history length, e.g. 60
    :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5)
    :return: training_old folds, validation folds, testing folds, list of scaler (one per fold)
    """
    data = load_idiab(dataset, subject)
    data = remove_anomalies(data)
    data = resample(data, cs.freq)
    data = remove_last_day(data)
    # data = create_samples(data, ph, hist, day_len)
    data = create_samples_double_y(data, ph, hist, day_len) #TODO too many missing data => need to interpolate y_0 using y_1
    data = fill_nans(data, day_len, n_days_test)
    data = fill_y_prev(data)
    train, valid, test = split(data, day_len, misc.datasets.datasets[dataset]["n_days_test"], cs.cv)
    [train, valid, test] = [remove_nans(set) for set in [train, valid, test]]
    train, valid, test, scalers = standardize(train, valid, test)
    print(test[0].shape)
    return train, valid, test, scalers

Example #3

0

Show file

File: preprocessing.py Project: dotXem/GLYFE

def preprocessing_t1dms(dataset, subject, ph, hist, day_len, n_days_test):
    """
    T1DMS dataset preprocessing pipeline (valid for adult, adolescents and children):
    loading -> samples creation -> splitting -> standardization

    :param dataset: name of the dataset, e.g. "t1dms"
    :param subject: id of the subject, e.g. "1"
    :param ph: prediction horizon, e.g. 30
    :param hist: history length, e.g. 60
    :param day_len: length of a day normalized by sampling frequency, e.g. 1440 (1440/1)
    :return: training_old folds, validation folds, testing folds, list of scaler (one per fold)
    """
    data = load_t1dms(dataset, subject, day_len)
    data = scaling_T1DMS(data)
    data = resample(data, cs.freq)
    data = create_samples(data, ph, hist, day_len)
    train, valid, test = split(data, day_len, n_days_test, cs.cv)
    train, valid, test, scalers = standardize(train, valid, test)
    return train, valid, test, scalers

Example #4

0

Show file

File: preprocessing.py Project: dotXem/GLYFE

def preprocessing_ohio(dataset, subject, ph, hist, day_len, n_days_test):
    """
    OhioT1DM dataset preprocessing pipeline:
    loading -> samples creation -> cleaning (1st) -> splitting -> cleaning (2nd) -> standardization

    First cleaning is done before splitting to speedup the preprocessing

    :param dataset: name of the dataset, e.g. "ohio"
    :param subject: id of the subject, e.g. "559"
    :param ph: prediction horizon, e.g. 30
    :param hist: history length, e.g. 60
    :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5)
    :return: training_old folds, validation folds, testing folds, list of scaler (one per fold)
    """
    data = load_ohio(dataset, subject)
    data = resample(data, cs.freq)
    data = create_samples(data, ph, hist, day_len)
    data = fill_nans(data, day_len, n_days_test)
    train, valid, test = split(data, day_len, n_days_test, cs.cv)
    [train, valid, test] = [remove_nans(set) for set in [train, valid, test]]
    train, valid, test, scalers = standardize(train, valid, test)
    return train, valid, test, scalers

Example #5

0

Show file

def preprocessing_idiab(dataset, subject, ph, hist, day_len, n_days_test):
    """
    Idiab dataset preprocessing pipeline:
    loading -> remove anomalies -> resample -> remove last day -> samples creation -> cleaning (1st) -> features
    selection -> splitting -> cleaning (2nd) -> standardization

    First cleaning is done before splitting to speedup the preprocessing

    :param dataset: name of the dataset, e.g. "idiab"
    :param subject: id of the subject, e.g. "1"
    :param ph: prediction horizon, e.g. 30
    :param hist: history length, e.g. 60
    :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5)
    :param n_days_test:
    :return: training folds, validation folds, testing folds, list of scaler (one per fold)
    """
    printd("Preprocessing " + dataset + subject + "...")
    data = load(dataset, subject)
    data = remove_anomalies(data)
    data = resample(data, cs.freq)
    data = remove_last_day(data)
    # data["CHO"] = CPB(data, cs.C_bio, cs.t_max)
    # data["insulin"] = IOB(data, cs.K_DIA)
    # data["steps"] = AOB(data, cs.k_s)
    data = create_samples(data, ph, hist, day_len)
    data = fill_nans(data, day_len, n_days_test)
    to_drop = ["calories", "heartrate", "mets", "steps"]
    for col in data.columns:
        for ele in to_drop:
            if ele in col:
                data = data.drop(col, axis=1)
                break

    train, valid, test = split(data, day_len, n_days_test, cs.cv)
    [train, valid, test] = [remove_nans(set_) for set_ in [train, valid, test]]
    train, valid, test, scalers = standardize(train, valid, test)
    print(test[0].shape)
    return train, valid, test, scalers