Ejemplo n.º 1
0
def preprocessing_idiab(dataset, subject, ph, hist, day_len, n_days_test):
    """
    OhioT1DM dataset preprocessing pipeline:
    loading -> samples creation -> cleaning (1st) -> splitting -> cleaning (2nd) -> standardization

    First cleaning is done before splitting to speedup the preprocessing

    :param dataset: name of the dataset, e.g. "idiab"
    :param subject: id of the subject, e.g. "1"
    :param ph: prediction horizon, e.g. 30
    :param hist: history length, e.g. 60
    :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5)
    :return: training_old folds, validation folds, testing folds, list of scaler (one per fold)
    """
    data = load_idiab(dataset, subject)
    data = remove_anomalies(data)
    data = resample(data, cs.freq)
    data = remove_last_day(data)
    # data = create_samples(data, ph, hist, day_len)
    data = create_samples_double_y(data, ph, hist, day_len) #TODO too many missing data => need to interpolate y_0 using y_1
    data = fill_nans(data, day_len, n_days_test)
    data = fill_y_prev(data)
    train, valid, test = split(data, day_len, misc.datasets.datasets[dataset]["n_days_test"], cs.cv)
    [train, valid, test] = [remove_nans(set) for set in [train, valid, test]]
    train, valid, test, scalers = standardize(train, valid, test)
    print(test[0].shape)
    return train, valid, test, scalers
Ejemplo n.º 2
0
def preprocessing_full(dataset, subject, ph, hist, day_len, all_feat):
    """
    Full dataset samples creation pipeline:
    loading -> selecting features -> remove anomalies -> resample -> remove last day -> samples creation
    -> cleaning (1st)

    First cleaning is done before splitting to speedup the preprocessing

    :param dataset: name of the dataset, e.g. "idiab"
    :param subject: id of the subject, e.g. "1"
    :param ph: prediction horizon, e.g. 30
    :param hist: history length, e.g. 60
    :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5)
    :param all_feat:
    :return: dataframe of samples
    """
    data = load(dataset, subject)

    features = [
        feature for feature in list(data.columns)
        if feature not in ["datetime", "glucose"]
    ]
    to_drop = [feature for feature in features if feature not in all_feat]
    data = data.drop(to_drop, axis=1)

    if "idiab" in dataset:
        data = remove_anomalies(data)
    if "t1dms" in dataset:
        data = scaling_t1dms(data)

    data = resample(data, cs.freq)

    if "idiab" in dataset:
        data = remove_last_day(data)

    if "CPB" in all_feat:
        data["CPB"] = cpb(data, cs.C_bio, cs.t_max, True)
    if "IOB" in all_feat:
        data["IOB"] = iob(data, cs.K_DIA, True)
    if "AOB" in all_feat:
        data["AOB"] = aob(data, cs.k_s, True)

    data = create_samples(data, ph, hist, day_len)
    n_days_test = misc.datasets.datasets[dataset]["n_days_test"]

    if "idiab" in dataset or "ohio" in dataset:
        data = fill_nans(data, day_len, n_days_test)

    return data
Ejemplo n.º 3
0
def preprocessing_t1dms(dataset, subject, ph, hist, day_len, n_days_test):
    """
    T1DMS dataset preprocessing pipeline (valid for adult, adolescents and children):
    loading -> samples creation -> splitting -> standardization

    :param dataset: name of the dataset, e.g. "t1dms"
    :param subject: id of the subject, e.g. "1"
    :param ph: prediction horizon, e.g. 30
    :param hist: history length, e.g. 60
    :param day_len: length of a day normalized by sampling frequency, e.g. 1440 (1440/1)
    :return: training_old folds, validation folds, testing folds, list of scaler (one per fold)
    """
    data = load_t1dms(dataset, subject, day_len)
    data = scaling_T1DMS(data)
    data = resample(data, cs.freq)
    data = create_samples(data, ph, hist, day_len)
    train, valid, test = split(data, day_len, n_days_test, cs.cv)
    train, valid, test, scalers = standardize(train, valid, test)
    return train, valid, test, scalers
Ejemplo n.º 4
0
def preprocessing_ohio(dataset, subject, ph, hist, day_len, n_days_test):
    """
    OhioT1DM dataset preprocessing pipeline:
    loading -> samples creation -> cleaning (1st) -> splitting -> cleaning (2nd) -> standardization

    First cleaning is done before splitting to speedup the preprocessing

    :param dataset: name of the dataset, e.g. "ohio"
    :param subject: id of the subject, e.g. "559"
    :param ph: prediction horizon, e.g. 30
    :param hist: history length, e.g. 60
    :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5)
    :return: training_old folds, validation folds, testing folds, list of scaler (one per fold)
    """
    data = load_ohio(dataset, subject)
    data = resample(data, cs.freq)
    data = create_samples(data, ph, hist, day_len)
    data = fill_nans(data, day_len, n_days_test)
    train, valid, test = split(data, day_len, n_days_test, cs.cv)
    [train, valid, test] = [remove_nans(set) for set in [train, valid, test]]
    train, valid, test, scalers = standardize(train, valid, test)
    return train, valid, test, scalers
Ejemplo n.º 5
0
def preprocessing_idiab(dataset, subject, ph, hist, day_len, n_days_test):
    """
    Idiab dataset preprocessing pipeline:
    loading -> remove anomalies -> resample -> remove last day -> samples creation -> cleaning (1st) -> features
    selection -> splitting -> cleaning (2nd) -> standardization

    First cleaning is done before splitting to speedup the preprocessing

    :param dataset: name of the dataset, e.g. "idiab"
    :param subject: id of the subject, e.g. "1"
    :param ph: prediction horizon, e.g. 30
    :param hist: history length, e.g. 60
    :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5)
    :param n_days_test:
    :return: training folds, validation folds, testing folds, list of scaler (one per fold)
    """
    printd("Preprocessing " + dataset + subject + "...")
    data = load(dataset, subject)
    data = remove_anomalies(data)
    data = resample(data, cs.freq)
    data = remove_last_day(data)
    # data["CHO"] = CPB(data, cs.C_bio, cs.t_max)
    # data["insulin"] = IOB(data, cs.K_DIA)
    # data["steps"] = AOB(data, cs.k_s)
    data = create_samples(data, ph, hist, day_len)
    data = fill_nans(data, day_len, n_days_test)
    to_drop = ["calories", "heartrate", "mets", "steps"]
    for col in data.columns:
        for ele in to_drop:
            if ele in col:
                data = data.drop(col, axis=1)
                break

    train, valid, test = split(data, day_len, n_days_test, cs.cv)
    [train, valid, test] = [remove_nans(set_) for set_ in [train, valid, test]]
    train, valid, test, scalers = standardize(train, valid, test)
    print(test[0].shape)
    return train, valid, test, scalers