def preprocessing_select(data, dataset, day_len, all_feat, features): """ Dataset train, valid, test creation for specific features, after samples creation: features selection -> splitting -> cleaning (2nd) -> standardization :param data: samples creation after first cleaning :param dataset: name of the dataset, e.g. "idiab" :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5) :param all_feat: :param features: features to be used by the models during the processing phase :return: training folds, validation folds, testing folds, list of scaler (one per fold) """ to_drop = [ele for ele in all_feat if ele not in features] for col in data.columns: for ele in to_drop: if ele in col: data = data.drop(col, axis=1) break train, valid, test = split(data, day_len, misc.datasets.datasets[dataset]["n_days_test"], cs.cv) if "idiab" in dataset or "ohio" in dataset: [train, valid, test] = [remove_nans(set_) for set_ in [train, valid, test]] train, valid, test, scalers = standardize(train, valid, test) return train, valid, test, scalers
def preprocessing_idiab(dataset, subject, ph, hist, day_len, n_days_test): """ OhioT1DM dataset preprocessing pipeline: loading -> samples creation -> cleaning (1st) -> splitting -> cleaning (2nd) -> standardization First cleaning is done before splitting to speedup the preprocessing :param dataset: name of the dataset, e.g. "idiab" :param subject: id of the subject, e.g. "1" :param ph: prediction horizon, e.g. 30 :param hist: history length, e.g. 60 :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5) :return: training_old folds, validation folds, testing folds, list of scaler (one per fold) """ data = load_idiab(dataset, subject) data = remove_anomalies(data) data = resample(data, cs.freq) data = remove_last_day(data) # data = create_samples(data, ph, hist, day_len) data = create_samples_double_y(data, ph, hist, day_len) #TODO too many missing data => need to interpolate y_0 using y_1 data = fill_nans(data, day_len, n_days_test) data = fill_y_prev(data) train, valid, test = split(data, day_len, misc.datasets.datasets[dataset]["n_days_test"], cs.cv) [train, valid, test] = [remove_nans(set) for set in [train, valid, test]] train, valid, test, scalers = standardize(train, valid, test) print(test[0].shape) return train, valid, test, scalers
def preprocessing_t1dms(dataset, subject, ph, hist, day_len, n_days_test): """ T1DMS dataset preprocessing pipeline (valid for adult, adolescents and children): loading -> samples creation -> splitting -> standardization :param dataset: name of the dataset, e.g. "t1dms" :param subject: id of the subject, e.g. "1" :param ph: prediction horizon, e.g. 30 :param hist: history length, e.g. 60 :param day_len: length of a day normalized by sampling frequency, e.g. 1440 (1440/1) :return: training_old folds, validation folds, testing folds, list of scaler (one per fold) """ data = load_t1dms(dataset, subject, day_len) data = scaling_T1DMS(data) data = resample(data, cs.freq) data = create_samples(data, ph, hist, day_len) train, valid, test = split(data, day_len, n_days_test, cs.cv) train, valid, test, scalers = standardize(train, valid, test) return train, valid, test, scalers
def preprocessing_ohio(dataset, subject, ph, hist, day_len, n_days_test): """ OhioT1DM dataset preprocessing pipeline: loading -> samples creation -> cleaning (1st) -> splitting -> cleaning (2nd) -> standardization First cleaning is done before splitting to speedup the preprocessing :param dataset: name of the dataset, e.g. "ohio" :param subject: id of the subject, e.g. "559" :param ph: prediction horizon, e.g. 30 :param hist: history length, e.g. 60 :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5) :return: training_old folds, validation folds, testing folds, list of scaler (one per fold) """ data = load_ohio(dataset, subject) data = resample(data, cs.freq) data = create_samples(data, ph, hist, day_len) data = fill_nans(data, day_len, n_days_test) train, valid, test = split(data, day_len, n_days_test, cs.cv) [train, valid, test] = [remove_nans(set) for set in [train, valid, test]] train, valid, test, scalers = standardize(train, valid, test) return train, valid, test, scalers
def preprocessing_idiab(dataset, subject, ph, hist, day_len, n_days_test): """ Idiab dataset preprocessing pipeline: loading -> remove anomalies -> resample -> remove last day -> samples creation -> cleaning (1st) -> features selection -> splitting -> cleaning (2nd) -> standardization First cleaning is done before splitting to speedup the preprocessing :param dataset: name of the dataset, e.g. "idiab" :param subject: id of the subject, e.g. "1" :param ph: prediction horizon, e.g. 30 :param hist: history length, e.g. 60 :param day_len: length of a day normalized by sampling frequency, e.g. 288 (1440/5) :param n_days_test: :return: training folds, validation folds, testing folds, list of scaler (one per fold) """ printd("Preprocessing " + dataset + subject + "...") data = load(dataset, subject) data = remove_anomalies(data) data = resample(data, cs.freq) data = remove_last_day(data) # data["CHO"] = CPB(data, cs.C_bio, cs.t_max) # data["insulin"] = IOB(data, cs.K_DIA) # data["steps"] = AOB(data, cs.k_s) data = create_samples(data, ph, hist, day_len) data = fill_nans(data, day_len, n_days_test) to_drop = ["calories", "heartrate", "mets", "steps"] for col in data.columns: for ele in to_drop: if ele in col: data = data.drop(col, axis=1) break train, valid, test = split(data, day_len, n_days_test, cs.cv) [train, valid, test] = [remove_nans(set_) for set_ in [train, valid, test]] train, valid, test, scalers = standardize(train, valid, test) print(test[0].shape) return train, valid, test, scalers