def pred_CV(goal_datum, data_sources, n_folds=1, bootstrap=False, lin_reg_intercept=False): r"""Test the performance of the linear regression model using cross-validation. Parameters ---------- goal_datum : dictionary The gold standard data source, which is the dependent variable in the linear regression. data_sources : list List of data sources that will be evaluated. Each data source is a dictionary, which is an independent variable in the linear regression model. n_folds : integer Number of fold in k-fold cross-validation bootstrap : boolean Whether or not to use bootstrap data sets, which are created by sampling with replacement and the same size as the original training dataset. lin_reg_intercept: whether to include an intercept in the linear regression (and prediction) Returns ------- pred_series_CV : dictionary Gold standard values predicted from independent variables (data sources). """ if n_folds > 1: # kf = KFold(dm.length(goal_datum), n_folds) kf_p = KFold(n_folds) kf = list(kf_p.split(range(dm.length(goal_datum)))) else: v = range(dm.length(goal_datum)) kf = [(v, v)] pred_series_CV_list = [] for train, test in kf: if bootstrap: train = np.random.choice(train, len(train)) train_f, test_f = tt.index_to_filter(train), tt.index_to_filter(test) coef = tt.train_on_filter(lin_reg, goal_datum, data_sources, train_f, lin_reg_intercept=lin_reg_intercept) pred_series = tt.test_on_filter(partial(lin_pred, coefficients=coef), data_sources, test_f) pred_series_CV_list.append(pred_series) pred_series_CV = {} pred_series_CV['data'] = {} pred_series_CV['data']['values'] = np.concatenate(pred_series_CV_list, axis=0) pred_series_CV['data']['times'] = goal_datum['data']['times'] return pred_series_CV
def EED(data_sources, alg, h, auto_reset=False, len_outbreak=8): r"""Generalized Early Event Detection (EED) method with chose update method and optional reset. Parameters ---------- data_sources : list List of data sources that will be evaluated. Each data source is a dictionary. alg : string An early event detection method, including MEWMA, cCUSUM, MCUSUM. h : number A value below which an alarm will not be triggered (no outbreak). auto_reset : boolean, optional If the updated S statistic should be reset. The default is False. len_outbreak : integer, optional The length of the outbreak. The default length is 8. Returns ------- alarm : array (boolean data type) Indicate which time an EED method signals. S : list Elements in the list are arrays. Each element is the updated S statistic at each time step. E : list Elements in the list are the EED method test statistic E values at each time step. """ S, E = [], [] S_prev = np.zeros(len(data_sources)) for t in range(dm.length(data_sources[0])): series_list = [d['data']['values'] for d in data_sources] Y_t = np.array(series_list)[:, t] S_next, E_next = alg(S_prev, Y_t) S.append(S_next) E.append(E_next) S_prev = S_next if auto_reset: len_regression = int(round(len_outbreak / 2)) slope, _, _, p_value, _ = stats.linregress( range(len(E[-len_regression:])), E[-len_regression:]) reset = (E[-1] > h) * (slope < 0) * (p_value < 0.05) S_prev = (not reset) * S_prev alarm = (np.array(E) >= h) return alarm, S, E
def cross_validate(train_func, test_func, goal_datum, data_sources, num_folds=1): # kf = KFold(dm.length(goal_datum), n_folds) kf_p = KFold(n_folds) kf = list(kf_p.split(range(dm.length(goal_datum)))) pred_series_CV_list = [] for train, test in kf: train_f, test_f = tt.index_to_filter(train), tt.index_to_filter(test) params = tt.train_on_filter(train_func, goal_datum, data_sources, train_f) pred_series_CV = tt.test_on_filter( partial(test_func, train_params=params), data_sources, test_f) pred_series_CV_list.append(pred_series_CV) pred_series_CV = np.concatenate(pred_series_CV_list, axis=0) return 0
def pred_CV_candidate(goal_datum, data_sources, threshold, l, h, auto_reset=False, len_outbreak=8, n_folds=1, bootstrap=False): r"""Cross-validated train/test function for early detection. Parameters ---------- goal_datum : dictionary The gold standard data source. data_sources : list List of data sources that will be evaluated. Each data source is a dictionary. threshold : float Event baseline. When the value is larger than the threshold, there is an event. Otherwise, there is no event. l : float The smoothing parameter. A parameter in EED method MEWMA. h : number A value below which an alarm will not be triggered (no event). auto_reset : boolean If the updated S statistic should be reset. The default is False. len_outbreak : integer The length of the outbreak. The default length is 8. n_folds : integer Number of fold in k-fold cross-validation bootstrap : boolean Whether or not to use bootstrap data sets, which are created by sampling with replacement and has the same size as the original training dataset. Returns ------- alarm_CV : dictionary Including date and whether or not alarm is triggered at that date. S_CV : ndarray Elements in the array are lists. Each element includes the updated S statistic for each fold. E_CV : array Elements in the array are the EED method test statistic E values for each fold. """ if n_folds > 1: # kf = KFold(dm.length(goal_datum), n_folds) kf_p = KFold(n_folds) kf = list(kf_p.split(range(dm.length(goal_datum)))) else: v = range(dm.length(goal_datum)) kf = [(v, v)] alarm_CV_list, S_CV_list, E_CV_list = [], [], [] for train, test in kf: if bootstrap: train = np.random.choice(train, len(train)) train_f, test_f = tt.index_to_filter(train), tt.index_to_filter(test) _, mu_0, inv_Sigma_0 = tt.train_on_filter( partial(null_dist, threshold=threshold), goal_datum, data_sources, train_f) EED_MEWMA = partial(EED, alg=partial(MEWMA, mu_0=mu_0, inv_Sigma_0=inv_Sigma_0, l=l), h=h, auto_reset=auto_reset, len_outbreak=len_outbreak) alarm, S, E = tt.test_on_filter(EED_MEWMA, data_sources, test_f) alarm_CV_list.append(alarm) S_CV_list.append(S) E_CV_list.append(E) S_CV = np.concatenate(S_CV_list, axis=0) E_CV = np.concatenate(E_CV_list, axis=0) alarm_CV = {} alarm_CV['data'] = {} alarm_CV['data']['values'] = np.concatenate(alarm_CV_list, axis=0) alarm_CV['data']['times'] = goal_datum['data']['times'] return alarm_CV, S_CV, E_CV