Ejemplo n.º 1
0
def pred_CV(goal_datum,
            data_sources,
            n_folds=1,
            bootstrap=False,
            lin_reg_intercept=False):
    r"""Test the performance of the linear regression model using
    cross-validation.

    Parameters
    ----------
    goal_datum : dictionary
        The gold standard data source, which is the dependent variable in the
        linear regression.
    data_sources : list
        List of data sources that will be evaluated. Each data source is a
        dictionary, which is an independent variable in the linear
        regression model.
    n_folds : integer
        Number of fold in k-fold cross-validation
    bootstrap : boolean
        Whether or not to use bootstrap data sets, which are created by sampling
        with replacement and the same size as the original training dataset.
    lin_reg_intercept: whether to include an intercept in the linear regression
        (and prediction)

    Returns
    -------
    pred_series_CV : dictionary
        Gold standard values predicted from independent variables (data
        sources).

    """
    if n_folds > 1:
        # kf = KFold(dm.length(goal_datum), n_folds)
        kf_p = KFold(n_folds)
        kf = list(kf_p.split(range(dm.length(goal_datum))))
    else:
        v = range(dm.length(goal_datum))
        kf = [(v, v)]
    pred_series_CV_list = []
    for train, test in kf:
        if bootstrap:
            train = np.random.choice(train, len(train))
        train_f, test_f = tt.index_to_filter(train), tt.index_to_filter(test)
        coef = tt.train_on_filter(lin_reg,
                                  goal_datum,
                                  data_sources,
                                  train_f,
                                  lin_reg_intercept=lin_reg_intercept)
        pred_series = tt.test_on_filter(partial(lin_pred, coefficients=coef),
                                        data_sources, test_f)
        pred_series_CV_list.append(pred_series)
    pred_series_CV = {}
    pred_series_CV['data'] = {}
    pred_series_CV['data']['values'] = np.concatenate(pred_series_CV_list,
                                                      axis=0)
    pred_series_CV['data']['times'] = goal_datum['data']['times']
    return pred_series_CV
Ejemplo n.º 2
0
def EED(data_sources, alg, h, auto_reset=False, len_outbreak=8):
    r"""Generalized Early Event Detection (EED) method with chose update method
    and optional reset.

    Parameters
    ----------
    data_sources : list
        List of data sources that will be evaluated. Each data source is a
        dictionary.
    alg : string
        An early event detection method, including MEWMA, cCUSUM, MCUSUM.
    h : number
        A value below which an alarm will not be triggered (no outbreak).
    auto_reset : boolean, optional
        If the updated S statistic should be reset. The default is False.
    len_outbreak : integer, optional
        The length of the outbreak. The default length is 8.

    Returns
    -------
    alarm : array (boolean data type)
        Indicate which time an EED method signals.
    S : list
        Elements in the list are arrays. Each element is the updated S statistic
        at each time step.
    E : list
        Elements in the list are the EED method test statistic E values at each
        time step.

    """
    S, E = [], []
    S_prev = np.zeros(len(data_sources))
    for t in range(dm.length(data_sources[0])):
        series_list = [d['data']['values'] for d in data_sources]
        Y_t = np.array(series_list)[:, t]
        S_next, E_next = alg(S_prev, Y_t)
        S.append(S_next)
        E.append(E_next)
        S_prev = S_next
        if auto_reset:
            len_regression = int(round(len_outbreak / 2))
            slope, _, _, p_value, _ = stats.linregress(
                range(len(E[-len_regression:])), E[-len_regression:])
            reset = (E[-1] > h) * (slope < 0) * (p_value < 0.05)
            S_prev = (not reset) * S_prev
    alarm = (np.array(E) >= h)
    return alarm, S, E
Ejemplo n.º 3
0
def cross_validate(train_func,
                   test_func,
                   goal_datum,
                   data_sources,
                   num_folds=1):
    # kf = KFold(dm.length(goal_datum), n_folds)
    kf_p = KFold(n_folds)
    kf = list(kf_p.split(range(dm.length(goal_datum))))

    pred_series_CV_list = []
    for train, test in kf:
        train_f, test_f = tt.index_to_filter(train), tt.index_to_filter(test)
        params = tt.train_on_filter(train_func, goal_datum, data_sources,
                                    train_f)
        pred_series_CV = tt.test_on_filter(
            partial(test_func, train_params=params), data_sources, test_f)
        pred_series_CV_list.append(pred_series_CV)
    pred_series_CV = np.concatenate(pred_series_CV_list, axis=0)
    return 0
Ejemplo n.º 4
0
def pred_CV_candidate(goal_datum,
                      data_sources,
                      threshold,
                      l,
                      h,
                      auto_reset=False,
                      len_outbreak=8,
                      n_folds=1,
                      bootstrap=False):
    r"""Cross-validated train/test function for early detection.

    Parameters
    ----------
    goal_datum : dictionary
        The gold standard data source.
    data_sources : list
        List of data sources that will be evaluated. Each data source is a
        dictionary.
    threshold : float
        Event baseline. When the value is larger than the threshold, there is
        an event. Otherwise, there is no event.
    l : float
        The smoothing parameter. A parameter in EED method MEWMA.
    h : number
        A value below which an alarm will not be triggered (no event).
    auto_reset : boolean
        If the updated S statistic should be reset. The default is False.
    len_outbreak : integer
        The length of the outbreak. The default length is 8.
    n_folds : integer
        Number of fold in k-fold cross-validation
    bootstrap : boolean
        Whether or not to use bootstrap data sets, which are created by sampling
        with replacement and has the same size as the original training dataset.

    Returns
    -------
    alarm_CV : dictionary
        Including date and whether or not alarm is triggered at that date.
    S_CV : ndarray
        Elements in the array are lists. Each element includes the updated S
        statistic for each fold.
    E_CV : array
        Elements in the array are the EED method test statistic E values for
        each fold.

    """
    if n_folds > 1:
        # kf = KFold(dm.length(goal_datum), n_folds)
        kf_p = KFold(n_folds)
        kf = list(kf_p.split(range(dm.length(goal_datum))))
    else:
        v = range(dm.length(goal_datum))
        kf = [(v, v)]
    alarm_CV_list, S_CV_list, E_CV_list = [], [], []
    for train, test in kf:
        if bootstrap:
            train = np.random.choice(train, len(train))
        train_f, test_f = tt.index_to_filter(train), tt.index_to_filter(test)
        _, mu_0, inv_Sigma_0 = tt.train_on_filter(
            partial(null_dist, threshold=threshold), goal_datum, data_sources,
            train_f)
        EED_MEWMA = partial(EED,
                            alg=partial(MEWMA,
                                        mu_0=mu_0,
                                        inv_Sigma_0=inv_Sigma_0,
                                        l=l),
                            h=h,
                            auto_reset=auto_reset,
                            len_outbreak=len_outbreak)
        alarm, S, E = tt.test_on_filter(EED_MEWMA, data_sources, test_f)
        alarm_CV_list.append(alarm)
        S_CV_list.append(S)
        E_CV_list.append(E)
    S_CV = np.concatenate(S_CV_list, axis=0)
    E_CV = np.concatenate(E_CV_list, axis=0)
    alarm_CV = {}
    alarm_CV['data'] = {}
    alarm_CV['data']['values'] = np.concatenate(alarm_CV_list, axis=0)
    alarm_CV['data']['times'] = goal_datum['data']['times']
    return alarm_CV, S_CV, E_CV