Exemple #1
0
def df_data_remove_z(df_data,
                     z=[str, list],
                     keys=None,
                     standardize: bool = True,
                     plot: bool = True):
    '''


    Parameters
    ----------
    df_data : pd.DataFrame
        DataFrame containing timeseries.
    z : str, optional
        variable z, of which influence will be remove of columns in keys.
        The default is str.

    Returns
    -------
    None.

    '''
    method = ParCorr()

    if type(z) is str:
        z = [z]
    if keys is None:
        discard = ['TrainIsTrue', 'RV_mask'] + z
        keys = [k for k in df_data.columns if k not in discard]

    npstore = np.zeros(shape=(len(keys), df_data.index.levels[0].size,
                              df_data.index.levels[1].size))
    for i, orig in enumerate(keys):
        orig = keys[i]

        # create fake X, Y format, needed for function _get_single_residuals
        dfxy = df_data[[orig]].merge(df_data[[orig
                                              ]].copy().rename({orig: 'copy'},
                                                               axis=1),
                                     left_index=True,
                                     right_index=True).copy()
        # Append Z timeseries
        dfxyz = dfxy.merge(df_data[z], left_index=True, right_index=True)

        for s in df_data.index.levels[0]:
            dfxyz_s = dfxyz.loc[s].copy()
            if all(dfxyz_s[orig].isna().values):
                npstore[i, s, :] = dfxyz_s[orig].values  # fill in all nans
            else:
                npstore[i, s, :] = method._get_single_residuals(
                    np.moveaxis(dfxyz_s.values, 0, 1),
                    0,
                    standardize=standardize)

    df_new = pd.DataFrame(np.moveaxis(npstore, 0, 2).reshape(-1, len(keys)),
                          index=df_data.index,
                          columns=keys)
    if plot:
        fig, axes = plt.subplots(len(keys),
                                 1,
                                 figsize=(10, 2.5 * len(keys)),
                                 sharex=True)
        if len(keys) == 1:
            axes = [axes]
        for i, k in enumerate(keys):
            df_data[k].loc[0].plot(ax=axes[i],
                                   label=f'{k} original',
                                   legend=False,
                                   color='green',
                                   lw=1,
                                   alpha=.8)
            df_new[k].loc[0].plot(ax=axes[i],
                                  label=f'{z} regressed out',
                                  legend=False,
                                  color='blue',
                                  lw=1)
            axes[i].legend()
        out = (df_new, fig)
    else:
        out = (df_new)
    return out
Exemple #2
0
def df_data_remove_z(df_data,
                     z_keys=[str, list],
                     lag_z: [int, list] = [0],
                     keys=None,
                     standardize: bool = True,
                     plot: bool = True):
    '''


    Parameters
    ----------
    df_data : pd.DataFrame
        DataFrame containing timeseries.
    z_keys : str, optional
        variable z, of which influence will be remove of columns in keys.
        The default is str.

    Returns
    -------
    None.

    '''
    method = ParCorr()

    if type(z_keys) is str:
        z_keys = [z_keys]
    if keys is None:
        discard = ['TrainIsTrue', 'RV_mask'] + z_keys
        keys = [k for k in df_data.columns if k not in discard]
    if hasattr(df_data.index, 'levels'
               ) == False:  # create fake multi-index for standard data format
        df_data.index = pd.MultiIndex.from_product([[0], df_data.index])

    if type(lag_z) is int:
        lag_z = [lag_z]

    max_lag = max(lag_z)
    dates = df_data.index.levels[1]
    df_z = df_data[z_keys]
    zlist = []
    if 0 in lag_z:
        zlist = [df_z.loc[pd.IndexSlice[:, dates[max_lag:]], :]]
    [
        zlist.append(df_z.loc[pd.IndexSlice[:, dates[max_lag - l:-l]], :])
        for l in lag_z if l != 0
    ]
    # update df_data to account for lags (first dates have no lag):
    df_data = df_data.loc[pd.IndexSlice[:, dates[max_lag:]], :]
    # align index dates for easy merging
    for d_ in zlist:
        d_.index = df_data.index
    df_z = pd.concat(zlist, axis=1).loc[pd.IndexSlice[:, dates[max_lag:]], :]
    # update columns with lag indication
    df_z.columns = [
        f'{c[0]}_lag{c[1]}'
        for c in np.array(np.meshgrid(z_keys, lag_z)).T.reshape(-1, 2)
    ]

    npstore = np.zeros(shape=(len(keys), df_data.index.levels[0].size,
                              dates[max_lag:].size))
    for i, orig in enumerate(keys):
        orig = keys[i]

        # create fake X, Y format, needed for function _get_single_residuals
        dfxy = df_data[[orig]].merge(df_data[[orig
                                              ]].copy().rename({orig: 'copy'},
                                                               axis=1),
                                     left_index=True,
                                     right_index=True).copy()
        # Append Z timeseries
        dfxyz = dfxy.merge(df_z, left_index=True, right_index=True)

        for s in df_data.index.levels[0]:
            dfxyz_s = dfxyz.loc[s].copy()
            if all(dfxyz_s[orig].isna().values):
                npstore[i, s, :] = dfxyz_s[orig].values  # fill in all nans
            else:
                npstore[i, s, :] = method._get_single_residuals(
                    np.moveaxis(dfxyz_s.values, 0, 1),
                    0,
                    standardize=standardize,
                    return_means=True)[0]

    df_new = pd.DataFrame(np.moveaxis(npstore, 0, 2).reshape(-1, len(keys)),
                          index=df_data.index,
                          columns=keys)
    if plot:
        fig, axes = plt.subplots(len(keys),
                                 1,
                                 figsize=(10, 2.5 * len(keys)),
                                 sharex=True)
        if len(keys) == 1:
            axes = [axes]
        for i, k in enumerate(keys):
            df_data[k].loc[0].plot(ax=axes[i],
                                   label=f'{k} original',
                                   legend=False,
                                   color='green',
                                   lw=1,
                                   alpha=.8)
            cols = ', '.join(c.replace('_', ' ') for c in df_z.columns)
            df_new[k].loc[0].plot(ax=axes[i],
                                  label=cols + ' regressed out',
                                  legend=False,
                                  color='blue',
                                  lw=1)
            axes[i].legend()
        out = (df_new, fig)
    else:
        out = (df_new)
    return out
class TestCondInd():  #unittest.TestCase):
    # def __init__(self):
    #     pass

    def setUp(self):

        auto = 0.6
        coeff = 0.6
        T = 1000
        numpy.random.seed(42)
        # True graph
        links_coeffs = {
            0: [((0, -1), auto)],
            1: [((1, -1), auto), ((0, -1), coeff)],
            2: [((2, -1), auto), ((1, -1), coeff)]
        }

        self.data, self.true_parents_coeffs = pp.var_process(links_coeffs, T=T)
        T, N = self.data.shape

        self.ci_par_corr = ParCorr(use_mask=False,
                                   mask_type=None,
                                   significance='analytic',
                                   fixed_thres=None,
                                   sig_samples=10000,
                                   sig_blocklength=3,
                                   confidence='analytic',
                                   conf_lev=0.9,
                                   conf_samples=10000,
                                   conf_blocklength=1,
                                   recycle_residuals=False,
                                   verbosity=0)

        self.ci_gpdc = GPDC(significance='analytic',
                            sig_samples=1000,
                            sig_blocklength=1,
                            confidence='bootstrap',
                            conf_lev=0.9,
                            conf_samples=100,
                            conf_blocklength=None,
                            use_mask=False,
                            mask_type='y',
                            recycle_residuals=False,
                            verbosity=0)

    def test_construct_array(self):

        data = numpy.array([[0, 10, 20, 30], [1, 11, 21, 31], [2, 12, 22, 32],
                            [3, 13, 23, 33], [4, 14, 24, 34], [5, 15, 25, 35],
                            [6, 16, 26, 36]])
        data_mask = numpy.array(
            [[0, 1, 1, 0], [0, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 1],
             [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
            dtype='bool')

        X = [(1, -1)]
        Y = [(0, 0)]
        Z = [(0, -1), (1, -2), (2, 0)]

        tau_max = 2

        # No masking
        res = _construct_array(X=X,
                               Y=Y,
                               Z=Z,
                               tau_max=tau_max,
                               use_mask=False,
                               data=data,
                               mask=data_mask,
                               missing_flag=None,
                               mask_type=None,
                               verbosity=verbosity)
        print res[0]
        numpy.testing.assert_almost_equal(
            res[0],
            numpy.array([[13, 14, 15], [4, 5, 6], [3, 4, 5], [12, 13, 14],
                         [24, 25, 26]]))
        numpy.testing.assert_almost_equal(res[1], numpy.array([0, 1, 2, 2, 2]))

        # masking y
        res = _construct_array(X=X,
                               Y=Y,
                               Z=Z,
                               tau_max=tau_max,
                               use_mask=True,
                               data=data,
                               mask=data_mask,
                               mask_type=['y'],
                               verbosity=verbosity)
        print res[0]

        numpy.testing.assert_almost_equal(
            res[0],
            numpy.array([[13, 14, 15], [4, 5, 6], [3, 4, 5], [12, 13, 14],
                         [24, 25, 26]]))

        numpy.testing.assert_almost_equal(res[1], numpy.array([0, 1, 2, 2, 2]))

        # masking all
        res = _construct_array(X=X,
                               Y=Y,
                               Z=Z,
                               tau_max=tau_max,
                               use_mask=True,
                               data=data,
                               mask=data_mask,
                               mask_type=['x', 'y', 'z'],
                               verbosity=verbosity)
        print res[0]

        numpy.testing.assert_almost_equal(
            res[0],
            numpy.array([[13, 14, 15], [4, 5, 6], [3, 4, 5], [12, 13, 14],
                         [24, 25, 26]]))

        numpy.testing.assert_almost_equal(res[1], numpy.array([0, 1, 2, 2, 2]))

    def test_missing_values(self):

        data = numpy.array([
            [0, 10, 20, 30],
            [1, 11, 21, 31],
            [2, 12, 22, 32],
            [3, 13, 999, 33],
            [4, 14, 24, 34],
            [5, 15, 25, 35],
            [6, 16, 26, 36],
        ])
        data_mask = numpy.array(
            [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
             [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
            dtype='bool')

        X = [(1, -2)]
        Y = [(0, 0)]
        Z = [(2, -1)]

        tau_max = 1

        # Missing values
        res = _construct_array(X=X,
                               Y=Y,
                               Z=Z,
                               tau_max=tau_max,
                               use_mask=False,
                               data=data,
                               mask=data_mask,
                               missing_flag=999,
                               mask_type=['y'],
                               verbosity=verbosity)

        # print res[0]
        numpy.testing.assert_almost_equal(
            res[0], numpy.array([[10, 14], [2, 6], [21, 25]]))

    def test_bootstrap_vs_analytic_confidence_parcorr(self):

        cov = numpy.array([[1., 0.3], [0.3, 1.]])
        array = numpy.random.multivariate_normal(mean=numpy.zeros(2),
                                                 cov=cov,
                                                 size=150).T

        val = numpy.corrcoef(array)[0, 1]
        # print val
        dim, T = array.shape
        xyz = numpy.array([0, 1])

        conf_ana = self.ci_par_corr.get_analytic_confidence(
            df=T - dim, value=val, conf_lev=self.ci_par_corr.conf_lev)

        conf_boots = self.ci_par_corr.get_bootstrap_confidence(
            array,
            xyz,
            dependence_measure=self.ci_par_corr.get_dependence_measure,
            conf_samples=self.ci_par_corr.conf_samples,
            conf_blocklength=self.ci_par_corr.conf_blocklength,
            conf_lev=self.ci_par_corr.conf_lev,
        )

        print conf_ana
        print conf_boots

        numpy.testing.assert_allclose(numpy.array(conf_ana),
                                      numpy.array(conf_boots),
                                      atol=0.01)

    def test_shuffle_vs_analytic_significance_parcorr(self):

        cov = numpy.array([[1., 0.04], [0.04, 1.]])
        array = numpy.random.multivariate_normal(mean=numpy.zeros(2),
                                                 cov=cov,
                                                 size=250).T
        # array = numpy.random.randn(3, 10)
        val = numpy.corrcoef(array)[0, 1]
        # print val
        dim, T = array.shape
        xyz = numpy.array([0, 1])

        pval_ana = self.ci_par_corr.get_analytic_significance(value=val,
                                                              T=T,
                                                              dim=dim)

        pval_shuffle = self.ci_par_corr.get_shuffle_significance(
            array, xyz, val)
        # Adjust p-value for two-sided measures

        print pval_ana
        print pval_shuffle

        numpy.testing.assert_allclose(numpy.array(pval_ana),
                                      numpy.array(pval_shuffle),
                                      atol=0.01)

    def test__parcorr_get_single_residuals(self):

        target_var = 0  #numpy.array([True, False, False, False])
        true_residual = numpy.random.randn(4, 1000)

        array = numpy.copy(true_residual)

        array[0] += 0.5 * array[2:].sum(axis=0)

        est_residual = self.ci_par_corr._get_single_residuals(
            array, target_var, standardize=False, return_means=False)

        # print est_residual[:10]
        # print true_residual[0, :10]
        numpy.testing.assert_allclose(est_residual,
                                      true_residual[0],
                                      atol=0.01)

    def test_par_corr(self):

        val_ana = 0.6
        T = 1000
        array = numpy.random.randn(5, T)

        cov = numpy.array([[1., val_ana], [val_ana, 1.]])
        array[:2, :] = numpy.random.multivariate_normal(mean=numpy.zeros(2),
                                                        cov=cov,
                                                        size=T).T

        # Generate some confounding
        array[0] += 0.5 * array[2:].sum(axis=0)
        array[1] += 0.7 * array[2:].sum(axis=0)

        # print numpy.corrcoef(array)[0,1]
        # print val
        dim, T = array.shape
        xyz = numpy.array([0, 1, 2, 2, 2])

        val_est = self.ci_par_corr.get_dependence_measure(array, xyz)

        print val_est
        print val_ana

        numpy.testing.assert_allclose(numpy.array(val_ana),
                                      numpy.array(val_est),
                                      atol=0.02)

    def test__gpdc_get_single_residuals(self):

        ci_test = self.ci_gpdc
        # ci_test = self.ci_par_corr

        c = .3
        T = 1000

        numpy.random.seed(42)

        def func(x):
            return x * (1. - 4. * x**0 * numpy.exp(-x**2 / 2.))

        array = numpy.random.randn(3, T)
        array[1] += c * func(array[2])  #.sum(axis=0)
        xyz = numpy.array([0, 1] + [2 for i in range(array.shape[0] - 2)])

        target_var = 1

        dim, T = array.shape
        # array -= array.mean(axis=1).reshape(dim, 1)
        c_std = c  #/array[1].std()
        # array /= array.std(axis=1).reshape(dim, 1)
        array_orig = numpy.copy(array)

        (est_residual, pred) = ci_test._get_single_residuals(array,
                                                             target_var,
                                                             standardize=False,
                                                             return_means=True)

        # Testing that in the center the fit is good
        center = numpy.where(numpy.abs(array_orig[2]) < .7)[0]
        print(pred[center][:10]).round(2)
        print(c_std * func(array_orig[2][center])[:10]).round(2)
        numpy.testing.assert_allclose(pred[center],
                                      c_std * func(array_orig[2][center]),
                                      atol=0.2)

    def plot__gpdc_get_single_residuals(self):

        #######
        ci_test = self.ci_gpdc
        # ci_test = self.ci_par_corr

        a = 0.
        c = .3
        T = 500
        # Each key refers to a variable and the incoming links are supplied as a
        # list of format [((driver, lag), coeff), ...]
        links_coeffs = {
            0: [((0, -1), a)],
            1: [((1, -1), a), ((0, -1), c)],
        }

        numpy.random.seed(42)
        data, true_parents_neighbors = pp.var_process(links_coeffs,
                                                      use='inv_inno_cov',
                                                      T=T)
        dataframe = pp.DataFrame(data)
        ci_test.set_dataframe(dataframe)

        # ci_test.set_tau_max(1)

        # X=[(1, -1)]
        # Y=[(1, 0)]
        # Z=[(0, -1)] + [(1, -tau) for tau in range(1, 2)]
        # array, xyz, XYZ = ci_test.get_array(X, Y, Z,
        #     verbosity=0)]
        # ci_test.run_test(X, Y, Z,)
        def func(x):
            return x * (1. - 4. * x**0 * numpy.exp(-x**2 / 2.))

        true_residual = numpy.random.randn(3, T)
        array = numpy.copy(true_residual)
        array[1] += c * func(array[2])  #.sum(axis=0)
        xyz = numpy.array([0, 1] + [2 for i in range(array.shape[0] - 2)])

        print 'xyz ', xyz, numpy.where(xyz == 1)
        target_var = 1

        dim, T = array.shape
        # array -= array.mean(axis=1).reshape(dim, 1)
        c_std = c  #/array[1].std()
        # array /= array.std(axis=1).reshape(dim, 1)
        array_orig = numpy.copy(array)

        import matplotlib
        from matplotlib import pyplot
        (est_residual, pred) = ci_test._get_single_residuals(array,
                                                             target_var,
                                                             standardize=False,
                                                             return_means=True)
        (resid_, pred_parcorr) = self.ci_par_corr._get_single_residuals(
            array, target_var, standardize=False, return_means=True)

        fig = pyplot.figure()
        ax = fig.add_subplot(111)
        ax.scatter(array_orig[2], array_orig[1])
        ax.scatter(array_orig[2], pred, color='red')
        ax.scatter(array_orig[2], pred_parcorr, color='green')
        ax.plot(numpy.sort(array_orig[2]),
                c_std * func(numpy.sort(array_orig[2])),
                color='black')

        pyplot.savefig('/home/jakobrunge/test/gpdctest.pdf')

    def test_shuffle_vs_analytic_significance_gpdc(self):

        cov = numpy.array([[1., 0.2], [0.2, 1.]])
        array = numpy.random.multivariate_normal(mean=numpy.zeros(2),
                                                 cov=cov,
                                                 size=245).T

        dim, T = array.shape
        xyz = numpy.array([0, 1])

        val = self.ci_gpdc.get_dependence_measure(array, xyz)

        pval_ana = self.ci_gpdc.get_analytic_significance(value=val,
                                                          T=T,
                                                          dim=dim)

        pval_shuffle = self.ci_gpdc.get_shuffle_significance(array, xyz, val)

        print pval_ana
        print pval_shuffle

        numpy.testing.assert_allclose(numpy.array(pval_ana),
                                      numpy.array(pval_shuffle),
                                      atol=0.05)

    def test_shuffle_vs_analytic_significance_gpdc(self):

        cov = numpy.array([[1., 0.01], [0.01, 1.]])
        array = numpy.random.multivariate_normal(mean=numpy.zeros(2),
                                                 cov=cov,
                                                 size=300).T

        dim, T = array.shape
        xyz = numpy.array([0, 1])

        val = self.ci_gpdc.get_dependence_measure(array, xyz)

        pval_ana = self.ci_gpdc.get_analytic_significance(value=val,
                                                          T=T,
                                                          dim=dim)

        pval_shuffle = self.ci_gpdc.get_shuffle_significance(array, xyz, val)
        print pval_ana
        print pval_shuffle

        numpy.testing.assert_allclose(numpy.array(pval_ana),
                                      numpy.array(pval_shuffle),
                                      atol=0.05)

    def test_cmi_knn(self):

        ci_cmi_knn = CMIknn(use_mask=False,
                            mask_type=None,
                            significance='shuffle_test',
                            fixed_thres=None,
                            sig_samples=10000,
                            sig_blocklength=3,
                            knn=10,
                            confidence='bootstrap',
                            conf_lev=0.9,
                            conf_samples=10000,
                            conf_blocklength=1,
                            verbosity=0)

        # ci_cmi_knn._trafo2uniform(self, x)

        val_ana = 0.6
        T = 10000
        numpy.random.seed(42)
        array = numpy.random.randn(5, T)

        cov = numpy.array([[1., val_ana], [val_ana, 1.]])
        array[:2, :] = numpy.random.multivariate_normal(mean=numpy.zeros(2),
                                                        cov=cov,
                                                        size=T).T

        # Generate some confounding
        if len(array) > 2:
            array[0] += 0.5 * array[2:].sum(axis=0)
            array[1] += 0.7 * array[2:].sum(axis=0)

        # print numpy.corrcoef(array)[0,1]
        # print val
        dim, T = array.shape
        xyz = numpy.array([0, 1, 2, 2, 2])

        val_est = ci_cmi_knn.get_dependence_measure(array, xyz)

        print val_est
        print _par_corr_to_cmi(val_ana)

        numpy.testing.assert_allclose(numpy.array(_par_corr_to_cmi(val_ana)),
                                      numpy.array(val_est),
                                      atol=0.02)

    def test_trafo2uniform(self):

        T = 1000
        # numpy.random.seed(None)
        array = numpy.random.randn(2, T)

        bins = 10

        uniform = self.ci_gpdc._trafo2uniform(array)
        # print uniform

        # import matplotlib
        # from matplotlib import pylab
        for i in range(array.shape[0]):
            print uniform[i].shape
            hist, edges = numpy.histogram(uniform[i], bins=bins, density=True)
            # pylab.figure()
            # pylab.hist(uniform[i], color='grey', alpha=0.3)
            # pylab.hist(array[i], alpha=0.3)
            # pylab.show()
            print hist / float(bins)  #, edges
            numpy.testing.assert_allclose(numpy.ones(bins) / float(bins),
                                          hist / float(bins),
                                          atol=0.01)

    def test_cmi_symb(self):

        ci_cmi_symb = CMIsymb(use_mask=False,
                              mask_type=None,
                              significance='shuffle_test',
                              fixed_thres=None,
                              sig_samples=10000,
                              sig_blocklength=3,
                              confidence='bootstrap',
                              conf_lev=0.9,
                              conf_samples=10000,
                              conf_blocklength=1,
                              verbosity=0)

        val_ana = 0.6
        T = 100000
        numpy.random.seed(None)
        array = numpy.random.randn(3, T)

        cov = numpy.array([[1., val_ana], [val_ana, 1.]])
        array[:2, :] = numpy.random.multivariate_normal(mean=numpy.zeros(2),
                                                        cov=cov,
                                                        size=T).T

        # Generate some confounding
        if len(array) > 2:
            array[0] += 0.5 * array[2:].sum(axis=0)
            array[1] += 0.7 * array[2:].sum(axis=0)

        # Transform to symbolic data
        array = pp.quantile_bin_array(array.T, bins=16).T

        dim, T = array.shape
        xyz = numpy.array([0, 1, 2, 2, 2])

        val_est = ci_cmi_symb.get_dependence_measure(array, xyz)

        print val_est
        print _par_corr_to_cmi(val_ana)

        numpy.testing.assert_allclose(numpy.array(_par_corr_to_cmi(val_ana)),
                                      numpy.array(val_est),
                                      atol=0.02)