Example #1
0
    def __init__(self, Y, X, sub=None, v=False, lsq=0, title=None):
        """
        Y: dependent variable
        X: model
        v: print some intermediate results for inspection
        
        lsq: 0 = numpy lsq
             1 = my least sq (Fox)
        
        performs an anova/ancova based on residuals. Fixed effects only
        """
        self.title = title
        self.results = {}  # will store results
        # prepare input
        Y = asvar(Y)
        X = asmodel(X)  # .sorted()
        if sub is not None:
            Y = Y[sub]
            X = X[sub]

        assert Y.N == X.N
        assert X.df_error > 0

        # fit
        if lsq == 1:
            # estimate least squares approximation
            beta = _leastsq(Y.x, X.full)
            # estimate
            values = self.values = beta * X.full
            Y_est = values.sum(1)
            self._residuals = residuals = Y.x - Y_est
            SS_res = np.sum(residuals ** 2)
            if not Y.mu == Y_est.mean():
                logging.warning("Y.mu=%s != Y_est.mean()=%s" % (Y.mu, Y_est.mean()))
        else:
            # use numpy
            beta, SS_res, rank, s = np.linalg.lstsq(X.full, Y.x)
            if len(SS_res) == 1:
                SS_res = SS_res[0]
            else:
                SS_res = 0

        # SS total
        SS_total = self.SS_total = np.sum((Y.x - Y.mu) ** 2)
        df_total = self.df_total = X.df_total
        MS_total = self.MS_total = SS_total / df_total
        # SS residuals
        self.SS_res = SS_res
        df_res = self.df_res = X.df_error
        MS_res = self.MS_res = SS_res / df_res
        # SS explained
        #        SS_model = self.SS_model = np.sum((Y_est - Y.mu)**2)
        SS_model = self.SS_model = SS_total - SS_res
        df_model = self.df_model = X.df
        MS_model = self.MS_model = SS_model / df_model

        # store stuff
        self.Y = Y
        self.X = X
        self.beta = beta
Example #2
0
    def __init__(self, Y, X, norm=None, sub=None, ds=None,
                 contours={.05: (.8, .2, .0), .01: (1., .6, .0), .001: (1., 1., .0)},
                 tp=.1, samples=1000, replacement=False,
                 tstart=None, tstop=None, close_time=0, pmax=1):
        """

        Y : ndvar
            Dependent variable.
        X : continuous | None
            The continuous predictor variable.
        norm : None | categorial
            Categories in which to normalize (z-score) X.

        """
        Y = asndvar(Y, sub=sub, ds=ds)
        X = asvar(X, sub=sub, ds=ds)

        self.name = name = "%s corr %s" % (Y.name, X.name)

        # calculate threshold
        # http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient#Inference
        self.n = n = len(X)
        df = n - 2
        tt = scipy.stats.distributions.t.isf(tp, df)
        tr = tt / np.sqrt(df + tt ** 2)

        cs = _cs.Colorspace(cmap=_cs.cm_xpolar, vmax=1, vmin= -1)
        cdist = cluster_dist(Y, N=samples, t_upper=tr, t_lower= -tr,
                             tstart=tstart, tstop=tstop, close_time=close_time,
                             unit='r', pmax=pmax, name=name, cs=cs)

        # normalization is done before the permutation b/c we are interested in the variance associated with each subject for the z-scoring.
        Y = Y.copy()
        Y.x = Y.x.reshape((n, -1))
        if norm is not None:
            for cell in norm.cells:
                idx = (norm == cell)
                Y.x[idx] = scipy.stats.mstats.zscore(Y.x[idx])

        x = X.x.reshape((n, -1))
        m_x = np.mean(x)
        if np.isnan(m_x):
            raise ValueError("np.mean(x) is nan")
        self.x = x - m_x

        for _, Yrs in _resample(Y, replacement=replacement, samples=samples):
            r = self._corr(Yrs)
            cdist.add_perm(r)

        r = self._corr(Y)
        cdist.add_original(r)

        self.r_map = cdist.P
        self.all = [[self.r_map] + cdist.clusters]
        self.clusters = cdist
Example #3
0
    def __init__(self, Y, X, match=None, sub=None,
                 samples=1000, replacement=True,
                 title="Bootstrapped Pairwise Tests"):
        Y = asvar(Y, sub)
        X = asfactor(X, sub)
        assert len(Y) == len(X), "dataset length mismatch"

        if match:
            if sub is not None:
                match = match[sub]
            assert len(match) == len(Y), "dataset length mismatch"

        # prepare data container
        resampled = np.empty((samples + 1, len(Y)))  # sample X subject within category
        resampled[0] = Y.x
        # fill resampled
        for i, Y_resampled in _resample(Y, unit=match, samples=samples,
                                        replacement=replacement):
            resampled[i + 1] = Y_resampled.x
        self.resampled = resampled

        cells = X.cells
        n_groups = len(cells)

        if match:
            # if there are several values per X%match cell, take the average
            # T: indexes to transform Y.x to [X%match, value]-array
            match_cell_ids = match.cells
            group_size = len(match_cell_ids)
            T = None; i = 0
            for X_cell in cells:
                for match_cell in match_cell_ids:
                    source_indexes = np.where((X == X_cell) * (match == match_cell))[0]
                    if T is None:
                        n_cells = n_groups * group_size
                        T = np.empty((n_cells, len(source_indexes)), dtype=int)
                    T[i, :] = source_indexes
                    i += 1

            if T.shape[1] == 1:
                T = T[:, 0]
                ordered = resampled[:, T]
            else:
                ordered = resampled[:, T].mean(axis=2)
            self.ordered = ordered

            # t-tests
            n_comparisons = sum(range(n_groups))
            t = np.empty((samples + 1, n_comparisons))
            comp_names = []
            one_group = np.arange(group_size)
            groups = [one_group + i * group_size for i in range(n_groups)]
            for i, (g1, g2) in enumerate(itertools.combinations(range(n_groups), 2)):
                group_1 = groups[g1]
                group_2 = groups[g2]
                diffs = ordered[:, group_1] - ordered[:, group_2]
                t[:, i] = np.mean(diffs, axis=1) * np.sqrt(group_size) / np.std(diffs, axis=1, ddof=1)
                comp_names.append(' - '.join((cells[g1], cells[g2])))

            self.diffs = diffs
            self.t_resampled = np.max(np.abs(t[1:]), axis=1)
            self.t = t = t[0]
        else:
            raise NotImplementedError

        self._Y = Y
        self._X = X
        self._group_names = cells
        self._group_data = np.array([ordered[0, g] for g in groups])
        self._group_size = group_size
        self._df = group_size - 1
        self._match = match
        self._n_samples = samples
        self._replacement = replacement
        self._comp_names = comp_names
        self._p_parametric = self.test_param(t)
        self._p_boot = self.test_boot(t)
        self.title = title
Example #4
0
    def __init__(self, Y, X, sub=None, title=None, empty=True, ems=None, lsq=0, showall=False):
        """
        Returns an ANOVA table for the linear model. Mixed effects models require 
        full model specification so that E(MS) can be estimated according to 
        Hopkins (1976)
        
        Random effects: If the model is fully specified, a Hopkins E(MS) table 
        is used to determine error terms in the mixed effects model. Otherwise,
        random factors are treated as fixed factors.
        
        kwargs
        ------
        empty:  include rows without F-Tests (True/False)
        ems:  display source of E(MS) for F-Tests (True/False; None = use default)
        lsq:  least square fitter
                = 0 -> numpy.linalg.lstsq 
                = 1 -> after Fox
        showall: show SS, df and MS for effects without F test
        
        
        TODO
        ----
          - sort model
          - reuse lms which are used repeatedly
          - provide threshold for including interaction effects when testing lower 
            level effects
        
        
        Problem with unbalanced models
        ------------------------------
          - The SS of Effects which do not include the between-subject factor are 
            higher than in SPSS
          - The SS of effects which include the between-subject factor agree with 
            SPSS
        
        """
        # prepare kwargs
        Y = asvar(Y)
        X = asmodel(X)

        if sub is not None:
            Y = Y[sub]
            X = X[sub]

        assert Y.N == X.N

        # save args
        self.Y = Y
        self.X = X
        self.title = title
        self.show_ems = ems

        self._log = []

        # decide which E(MS) model to use
        if X.df_error == 0:
            rfx = 1
            fx_desc = "Mixed"
        elif X.df_error > 0:
            rfx = 0
            fx_desc = "Fixed"
        else:
            raise ValueError("Model Overdetermined")
        self._log.append("%s effects model" % fx_desc)

        if lsq == 1:
            self._log.append("(my lsq)")
        elif lsq == 0:
            self._log.append("\n (np lsq)")

        # create testing table:
        # list of (effect, lm, lm_comp, lm_EMS)
        test_table = []
        #
        # list of (name, SS, df, MS, F, p)
        results_table = []

        if len(X.effects) == 1:
            self._log.append("single factor model")
            lm0 = lm(Y, X, lsq=lsq)
            SS = lm0.SS_model
            df = lm0.df_model
            MS = lm0.MS_model
            F, p = lm0.F_test()
            results_table.append((X.name, SS, df, MS, F, p))
            results_table.append(("Residuals", lm0.SS_res, lm0.df_res, lm0.MS_res, None, None))
        else:
            if not rfx:
                full_lm = lm(Y, X, lsq=lsq)
                SS_e = full_lm.SS_res
                MS_e = full_lm.MS_res
                df_e = full_lm.df_res

            for e_test in X.effects:
                skip = False
                name = e_test.name

                # find model 0
                effects = []
                excluded_e = []
                for e in X.effects:
                    # determine whether e_test
                    if e is e_test:
                        pass
                    else:
                        if _is_higher_order(e, e_test):
                            excluded_e.append(e)
                        else:
                            effects.append(e)

                model0 = model(*effects)
                if e_test.df > model0.df_error:
                    skip = "overspecified"
                else:
                    lm0 = lm(Y, model0, lsq=lsq)

                    # find model 1
                    effects.append(e_test)
                    model1 = model(*effects)
                    if model1.df_error > 0:
                        lm1 = lm(Y, model1, lsq=lsq)
                    else:
                        lm1 = None

                    if rfx:
                        # find E(MS)
                        EMS_effects = []
                        for e in X.effects:
                            if e is e_test:
                                pass
                            elif all([(f in e_test or f.random) for f in e.factors]):
                                if all([(f in e or e.nestedin(f)) for f in e_test.factors]):
                                    EMS_effects.append(e)
                        if len(EMS_effects) > 0:
                            lm_EMS = lm(Y, model(*EMS_effects), lsq=lsq)
                            MS_e = lm_EMS.MS_model
                            df_e = lm_EMS.df_model
                        else:
                            if showall:
                                if lm1 is None:
                                    SS = lm0.SS_res
                                    df = lm0.df_res
                                else:
                                    SS = lm0.SS_res - lm1.SS_res
                                    df = lm0.df_res - lm1.df_res
                                MS = SS / df
                                results_table.append((name, SS, df, MS, None, None))
                            skip = "no Hopkins E(MS)"

                if skip:
                    self._log.append("SKIPPING: %s (%s)" % (e_test.name, skip))
                else:
                    test_table.append((e_test, lm1, lm0, MS_e, df_e))
                    SS, df, MS, F, p = incremental_F_test(lm1, lm0, MS_e=MS_e, df_e=df_e)
                    results_table.append((name, SS, df, MS, F, p))
            if not rfx:
                results_table.append(("Residuals", SS_e, df_e, MS_e, None, None))
        self._test_table = test_table
        self._results_table = results_table
Example #5
0
    def __init__(self, Y, X, norm=None, sub=None, ds=None,
                 contours={.05: (.8, .2, .0), .01: (1., .6, .0), .001: (1., 1., .0)}):
        """

        Y : ndvar
            Dependent variable.
        X : continuous | None
            The continuous predictor variable.
        norm : None | categorial
            Categories in which to normalize (z-score) X.

        """
        Y = asndvar(Y, sub=sub, ds=ds)
        X = asvar(X, sub=sub, ds=ds)

        if not Y.has_case:
            msg = ("Dependent variable needs case dimension")
            raise ValueError(msg)

        y = Y.x.reshape((len(Y), -1))
        if norm is not None:
            y = y.copy()
            for cell in norm.cells:
                idx = (norm == cell)
                y[idx] = scipy.stats.mstats.zscore(y[idx])

        n = len(X)
        x = X.x.reshape((n, -1))

        # covariance
        m_x = np.mean(x)
        if np.isnan(m_x):
            raise ValueError("np.mean(x) is nan")
        x -= m_x
        y -= np.mean(y, axis=0)
        cov = np.sum(x * y, axis=0) / (n - 1)

        # correlation
        r = cov / (np.std(x, axis=0) * np.std(y, axis=0))

        # p-value calculation
        # http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient#Inference
        pcont = {}
        r_ps = {}
        df = n - 2
        for p, color in contours.iteritems():
            t = scipy.stats.distributions.t.isf(p, df)
            r_p = t / np.sqrt(n - 2 + t ** 2)
            pcont[r_p] = color
            r_ps[r_p] = p
            pcont[-r_p] = color
            r_ps[-r_p] = p

        dims = Y.dims[1:]
        shape = Y.x.shape[1:]
        properties = Y.properties.copy()
        cs = _cs.Colorspace(cmap=_cs.cm_xpolar, vmax=1, vmin= -1, contours=pcont, ps=r_ps)
        properties['colorspace'] = cs
        r = ndvar(r.reshape(shape), dims=dims, properties=properties)

        # store results
        self.name = "%s corr %s" % (Y.name, X.name)
        self.r = r
        self.all = r
Example #6
0
    def __init__(self, Y, X, sub=None, title=None, empty=True, ems=None, showall=False, ds=None):
        """
        Fits a univariate ANOVA model.

        Mixed effects models require full model specification so that E(MS)
        can be estimated according to Hopkins (1976)


        Parameters
        ----------
        Y : var
            dependent variable
        X : model
            Model to fit to Y
        empty : bool
            include rows without F-Tests (True/False)
        ems : bool | None
            display source of E(MS) for F-Tests (True/False; None = use default)
        lsq : int
            least square fitter to use;
            0 -> scipy.linalg.lstsq
            1 -> after Fox
        showall : bool
            show SS, df and MS for effects without F test
        """
        #  TODO:
        #         - sort model
        #          - reuse lms which are used repeatedly
        #          - provide threshold for including interaction effects when testing lower
        #            level effects
        #
        #        Problem with unbalanced models
        #        ------------------------------
        #          - The SS of Effects which do not include the between-subject factor are
        #            higher than in SPSS
        #          - The SS of effects which include the between-subject factor agree with
        #            SPSS

        # prepare kwargs
        Y = asvar(Y, sub=sub, ds=ds)
        X = asmodel(X, sub=sub, ds=ds)

        if len(Y) != len(X):
            raise ValueError("Y and X must describe same number of cases")

        # save args
        self.Y = Y
        self.X = X
        self.title = title
        self.show_ems = ems
        self._log = []

        # decide which E(MS) model to use
        if X.df_error == 0:
            rfx = 1
            fx_desc = "Mixed"
        elif X.df_error > 0:
            if hasrandom(X):
                err = "Models containing random effects need to be fully " "specified."
                raise NotImplementedError(err)
            rfx = 0
            fx_desc = "Fixed"
        else:
            raise ValueError("Model Overdetermined")
        self._log.append("Using %s effects model" % fx_desc)

        # list of (name, SS, df, MS, F, p)
        self.F_tests = []
        self.names = []

        if len(X.effects) == 1:
            self._log.append("single factor model")
            lm1 = lm(Y, X)
            self.F_tests.append(lm1)
            self.names.append(X.name)
            self.residuals = lm1.SS_res, lm1.df_res, lm1.MS_res
        else:
            if rfx:
                pass  # <- Hopkins
            else:
                full_lm = lm(Y, X)
                SS_e = full_lm.SS_res
                MS_e = full_lm.MS_res
                df_e = full_lm.df_res

            for e_test in X.effects:
                skip = False
                name = e_test.name

                # find model 0
                effects = []
                excluded_e = []
                for e in X.effects:
                    # determine whether e_test
                    if e is e_test:
                        pass
                    else:
                        if is_higher_order(e, e_test):
                            excluded_e.append(e)
                        else:
                            effects.append(e)

                model0 = model(*effects)
                if e_test.df > model0.df_error:
                    skip = "overspecified"
                else:
                    lm0 = lm(Y, model0)

                    # find model 1
                    effects.append(e_test)
                    model1 = model(*effects)
                    if model1.df_error > 0:
                        lm1 = lm(Y, model1)
                    else:
                        lm1 = None

                    if rfx:
                        # find E(MS)
                        EMS_effects = _find_hopkins_ems(e_test, X)

                        if len(EMS_effects) > 0:
                            lm_EMS = lm(Y, model(*EMS_effects))
                            MS_e = lm_EMS.MS_model
                            df_e = lm_EMS.df_model
                        else:
                            if lm1 is None:
                                SS = lm0.SS_res
                                df = lm0.df_res
                            else:
                                SS = lm0.SS_res - lm1.SS_res
                                df = lm0.df_res - lm1.df_res
                            MS = SS / df
                            skip = "no Hopkins E(MS); SS=%.2f, df=%i, " "MS=%.2f" % (SS, df, MS)

                if skip:
                    self._log.append("SKIPPING: %s (%s)" % (e_test.name, skip))
                else:
                    res = incremental_F_test(lm1, lm0, MS_e=MS_e, df_e=df_e, name=name)
                    self.F_tests.append(res)
                    self.names.append(name)
            if not rfx:
                self.residuals = SS_e, df_e, MS_e
Example #7
0
    def __init__(self, Y, X, sub=None):
        """
        Fit the model X to the dependent variable Y.

        Parameters
        ----------
        Y : var
            Dependent variable.
        X : model
            Model.
        sub : None | index
            Only use part of the data
        """
        # prepare input
        Y = asvar(Y)
        X = asmodel(X)  # .sorted()
        if sub is not None:
            Y = Y[sub]
            X = X[sub]

        assert len(Y) == len(X)
        assert X.df_error > 0

        # fit
        if _lm_lsq == 0:  # use scipy (faster)
            beta, SS_res, _, _ = lstsq(X.full, Y.x)
        elif _lm_lsq == 1:  # Fox
            # estimate least squares approximation
            beta = X.fit(Y)
            # estimate
            values = beta * X.full
            Y_est = values.sum(1)
            self._residuals = residuals = Y.x - Y_est
            SS_res = np.sum(residuals ** 2)
            if not Y.mean() == Y_est.mean():
                msg = "Y.mean() != Y_est.mean() (%s vs " "%s)" % (Y.mean(), Y_est.mean())
                logging.warning(msg)
        else:
            raise ValueError

        # SS total
        SS_total = self.SS_total = np.sum((Y.x - Y.mean()) ** 2)
        df_total = self.df_total = X.df_total
        self.MS_total = SS_total / df_total

        # SS residuals
        self.SS_res = SS_res
        df_res = self.df_res = X.df_error
        self.MS_res = SS_res / df_res

        # SS explained
        #        SS_model = self.SS_model = np.sum((Y_est - Y.mean())**2)
        SS_model = self.SS = self.SS_model = SS_total - SS_res
        df_model = self.df = self.df_model = X.df
        self.MS_model = self.MS = SS_model / df_model

        # store stuff
        self.Y = Y
        self.X = X
        self.sub = sub
        self.beta = beta