Ejemplo n.º 1
0
    def __init__(self, X):
        """
        X : model
            Model which will be fitted to the data.

        """
        # prepare input
        self.X = X = asmodel(X)
        self.n_cases = len(X)
        if not isbalanced(X):
            raise NotImplementedError("Unbalanced models")
        self.X_ = X.full

        self.full_model = fm = X.df_error == 0
        if fm:
            self.E_MS = hopkins_ems(X)
        elif hasrandom(X):
            err = "Models containing random effects need to be fully " "specified."
            raise NotImplementedError(err)

        self.max_len = int(2 ** _max_array_size // X.df ** 2)

        if _lmf_lsq == 0:
            pass
        elif _lmf_lsq == 1:
            self.Xsinv = X.Xsinv
        else:
            raise ValueError("version")
Ejemplo n.º 2
0
    def __init__(self, Y, X, sub=None, v=False, lsq=0, title=None):
        """
        Y: dependent variable
        X: model
        v: print some intermediate results for inspection
        
        lsq: 0 = numpy lsq
             1 = my least sq (Fox)
        
        performs an anova/ancova based on residuals. Fixed effects only
        """
        self.title = title
        self.results = {}  # will store results
        # prepare input
        Y = asvar(Y)
        X = asmodel(X)  # .sorted()
        if sub is not None:
            Y = Y[sub]
            X = X[sub]

        assert Y.N == X.N
        assert X.df_error > 0

        # fit
        if lsq == 1:
            # estimate least squares approximation
            beta = _leastsq(Y.x, X.full)
            # estimate
            values = self.values = beta * X.full
            Y_est = values.sum(1)
            self._residuals = residuals = Y.x - Y_est
            SS_res = np.sum(residuals ** 2)
            if not Y.mu == Y_est.mean():
                logging.warning("Y.mu=%s != Y_est.mean()=%s" % (Y.mu, Y_est.mean()))
        else:
            # use numpy
            beta, SS_res, rank, s = np.linalg.lstsq(X.full, Y.x)
            if len(SS_res) == 1:
                SS_res = SS_res[0]
            else:
                SS_res = 0

        # SS total
        SS_total = self.SS_total = np.sum((Y.x - Y.mu) ** 2)
        df_total = self.df_total = X.df_total
        MS_total = self.MS_total = SS_total / df_total
        # SS residuals
        self.SS_res = SS_res
        df_res = self.df_res = X.df_error
        MS_res = self.MS_res = SS_res / df_res
        # SS explained
        #        SS_model = self.SS_model = np.sum((Y_est - Y.mu)**2)
        SS_model = self.SS_model = SS_total - SS_res
        df_model = self.df_model = X.df
        MS_model = self.MS_model = SS_model / df_model

        # store stuff
        self.Y = Y
        self.X = X
        self.beta = beta
Ejemplo n.º 3
0
def ancova(Y, factorial_model, covariate, interaction=None, sub=None, v=True, empty=True, ems=None):
    """
    OBSOLETE
    
    args
    ----
    Y: dependent variable
    factorial model:
    covariate:
    
    
    kwargs
    ------
    interaction: term from the factorial model to check for interaction with
                 the covariate
    v=True: display more information
    **anova_kwargs: ems, empty
    
    
    Based on
    --------
    Exercise to STATISTICS: AN INTRODUCTION USING R
    http://www.bio.ic.ac.uk/research/crawley/statistics/exercises/R6Ancova.pdf
    """
    assert isvar(covariate)
    anova_kwargs = {"empty": empty, "ems": ems}
    if sub != None:
        Y = Y[sub]
        factorial_model = factorial_model[sub]
        covariate = covariate[sub]
        if interaction != None:
            interaction = interaction[sub]
    # if interaction: assert type(interaction) in [factor]
    factorial_model = asmodel(factorial_model)
    a1 = lm(Y, factorial_model)
    if v:
        print a1.anova(title="MODEL 1", **anova_kwargs)
        print "\n"
    a2 = lm(Y, factorial_model + covariate)
    if v:
        print a2.anova(title="MODEL 2: Main Effect Covariate", **anova_kwargs)
        print "\n"
    print 'Model with "%s" Covariate > without Covariate' % covariate.name
    print comparelm(a1, a2)

    if interaction:
        logging.debug("%s / %s" % (covariate.name, interaction.name))
        logging.debug("%s" % (covariate.__div__))
        i_effect = covariate.__div__(interaction)
        #        i_effect = covariate / interaction
        a3 = lm(Y, factorial_model + i_effect)
        if v:
            print "\n"
            print a3.anova(title="MODEL 3: Interaction")
        # compare
        print '\n"%s"x"%s" Interaction > No Covariate:' % (covariate.name, interaction.name)
        print comparelm(a1, a3)
        print '\n"%s"x"%s" Interaction > Main Effect:' % (covariate.name, interaction.name)
        print comparelm(a2, a3)
Ejemplo n.º 4
0
 def Ysub(self, effects):
     "return Y after subtracting the SS explained by the given effects"
     # FIXME: Ysub
     effects = asmodel(effects)
     Yout = deepcopy(self.Y.x)
     for e in effects.effects:
         Yout -= self.values[:, self.indexes[e.name]].sum(1)
     return Yout
Ejemplo n.º 5
0
 def __init__(self, X, v=False, title=False):
     self.title = title
     self.results = {}  # will store results
     # prepare input
     X = asmodel(X)
     self.X = X
     # X inverse
     X_ = X.full
     self.Xinv = np.matrix(X_).I.A  # alternative still used for map
     self.Xsinv = np.dot(np.matrix(np.dot(X_.T, X_)).I.A, X_.T)
     # E MS
     self.E_ms = _hopkins_ems(X)
     self.df_res = X.df_error
Ejemplo n.º 6
0
    def __init__(self, Y='MEG', X='condition', sub=None, ds=None,
                 p=.05, contours={.01: '.5', .001: '0'}):
        self.name = "anova"
        Y = self.Y = asndvar(Y, sub=sub, ds=ds)
        X = self.X = asmodel(X, sub=sub, ds=ds)

        fitter = _glm.lm_fitter(X)

        properties = Y.properties.copy()
        properties['colorspace'] = _cs.get_sig(p=p, contours=contours)
        kwargs = dict(dims=Y.dims[1:], properties=properties)

        self.all = []
        self.p_maps = {}
        for e, _, Ps in fitter.map(Y.x):
            name = e.name
            P = ndvar(Ps, name=name, **kwargs)
            self.all.append(P)
            self.p_maps[e] = P
Ejemplo n.º 7
0
    def __init__(self, Y, X, sub=None, title=None, empty=True, ems=None, lsq=0, showall=False):
        """
        Returns an ANOVA table for the linear model. Mixed effects models require 
        full model specification so that E(MS) can be estimated according to 
        Hopkins (1976)
        
        Random effects: If the model is fully specified, a Hopkins E(MS) table 
        is used to determine error terms in the mixed effects model. Otherwise,
        random factors are treated as fixed factors.
        
        kwargs
        ------
        empty:  include rows without F-Tests (True/False)
        ems:  display source of E(MS) for F-Tests (True/False; None = use default)
        lsq:  least square fitter
                = 0 -> numpy.linalg.lstsq 
                = 1 -> after Fox
        showall: show SS, df and MS for effects without F test
        
        
        TODO
        ----
          - sort model
          - reuse lms which are used repeatedly
          - provide threshold for including interaction effects when testing lower 
            level effects
        
        
        Problem with unbalanced models
        ------------------------------
          - The SS of Effects which do not include the between-subject factor are 
            higher than in SPSS
          - The SS of effects which include the between-subject factor agree with 
            SPSS
        
        """
        # prepare kwargs
        Y = asvar(Y)
        X = asmodel(X)

        if sub is not None:
            Y = Y[sub]
            X = X[sub]

        assert Y.N == X.N

        # save args
        self.Y = Y
        self.X = X
        self.title = title
        self.show_ems = ems

        self._log = []

        # decide which E(MS) model to use
        if X.df_error == 0:
            rfx = 1
            fx_desc = "Mixed"
        elif X.df_error > 0:
            rfx = 0
            fx_desc = "Fixed"
        else:
            raise ValueError("Model Overdetermined")
        self._log.append("%s effects model" % fx_desc)

        if lsq == 1:
            self._log.append("(my lsq)")
        elif lsq == 0:
            self._log.append("\n (np lsq)")

        # create testing table:
        # list of (effect, lm, lm_comp, lm_EMS)
        test_table = []
        #
        # list of (name, SS, df, MS, F, p)
        results_table = []

        if len(X.effects) == 1:
            self._log.append("single factor model")
            lm0 = lm(Y, X, lsq=lsq)
            SS = lm0.SS_model
            df = lm0.df_model
            MS = lm0.MS_model
            F, p = lm0.F_test()
            results_table.append((X.name, SS, df, MS, F, p))
            results_table.append(("Residuals", lm0.SS_res, lm0.df_res, lm0.MS_res, None, None))
        else:
            if not rfx:
                full_lm = lm(Y, X, lsq=lsq)
                SS_e = full_lm.SS_res
                MS_e = full_lm.MS_res
                df_e = full_lm.df_res

            for e_test in X.effects:
                skip = False
                name = e_test.name

                # find model 0
                effects = []
                excluded_e = []
                for e in X.effects:
                    # determine whether e_test
                    if e is e_test:
                        pass
                    else:
                        if _is_higher_order(e, e_test):
                            excluded_e.append(e)
                        else:
                            effects.append(e)

                model0 = model(*effects)
                if e_test.df > model0.df_error:
                    skip = "overspecified"
                else:
                    lm0 = lm(Y, model0, lsq=lsq)

                    # find model 1
                    effects.append(e_test)
                    model1 = model(*effects)
                    if model1.df_error > 0:
                        lm1 = lm(Y, model1, lsq=lsq)
                    else:
                        lm1 = None

                    if rfx:
                        # find E(MS)
                        EMS_effects = []
                        for e in X.effects:
                            if e is e_test:
                                pass
                            elif all([(f in e_test or f.random) for f in e.factors]):
                                if all([(f in e or e.nestedin(f)) for f in e_test.factors]):
                                    EMS_effects.append(e)
                        if len(EMS_effects) > 0:
                            lm_EMS = lm(Y, model(*EMS_effects), lsq=lsq)
                            MS_e = lm_EMS.MS_model
                            df_e = lm_EMS.df_model
                        else:
                            if showall:
                                if lm1 is None:
                                    SS = lm0.SS_res
                                    df = lm0.df_res
                                else:
                                    SS = lm0.SS_res - lm1.SS_res
                                    df = lm0.df_res - lm1.df_res
                                MS = SS / df
                                results_table.append((name, SS, df, MS, None, None))
                            skip = "no Hopkins E(MS)"

                if skip:
                    self._log.append("SKIPPING: %s (%s)" % (e_test.name, skip))
                else:
                    test_table.append((e_test, lm1, lm0, MS_e, df_e))
                    SS, df, MS, F, p = incremental_F_test(lm1, lm0, MS_e=MS_e, df_e=df_e)
                    results_table.append((name, SS, df, MS, F, p))
            if not rfx:
                results_table.append(("Residuals", SS_e, df_e, MS_e, None, None))
        self._test_table = test_table
        self._results_table = results_table
Ejemplo n.º 8
0
    def __init__(self, Y, X, t=.1, samples=1000, replacement=False,
                 tstart=None, tstop=None, close_time=0,
                 pmax=1, sub=None, ds=None,
                 ):
        """

        Arguments
        ---------

        Y : ndvar
            Measurements (dependent variable)

        X : categorial
            Model

        t : scalar
            Threshold (uncorrected p-value) to use for finding clusters

        samples : int
            Number of samples to estimate parameter distributions

        replacement : bool
            whether random samples should be drawn with replacement or
            without

        tstart, tstop : None | scalar
            Time window for clusters.
            **None**: use the whole epoch;
            **scalar**: use only a part of the epoch

            .. Note:: implementation is not optimal: F-values are still
                computed but ignored.

        close_time : scalar
            Close gaps in clusters that are smaller than this interval. Assumes
            that Y is a uniform time series.

        sub : index
            Apply analysis to a subset of cases in Y, X

        pmax : scalar <= 1
            Maximum cluster p-values to keep cluster.


        .. FIXME:: connectivity for >2 dimensional data. Currently, adjacent
            samples are connected.

        """
        Y = self.Y = asndvar(Y, sub=sub, ds=ds)
        X = self.X = asmodel(X, sub=sub, ds=ds)
        lm = _glm.lm_fitter(X)

        # get F-thresholds from p-threshold
        tF = {}
        if lm.full_model:
            for e in lm.E_MS:
                effects_d = lm.E_MS[e]
                if effects_d:
                    df_d = sum(ed.df for ed in effects_d)
                    tF[e] = scipy.stats.distributions.f.isf(t, e.df, df_d)
        else:
            df_d = X.df_error
            tF = {e: scipy.stats.distributions.f.isf(t, e.df, df_d) for e in X.effects}

        # Estimate statistic distributions from permuted Ys
        kwargs = dict(tstart=tstart, tstop=tstop, close_time=close_time, unit='F')
        dists = {e: cluster_dist(Y, samples, tF[e], name=e.name, **kwargs) for e in tF}
        self.cluster_dists = dists
        for _, Yrs in _resample(Y, replacement=replacement, samples=samples):
            for e, F in lm.map(Yrs.x, p=False):
                dists[e].add_perm(F)

        # Find clusters in the actual data
        test0 = lm.map(Y.x, p=False)
        self.effects = []
        self.clusters = {}
        self.F_maps = {}
        for e, F in test0:
            self.effects.append(e)
            dist = dists[e]
            dist.add_original(F)
            self.clusters[e] = dist
            self.F_maps[e] = dist.P

        self.name = "ANOVA Permutation Cluster Test"
        self.tF = tF

        self.all = [[self.F_maps[e]] + self.clusters[e].clusters
                    for e in self.X.effects if e in self.F_maps]
Ejemplo n.º 9
0
    def __init__(self, Y, X, sub=None, title=None, empty=True, ems=None, showall=False, ds=None):
        """
        Fits a univariate ANOVA model.

        Mixed effects models require full model specification so that E(MS)
        can be estimated according to Hopkins (1976)


        Parameters
        ----------
        Y : var
            dependent variable
        X : model
            Model to fit to Y
        empty : bool
            include rows without F-Tests (True/False)
        ems : bool | None
            display source of E(MS) for F-Tests (True/False; None = use default)
        lsq : int
            least square fitter to use;
            0 -> scipy.linalg.lstsq
            1 -> after Fox
        showall : bool
            show SS, df and MS for effects without F test
        """
        #  TODO:
        #         - sort model
        #          - reuse lms which are used repeatedly
        #          - provide threshold for including interaction effects when testing lower
        #            level effects
        #
        #        Problem with unbalanced models
        #        ------------------------------
        #          - The SS of Effects which do not include the between-subject factor are
        #            higher than in SPSS
        #          - The SS of effects which include the between-subject factor agree with
        #            SPSS

        # prepare kwargs
        Y = asvar(Y, sub=sub, ds=ds)
        X = asmodel(X, sub=sub, ds=ds)

        if len(Y) != len(X):
            raise ValueError("Y and X must describe same number of cases")

        # save args
        self.Y = Y
        self.X = X
        self.title = title
        self.show_ems = ems
        self._log = []

        # decide which E(MS) model to use
        if X.df_error == 0:
            rfx = 1
            fx_desc = "Mixed"
        elif X.df_error > 0:
            if hasrandom(X):
                err = "Models containing random effects need to be fully " "specified."
                raise NotImplementedError(err)
            rfx = 0
            fx_desc = "Fixed"
        else:
            raise ValueError("Model Overdetermined")
        self._log.append("Using %s effects model" % fx_desc)

        # list of (name, SS, df, MS, F, p)
        self.F_tests = []
        self.names = []

        if len(X.effects) == 1:
            self._log.append("single factor model")
            lm1 = lm(Y, X)
            self.F_tests.append(lm1)
            self.names.append(X.name)
            self.residuals = lm1.SS_res, lm1.df_res, lm1.MS_res
        else:
            if rfx:
                pass  # <- Hopkins
            else:
                full_lm = lm(Y, X)
                SS_e = full_lm.SS_res
                MS_e = full_lm.MS_res
                df_e = full_lm.df_res

            for e_test in X.effects:
                skip = False
                name = e_test.name

                # find model 0
                effects = []
                excluded_e = []
                for e in X.effects:
                    # determine whether e_test
                    if e is e_test:
                        pass
                    else:
                        if is_higher_order(e, e_test):
                            excluded_e.append(e)
                        else:
                            effects.append(e)

                model0 = model(*effects)
                if e_test.df > model0.df_error:
                    skip = "overspecified"
                else:
                    lm0 = lm(Y, model0)

                    # find model 1
                    effects.append(e_test)
                    model1 = model(*effects)
                    if model1.df_error > 0:
                        lm1 = lm(Y, model1)
                    else:
                        lm1 = None

                    if rfx:
                        # find E(MS)
                        EMS_effects = _find_hopkins_ems(e_test, X)

                        if len(EMS_effects) > 0:
                            lm_EMS = lm(Y, model(*EMS_effects))
                            MS_e = lm_EMS.MS_model
                            df_e = lm_EMS.df_model
                        else:
                            if lm1 is None:
                                SS = lm0.SS_res
                                df = lm0.df_res
                            else:
                                SS = lm0.SS_res - lm1.SS_res
                                df = lm0.df_res - lm1.df_res
                            MS = SS / df
                            skip = "no Hopkins E(MS); SS=%.2f, df=%i, " "MS=%.2f" % (SS, df, MS)

                if skip:
                    self._log.append("SKIPPING: %s (%s)" % (e_test.name, skip))
                else:
                    res = incremental_F_test(lm1, lm0, MS_e=MS_e, df_e=df_e, name=name)
                    self.F_tests.append(res)
                    self.names.append(name)
            if not rfx:
                self.residuals = SS_e, df_e, MS_e
Ejemplo n.º 10
0
    def __init__(self, Y, X, sub=None):
        """
        Fit the model X to the dependent variable Y.

        Parameters
        ----------
        Y : var
            Dependent variable.
        X : model
            Model.
        sub : None | index
            Only use part of the data
        """
        # prepare input
        Y = asvar(Y)
        X = asmodel(X)  # .sorted()
        if sub is not None:
            Y = Y[sub]
            X = X[sub]

        assert len(Y) == len(X)
        assert X.df_error > 0

        # fit
        if _lm_lsq == 0:  # use scipy (faster)
            beta, SS_res, _, _ = lstsq(X.full, Y.x)
        elif _lm_lsq == 1:  # Fox
            # estimate least squares approximation
            beta = X.fit(Y)
            # estimate
            values = beta * X.full
            Y_est = values.sum(1)
            self._residuals = residuals = Y.x - Y_est
            SS_res = np.sum(residuals ** 2)
            if not Y.mean() == Y_est.mean():
                msg = "Y.mean() != Y_est.mean() (%s vs " "%s)" % (Y.mean(), Y_est.mean())
                logging.warning(msg)
        else:
            raise ValueError

        # SS total
        SS_total = self.SS_total = np.sum((Y.x - Y.mean()) ** 2)
        df_total = self.df_total = X.df_total
        self.MS_total = SS_total / df_total

        # SS residuals
        self.SS_res = SS_res
        df_res = self.df_res = X.df_error
        self.MS_res = SS_res / df_res

        # SS explained
        #        SS_model = self.SS_model = np.sum((Y_est - Y.mean())**2)
        SS_model = self.SS = self.SS_model = SS_total - SS_res
        df_model = self.df = self.df_model = X.df
        self.MS_model = self.MS = SS_model / df_model

        # store stuff
        self.Y = Y
        self.X = X
        self.sub = sub
        self.beta = beta
Ejemplo n.º 11
0
def data(Y, X=None, match=None, cov=[], sub=None, fmt=None, labels=True, 
          showcase=True):
    """
    return a textab.table (printed as tsv table by default)
    
    parameters
    ----------
    Y: variable to display (can be model with several dependents)

    X: categories defining cells (factorial model)

    match: factor to match values on and return repeated-measures table

    cov: covariate to report (WARNING: only works with match, where each value
         on the matching variable corresponds with one value in the covariate)

    sub: boolean array specifying which values to include (generate e.g. 
         with 'sub=T==[1,2]')

    fmt: Format string  
            
    labels: display labels for nominal variables (otherwise display codes)

    """
    if hasattr(Y, '_items'): # dataframe
        Y = Y._items
    Y = _data.asmodel(Y)
    if _data.isfactor(cov) or _data.isvar(cov):
        cov = [cov]
    
    data = []
    names_yname = [] # names including Yi.name for matched table headers
    ynames = [] # names of Yi for independent measures table headers
    within_list = []
    for Yi in Y.effects:
        _data, datalabels, names, _within = _data._split_Y(Yi, X, match=match, 
                                                     sub=sub, datalabels=match)
        data += _data
        names_yname += ['({c})'.format(c=n) for n in names]
        ynames.append(Yi.name)
        within_list.append(_within)
    within = within_list[0]
    assert all([w==within for w in within_list])
    
    # table
    n_dependents = len(Y.effects)
    n_cells = int(len(data) / n_dependents)
    if within:
        n, k = len(data[0]), len(data)
        table = textab.Table('l' * (k + showcase + len(cov)))
        
        # header line 1
        if showcase:
            table.cell(match.name)
            case_labels = datalabels[0]
            assert all([np.all(case_labels==l) for l in datalabels[1:]])
        for i in range(n_dependents):
            for name in names:        
                table.cell(name.replace(' ','_'))
        for c in cov:
            table.cell(c.name)
        
        # header line 2
        if n_dependents > 1:
            if showcase:
                table.cell()
            for name in ynames:
                [table.cell('(%s)'%name) for i in range(n_cells)]
            for c in cov:
                table.cell()
        
        # body
        table.midrule()
        for i in range(n):
            case = case_labels[i]
            if showcase:
                table.cell(case)
            for j in range(k):
                table.cell(data[j][i], fmt=fmt)
            # covariates
            indexes = match==case
            for c in cov:
                # test it's all the same values
                case_cov = c[indexes]
                if len(np.unique(case_cov.x)) != 1: 
                    msg = 'covariate for case "%s" has several values'%case
                    raise ValueError(msg)
                # get value
                first_i = np.nonzero(indexes)[0][0]
                cov_value = c[first_i]
                if _data.isfactor(c) and labels:
                    cov_value = c.cells[cov_value]
                table.cell(cov_value, fmt=fmt)
    else:
        table = textab.Table('l'*(1 + n_dependents))
        table.cell(X.name)
        [table.cell(y) for y in ynames]
        table.midrule()
        # data is now sorted: (cell_i within dependent_i)
        # sort data as (X-cell, dependent_i)
        data_sorted = []
        for i_cell in range(n_cells):
            data_sorted.append([data[i_dep*n_cells + i_cell] for i_dep in \
                               range(n_dependents)])
        # table
        for name, cell_data in zip(names, data_sorted):
            for i in range(len(cell_data[0])):
                table.cell(name)
                for dep_data in cell_data:
                    v = dep_data[i]
                    table.cell(v, fmt=fmt)
    return table