def __init__(self, Y, X, sub=None, v=False, lsq=0, title=None): """ Y: dependent variable X: model v: print some intermediate results for inspection lsq: 0 = numpy lsq 1 = my least sq (Fox) performs an anova/ancova based on residuals. Fixed effects only """ self.title = title self.results = {} # will store results # prepare input Y = asvar(Y) X = asmodel(X) # .sorted() if sub is not None: Y = Y[sub] X = X[sub] assert Y.N == X.N assert X.df_error > 0 # fit if lsq == 1: # estimate least squares approximation beta = _leastsq(Y.x, X.full) # estimate values = self.values = beta * X.full Y_est = values.sum(1) self._residuals = residuals = Y.x - Y_est SS_res = np.sum(residuals ** 2) if not Y.mu == Y_est.mean(): logging.warning("Y.mu=%s != Y_est.mean()=%s" % (Y.mu, Y_est.mean())) else: # use numpy beta, SS_res, rank, s = np.linalg.lstsq(X.full, Y.x) if len(SS_res) == 1: SS_res = SS_res[0] else: SS_res = 0 # SS total SS_total = self.SS_total = np.sum((Y.x - Y.mu) ** 2) df_total = self.df_total = X.df_total MS_total = self.MS_total = SS_total / df_total # SS residuals self.SS_res = SS_res df_res = self.df_res = X.df_error MS_res = self.MS_res = SS_res / df_res # SS explained # SS_model = self.SS_model = np.sum((Y_est - Y.mu)**2) SS_model = self.SS_model = SS_total - SS_res df_model = self.df_model = X.df MS_model = self.MS_model = SS_model / df_model # store stuff self.Y = Y self.X = X self.beta = beta
def __init__(self, Y, X, norm=None, sub=None, ds=None, contours={.05: (.8, .2, .0), .01: (1., .6, .0), .001: (1., 1., .0)}, tp=.1, samples=1000, replacement=False, tstart=None, tstop=None, close_time=0, pmax=1): """ Y : ndvar Dependent variable. X : continuous | None The continuous predictor variable. norm : None | categorial Categories in which to normalize (z-score) X. """ Y = asndvar(Y, sub=sub, ds=ds) X = asvar(X, sub=sub, ds=ds) self.name = name = "%s corr %s" % (Y.name, X.name) # calculate threshold # http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient#Inference self.n = n = len(X) df = n - 2 tt = scipy.stats.distributions.t.isf(tp, df) tr = tt / np.sqrt(df + tt ** 2) cs = _cs.Colorspace(cmap=_cs.cm_xpolar, vmax=1, vmin= -1) cdist = cluster_dist(Y, N=samples, t_upper=tr, t_lower= -tr, tstart=tstart, tstop=tstop, close_time=close_time, unit='r', pmax=pmax, name=name, cs=cs) # normalization is done before the permutation b/c we are interested in the variance associated with each subject for the z-scoring. Y = Y.copy() Y.x = Y.x.reshape((n, -1)) if norm is not None: for cell in norm.cells: idx = (norm == cell) Y.x[idx] = scipy.stats.mstats.zscore(Y.x[idx]) x = X.x.reshape((n, -1)) m_x = np.mean(x) if np.isnan(m_x): raise ValueError("np.mean(x) is nan") self.x = x - m_x for _, Yrs in _resample(Y, replacement=replacement, samples=samples): r = self._corr(Yrs) cdist.add_perm(r) r = self._corr(Y) cdist.add_original(r) self.r_map = cdist.P self.all = [[self.r_map] + cdist.clusters] self.clusters = cdist
def __init__(self, Y, X, match=None, sub=None, samples=1000, replacement=True, title="Bootstrapped Pairwise Tests"): Y = asvar(Y, sub) X = asfactor(X, sub) assert len(Y) == len(X), "dataset length mismatch" if match: if sub is not None: match = match[sub] assert len(match) == len(Y), "dataset length mismatch" # prepare data container resampled = np.empty((samples + 1, len(Y))) # sample X subject within category resampled[0] = Y.x # fill resampled for i, Y_resampled in _resample(Y, unit=match, samples=samples, replacement=replacement): resampled[i + 1] = Y_resampled.x self.resampled = resampled cells = X.cells n_groups = len(cells) if match: # if there are several values per X%match cell, take the average # T: indexes to transform Y.x to [X%match, value]-array match_cell_ids = match.cells group_size = len(match_cell_ids) T = None; i = 0 for X_cell in cells: for match_cell in match_cell_ids: source_indexes = np.where((X == X_cell) * (match == match_cell))[0] if T is None: n_cells = n_groups * group_size T = np.empty((n_cells, len(source_indexes)), dtype=int) T[i, :] = source_indexes i += 1 if T.shape[1] == 1: T = T[:, 0] ordered = resampled[:, T] else: ordered = resampled[:, T].mean(axis=2) self.ordered = ordered # t-tests n_comparisons = sum(range(n_groups)) t = np.empty((samples + 1, n_comparisons)) comp_names = [] one_group = np.arange(group_size) groups = [one_group + i * group_size for i in range(n_groups)] for i, (g1, g2) in enumerate(itertools.combinations(range(n_groups), 2)): group_1 = groups[g1] group_2 = groups[g2] diffs = ordered[:, group_1] - ordered[:, group_2] t[:, i] = np.mean(diffs, axis=1) * np.sqrt(group_size) / np.std(diffs, axis=1, ddof=1) comp_names.append(' - '.join((cells[g1], cells[g2]))) self.diffs = diffs self.t_resampled = np.max(np.abs(t[1:]), axis=1) self.t = t = t[0] else: raise NotImplementedError self._Y = Y self._X = X self._group_names = cells self._group_data = np.array([ordered[0, g] for g in groups]) self._group_size = group_size self._df = group_size - 1 self._match = match self._n_samples = samples self._replacement = replacement self._comp_names = comp_names self._p_parametric = self.test_param(t) self._p_boot = self.test_boot(t) self.title = title
def __init__(self, Y, X, sub=None, title=None, empty=True, ems=None, lsq=0, showall=False): """ Returns an ANOVA table for the linear model. Mixed effects models require full model specification so that E(MS) can be estimated according to Hopkins (1976) Random effects: If the model is fully specified, a Hopkins E(MS) table is used to determine error terms in the mixed effects model. Otherwise, random factors are treated as fixed factors. kwargs ------ empty: include rows without F-Tests (True/False) ems: display source of E(MS) for F-Tests (True/False; None = use default) lsq: least square fitter = 0 -> numpy.linalg.lstsq = 1 -> after Fox showall: show SS, df and MS for effects without F test TODO ---- - sort model - reuse lms which are used repeatedly - provide threshold for including interaction effects when testing lower level effects Problem with unbalanced models ------------------------------ - The SS of Effects which do not include the between-subject factor are higher than in SPSS - The SS of effects which include the between-subject factor agree with SPSS """ # prepare kwargs Y = asvar(Y) X = asmodel(X) if sub is not None: Y = Y[sub] X = X[sub] assert Y.N == X.N # save args self.Y = Y self.X = X self.title = title self.show_ems = ems self._log = [] # decide which E(MS) model to use if X.df_error == 0: rfx = 1 fx_desc = "Mixed" elif X.df_error > 0: rfx = 0 fx_desc = "Fixed" else: raise ValueError("Model Overdetermined") self._log.append("%s effects model" % fx_desc) if lsq == 1: self._log.append("(my lsq)") elif lsq == 0: self._log.append("\n (np lsq)") # create testing table: # list of (effect, lm, lm_comp, lm_EMS) test_table = [] # # list of (name, SS, df, MS, F, p) results_table = [] if len(X.effects) == 1: self._log.append("single factor model") lm0 = lm(Y, X, lsq=lsq) SS = lm0.SS_model df = lm0.df_model MS = lm0.MS_model F, p = lm0.F_test() results_table.append((X.name, SS, df, MS, F, p)) results_table.append(("Residuals", lm0.SS_res, lm0.df_res, lm0.MS_res, None, None)) else: if not rfx: full_lm = lm(Y, X, lsq=lsq) SS_e = full_lm.SS_res MS_e = full_lm.MS_res df_e = full_lm.df_res for e_test in X.effects: skip = False name = e_test.name # find model 0 effects = [] excluded_e = [] for e in X.effects: # determine whether e_test if e is e_test: pass else: if _is_higher_order(e, e_test): excluded_e.append(e) else: effects.append(e) model0 = model(*effects) if e_test.df > model0.df_error: skip = "overspecified" else: lm0 = lm(Y, model0, lsq=lsq) # find model 1 effects.append(e_test) model1 = model(*effects) if model1.df_error > 0: lm1 = lm(Y, model1, lsq=lsq) else: lm1 = None if rfx: # find E(MS) EMS_effects = [] for e in X.effects: if e is e_test: pass elif all([(f in e_test or f.random) for f in e.factors]): if all([(f in e or e.nestedin(f)) for f in e_test.factors]): EMS_effects.append(e) if len(EMS_effects) > 0: lm_EMS = lm(Y, model(*EMS_effects), lsq=lsq) MS_e = lm_EMS.MS_model df_e = lm_EMS.df_model else: if showall: if lm1 is None: SS = lm0.SS_res df = lm0.df_res else: SS = lm0.SS_res - lm1.SS_res df = lm0.df_res - lm1.df_res MS = SS / df results_table.append((name, SS, df, MS, None, None)) skip = "no Hopkins E(MS)" if skip: self._log.append("SKIPPING: %s (%s)" % (e_test.name, skip)) else: test_table.append((e_test, lm1, lm0, MS_e, df_e)) SS, df, MS, F, p = incremental_F_test(lm1, lm0, MS_e=MS_e, df_e=df_e) results_table.append((name, SS, df, MS, F, p)) if not rfx: results_table.append(("Residuals", SS_e, df_e, MS_e, None, None)) self._test_table = test_table self._results_table = results_table
def __init__(self, Y, X, norm=None, sub=None, ds=None, contours={.05: (.8, .2, .0), .01: (1., .6, .0), .001: (1., 1., .0)}): """ Y : ndvar Dependent variable. X : continuous | None The continuous predictor variable. norm : None | categorial Categories in which to normalize (z-score) X. """ Y = asndvar(Y, sub=sub, ds=ds) X = asvar(X, sub=sub, ds=ds) if not Y.has_case: msg = ("Dependent variable needs case dimension") raise ValueError(msg) y = Y.x.reshape((len(Y), -1)) if norm is not None: y = y.copy() for cell in norm.cells: idx = (norm == cell) y[idx] = scipy.stats.mstats.zscore(y[idx]) n = len(X) x = X.x.reshape((n, -1)) # covariance m_x = np.mean(x) if np.isnan(m_x): raise ValueError("np.mean(x) is nan") x -= m_x y -= np.mean(y, axis=0) cov = np.sum(x * y, axis=0) / (n - 1) # correlation r = cov / (np.std(x, axis=0) * np.std(y, axis=0)) # p-value calculation # http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient#Inference pcont = {} r_ps = {} df = n - 2 for p, color in contours.iteritems(): t = scipy.stats.distributions.t.isf(p, df) r_p = t / np.sqrt(n - 2 + t ** 2) pcont[r_p] = color r_ps[r_p] = p pcont[-r_p] = color r_ps[-r_p] = p dims = Y.dims[1:] shape = Y.x.shape[1:] properties = Y.properties.copy() cs = _cs.Colorspace(cmap=_cs.cm_xpolar, vmax=1, vmin= -1, contours=pcont, ps=r_ps) properties['colorspace'] = cs r = ndvar(r.reshape(shape), dims=dims, properties=properties) # store results self.name = "%s corr %s" % (Y.name, X.name) self.r = r self.all = r
def __init__(self, Y, X, sub=None, title=None, empty=True, ems=None, showall=False, ds=None): """ Fits a univariate ANOVA model. Mixed effects models require full model specification so that E(MS) can be estimated according to Hopkins (1976) Parameters ---------- Y : var dependent variable X : model Model to fit to Y empty : bool include rows without F-Tests (True/False) ems : bool | None display source of E(MS) for F-Tests (True/False; None = use default) lsq : int least square fitter to use; 0 -> scipy.linalg.lstsq 1 -> after Fox showall : bool show SS, df and MS for effects without F test """ # TODO: # - sort model # - reuse lms which are used repeatedly # - provide threshold for including interaction effects when testing lower # level effects # # Problem with unbalanced models # ------------------------------ # - The SS of Effects which do not include the between-subject factor are # higher than in SPSS # - The SS of effects which include the between-subject factor agree with # SPSS # prepare kwargs Y = asvar(Y, sub=sub, ds=ds) X = asmodel(X, sub=sub, ds=ds) if len(Y) != len(X): raise ValueError("Y and X must describe same number of cases") # save args self.Y = Y self.X = X self.title = title self.show_ems = ems self._log = [] # decide which E(MS) model to use if X.df_error == 0: rfx = 1 fx_desc = "Mixed" elif X.df_error > 0: if hasrandom(X): err = "Models containing random effects need to be fully " "specified." raise NotImplementedError(err) rfx = 0 fx_desc = "Fixed" else: raise ValueError("Model Overdetermined") self._log.append("Using %s effects model" % fx_desc) # list of (name, SS, df, MS, F, p) self.F_tests = [] self.names = [] if len(X.effects) == 1: self._log.append("single factor model") lm1 = lm(Y, X) self.F_tests.append(lm1) self.names.append(X.name) self.residuals = lm1.SS_res, lm1.df_res, lm1.MS_res else: if rfx: pass # <- Hopkins else: full_lm = lm(Y, X) SS_e = full_lm.SS_res MS_e = full_lm.MS_res df_e = full_lm.df_res for e_test in X.effects: skip = False name = e_test.name # find model 0 effects = [] excluded_e = [] for e in X.effects: # determine whether e_test if e is e_test: pass else: if is_higher_order(e, e_test): excluded_e.append(e) else: effects.append(e) model0 = model(*effects) if e_test.df > model0.df_error: skip = "overspecified" else: lm0 = lm(Y, model0) # find model 1 effects.append(e_test) model1 = model(*effects) if model1.df_error > 0: lm1 = lm(Y, model1) else: lm1 = None if rfx: # find E(MS) EMS_effects = _find_hopkins_ems(e_test, X) if len(EMS_effects) > 0: lm_EMS = lm(Y, model(*EMS_effects)) MS_e = lm_EMS.MS_model df_e = lm_EMS.df_model else: if lm1 is None: SS = lm0.SS_res df = lm0.df_res else: SS = lm0.SS_res - lm1.SS_res df = lm0.df_res - lm1.df_res MS = SS / df skip = "no Hopkins E(MS); SS=%.2f, df=%i, " "MS=%.2f" % (SS, df, MS) if skip: self._log.append("SKIPPING: %s (%s)" % (e_test.name, skip)) else: res = incremental_F_test(lm1, lm0, MS_e=MS_e, df_e=df_e, name=name) self.F_tests.append(res) self.names.append(name) if not rfx: self.residuals = SS_e, df_e, MS_e
def __init__(self, Y, X, sub=None): """ Fit the model X to the dependent variable Y. Parameters ---------- Y : var Dependent variable. X : model Model. sub : None | index Only use part of the data """ # prepare input Y = asvar(Y) X = asmodel(X) # .sorted() if sub is not None: Y = Y[sub] X = X[sub] assert len(Y) == len(X) assert X.df_error > 0 # fit if _lm_lsq == 0: # use scipy (faster) beta, SS_res, _, _ = lstsq(X.full, Y.x) elif _lm_lsq == 1: # Fox # estimate least squares approximation beta = X.fit(Y) # estimate values = beta * X.full Y_est = values.sum(1) self._residuals = residuals = Y.x - Y_est SS_res = np.sum(residuals ** 2) if not Y.mean() == Y_est.mean(): msg = "Y.mean() != Y_est.mean() (%s vs " "%s)" % (Y.mean(), Y_est.mean()) logging.warning(msg) else: raise ValueError # SS total SS_total = self.SS_total = np.sum((Y.x - Y.mean()) ** 2) df_total = self.df_total = X.df_total self.MS_total = SS_total / df_total # SS residuals self.SS_res = SS_res df_res = self.df_res = X.df_error self.MS_res = SS_res / df_res # SS explained # SS_model = self.SS_model = np.sum((Y_est - Y.mean())**2) SS_model = self.SS = self.SS_model = SS_total - SS_res df_model = self.df = self.df_model = X.df self.MS_model = self.MS = SS_model / df_model # store stuff self.Y = Y self.X = X self.sub = sub self.beta = beta