Ejemplo n.º 1
0
def ancova(Y, factorial_model, covariate, interaction=None, sub=None, v=True, empty=True, ems=None):
    """
    OBSOLETE
    
    args
    ----
    Y: dependent variable
    factorial model:
    covariate:
    
    
    kwargs
    ------
    interaction: term from the factorial model to check for interaction with
                 the covariate
    v=True: display more information
    **anova_kwargs: ems, empty
    
    
    Based on
    --------
    Exercise to STATISTICS: AN INTRODUCTION USING R
    http://www.bio.ic.ac.uk/research/crawley/statistics/exercises/R6Ancova.pdf
    """
    assert isvar(covariate)
    anova_kwargs = {"empty": empty, "ems": ems}
    if sub != None:
        Y = Y[sub]
        factorial_model = factorial_model[sub]
        covariate = covariate[sub]
        if interaction != None:
            interaction = interaction[sub]
    # if interaction: assert type(interaction) in [factor]
    factorial_model = asmodel(factorial_model)
    a1 = lm(Y, factorial_model)
    if v:
        print a1.anova(title="MODEL 1", **anova_kwargs)
        print "\n"
    a2 = lm(Y, factorial_model + covariate)
    if v:
        print a2.anova(title="MODEL 2: Main Effect Covariate", **anova_kwargs)
        print "\n"
    print 'Model with "%s" Covariate > without Covariate' % covariate.name
    print comparelm(a1, a2)

    if interaction:
        logging.debug("%s / %s" % (covariate.name, interaction.name))
        logging.debug("%s" % (covariate.__div__))
        i_effect = covariate.__div__(interaction)
        #        i_effect = covariate / interaction
        a3 = lm(Y, factorial_model + i_effect)
        if v:
            print "\n"
            print a3.anova(title="MODEL 3: Interaction")
        # compare
        print '\n"%s"x"%s" Interaction > No Covariate:' % (covariate.name, interaction.name)
        print comparelm(a1, a3)
        print '\n"%s"x"%s" Interaction > Main Effect:' % (covariate.name, interaction.name)
        print comparelm(a2, a3)
Ejemplo n.º 2
0
 def __str__(self):
     f_names = [f.name for f in self._factors.values() if _vsl.isfactor(f)]
     v_names = [f.name for f in self._factors.values() if _vsl.isvar(f)]
     out = 'Variables:\n' + ', '.join(sorted(v_names))
     out += '\nFactors:\n' + ', '.join(sorted(f_names))
     if hasattr(self, '_stats'):
         out += '\n\nSEGMENTS:\n' + ', '.join(f.name for f in self._stats.values())
     return out
Ejemplo n.º 3
0
def _resample(Y, unit=None, replacement=True, samples=1000):
    """
    Generator function to resample a dependent variable (Y) multiple times
    
    unit: factor specdifying unit of measurement (e.g. subject). If unit is 
          specified, resampling proceeds by first resampling the categories of 
          unit (with or without replacement) and then shuffling the values 
          within unites (no replacement). 
    replacement: whether random samples should be drawn with replacement or 
                 without
    samples: number of samples to yield
    
    """
    if isvar(Y):
        Yout = Y.copy('_resampled')
        Y
    else:
        Y = var(Y)
        Yout = var(Y.copy(), name="Y resampled")
    
    if unit:
        ct = celltable(Y, unit)
        unit_data = ct.get_data(out=list)
        unit_indexes = ct.data_indexes.values()
        x_out = Yout.x
        
        if replacement:
            n = len(ct.indexes)
            for sample in xrange(samples):
                source_ids = np.random.randint(n, size=n)
                for index, source_index in zip(unit_indexes, source_ids):
                    data = unit_data[source_index]
                    np.random.shuffle(data)
                    x_out[index] = data
                yield Yout
            
        else:
            for sample in xrange(samples):
                random.shuffle(unit_data)
                for index, data in zip(unit_indexes, unit_data):
                    np.random.shuffle(data)
                    x_out[index] = data
                yield Yout
            
    else:
        if replacement:
            N = Y.N
            for i in xrange(samples):
                index = np.random.randint(N)
                Yout.x = Y.x[index]
                yield Yout
        else:
            for i in xrange(samples):
                np.random.shuffle(Yout.x)
                yield Yout
Ejemplo n.º 4
0
def _resample(Y, unit=None, replacement=True, samples=1000):
    """
    Generator function to resample a dependent variable (Y) multiple times

    Y : var | ndvar
        Variable which is to be resampled; a copy of ``Y`` is yielded in each
        iteration.

    unit : categorial
        factor specifying unit of measurement (e.g. subject). If unit is
        specified, resampling proceeds by first resampling the categories of
        unit (with or without replacement) and then shuffling the values
        within unites (no replacement).

    replacement : bool
        whether random samples should be drawn with replacement or
        without

    samples : int
        number of samples to yield

    """
    if isvar(Y):
        pass
    elif isndvar(Y):
        if not Y.has_case:
            raise ValueError("Need ndvar with cases")
    else:
        raise TypeError("need var or ndvar")

    Yout = Y.copy('{name}_resampled')

    if unit:  # not implemented
        ct = celltable(Y, unit)
        unit_data = ct.get_data(out=list)
        unit_indexes = ct.data_indexes.values()
        x_out = Yout.x

        if replacement:
            n = len(ct.indexes)
            for i in xrange(samples):
                source_ids = np.random.randint(n, size=n)
                for index, source_index in zip(unit_indexes, source_ids):
                    data = unit_data[source_index]
                    np.random.shuffle(data)
                    x_out[index] = data
                yield i, Yout

        else:
            for i in xrange(samples):
                random.shuffle(unit_data)
                for index, data in zip(unit_indexes, unit_data):
                    np.random.shuffle(data)
                    x_out[index] = data
                yield i, Yout
    else:  # OK
        if replacement:
            N = len(Y)
            for i in xrange(samples):
                index = np.random.randint(N, N)
                Yout.x = Y.x[index]
                yield i, Yout
        else:  # OK
            for i in xrange(samples):
                np.random.shuffle(Yout.x)
                yield i, Yout
Ejemplo n.º 5
0
def correlations(Y, Xs, cat=None, levels=[.05, .01, .001], diff=None, sub=None,
         pmax=None, nan=True):  # , match=None):
    """
    :arg var Y: first variable
    :arg var X: second variable (or list of variables)
    :arg cat: show correlations separately for different groups in the
        data. Can be a ``factor`` (the correlation for each level is shown
        separately) or an array of ``bool`` values (e.g. from a comparison like
        ``Stim==1``)
    :arg list levels: significance levels to mark
    :arg diff: (factor, cat_1, cat_2)
    :arg sub: use only a subset of the data
    :arg pmax: (None) don't show correlations with p>pmax
    :arg nan: ``True``: display correlation which yield NAN;
        ``False``: hide NANs but mention occurrence in summary (not
        implemented);
        ``None``: don't mention NANs
    :rtype: Table

    """
    levels = np.array(levels)

    if isvar(Xs):
        Xs = [Xs]

    # SUB
    if sub is not None:
        Y = Y[sub]
        Xs = [X[sub] for X in Xs]
        if ismodel(cat) or isfactor(cat):
            cat = cat[sub]

    if diff is not None:
        raise NotImplementedError

    if cat is None:
        table = fmtxt.Table('l' * 4)
        table.cells('Variable', 'r', 'p', 'n')
    else:
        assert iscategorial(cat)
        table = fmtxt.Table('l' * 5)
        table.cells('Variable', 'Category', 'r', 'p', 'n')

    table.midrule()
    table.title("Correlations with %s" % (Y.name))

    table._my_nan_count = 0

    for X in Xs:
        if cat is None:
            _corr_to_table(table, Y, X, cat, levels, pmax=pmax, nan=nan)
        else:
            printXname = True
            for cell in cat.cells:
                tlen = len(table)
                sub = (cat == cell)
                _corr_to_table(table, Y, X, sub, levels, pmax=pmax, nan=nan,
                               printXname=printXname, label=cell_label(cell))

                if len(table) > tlen:
                    printXname = False

    # last row
    if pmax is None:
        p_text = ''
    else:
        p_text = 'all other p>{p}'.format(p=pmax)
    if nan is False and table._my_nan_count > 0:
        nan_text = '%s NANs' % table._my_nan_count
    else:
        nan_text = ''
    if p_text or nan_text:
        if p_text and nan_text:
            text = ', '.join([p_text, nan_text])
        else:
            text = ''.join([p_text, nan_text])
        table.cell("(%s)" % text)
    return table
Ejemplo n.º 6
0
def data(Y, X=None, match=None, cov=[], sub=None, fmt=None, labels=True, 
          showcase=True):
    """
    return a textab.table (printed as tsv table by default)
    
    parameters
    ----------
    Y: variable to display (can be model with several dependents)

    X: categories defining cells (factorial model)

    match: factor to match values on and return repeated-measures table

    cov: covariate to report (WARNING: only works with match, where each value
         on the matching variable corresponds with one value in the covariate)

    sub: boolean array specifying which values to include (generate e.g. 
         with 'sub=T==[1,2]')

    fmt: Format string  
            
    labels: display labels for nominal variables (otherwise display codes)

    """
    if hasattr(Y, '_items'): # dataframe
        Y = Y._items
    Y = _data.asmodel(Y)
    if _data.isfactor(cov) or _data.isvar(cov):
        cov = [cov]
    
    data = []
    names_yname = [] # names including Yi.name for matched table headers
    ynames = [] # names of Yi for independent measures table headers
    within_list = []
    for Yi in Y.effects:
        _data, datalabels, names, _within = _data._split_Y(Yi, X, match=match, 
                                                     sub=sub, datalabels=match)
        data += _data
        names_yname += ['({c})'.format(c=n) for n in names]
        ynames.append(Yi.name)
        within_list.append(_within)
    within = within_list[0]
    assert all([w==within for w in within_list])
    
    # table
    n_dependents = len(Y.effects)
    n_cells = int(len(data) / n_dependents)
    if within:
        n, k = len(data[0]), len(data)
        table = textab.Table('l' * (k + showcase + len(cov)))
        
        # header line 1
        if showcase:
            table.cell(match.name)
            case_labels = datalabels[0]
            assert all([np.all(case_labels==l) for l in datalabels[1:]])
        for i in range(n_dependents):
            for name in names:        
                table.cell(name.replace(' ','_'))
        for c in cov:
            table.cell(c.name)
        
        # header line 2
        if n_dependents > 1:
            if showcase:
                table.cell()
            for name in ynames:
                [table.cell('(%s)'%name) for i in range(n_cells)]
            for c in cov:
                table.cell()
        
        # body
        table.midrule()
        for i in range(n):
            case = case_labels[i]
            if showcase:
                table.cell(case)
            for j in range(k):
                table.cell(data[j][i], fmt=fmt)
            # covariates
            indexes = match==case
            for c in cov:
                # test it's all the same values
                case_cov = c[indexes]
                if len(np.unique(case_cov.x)) != 1: 
                    msg = 'covariate for case "%s" has several values'%case
                    raise ValueError(msg)
                # get value
                first_i = np.nonzero(indexes)[0][0]
                cov_value = c[first_i]
                if _data.isfactor(c) and labels:
                    cov_value = c.cells[cov_value]
                table.cell(cov_value, fmt=fmt)
    else:
        table = textab.Table('l'*(1 + n_dependents))
        table.cell(X.name)
        [table.cell(y) for y in ynames]
        table.midrule()
        # data is now sorted: (cell_i within dependent_i)
        # sort data as (X-cell, dependent_i)
        data_sorted = []
        for i_cell in range(n_cells):
            data_sorted.append([data[i_dep*n_cells + i_cell] for i_dep in \
                               range(n_dependents)])
        # table
        for name, cell_data in zip(names, data_sorted):
            for i in range(len(cell_data[0])):
                table.cell(name)
                for dep_data in cell_data:
                    v = dep_data[i]
                    table.cell(v, fmt=fmt)
    return table