Esempio n. 1
0
def bootstrap(factor_names, model, run_count):
    """Create a minimal starting design that is non-singular."""
    md = ModelDesc.from_formula(model)
    model_size = len(md.rhs_termlist)
    if run_count == 0:
        run_count = model_size
    if model_size > run_count:
        raise ValueError("Can't build a design of size {} "
                         "for a model of rank {}. "
                         "Model: '{}'".format(run_count, model_size, model))

    factor_count = len(factor_names)
    x0 = np.zeros(factor_count)
    # add high/low bounds to constraint matrix
    constraint_matrix = np.zeros((factor_count * 2, factor_count))
    bounds = np.zeros(factor_count * 2)
    c = 0
    for f in range(factor_count):
        constraint_matrix[c][f] = -1
        bounds[c] = 1
        c += 1
        constraint_matrix[c][f] = 1
        bounds[c] = 1
        c += 1

    start_points = hit_and_run(x0, constraint_matrix, bounds, run_count)

    d = pd.DataFrame(start_points, columns=factor_names)
    X = dmatrix(model, d)

    return (d, X)
Esempio n. 2
0
    def _package_attrs(self, attrs):
        # Sometimes features are retrieved from wrapper (stargazer does this),
        # other times from the actual result (statsmodels' summary_col does
        # this), so we'll have both.
        rres = RRegressionResults()

        # Use patsy to extract the target variable:
        fobj = ModelDesc.from_formula(self.formula)
        rres.target = fobj.lhs_termlist[0].name()
        rres.model = self

        # We need to hijack this rather than subclassing because stargazer does
        # not use "isinstance()" but "type()":
        wrap = RegressionResultsWrapper(rres)

        # All items except "params" are @cache_readonly and need first to be
        # deleted, and then redefined:
        for attr in attrs:
            if attr not in ('params', ):
                if hasattr(rres, attr):
                    delattr(rres, attr)
            setattr(rres, attr, attrs[attr])
            setattr(wrap, attr, attrs[attr])
            self._debug("Set {} to {}".format(attr, attrs[attr]))

        rres.__class__ = RegressionResults
        return wrap
Esempio n. 3
0
def add_predictors(base_formula, extra_predictors):
    desc = ModelDesc.from_formula(base_formula)
    # Using LookupFactor here ensures that everything will work correctly even
    # if one of the column names in extra_columns is named like "weight.in.kg"
    # or "sys.exit()" or "LittleBobbyTables()".
    desc.rhs_termlist += [Term([LookupFactor(p)]) for p in extra_predictors]
    return desc
Esempio n. 4
0
def _parse_formula(formula_str, include_intercept=False):
    """ Wrap some extra functionality into Patsy formula parse """

    _form = ModelDesc.from_formula(formula_str)

    # patsy (by default) includes intercept. Discard this on RHS
    if not include_intercept:
        _form.rhs_termlist = [
            t for t in _form.rhs_termlist if len(t.factors) != INTERCEPT
        ]

    #print(_form.lhs_termlist)
    #_categoricals = [t for t in _form.lhs_termlist
    #if t.startswith("C(") and t.endswith(")")]
    #_task = 'classify' if len(categoricals) > 0 else 'regress'

    #if len(_categoricals) != len(_form.lhs_termlist):
    #raise ValueError(f"Mixed targets detected in {formula_str}. "
    #"Specify all categoricals "
    #"using the C(...) syntax or all continuous.")

    #_num_classes = None if _task == 'classify' else len(_form.lhs_termlist)
    _num_classes = 2
    _task = 'classify'

    return _form, _task, _num_classes
Esempio n. 5
0
def _drop_intercept(formula, add_intercept):
    """Drop the intercept from formula if not add_intercept"""
    if not add_intercept:
        if not isinstance(formula, ModelDesc):
            formula = ModelDesc.from_formula(formula)
        if INTERCEPT in formula.rhs_termlist:
            formula.rhs_termlist.remove(INTERCEPT)
        return formula
    return formula
Esempio n. 6
0
def _drop_intercept(formula, add_intercept):
    """Drop the intercept from formula if not add_intercept"""
    if not add_intercept:
        if not isinstance(formula, ModelDesc):
            formula = ModelDesc.from_formula(formula)
        if INTERCEPT in formula.rhs_termlist:
            formula.rhs_termlist.remove(INTERCEPT)
        return formula
    return formula
Esempio n. 7
0
 def get_factor_names(self, level=1):
     """
     Gets the factors in a model which correspond to a certain level:
     1 : pure factors
     2 : 2-factor interactions and quadratic terms
     3 : 3-factor interactions and cubic terms
     4 : etc
     """
     spec = ModelDesc.from_formula(self._model_spec)
     return [term.name() for term in spec.rhs_termlist \
                                             if len(term.factors)==level]
Esempio n. 8
0
    def from_r_object(cls, rsum, ci=None, debug=False):
        """
        Reconstruct a model from an rpy2 summary object, and optionally its
        confidence intervals.
        These can be easily saved in R with
            save(objname, file=file_name)
        and loaded in Python via rpy2 with
            r['load'](file_name)['objname']

        Parameters
        ----------
        rsum : R object
            R summary of a fitted model.
            Typically produced with "summary(fitted)" (in R).
        ci : R object
            Confidence intervals of the fitted model
            Typically produced with "confint(fitted)" (in R).
        debug : bool, default False
            If True, print debug messages.
        """

        d_res = cls._r_as_dict(None, rsum)

        if not 'terms' in d_res:
            msg = ("Interpreting r objects inside Python is only supported "
                   "for few estimators. More will work using "
                   "RModel.from_rdata() directly.")
            raise NotImplementedError(msg)

        formula = str(d_res['terms']).splitlines()[0]

        # We want to create a fake dataset, and we use patsy to get the list of
        # variables. We are actually creating columns for interactions and
        # functions too... but who cares, identifying them would be at the
        # moment overkill.
        fobj = ModelDesc.from_formula(formula)
        varnames = [t.name()
                    for t in fobj.rhs_termlist + fobj.lhs_termlist][1:]

        # We need to pass some pd.DataFrame to from_formula() below - but it
        # doesn't seem to be actually used.
        data = pd.DataFrame(-1, index=[0], columns=[0])

        # Creating the OLS object and only then hijacking it allows us to best
        # profit of statsmodels' machinery:
        mod = OLS.from_formula(formula, data)
        mod.__class__ = RModel
        # This is now an RModel:
        mod._initialize(debug=debug)

        attrs = mod._inspect_R(rsum, ci=ci)
        wrap = mod._package_attrs(attrs)

        return wrap
Esempio n. 9
0
def _parse_formula(formula):
    # head off patsy errors
    if ';' in formula or formula.strip()[0].isdigit():
        metric = formula.split('~')[0].strip()
    else:
        metric = None

    # use patsy to parse formula
    model_desc = ModelDesc.from_formula(formula)
    group_columns = set()
    for t in model_desc.rhs_termlist:
        for i in t.factors:
            group_columns.add(i.name())
    if metric is None:
        metric = model_desc.lhs_termlist[0].name()
    return metric, group_columns
Esempio n. 10
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: int = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            column = metadata.get_column(i.name())
            if column.has_missing_values():
                raise ValueError(
                    'adonis requires metadata columns with no '
                    'NaN values (missing values in column `%s`.)' %
                    (column.name, ))

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = [
            'run_adonis.R', dm_fp, md_fp, formula,
            str(permutations),
            str(n_jobs), results_fp
        ]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
Esempio n. 11
0
def _parse_formula(formula):
    # head off patsy errors
    if '~' not in formula:
        raise ValueError('Formula not valid: missing tilde.\n'
                         'Enter a valid formula in format "y ~ model".')
    if ';' in formula or formula.strip()[0].isdigit():
        metric = formula.split('~')[0].strip()
    else:
        metric = None

    # use patsy to parse formula
    model_desc = ModelDesc.from_formula(formula)
    group_columns = set()
    for t in model_desc.rhs_termlist:
        for i in t.factors:
            group_columns.add(i.name())
    if metric is None:
        metric = model_desc.lhs_termlist[0].name()
    return metric, group_columns
Esempio n. 12
0
def parse_formula(f_str):

    patsy_formula = ModelDesc.from_formula(f_str)

    tokenize = patsy_formula.lhs_termlist

    valid_tokenizers = list()
    for term in tokenize:
        for e in term.factors:
            code = e.code
            if code in _VALID_TOKENIZERS:
                valid_tokenizers.append(code)

    if len(valid_tokenizers) == 0:
        tokenize.insert(0, Term([EvalFactor(_DEFAULT_TOKENIZER)]))
    if len(valid_tokenizers) > 1:
        raise RuntimeError("Multiple tokenizers found in formula\n"
                           f"Specify one from {' '.join(_VALID_TOKENIZERS)}")

    preprocess = [t for t in patsy_formula.rhs_termlist if len(t.factors) > 0]
    return tokenize, preprocess
Esempio n. 13
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: str = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            metadata.get_column(i.name())

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations),
               str(n_jobs), results_fp]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
Esempio n. 14
0
# Now to try this in `patsy`.  
# 
# Steps:  
# 1. See how the model description is derived from the formula  
# 2. Build the design matrix that the formula specifies  
# 3. Use the design matrix in order to create the model in `scikit-learn`

# In[87]:

from patsy import ModelDesc, EvalEnvironment


# In[88]:

env = EvalEnvironment.capture()
predicted_lat_age_mtx = ModelDesc.from_formula('Predicted ~ Age_Calc * Case', env)


# In[89]:

predicted_lat_age_mtx


# In[90]:

from patsy import dmatrix


# In[91]:

design_mtx = dmatrix('Case * Age_Calc', subj_case_data)
Esempio n. 15
0
def mean_from_patsy_formula(formula, inputdict={}):
    """
    Create a mean function from a patsy formula

    This is the functional interface to creating a mean function from a patsy/R formula.
    The higher level function ``MeanFunction`` in the ``MeanFunction`` module is preferred
    over this one, but this potentially gives the user slightly more control as a
    patsy ``ModelDesc`` object can be passed directly, rather than giving a string.

    This method takes a string or a patsy ``ModelDesc`` object as an input and an optional
    dictionary that map strings to integer indices in the input data. The formula is then
    parsed with patsy, and the individual terms resulting from that are converted to
    mean functions and composed using the provided operations.

    The string formulas can be specified in several ways. The formula LHS is implicitly
    always ``"y = "`` or ``"y ~ "``, though these can be explicitly provided as well
    (though it is ignored in the conversion). The RHS may contain a set of terms
    containing the add, multiply, power, and call operations much in the same way
    that the operations would be entered as regular python code. Parentheses are
    used to indicated prececence as well as the call operation, and square brackets
    indicate an indexing operation on the inputs. Inputs may be specified as either
    a string such as ``"x[0]"``, ``"inputs[0]"``, or a string that can be mapped to
    an integer index with the optional dictionary passed to the function. Any strings
    not representing operations or inputs as described above are interpreted as follows:
    if the string can be converted into a number, then it is interpreted as a
    ``ConstantMean`` fixed mean function object; otherwise it is assumed to represent
    a fitting coefficient. Note that this means many characters that do not represent
    operations within this mean function language but would not normally be considered
    as python variables will nonetheless be converted into fitting coefficients --
    it is up to the user to get this right.

    Expressions that are repeated or redundant will not be simplified beyond the parsing
    done by patsy, so the user should take care that the provided expression is sensible
    as a mean function and will not cause problems when fitting.

    Examples: ::

        >>> from mogp_emulator.formula import mean_from_patsy_formula
        >>> mf1 = mean_from_patsy_formula("x[0]")
        >>> print(mf1)
        c + c*x[0]
        >>> mf2 = mean_from_patsy_formula("a*b", {"a": 0, "b": 1})
        >>> print(mf2)
        c + c*x[0] + c*x[1] + c*x[0]*x[1]

    :param formula: string representing the desired mean function formula
                    or a patsy ``ModelDesc`` object
    :type formula: str or ModelDesc
    :param inputdict: dictionary used to map variables to input indices. Maps
                      strings to integer indices (must be non-negative). Optional,
                      default is ``{}``.
    :type inputdict: dict
    :returns: New subclass of ``MeanBase`` implementing the given formula
    :rtype: subclass of MeanBase (exact type will depend on the formula that is provided)
    """

    assert not no_patsy, "patsy must be installed to parse formulas using patsy"

    if isinstance(formula, str):
        model = ModelDesc.from_formula(formula)
    elif isinstance(formula, ModelDesc):
        model = formula

    model_terms = []

    for term in model.rhs_termlist:
        model_terms.append(_term_to_mean(term, inputdict))

    mf = model_terms.pop(0)

    assert issubclass(type(mf), MeanFunction.MeanBase)

    for term in model_terms:
        mf += term

    assert issubclass(type(mf), MeanFunction.MeanBase)

    return mf
Esempio n. 16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
from patsy import ModelDesc
from patsy import demo_data
from patsy import dmatrix, dmatrices
print(ModelDesc.from_formula("y ~ x").describe())
print(ModelDesc.from_formula("y ~ x + x + x").describe())
print(ModelDesc.from_formula("y ~ -1 + x").describe())
print(ModelDesc.from_formula("~ -1").describe())
print(ModelDesc.from_formula("y ~ a:b").describe())
print(ModelDesc.from_formula("y ~ a*b").describe())
print(ModelDesc.from_formula("y ~ (a + b + c + d) ** 2").describe())
print(ModelDesc.from_formula("y ~ (a + b)/(c + d)").describe())
print(
    ModelDesc.from_formula("np.log(x1 + x2) "
                           "+ (x + {6: x3, 8 + 1: x4}[3 * i])").describe())
#Sometimes it might be easier to read if you put the processed formula back into formula notation using ModelDesc.describe():

desc = ModelDesc.from_formula("y ~ (a + b + c + d) ** 2")
print(desc.describe())

data = demo_data("a", "b", "x1", "x2")
mat = dmatrix("x1:x2 + a:b + b + x1:a:b + a + x2:a:x1", data)
print(mat.design_info.term_names)

data = demo_data("a", "b", "y")

mat1 = dmatrices("y ~ 0 + a:b", data)[1]
Esempio n. 17
0
 def __str__(self):
     spec = ModelDesc.from_formula(self._model_spec)
     return spec.describe()
Esempio n. 18
0
def lm(
    model_spec: str,
    data: pd.DataFrame,
    name: Optional[str] = None,
    alias_threshold: Optional[float] = 0.995,
) -> Model:
    """
    Create a linear model.
    """
    def find_aliases(model, model_desc, threshold_correlation=0.995):
        """
        Finds columns which are exactly correlated, or up to at least a level
        of `threshold_correlation`.
        Returns a dictionary of aliasing and a list of columns to keep.

        The columns to keep will be in the order checked. Perhaps this can be
        improved.
        For example if AB = CD, then return AB to keep.
        For example if A = BCD, then return A, and not the BCD column to keep.
        """
        has_variation = model.exog.std(axis=0) > np.sqrt(np.finfo(float).eps)

        # np.dot(model.exog.T, model.exog)/model.exog.shape[0]
        # Drop columns which do not have any variation
        corrcoef = np.corrcoef(model.exog[:, has_variation].T)  # , ddof=0)

        # Snippet of code here is from the NumPy "corrcoef" function. Adapted.
        c = np.cov(model.exog.T, None, rowvar=True)
        dot_product = model.exog.T @ model.exog
        try:
            d = np.diag(c)
        except ValueError:
            # scalar covariance
            # nan if incorrect value (nan, inf, 0), 1 otherwise
            return c / c
        stddev = np.sqrt(d.real)

        aliasing = defaultdict(list)
        terms = model_desc.rhs_termlist
        drop_columns = []
        counter = -1
        corrcoef = c.copy()
        for idx, check in enumerate(has_variation):
            if check:
                counter += 1

                for j, stddev_value in enumerate(stddev):
                    if stddev_value == 0:
                        pass
                    else:
                        corrcoef[idx, j] = c[idx, j] / stddev[idx] / stddev[j]

                # corrcoef = c / stddev[idx, None]
                # corrcoef = corrcoef / stddev[None, idx]

                candidates = [
                    i for i, val in enumerate(np.abs(corrcoef[idx, :]))
                    if (val > threshold_correlation)
                ]
                signs = [np.sign(j) for j in corrcoef[idx, :]]
            else:
                # Columns with no variation
                candidates = [
                    i for i, j in enumerate(has_variation)
                    if (j <= threshold_correlation)
                ]

            # Track the correlation signs
            signs = [np.sign(j) for j in dot_product[idx, :]]

            # Now drop out the candidates with the longest word lengths
            alias_len = [(len(terms[i].factors), i) for i in candidates]
            alias_len.sort(reverse=True)
            for entry in alias_len[0:-1]:
                drop_columns.append(entry[1])

            for col in candidates:
                if col == idx:
                    # It is of course perfectly correlated with itself
                    pass
                else:

                    aliases = [t.name() for t in terms[col].factors]
                    if len(aliases) == 0:
                        aliases = ["Intercept"]

                    key = tuple([t.name() for t in terms[idx].factors])
                    if len(key) == 0:
                        key = ("Intercept", )

                    if signs[col] > 0:
                        aliases.insert(0, "+")
                    if signs[col] < 0:
                        aliases.insert(0, "-")
                    aliasing[key].append(aliases)

        # Sort the aliases in length:
        for key, val in aliasing.items():
            alias_len = [(len(i), i) if i[1] != "Intercept" else (1e5, i)
                         for i in val]
            alias_len.sort()
            aliasing[key] = [i[1] for i in alias_len]

        return aliasing, list(set(drop_columns))

    pre_model = smf.ols(model_spec, data=data)
    model_description = ModelDesc.from_formula(model_spec)
    aliasing, drop_columns = find_aliases(
        pre_model, model_description, threshold_correlation=alias_threshold)
    drop_column_names = [pre_model.data.xnames[i] for i in drop_columns]

    post_model = smf.ols(model_spec, data=data, drop_cols=drop_column_names)

    name = name or data.pi_title
    out = Model(
        OLS_instance=post_model.fit(),
        model_spec=model_spec,
        aliasing=aliasing,
        name=name,
    )
    out.data = data

    return out
Esempio n. 19
0
 def get_response_name(self):
     spec = ModelDesc.from_formula(self._model_spec)
     return spec.lhs_termlist[0].name()
Esempio n. 20
0
def guess_and_check_eq(sims, info):
    errors, obs = [], "noisy"
    for index, row in sims.iterrows():
        T = row['T']
        gt = row['std']**2 / T - row['eps']**2 / T
        error = np.mean((row[obs] - gt)**2)
        errors.append(error)
    sims['errors'] = errors
    sims['inv_T'] = 1 / sims['T']
    sims['inv_D'] = 1 / sims['D']
    sims['inv_T2'] = 1 / sims['T']**2
    sims['inv_D2'] = 1 / sims['D']**2
    sims['sqrt_T'] = 1 / np.sqrt(sims['T'])
    sims['sqrt_D'] = 1 / np.sqrt(sims['D'])
    sims['std2'] = sims['std']**2
    sims['eps2'] = sims['eps']**2
    sims['eps2_div_T'] = sims['eps2'] / (sims['T'] * sims['D'])**2
    sims['std2_div_T'] = sims['std2'] / (sims['T'] * sims['D'])**2
    sims['eps2_std2'] = sims['std2'] * sims['eps2'] / sims['T']**2
    sims['sum_eps2_std2_div_T'] = sims['eps2'] + sims['std2'] / sims['T']**2
    sims['exp_sum_eps2_std2_div_T'] = np.exp(
        (sims['eps2_div_T'] + sims['std2_div_T']))
    sims['sum_std2_prod_eps2_std2_div_T'] = sims['std2'] * (
        sims['eps2_div_T'] + sims['std2_div_T'])
    sims['div_D_prod_eps2_std2_div_T'] = (sims['eps2_div_T'] +
                                          sims['std2_div_T']) / sims['D']
    sims['eps2_prod_eps2_std2_div_T'] = (sims['eps2_div_T'] +
                                         sims['std2_div_T']) * sims['eps2']
    sims['std2_prod_eps2_std2_div_T'] = (sims['eps2_div_T'] +
                                         sims['std2_div_T']) * sims['std2']
    # sims['exp_eps2_std2_div_T'] = np.exp( ( sims['eps2'] + sims['std2'] ) / sims['T']**2)
    # sims = sims[sims['std'] > 0]
    # sims = sims[sims['T'] > 3]
    # sims = sims[sims['eps'] > 0]
    # model_desc = "errors ~ std2_div_T * eps2_div_T * sum_eps2_std2_div_T * T * D - D - T"
    model_desc = "errors ~ eps2_div_T * std2_div_T * inv_T2 * inv_D2 * std2_prod_eps2_std2_div_T * eps2_prod_eps2_std2_div_T"
    # model_desc = "errors ~ eps2_div_T * std2_div_T * inv_T * inv_D * exp_sum_eps2_std2_div_T"
    y, X = dmatrices(model_desc, sims)
    desc = ModelDesc.from_formula(model_desc)
    print(desc.describe())
    reg = linear_model.LinearRegression()
    #reg = linear_model.Ridge(alpha=1000)
    # reg = linear_model.Lasso(alpha=.1,max_iter=10000)
    reg.fit(X, y)
    print(reg.coef_)
    print(reg.intercept_)

    b = reg.intercept_
    A = reg.coef_
    print(X.shape, A.shape)
    preds = reg.predict(X)
    # preds_2 = X @ A.T + b
    # print(np.mean((preds - preds_2)**2))
    mse = np.mean((y - preds)**2)
    pp = pprint.PrettyPrinter(indent=4)

    results = {}
    for v, k in zip(reg.coef_[0], str(desc.describe()).split(" + ")):
        name = str(int(np_log(abs(v)) / np_log(10)))
        if not (name in results.keys()): results[name] = []
        results[name].append(k)
    pp.pprint(results)

    fixed = {"T": 10, "D": 100}
    fsims = sims
    for name, value in fixed.items():
        fsims = fsims[fsims[name] == value]
    noisy = fsims['noisy'].mean()
    std = fsims['std'].mean()
    eps = fsims['eps'].mean()
    # print(noisy,std,eps)
    print("MSE ~= %2.3e" % mse)
Esempio n. 21
0
def _epoch_spans(recspan_intern_table, data_set, rerp_specs, eval_env):
    rerp_infos = []
    rerp_names = set()
    spans = []
    design_offset = 0
    expanded_design_offset = 0
    data_format = data_set.data_format
    for rerp_idx, rerp_spec in enumerate(rerp_specs):
        start_offset = data_format.ms_to_samples(rerp_spec.start_time)
        # Offsets are half open: [start, stop)
        # But, it's more intuitive for times to be closed: [start, stop]
        # So we interpret the user times as a closed interval, and add 1
        # sample when converting to offsets.
        stop_offset = 1 + data_format.ms_to_samples(rerp_spec.stop_time)
        if start_offset >= stop_offset:
            raise ValueError("Epochs must be >1 sample long!")
        event_set = data_set.events.find(rerp_spec.event_query)
        # Tricky bit: the specifies a RHS-only formula, but really we have an
        # implicit LHS (determined by the event_query). This makes things
        # complicated when it comes to e.g. keeping track of which items
        # survived NA removal, determining the number of rows in an
        # intercept-only formula, etc. Really we want patsy to just treat all
        # this stuff the same way as it normally handles a LHS~RHS
        # formula. So, we convert our RHS formula into a LHS~RHS formula,
        # using a special LHS that represents each event by a placeholder
        # integer!
        desc = ModelDesc.from_formula(rerp_spec.formula, eval_env)
        if desc.lhs_termlist:
            raise ValueError("Formula cannot have a left-hand side")
        desc.lhs_termlist = [Term([_ArangeFactor(len(event_set))])]
        fake_lhs, design = dmatrices(desc, event_set)
        surviving_event_idxes = np.asarray(fake_lhs, dtype=int).ravel()
        design_row_idxes = np.empty(len(event_set))
        design_row_idxes.fill(-1)
        design_row_idxes[surviving_event_idxes] = np.arange(design.shape[0])
        # Now design_row_idxes[i] is -1 if event i was thrown out, and
        # otherwise gives the row in 'design' which refers to event 'i'.
        for i in xrange(len(event_set)):
            event = event_set[i]
            # -1 for non-existent
            design_row_idx = design_row_idxes[i]
            recspan = (event.recording, event.span_id)
            recspan_intern = recspan_intern_table[recspan]
            epoch_start = start_offset + event.start_idx
            epoch_stop = stop_offset + event.start_idx
            if design_row_idx == -1:
                design_row = None
            else:
                design_row = design[design_row_idx, :]
            epoch = _Epoch(epoch_start, epoch_stop - epoch_start, design_row,
                           design_offset, expanded_design_offset, rerp_idx, [])
            if design_row is None:
                # Event thrown out due to missing predictors; this
                # makes its whole epoch into an artifact -- but if overlap
                # correction is disabled, then this artifact only affects
                # this epoch, not anything else. (We still want to treat
                # it as an artifact though so we get proper accounting at
                # the end.)
                epoch.intrinsic_artifacts.append("_MISSING_PREDICTOR")
            spans.append(
                DataSpan((recspan_intern, epoch_start),
                         (recspan_intern, epoch_stop), epoch, None))
        if rerp_spec.name in rerp_names:
            raise ValueError("name %r used for two different sub-analyses" %
                             (rerp_spec.name, ))
        rerp_names.add(rerp_spec.name)
        rerp_info = {
            "spec": rerp_spec,
            "design_info": design.design_info,
            "start_offset": start_offset,
            "stop_offset": stop_offset,
            "design_offset": design_offset,
            "expanded_design_offset": expanded_design_offset,
            "total_epochs": len(event_set),
            "epochs_with_data": 0,
            "epochs_with_artifacts": 0,
        }
        rerp_infos.append(rerp_info)
        design_offset += design.shape[1]
        epoch_samples = stop_offset - start_offset
        expanded_design_offset += epoch_samples * design.shape[1]

    return rerp_infos, spans, design_offset, expanded_design_offset