Example #1
0
    def _package_attrs(self, attrs):
        # Sometimes features are retrieved from wrapper (stargazer does this),
        # other times from the actual result (statsmodels' summary_col does
        # this), so we'll have both.
        rres = RRegressionResults()

        # Use patsy to extract the target variable:
        fobj = ModelDesc.from_formula(self.formula)
        rres.target = fobj.lhs_termlist[0].name()
        rres.model = self

        # We need to hijack this rather than subclassing because stargazer does
        # not use "isinstance()" but "type()":
        wrap = RegressionResultsWrapper(rres)

        # All items except "params" are @cache_readonly and need first to be
        # deleted, and then redefined:
        for attr in attrs:
            if attr not in ('params', ):
                if hasattr(rres, attr):
                    delattr(rres, attr)
            setattr(rres, attr, attrs[attr])
            setattr(wrap, attr, attrs[attr])
            self._debug("Set {} to {}".format(attr, attrs[attr]))

        rres.__class__ = RegressionResults
        return wrap
Example #2
0
    def _prune(self, fit, p_max):
        """
        If the fit contains statistically insignificant parameters, remove them.
        Returns a pruned fit where all parameters have p-values of the t-statistic below p_max

        Parameters
        ----------
        fit: fm.ols fit object
            Can contain insignificant parameters
        p_max : float
            Maximum allowed probability of the t-statistic

        Returns
        -------
        fit: fm.ols fit object
            Won't contain any insignificant parameters

        """

        model_desc = ModelDesc(
            fit.model.formula.lhs_termlist[:], fit.model.formula.rhs_termlist[:])
        to_prune = fit.pvalues.where(
            fit.pvalues > p_max).dropna().index.tolist()
        to_prune.remove('Intercept')

        while to_prune:
            model_desc.rhs_termlist.remove(Term([LookupFactor(to_prune[0])]))
            fit = fm.ols(model_desc, data=self.data_frame).fit()
            to_prune = fit.pvalues.where(
                fit.pvalues > p_max).dropna().index.tolist()
            to_prune.remove('Intercept')

        return fit
Example #3
0
def bootstrap(factor_names, model, run_count):
    """Create a minimal starting design that is non-singular."""
    md = ModelDesc.from_formula(model)
    model_size = len(md.rhs_termlist)
    if run_count == 0:
        run_count = model_size
    if model_size > run_count:
        raise ValueError("Can't build a design of size {} "
                         "for a model of rank {}. "
                         "Model: '{}'".format(run_count, model_size, model))

    factor_count = len(factor_names)
    x0 = np.zeros(factor_count)
    # add high/low bounds to constraint matrix
    constraint_matrix = np.zeros((factor_count * 2, factor_count))
    bounds = np.zeros(factor_count * 2)
    c = 0
    for f in range(factor_count):
        constraint_matrix[c][f] = -1
        bounds[c] = 1
        c += 1
        constraint_matrix[c][f] = 1
        bounds[c] = 1
        c += 1

    start_points = hit_and_run(x0, constraint_matrix, bounds, run_count)

    d = pd.DataFrame(start_points, columns=factor_names)
    X = dmatrix(model, d)

    return (d, X)
Example #4
0
def _build_targets(formula, data):

    y, _ = dmatrices(ModelDesc(formula.lhs_termlist, list()), data)
    y = np.ravel(y)
    y = np.array(y)

    return y
Example #5
0
def add_predictors(base_formula, extra_predictors):
    desc = ModelDesc.from_formula(base_formula)
    # Using LookupFactor here ensures that everything will work correctly even
    # if one of the column names in extra_columns is named like "weight.in.kg"
    # or "sys.exit()" or "LittleBobbyTables()".
    desc.rhs_termlist += [Term([LookupFactor(p)]) for p in extra_predictors]
    return desc
Example #6
0
def _parse_formula(formula_str, include_intercept=False):
    """ Wrap some extra functionality into Patsy formula parse """

    _form = ModelDesc.from_formula(formula_str)

    # patsy (by default) includes intercept. Discard this on RHS
    if not include_intercept:
        _form.rhs_termlist = [
            t for t in _form.rhs_termlist if len(t.factors) != INTERCEPT
        ]

    #print(_form.lhs_termlist)
    #_categoricals = [t for t in _form.lhs_termlist
    #if t.startswith("C(") and t.endswith(")")]
    #_task = 'classify' if len(categoricals) > 0 else 'regress'

    #if len(_categoricals) != len(_form.lhs_termlist):
    #raise ValueError(f"Mixed targets detected in {formula_str}. "
    #"Specify all categoricals "
    #"using the C(...) syntax or all continuous.")

    #_num_classes = None if _task == 'classify' else len(_form.lhs_termlist)
    _num_classes = 2
    _task = 'classify'

    return _form, _task, _num_classes
Example #7
0
    def _do_analysis_no_cross_validation(self):
        """
        Find the best model (fit) and create self.list_of_fits and self.fit
        """

        # first model is just the mean
        response_term = [Term([LookupFactor(self.y)])]
        model_terms = [Term([])]  # empty term is the intercept
        all_model_terms_dict = {
            x: Term([LookupFactor(x)])
            for x in self.list_of_x
        }
        # ...then add another term for each candidate
        #model_terms += [Term([LookupFactor(c)]) for c in candidates]
        model_desc = ModelDesc(response_term, model_terms)
        self._list_of_fits.append(fm.ols(model_desc, data=self.df).fit())
        # try to improve the model until no improvements can be found

        while all_model_terms_dict:
            # try each x and overwrite the best_fit if we find a better one
            # the first best_fit is the one from the previous round
            ref_fit = self._list_of_fits[-1]
            best_fit = self._list_of_fits[-1]
            best_bic = best_fit.bic
            for x, term in all_model_terms_dict.items():
                # make new_fit, compare with best found so far
                model_desc = ModelDesc(
                    response_term, ref_fit.model.formula.rhs_termlist + [term])
                fit = fm.ols(model_desc, data=self.df).fit()
                if fit.bic < best_bic:
                    best_bic = fit.bic
                    best_fit = fit
                    best_x = x
            # Sometimes, the obtained fit may be better, but contains unsignificant parameters.
            # Correct the fit by removing the unsignificant parameters and estimate again
            best_fit = self._prune(best_fit, p_max=self.p_max)

            # if best_fit does not contain more variables than ref fit, exit
            if len(best_fit.model.formula.rhs_termlist) == len(
                    ref_fit.model.formula.rhs_termlist):
                break
            else:
                self._list_of_fits.append(best_fit)
                all_model_terms_dict.pop(best_x)
        self._fit = self._list_of_fits[-1]
Example #8
0
def _drop_intercept(formula, add_intercept):
    """Drop the intercept from formula if not add_intercept"""
    if not add_intercept:
        if not isinstance(formula, ModelDesc):
            formula = ModelDesc.from_formula(formula)
        if INTERCEPT in formula.rhs_termlist:
            formula.rhs_termlist.remove(INTERCEPT)
        return formula
    return formula
Example #9
0
def _drop_intercept(formula, add_intercept):
    """Drop the intercept from formula if not add_intercept"""
    if not add_intercept:
        if not isinstance(formula, ModelDesc):
            formula = ModelDesc.from_formula(formula)
        if INTERCEPT in formula.rhs_termlist:
            formula.rhs_termlist.remove(INTERCEPT)
        return formula
    return formula
Example #10
0
def create_patsy_model(dependent_variable, independent_variables, transformations={}, interactions=[]):
    '''
    Construct and return patsy formula (object representation)
    '''

    # 1) Handling passing in [{'name': X}] vs [X]
    lhs_var = dependent_variable
    rhs_vars = independent_variables
    if 'name' in dependent_variable:
        lhs_var = dependent_variable['name']

    if 'name' in independent_variables[0]:
        new_rhs_vars = []
        for iv in independent_variables:
            if type(iv) is list:
                new_rhs_vars.append([x['name'] for x in iv])
            else:
                if 'name' in iv:
                    new_rhs_vars.append(iv['name'])
                else:
                    new_rhs_vars.append(iv)
        rhs_vars = new_rhs_vars

    if interactions:
        first_interaction = interactions[0]
        if 'name' in first_interaction:
            new_interactions = []
            for interaction in interactions:
                new_interactions.append([term['name'] for term in interaction])
            rhs_interactions = new_interactions
        else:
            rhs_interactions = interactions

    # 2) Constructing model
    lhs = [ Term([LookupFactor(lhs_var)]) ]

    rhs = [ Term([]) ]
    for rhs_var in rhs_vars:
        if type(rhs_var) is list:
            rhs += [ Term([ LookupFactor(term) for term in rhs_var ]) ]
        else:
            if rhs_var in transformations:
                transformation = transformations[rhs_var]    
                if transformation == 'square':
                    rhs += [ Term([ LookupFactor(rhs_var) ]) ]
                format_string = transformation_to_format_string[transformation]
                rhs += [ Term([ EvalFactor(format_string.format(rhs_var)) ]) ]                    
            else:
                rhs += [ Term([ LookupFactor(rhs_var) ]) ]

    if interactions:
        rhs += [ Term([ LookupFactor(term) for term in interaction ]) for interaction in rhs_interactions ]

    model = ModelDesc(lhs, rhs)
    return model
Example #11
0
    def from_r_object(cls, rsum, ci=None, debug=False):
        """
        Reconstruct a model from an rpy2 summary object, and optionally its
        confidence intervals.
        These can be easily saved in R with
            save(objname, file=file_name)
        and loaded in Python via rpy2 with
            r['load'](file_name)['objname']

        Parameters
        ----------
        rsum : R object
            R summary of a fitted model.
            Typically produced with "summary(fitted)" (in R).
        ci : R object
            Confidence intervals of the fitted model
            Typically produced with "confint(fitted)" (in R).
        debug : bool, default False
            If True, print debug messages.
        """

        d_res = cls._r_as_dict(None, rsum)

        if not 'terms' in d_res:
            msg = ("Interpreting r objects inside Python is only supported "
                   "for few estimators. More will work using "
                   "RModel.from_rdata() directly.")
            raise NotImplementedError(msg)

        formula = str(d_res['terms']).splitlines()[0]

        # We want to create a fake dataset, and we use patsy to get the list of
        # variables. We are actually creating columns for interactions and
        # functions too... but who cares, identifying them would be at the
        # moment overkill.
        fobj = ModelDesc.from_formula(formula)
        varnames = [t.name()
                    for t in fobj.rhs_termlist + fobj.lhs_termlist][1:]

        # We need to pass some pd.DataFrame to from_formula() below - but it
        # doesn't seem to be actually used.
        data = pd.DataFrame(-1, index=[0], columns=[0])

        # Creating the OLS object and only then hijacking it allows us to best
        # profit of statsmodels' machinery:
        mod = OLS.from_formula(formula, data)
        mod.__class__ = RModel
        # This is now an RModel:
        mod._initialize(debug=debug)

        attrs = mod._inspect_R(rsum, ci=ci)
        wrap = mod._package_attrs(attrs)

        return wrap
Example #12
0
 def get_factor_names(self, level=1):
     """
     Gets the factors in a model which correspond to a certain level:
     1 : pure factors
     2 : 2-factor interactions and quadratic terms
     3 : 3-factor interactions and cubic terms
     4 : etc
     """
     spec = ModelDesc.from_formula(self._model_spec)
     return [term.name() for term in spec.rhs_termlist \
                                             if len(term.factors)==level]
Example #13
0
    def _prune(self, fit, p_max):
        """
        If the fit contains statistically insignificant parameters, remove them.
        Returns a pruned fit where all parameters have p-values of the t-statistic below p_max

        Parameters
        ----------
        fit: fm.ols fit object
            Can contain insignificant parameters
        p_max : float
            Maximum allowed probability of the t-statistic

        Returns
        -------
        fit: fm.ols fit object
            Won't contain any insignificant parameters

        """
        def remove_from_model_desc(x, model_desc):
            """
            Return a model_desc without x
            """

            rhs_termlist = []
            for t in model_desc.rhs_termlist:
                if not t.factors:
                    # intercept, add anyway
                    rhs_termlist.append(t)
                elif not x == t.factors[0]._varname:
                    # this is not the term with x
                    rhs_termlist.append(t)

            md = ModelDesc(model_desc.lhs_termlist, rhs_termlist)
            return md

        corrected_model_desc = ModelDesc(fit.model.formula.lhs_termlist[:],
                                         fit.model.formula.rhs_termlist[:])
        pars_to_prune = fit.pvalues.where(
            fit.pvalues > p_max).dropna().index.tolist()
        try:
            pars_to_prune.remove('Intercept')
        except:
            pass
        while pars_to_prune:
            corrected_model_desc = remove_from_model_desc(
                pars_to_prune[0], corrected_model_desc)
            fit = fm.ols(corrected_model_desc, data=self.df).fit()
            pars_to_prune = fit.pvalues.where(
                fit.pvalues > p_max).dropna().index.tolist()
            try:
                pars_to_prune.remove('Intercept')
            except:
                pass
        return fit
Example #14
0
def dict_to_model_desc(dictionary):
    """Return a string representation of a patsy ModelDesc object"""
    lhs_termlist = [Term([LookupFactor(dictionary['lhs_termlist'][0])])]
    rhs_termlist = []
    for name in dictionary['rhs_termlist']:
        if name == '':
            rhs_termlist.append(Term([]))
        else:
            rhs_termlist.append(Term([LookupFactor(name)]))

    return ModelDesc(lhs_termlist, rhs_termlist)
Example #15
0
    def _modeldesc_from_dict(self, d):
        """Return a string representation of a patsy ModelDesc object"""
        lhs_termlist = [Term([LookupFactor(d['lhs_termlist'][0])])]
        rhs_termlist = []
        for name in d['rhs_termlist']:
            if name == '':
                rhs_termlist.append(Term([]))
            else:
                rhs_termlist.append(Term([LookupFactor(name)]))

        md = ModelDesc(lhs_termlist, rhs_termlist)
        return md
Example #16
0
def _parse_formula(formula):
    # head off patsy errors
    if ';' in formula or formula.strip()[0].isdigit():
        metric = formula.split('~')[0].strip()
    else:
        metric = None

    # use patsy to parse formula
    model_desc = ModelDesc.from_formula(formula)
    group_columns = set()
    for t in model_desc.rhs_termlist:
        for i in t.factors:
            group_columns.add(i.name())
    if metric is None:
        metric = model_desc.lhs_termlist[0].name()
    return metric, group_columns
Example #17
0
        def remove_from_model_desc(x, model_desc):
            """
            Return a model_desc without x
            """

            rhs_termlist = []
            for t in model_desc.rhs_termlist:
                if not t.factors:
                    # intercept, add anyway
                    rhs_termlist.append(t)
                elif not x == t.factors[0]._varname:
                    # this is not the term with x
                    rhs_termlist.append(t)

            md = ModelDesc(model_desc.lhs_termlist, rhs_termlist)
            return md
Example #18
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: int = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            column = metadata.get_column(i.name())
            if column.has_missing_values():
                raise ValueError(
                    'adonis requires metadata columns with no '
                    'NaN values (missing values in column `%s`.)' %
                    (column.name, ))

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = [
            'run_adonis.R', dm_fp, md_fp, formula,
            str(permutations),
            str(n_jobs), results_fp
        ]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
Example #19
0
def _parse_formula(formula):
    # head off patsy errors
    if '~' not in formula:
        raise ValueError('Formula not valid: missing tilde.\n'
                         'Enter a valid formula in format "y ~ model".')
    if ';' in formula or formula.strip()[0].isdigit():
        metric = formula.split('~')[0].strip()
    else:
        metric = None

    # use patsy to parse formula
    model_desc = ModelDesc.from_formula(formula)
    group_columns = set()
    for t in model_desc.rhs_termlist:
        for i in t.factors:
            group_columns.add(i.name())
    if metric is None:
        metric = model_desc.lhs_termlist[0].name()
    return metric, group_columns
Example #20
0
def build_model_desc(snps, no_interactions):
    """
    Creates the model description (formula)
    :param snps: The selected snp labels
    :param no_interactions: If false, interactions will not be included in the model
    :return: The model description
    """
    x_terms = []
    for i in range(len(snps)):
        # Main effects
        snp_i = EvalFactor(snps[i])
        x_terms.append(Term([snp_i]))

        if not no_interactions:
            for j in range(i + 1, len(snps)):
                # Interaction effects
                snp_j = EvalFactor(snps[j])
                x_terms.append(Term([snp_i, snp_j]))

    return ModelDesc([], x_terms)
Example #21
0
def parse_formula(f_str):

    patsy_formula = ModelDesc.from_formula(f_str)

    tokenize = patsy_formula.lhs_termlist

    valid_tokenizers = list()
    for term in tokenize:
        for e in term.factors:
            code = e.code
            if code in _VALID_TOKENIZERS:
                valid_tokenizers.append(code)

    if len(valid_tokenizers) == 0:
        tokenize.insert(0, Term([EvalFactor(_DEFAULT_TOKENIZER)]))
    if len(valid_tokenizers) > 1:
        raise RuntimeError("Multiple tokenizers found in formula\n"
                           f"Specify one from {' '.join(_VALID_TOKENIZERS)}")

    preprocess = [t for t in patsy_formula.rhs_termlist if len(t.factors) > 0]
    return tokenize, preprocess
Example #22
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: str = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            metadata.get_column(i.name())

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations),
               str(n_jobs), results_fp]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
Example #23
0
    def _do_analysis_cross_validation(self):
        """
        Find the best model (fit) based on cross-valiation (leave one out)
        """
        assert len(self.data_frame) < 15, "Minimum 15 datapoints"

        # initialization: first model is the mean, but compute cv correctly.
        errors = []
        response_term = [Term([LookupFactor(self.dependent_var)])]
        model_desc = ModelDesc(response_term, [Term([])])
        for i in self.data_frame.index:
            # make new_fit, compute cross-validation and store error
            data_frame_ = self.data_frame.drop(i, axis=0)
            fit = fm.ols(model_desc, data=data_frame_).fit()
            cross_prediction = self._predict(
                fit=fit, data_frame=self.data_frame.loc[[i], :])
            errors.append(
                cross_prediction['predicted'] - cross_prediction[self.dependent_var])

        self._list_of_fits = [fm.ols(model_desc, data=self.data_frame).fit()]
        self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))]

        # try to improve the model until no improvements can be found
        all_model_terms_dict = {x: Term([LookupFactor(x)])
                                for x in self.list_of_x}
        while all_model_terms_dict:
            # import pdb;pdb.set_trace()
            # try each x in all_exog and overwrite if we find a better one
            # at the end of iteration (and not earlier), save the best of the iteration
            better_model_found = False
            best = {
                "fit": self._list_of_fits[-1],
                "cverror": self.list_of_cverrors[-1]
            }
            for value, term in all_model_terms_dict.items():
                model_desc = ModelDesc(
                    response_term, self._list_of_fits[-1].model.formula.rhs_termlist + [term])
                # cross_validation, currently only implemented for monthly data
                # compute the mean error for a given formula based on leave-one-out.
                errors = []
                for i in self.data_frame.index:
                    # make new_fit, compute cross-validation and store error
                    data_frame_ = self.data_frame.drop(i, axis=0)
                    fit = fm.ols(model_desc, data=data_frame_).fit()
                    cross_prediction = self._predict(
                        fit=fit, data_frame=self.data_frame.loc[[i], :])
                    errors.append(
                        cross_prediction['predicted'] - cross_prediction[self.dependent_var])
                cverror = np.mean(np.abs(np.array(errors)))
                # compare the model with the current fit
                if cverror < best['cverror']:
                    # better model, keep it
                    # first, reidentify using all the datapoints
                    best['fit'] = fm.ols(
                        model_desc, data=self.data_frame).fit()
                    best['cverror'] = cverror
                    better_model_found = True
                    best_val = value

            if better_model_found:
                self._list_of_fits.append(best['fit'])
                self.list_of_cverrors.append(best['cverror'])

            else:
                # if we did not find a better model, exit
                break

            # next iteration with the found exog removed
            all_model_terms_dict.pop(best_val)

        self._fit = self._list_of_fits[-1]
Example #24
0
execfile("code/03-DataPrep.py")

from patsy import dmatrices, ModelDesc, Term, LookupFactor
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.feature_selection import SelectFromModel
import numpy as np

'''
model 1 - logistic regression with L1 regularization
'''
formula = ModelDesc([Term([LookupFactor('rating')])], [Term([LookupFactor(c)]) for c in orgfeatures])

y, x = dmatrices(formula, rawdf, return_type="dataframe")
y = y.values.flatten()

logreg = linear_model.LogisticRegression(C=0.1, penalty='l1', tol=0.01)

logreg.fit(x, y)
scores = cross_val_score(logreg, x, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

coeffdf = pd.DataFrame({'feature': x.columns, 'coeff': np.transpose(logreg.coef_).flatten()})
nflist = coeffdf[coeffdf.coeff != 0].feature.values.tolist()
print(len(nflist))

# feature selection using best model from cross validation and get the best features
fslogreg = linear_model.LogisticRegressionCV(penalty='l1', solver='liblinear')
fslogreg.fit(x, y)
Example #25
0
def _epoch_spans(recspan_intern_table, data_set, rerp_specs, eval_env):
    rerp_infos = []
    rerp_names = set()
    spans = []
    design_offset = 0
    expanded_design_offset = 0
    data_format = data_set.data_format
    for rerp_idx, rerp_spec in enumerate(rerp_specs):
        start_offset = data_format.ms_to_samples(rerp_spec.start_time)
        # Offsets are half open: [start, stop)
        # But, it's more intuitive for times to be closed: [start, stop]
        # So we interpret the user times as a closed interval, and add 1
        # sample when converting to offsets.
        stop_offset = 1 + data_format.ms_to_samples(rerp_spec.stop_time)
        if start_offset >= stop_offset:
            raise ValueError("Epochs must be >1 sample long!")
        event_set = data_set.events.find(rerp_spec.event_query)
        # Tricky bit: the specifies a RHS-only formula, but really we have an
        # implicit LHS (determined by the event_query). This makes things
        # complicated when it comes to e.g. keeping track of which items
        # survived NA removal, determining the number of rows in an
        # intercept-only formula, etc. Really we want patsy to just treat all
        # this stuff the same way as it normally handles a LHS~RHS
        # formula. So, we convert our RHS formula into a LHS~RHS formula,
        # using a special LHS that represents each event by a placeholder
        # integer!
        desc = ModelDesc.from_formula(rerp_spec.formula, eval_env)
        if desc.lhs_termlist:
            raise ValueError("Formula cannot have a left-hand side")
        desc.lhs_termlist = [Term([_ArangeFactor(len(event_set))])]
        fake_lhs, design = dmatrices(desc, event_set)
        surviving_event_idxes = np.asarray(fake_lhs, dtype=int).ravel()
        design_row_idxes = np.empty(len(event_set))
        design_row_idxes.fill(-1)
        design_row_idxes[surviving_event_idxes] = np.arange(design.shape[0])
        # Now design_row_idxes[i] is -1 if event i was thrown out, and
        # otherwise gives the row in 'design' which refers to event 'i'.
        for i in xrange(len(event_set)):
            event = event_set[i]
            # -1 for non-existent
            design_row_idx = design_row_idxes[i]
            recspan = (event.recording, event.span_id)
            recspan_intern = recspan_intern_table[recspan]
            epoch_start = start_offset + event.start_idx
            epoch_stop = stop_offset + event.start_idx
            if design_row_idx == -1:
                design_row = None
            else:
                design_row = design[design_row_idx, :]
            epoch = _Epoch(epoch_start, epoch_stop - epoch_start, design_row,
                           design_offset, expanded_design_offset, rerp_idx, [])
            if design_row is None:
                # Event thrown out due to missing predictors; this
                # makes its whole epoch into an artifact -- but if overlap
                # correction is disabled, then this artifact only affects
                # this epoch, not anything else. (We still want to treat
                # it as an artifact though so we get proper accounting at
                # the end.)
                epoch.intrinsic_artifacts.append("_MISSING_PREDICTOR")
            spans.append(
                DataSpan((recspan_intern, epoch_start),
                         (recspan_intern, epoch_stop), epoch, None))
        if rerp_spec.name in rerp_names:
            raise ValueError("name %r used for two different sub-analyses" %
                             (rerp_spec.name, ))
        rerp_names.add(rerp_spec.name)
        rerp_info = {
            "spec": rerp_spec,
            "design_info": design.design_info,
            "start_offset": start_offset,
            "stop_offset": stop_offset,
            "design_offset": design_offset,
            "expanded_design_offset": expanded_design_offset,
            "total_epochs": len(event_set),
            "epochs_with_data": 0,
            "epochs_with_artifacts": 0,
        }
        rerp_infos.append(rerp_info)
        design_offset += design.shape[1]
        epoch_samples = stop_offset - start_offset
        expanded_design_offset += epoch_samples * design.shape[1]

    return rerp_infos, spans, design_offset, expanded_design_offset
Example #26
0
 def __str__(self):
     spec = ModelDesc.from_formula(self._model_spec)
     return spec.describe()
Example #27
0
    chist = chist[Yhist_gpvars + ['HISTBIN', wgtvar]].groupby(
        Yhist_gpvars + ['HISTBIN'], as_index=False).aggregate(np.sum)
    return chist


if __name__ == "__main__":
    ## Build the regression formula
    catvars = list(cfg.flevels.keys())
    with open("fpaths.json") as fpj:
        FPATHS = json.load(fpj)
    numvar_evals = ["I(YEAR - 2000)", "INCTOT99"]
    catvar_evals = [
        "C(" + cv + ", Treatment, levels=cfg.flevels['" + cv + "'])"
        for cv in catvars
    ]
    desc = ModelDesc([], [Term([EvalFactor(v)]) for v in numvar_evals])
    desc.rhs_termlist += [Term([EvalFactor(v)]) for v in catvar_evals]
    # Interactions
    interact_order = 2
    catvar_interact = ['SEX', 'AGECAT', 'RACE']
    print("Including all order-" + str(interact_order) +
          " interactions of the following variables:\n\t" +
          ", ".join(catvar_interact + numvar_evals))
    interact_evals = numvar_evals + [
        catvar_evals[i] for i in [catvars.index(v) for v in catvar_interact]
    ]
    desc.rhs_termlist += [
        Term([EvalFactor(v) for v in list(comb)])
        for comb in combinations(interact_evals, interact_order)
    ]
    # 'implied decimals'
Example #28
0
def scatterfit(x,
               y,
               method='pearson',
               adjustVars=[],
               labelLookup={},
               plotLine=True,
               annotateFit=True,
               annotatePoints=False,
               returnModel=False,
               lc='gray',
               **kwargs):
    """Scatter plot of x vs. y with a fitted line overlaid.

    Expects x and y as pd.Series but will accept arrays.

    Prints covariate unadjusted AND adjusted rho/pvalues on the figure.
    Plots covariate unadjusted data.

    Parameters
    ----------
    x,y : ndarrays or pd.Series
    method : string
        'pearson'
    adjustVars : list
    labelLookup : dict
    plotLine : bool
    annotateFit : bool
    annotatePoints : bool
    returnModel : bool
    kwargs : additional keyword arguments
        Passed to the plot function for the data points.

    Returns
    -------
    model : statsmodels GLM object
        Optionally the fitted model, depending on returnModel."""

    k = kwargs.keys()
    if not 'mec' in k:
        kwargs.update({'mec': 'k'})
    if not 'mfc' in k:
        kwargs.update({'mfc': 'k'})
    if not 'ms' in k:
        kwargs.update({'ms': 5})
    """Try to force X and Y into pandas.Series objects"""
    if not isinstance(x, pd.core.series.Series):
        x = pd.Series(x, name='X')
    if not isinstance(y, pd.core.series.Series):
        y = pd.Series(y, name='Y')

    xlab = x.name
    ylab = y.name
    if xlab == ylab:
        ylab = 'y_' + ylab
        xlab = 'x_' + xlab
        x.name = xlab
        y.name = ylab

    tmpDf = pd.concat((
        x,
        y,
    ), axis=1, join='inner')
    for av in adjustVars:
        tmpDf = pd.concat((tmpDf, pd.DataFrame(av)), axis=1)
    """Drop any row with a nan in either column"""
    tmpDf = tmpDf.dropna(axis=0, how='any')

    plt.gca().set_xmargin(0.2)
    plt.gca().set_ymargin(0.2)

    unrho, unp = partialcorr(tmpDf[xlab], tmpDf[ylab], method=method)
    """Print unadjusted AND adjusted rho/pvalues
    Plot unadjusted data with fit though..."""

    if method == 'spearman' and plotLine:
        #unrho,unp=stats.spearmanr(tmpDf[xlab],tmpDf[ylab])
        if unrho > 0:
            plt.plot(sorted(tmpDf[xlab]), sorted(tmpDf[ylab]), '-', color=lc)
        else:
            plt.plot(sorted(tmpDf[xlab]),
                     sorted(tmpDf[ylab], reverse=True),
                     '-',
                     color=lc)
    elif method == 'pearson' and plotLine:
        #unrho,unp=stats.pearsonr(tmpDf[xlab],tmpDf[ylab])
        formula_like = ModelDesc(
            [Term([LookupFactor(ylab)])],
            [Term([]), Term([LookupFactor(xlab)])])

        Y, X = dmatrices(formula_like, data=tmpDf, return_type='dataframe')
        model = sm.GLM(Y, X, family=sm.families.Gaussian())
        results = model.fit()
        mnmxi = np.array([tmpDf[xlab].idxmin(), tmpDf[xlab].idxmax()])
        plt.plot(tmpDf[xlab][mnmxi],
                 results.fittedvalues[mnmxi],
                 '-',
                 color=lc)

    plt.plot(tmpDf[xlab], tmpDf[ylab], 'o', **kwargs)

    if annotatePoints:
        annotationParams = dict(xytext=(0, 5),
                                textcoords='offset points',
                                size='medium')
        for x, y, lab in zip(tmpDf[xlab], tmpDf[ylab], tmpDf.index):
            plt.annotate(lab, xy=(x, y), **annotationParams)

    if annotateFit:
        if unp > 0.001:
            s = 'p = %1.3f\nrho = %1.2f\nn = %d' % (unp, unrho, tmpDf.shape[0])
        else:
            s = 'p = %1.1e\nrho = %1.2f\nn = %d' % (unp, unrho, tmpDf.shape[0])
        textTL(plt.gca(), s, color='black')

        if len(adjustVars) > 0:
            rho, p = partialcorr(tmpDf[xlab],
                                 tmpDf[ylab],
                                 adjust=adjustVars,
                                 method=method)
            if p > 0.001:
                s = 'adj-p = %1.3f\nadj-rho = %1.2f\nn = %d' % (p, rho,
                                                                tmpDf.shape[0])
            else:
                s = 'adj-p = %1.1e\nadj-rho = %1.2f\nn = %d' % (p, rho,
                                                                tmpDf.shape[0])

            textTR(plt.gca(), s, color='red')

    plt.xlabel(labelLookup.get(xlab, xlab))
    plt.ylabel(labelLookup.get(ylab, ylab))
    if returnModel:
        return model
Example #29
0
 def get_response_name(self):
     spec = ModelDesc.from_formula(self._model_spec)
     return spec.lhs_termlist[0].name()
Example #30
0
def _group_model(spreadsheet=None,
                 contrastdicts=None,
                 variabledicts=None,
                 subjects=None):
    rawdataframe = loadspreadsheet(spreadsheet)

    id_column = None
    for variabledict in variabledicts:
        if variabledict["type"] == "id":
            id_column = variabledict["name"]
            break

    assert id_column is not None, "Missing id column, cannot specify model"

    rawdataframe[id_column] = pd.Series(rawdataframe[id_column], dtype=str)
    if all(str(id).startswith("sub-")
           for id in rawdataframe[id_column]):  # for bids
        rawdataframe[id_column] = [
            str(id).replace("sub-", "") for id in rawdataframe[id_column]
        ]
    rawdataframe = rawdataframe.set_index(id_column)

    continuous_columns = []
    categorical_columns = []
    columns_in_order = []
    for variabledict in variabledicts:
        if variabledict["type"] == "continuous":
            continuous_columns.append(variabledict["name"])
            columns_in_order.append(variabledict["name"])
        elif variabledict["type"] == "categorical":
            categorical_columns.append(variabledict["name"])
            columns_in_order.append(variabledict["name"])

    # separate
    continuous = rawdataframe[continuous_columns]
    categorical = rawdataframe[categorical_columns]

    # only keep subjects that are in this analysis
    # also sets order
    continuous = continuous.loc[subjects, :]
    categorical = categorical.loc[subjects, :]

    # Demean continuous for flameo
    continuous -= continuous.mean()

    # replace np.nan by 0 for demeaned_continuous file and regression models
    continuous = continuous.replace({np.nan: 0})

    # change type first to string then to category
    categorical = categorical.astype(str)
    categorical = categorical.astype("category")

    # merge
    dataframe = categorical.join(continuous, how="outer").loc[subjects, :]

    # maintain order
    dataframe = dataframe[columns_in_order]

    # remove zero variance columns
    columns_var_gt_0 = dataframe.apply(pd.Series.nunique) > 1
    dataframe = dataframe.loc[:, columns_var_gt_0]

    # don't need to specify lhs
    lhs = []

    # generate rhs
    rhs = [Term([])]  # force intercept
    for contrastdict in contrastdicts:
        if contrastdict["type"] == "infer":
            # for every term in the model a contrast of type infer needs to be specified
            rhs.append(
                Term([LookupFactor(name)
                      for name in contrastdict["variable"]]))

    # specify patsy design matrix
    modelDesc = ModelDesc(lhs, rhs)
    dmat = dmatrix(modelDesc, dataframe, return_type="dataframe")
    _check_multicollinearity(dmat)

    # prepare lsmeans
    uniqueValuesForCategorical = [(0.0, ) if pd.api.types.is_numeric_dtype(
        dataframe[f].dtype) else dataframe[f].unique()
                                  for f in dataframe.columns]
    grid = pd.DataFrame(list(product(*uniqueValuesForCategorical)),
                        columns=dataframe.columns)
    refDmat = dmatrix(dmat.design_info, grid, return_type="dataframe")

    # data frame to store contrasts
    contrastMats = []

    for field, columnslice in dmat.design_info.term_name_slices.items():
        constraint = {
            column: 0
            for column in dmat.design_info.column_names[columnslice]
        }
        contrast = dmat.design_info.linear_constraint(constraint)
        assert np.all(contrast.variable_names == dmat.columns)
        contrastMat = pd.DataFrame(contrast.coefs, columns=dmat.columns)
        contrastMats.append((field, contrastMat))

    for contrastdict in contrastdicts:
        if contrastdict["type"] == "t":
            (variable, ) = contrastdict["variable"]
            variableLevels = dataframe[variable].unique()
            # Generate the lsmeans matrix where there is one row for each
            # factor level. Each row is a contrast vector.
            # This contrast vector corresponds to the mean of the dependent
            # variable at the factor level.
            # For example, we would have one row that calculates the mean
            # for patients, and one for controls.
            lsmeans = pd.DataFrame(index=variableLevels, columns=dmat.columns)
            for level in variableLevels:
                lsmeans.loc[level, :] = refDmat.loc[grid[variable] ==
                                                    level, :].mean()
            valueDict = contrastdict["values"]
            names = [
                name for name in valueDict.keys() if name in variableLevels
            ]
            values = [valueDict[name] for name in names]
            # If we wish to test the mean of each group against zero,
            # we can simply use these contrasts and be done.
            # To test a linear hypothesis such as patient-control=0,
            # which is expressed here as {"patient":1, "control":-1},
            # we translate it to a contrast vector by taking the linear
            # combination of the lsmeans contrasts.
            contrastVector = lsmeans.loc[names, :].mul(values, axis=0).sum()
            contrastMat = pd.DataFrame([contrastVector], columns=dmat.columns)
            contrastMats.append((contrastdict["name"], contrastMat))

    npts, nevs = dmat.shape

    if nevs >= npts:
        logger.warning("Reverting to simple intercept only design. \n"
                       f"nevs ({nevs}) >= npts ({npts})")
        return (
            {
                "intercept": [1.0] * len(subjects)
            },
            [["mean", "T", ["intercept"], [1]]],
            ["mean"],
        )

    regressors = {d: dmat[d].tolist() for d in dmat.columns}
    contrasts = []
    contrast_names = []

    for contrastName, contrastMat in contrastMats:  # t contrasts
        if contrastMat.shape[0] == 1:
            contrastVec = contrastMat.squeeze()
            contrasts.append((contrastName, "T", list(contrastVec.keys()),
                              list(contrastVec)))

            contrast_names.append(contrastName)

    for contrastName, contrastMat in contrastMats:  # f contrasts
        if contrastMat.shape[0] > 1:

            tcontrasts = []  # an f contrast consists of multiple t contrasts
            for i, contrastVec in contrastMat.iterrows():
                tname = f"{contrastName}_{i:d}"
                tcontrasts.append(
                    (tname, "T", list(contrastVec.keys()), list(contrastVec)))

            contrasts.extend(tcontrasts)  # add t contrasts to the model
            contrasts.append(
                (contrastName, "F", tcontrasts))  # then add the f contrast

            contrast_names.append(
                contrastName)  # we only care about the f contrast

    return regressors, contrasts, contrast_names
Example #31
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
from patsy import ModelDesc
from patsy import demo_data
from patsy import dmatrix, dmatrices
print(ModelDesc.from_formula("y ~ x").describe())
print(ModelDesc.from_formula("y ~ x + x + x").describe())
print(ModelDesc.from_formula("y ~ -1 + x").describe())
print(ModelDesc.from_formula("~ -1").describe())
print(ModelDesc.from_formula("y ~ a:b").describe())
print(ModelDesc.from_formula("y ~ a*b").describe())
print(ModelDesc.from_formula("y ~ (a + b + c + d) ** 2").describe())
print(ModelDesc.from_formula("y ~ (a + b)/(c + d)").describe())
print(
    ModelDesc.from_formula("np.log(x1 + x2) "
                           "+ (x + {6: x3, 8 + 1: x4}[3 * i])").describe())
#Sometimes it might be easier to read if you put the processed formula back into formula notation using ModelDesc.describe():

desc = ModelDesc.from_formula("y ~ (a + b + c + d) ** 2")
print(desc.describe())

data = demo_data("a", "b", "x1", "x2")
mat = dmatrix("x1:x2 + a:b + b + x1:a:b + a + x2:a:x1", data)
print(mat.design_info.term_names)

data = demo_data("a", "b", "y")

mat1 = dmatrices("y ~ 0 + a:b", data)[1]
# Now to try this in `patsy`.  
# 
# Steps:  
# 1. See how the model description is derived from the formula  
# 2. Build the design matrix that the formula specifies  
# 3. Use the design matrix in order to create the model in `scikit-learn`

# In[87]:

from patsy import ModelDesc, EvalEnvironment


# In[88]:

env = EvalEnvironment.capture()
predicted_lat_age_mtx = ModelDesc.from_formula('Predicted ~ Age_Calc * Case', env)


# In[89]:

predicted_lat_age_mtx


# In[90]:

from patsy import dmatrix


# In[91]:

design_mtx = dmatrix('Case * Age_Calc', subj_case_data)
Example #33
0
def lm(
    model_spec: str,
    data: pd.DataFrame,
    name: Optional[str] = None,
    alias_threshold: Optional[float] = 0.995,
) -> Model:
    """
    Create a linear model.
    """
    def find_aliases(model, model_desc, threshold_correlation=0.995):
        """
        Finds columns which are exactly correlated, or up to at least a level
        of `threshold_correlation`.
        Returns a dictionary of aliasing and a list of columns to keep.

        The columns to keep will be in the order checked. Perhaps this can be
        improved.
        For example if AB = CD, then return AB to keep.
        For example if A = BCD, then return A, and not the BCD column to keep.
        """
        has_variation = model.exog.std(axis=0) > np.sqrt(np.finfo(float).eps)

        # np.dot(model.exog.T, model.exog)/model.exog.shape[0]
        # Drop columns which do not have any variation
        corrcoef = np.corrcoef(model.exog[:, has_variation].T)  # , ddof=0)

        # Snippet of code here is from the NumPy "corrcoef" function. Adapted.
        c = np.cov(model.exog.T, None, rowvar=True)
        dot_product = model.exog.T @ model.exog
        try:
            d = np.diag(c)
        except ValueError:
            # scalar covariance
            # nan if incorrect value (nan, inf, 0), 1 otherwise
            return c / c
        stddev = np.sqrt(d.real)

        aliasing = defaultdict(list)
        terms = model_desc.rhs_termlist
        drop_columns = []
        counter = -1
        corrcoef = c.copy()
        for idx, check in enumerate(has_variation):
            if check:
                counter += 1

                for j, stddev_value in enumerate(stddev):
                    if stddev_value == 0:
                        pass
                    else:
                        corrcoef[idx, j] = c[idx, j] / stddev[idx] / stddev[j]

                # corrcoef = c / stddev[idx, None]
                # corrcoef = corrcoef / stddev[None, idx]

                candidates = [
                    i for i, val in enumerate(np.abs(corrcoef[idx, :]))
                    if (val > threshold_correlation)
                ]
                signs = [np.sign(j) for j in corrcoef[idx, :]]
            else:
                # Columns with no variation
                candidates = [
                    i for i, j in enumerate(has_variation)
                    if (j <= threshold_correlation)
                ]

            # Track the correlation signs
            signs = [np.sign(j) for j in dot_product[idx, :]]

            # Now drop out the candidates with the longest word lengths
            alias_len = [(len(terms[i].factors), i) for i in candidates]
            alias_len.sort(reverse=True)
            for entry in alias_len[0:-1]:
                drop_columns.append(entry[1])

            for col in candidates:
                if col == idx:
                    # It is of course perfectly correlated with itself
                    pass
                else:

                    aliases = [t.name() for t in terms[col].factors]
                    if len(aliases) == 0:
                        aliases = ["Intercept"]

                    key = tuple([t.name() for t in terms[idx].factors])
                    if len(key) == 0:
                        key = ("Intercept", )

                    if signs[col] > 0:
                        aliases.insert(0, "+")
                    if signs[col] < 0:
                        aliases.insert(0, "-")
                    aliasing[key].append(aliases)

        # Sort the aliases in length:
        for key, val in aliasing.items():
            alias_len = [(len(i), i) if i[1] != "Intercept" else (1e5, i)
                         for i in val]
            alias_len.sort()
            aliasing[key] = [i[1] for i in alias_len]

        return aliasing, list(set(drop_columns))

    pre_model = smf.ols(model_spec, data=data)
    model_description = ModelDesc.from_formula(model_spec)
    aliasing, drop_columns = find_aliases(
        pre_model, model_description, threshold_correlation=alias_threshold)
    drop_column_names = [pre_model.data.xnames[i] for i in drop_columns]

    post_model = smf.ols(model_spec, data=data, drop_cols=drop_column_names)

    name = name or data.pi_title
    out = Model(
        OLS_instance=post_model.fit(),
        model_spec=model_spec,
        aliasing=aliasing,
        name=name,
    )
    out.data = data

    return out