예제 #1
0
파일: models.py 프로젝트: mdekauwe/bambi
    def set_priors(self, priors=None, fixed=None, random=None):
        '''
        Set priors for one or more existing terms.
        Args:
            priors (dict): Dict of priors to update. Keys are names of terms
                to update; values are the new priors (either a Prior instance,
                or an int or float that scales the default priors). Note that
                a tuple can be passed as the key, in which case the same prior
                will be applied to all terms named in the tuple.
            fixed (Prior, int, float, str): a prior specification to apply to
                all fixed terms currently included in the model.
            random (Prior, int, float, str): a prior specification to apply to
                all random terms currently included in the model.
        '''

        targets = {}

        if fixed is not None:
            targets.update({name: fixed for name in self.fixed_terms.keys()})

        if random is not None:
            targets.update({name: random for name in self.random_terms.keys()})

        if priors is not None:
            for k, prior in priors.items():
                for name in listify(k):
                    if name not in self.terms:
                        raise ValueError("The model contains no term with "
                                         "the name '%s'." % name)
                    targets[name] = prior

        for name, prior in targets.items():
            self.terms[name].prior = prior
예제 #2
0
파일: models.py 프로젝트: camenpihor/bambi
    def _set_priors(self, priors=None, fixed=None, random=None, match_derived_names=True):
        """Internal version of set_priors(), with same arguments.

        Runs during Model.build().
        """

        targets = {}

        if fixed is not None:
            targets.update({name: fixed for name in self.fixed_terms.keys()})

        if random is not None:
            targets.update({name: random for name in self.random_terms.keys()})

        if priors is not None:
            for k, prior in priors.items():
                for name in listify(k):
                    term_names = list(self.terms.keys())
                    msg = "No terms in model match '%s'." % name
                    if name not in term_names:
                        terms = self._match_derived_terms(name)
                        if not match_derived_names or terms is None:
                            raise ValueError(msg)
                        for term in terms:
                            targets[term.name] = prior
                    else:
                        targets[name] = prior

        for name, prior in targets.items():
            self.terms[name].prior = prior
예제 #3
0
    def set_priors(self,
                   priors=None,
                   fixed=None,
                   random=None,
                   match_derived_names=True):
        '''
        Set priors for one or more existing terms.
        Args:
            priors (dict): Dict of priors to update. Keys are names of terms
                to update; values are the new priors (either a Prior instance,
                or an int or float that scales the default priors). Note that
                a tuple can be passed as the key, in which case the same prior
                will be applied to all terms named in the tuple.
            fixed (Prior, int, float, str): a prior specification to apply to
                all fixed terms currently included in the model.
            random (Prior, int, float, str): a prior specification to apply to
                all random terms currently included in the model.
            match_derived_names (bool): if True, the specified prior(s) will be
                applied not only to terms that match the keyword exactly,
                but to the levels of random effects that were derived from
                the original specification with the passed name. For example,
                `priors={'condition|subject':0.5}` would apply the prior
                to the terms with names '1|subject', 'condition[T.1]|subject',
                and so on. If False, an exact match is required for the
                prior to be applied.
        '''

        targets = {}

        if fixed is not None:
            targets.update({name: fixed for name in self.fixed_terms.keys()})

        if random is not None:
            targets.update({name: random for name in self.random_terms.keys()})

        if priors is not None:
            for k, prior in priors.items():
                for name in listify(k):
                    term_names = list(self.terms.keys())
                    msg = "No terms in model match '%s'." % name
                    if name not in term_names:
                        terms = self._match_derived_terms(name)
                        if not match_derived_names or terms is None:
                            raise ValueError(msg)
                        for t in terms:
                            targets[t.name] = prior
                    else:
                        targets[name] = prior

        for prior in targets.values():
            if isinstance(prior, Prior):
                prior._auto_scale = False

        for name, prior in targets.items():
            self.terms[name].prior = prior

        if fixed is not None or random is not None or priors is not None:
            self.built = False
예제 #4
0
    def _set_priors(self,
                    priors=None,
                    fixed=None,
                    random=None,
                    match_derived_names=True):
        '''
        Internal version of set_priors(), with same arguments.
        Runs during Model.build().
        '''

        targets = {}

        if fixed is not None:
            targets.update({name: fixed for name in self.fixed_terms.keys()})

        if random is not None:
            targets.update({name: random for name in self.random_terms.keys()})

        if priors is not None:
            for k, prior in priors.items():
                for name in listify(k):
                    term_names = list(self.terms.keys())
                    msg = "No terms in model match '%s'." % name
                    if name not in term_names:
                        terms = self._match_derived_terms(name)
                        if not match_derived_names or terms is None:
                            raise ValueError(msg)
                        for t in terms:
                            targets[t.name] = prior
                    else:
                        targets[name] = prior

        for name, prior in targets.items():
            _type = 'intercept' if name == 'Intercept' else \
                'random' if self.terms[name].random else 'fixed'
            self.terms[name].prior = self._prepare_prior(prior, _type)

        if fixed is not None or random is not None or priors is not None:
            self.built = False
예제 #5
0
    def _add(self,
             fixed=None,
             random=None,
             priors=None,
             family='gaussian',
             link=None,
             categorical=None,
             append=True):
        '''
        Internal version of add(), with the same arguments.
        Runs during Model.build()
        '''
        # use cleaned data with NAs removed (if user requested)
        data = self.clean_data
        # alter this pandas flag to avoid false positive SettingWithCopyWarnings
        data.is_copy = False

        # Explicitly convert columns to category if desired--though this
        # can also be done within the formula using C().
        if categorical is not None:
            data = data.copy()
            cats = listify(categorical)
            data[cats] = data[cats].apply(lambda x: x.astype('category'))

        if fixed is not None:
            if '~' in fixed:
                # check to see if formula is using the 'y[event] ~ x' syntax
                # (for bernoulli models). If so, chop it into groups:
                # 1 = 'y[event]', 2 = 'y', 3 = 'event', 4 = 'x'
                # If this syntax is not being used, event = None
                event = re.match(r'^((\S+)\[(\S+)\])\s*~(.*)$', fixed)
                if event is not None:
                    fixed = '{}~{}'.format(event.group(2), event.group(4))
                y, X = dmatrices(fixed, data=data, NA_action='raise')
                y_label = y.design_info.term_names[0]
                if event is not None:
                    # pass in new Y data that has 1 if y=event and 0 otherwise
                    y_data = y[:,
                               y.design_info.column_names.index(event.group(1)
                                                                )]
                    y_data = pd.DataFrame({event.group(3): y_data})
                    self._add_y(y_label, family=family, link=link, data=y_data)
                else:
                    # use Y as-is
                    self._add_y(y_label, family=family, link=link)
            else:
                X = dmatrix(fixed, data=data, NA_action='raise')

            # Loop over predictor terms
            for _name, _slice in X.design_info.term_name_slices.items():
                cols = X.design_info.column_names[_slice]
                term_data = pd.DataFrame(X[:, _slice], columns=cols)
                prior = priors.pop(_name, priors.get('fixed', None))
                _type = 'intercept' if _name == 'Intercept' else 'fixed'
                prior = self._prepare_prior(prior, _type)
                self.terms[_name] = Term(_name, term_data, prior=prior)

        # Random effects
        if random is not None:

            random = listify(random)

            for f in random:

                f = f.strip()

                # Split specification into intercept, predictor, and grouper
                patt = r'^([01]+)*[\s\+]*([^\|]+)*\|(.*)'

                intcpt, pred, grpr = re.search(patt, f).groups()
                label = '{}|{}'.format(pred, grpr) if pred else grpr
                prior = priors.pop(label, priors.get('random', None))
                prior = self._prepare_prior(prior, 'random')

                # Treat all grouping variables as categoricals, regardless of
                # their dtype and what the user may have specified in the
                # 'categorical' argument.
                var_names = re.findall('(\w+)', grpr)
                for v in var_names:
                    if v in data.columns:
                        data.loc[:, v] = data.loc[:, v].astype('category')
                        self.clean_data.loc[:, v] = data.loc[:, v]

                # Default to including random intercepts
                intcpt = 1 if intcpt is None else int(intcpt)

                grpr_df = dmatrix('0+%s' % grpr,
                                  data,
                                  return_type='dataframe',
                                  NA_action='raise')

                # If there's no predictor, we must be adding random intercepts
                if not pred and grpr not in self.terms:
                    name = '1|' + grpr
                    pred = np.ones((len(grpr_df), 1))
                    term = RandomTerm(name,
                                      grpr_df,
                                      pred,
                                      grpr_df.values,
                                      categorical=True,
                                      prior=prior)
                    self.terms[name] = term
                else:
                    pred_df = dmatrix('%s+%s' % (intcpt, pred),
                                      data,
                                      return_type='dataframe',
                                      NA_action='raise')
                    # determine value of the 'constant' attribute
                    const = np.atleast_2d(pred_df.T).T.sum(1).var() == 0

                    for col, i in pred_df.design_info.column_name_indexes.items(
                    ):
                        pred_data = pred_df.iloc[:, i]
                        lev_data = grpr_df.multiply(pred_data, axis=0)

                        # Also rename intercepts and skip if already added.
                        # This can happen if user specifies something like
                        # random=['1|school', 'student|school'].
                        if col == 'Intercept':
                            if grpr in self.terms:
                                continue
                            label = '1|%s' % grpr
                        else:
                            label = col + '|' + grpr

                        prior = priors.pop(label, priors.get('random', None))
                        prior = self._prepare_prior(prior, 'random')

                        # Categorical or continuous is determined from data
                        ld = lev_data.values
                        if ((ld == 0) | (ld == 1)).all():
                            lev_data = lev_data.astype(int)
                            cat = True
                        else:
                            cat = False

                        pred_data = pred_data[:, None]  # Must be 2D later
                        term = RandomTerm(label,
                                          lev_data,
                                          pred_data,
                                          grpr_df.values,
                                          categorical=cat,
                                          constant=const if const else None,
                                          prior=prior)
                        self.terms[label] = term
예제 #6
0
    def add(self,
            fixed=None,
            random=None,
            priors=None,
            family='gaussian',
            link=None,
            categorical=None,
            append=True):
        '''
        Adds one or more terms to the model via an R-like formula syntax.
        Args:
            fixed (str): Optional formula specification of fixed effects.
            random (list): Optional list-based specification of random effects.
            priors (dict): Optional specification of priors for one or more
                terms. A dict where the keys are the names of terms in the
                model, and the values are either instances of class Prior or
                ints, floats, or strings that specify the width of the priors
                on a standardized scale.
            family (str, Family): A specification of the model family
                (analogous to the family object in R). Either a string, or an
                instance of class priors.Family. If a string is passed, a
                family with the corresponding name must be defined in the
                defaults loaded at Model initialization. Valid pre-defined
                families are 'gaussian', 'bernoulli', 'poisson', and 't'.
            link (str): The model link function to use. Can be either a string
                (must be one of the options defined in the current backend;
                typically this will include at least 'identity', 'logit',
                'inverse', and 'log'), or a callable that takes a 1D ndarray
                or theano tensor as the sole argument and returns one with
                the same shape.
            categorical (str, list): The names of any variables to treat as
                categorical. Can be either a single variable name, or a list
                of names. If categorical is None, the data type of the columns
                in the DataFrame will be used to infer handling. In cases where
                numeric columns are to be treated as categoricals (e.g., random
                factors coded as numerical IDs), explicitly passing variable
                names via this argument is recommended.
            append (bool): if True, terms are appended to the existing model
                rather than replacing any existing terms. This allows
                formula-based specification of the model in stages.
        '''
        data = self.data

        # Primitive values (floats, strs) can be overwritten with Prior objects
        # so we need to make sure to copy first to avoid bad things happening
        # if user is re-using same prior dict in multiple models.
        if priors is None:
            priors = {}
        else:
            priors = deepcopy(priors)

        if not append:
            self.reset()

        # Explicitly convert columns to category if desired--though this
        # can also be done within the formula using C().
        if categorical is not None:
            data = data.copy()
            cats = listify(categorical)
            data[cats] = data[cats].apply(lambda x: x.astype('category'))

        # Custom patsy.missing.NAAction class. Similar to patsy drop/raise
        # defaults, but changes the raised message and logs any dropped rows
        NA_handler = Custom_NA(dropna=self.dropna)

        # screen fixed terms
        if fixed is not None:
            if '~' in fixed:
                clean_fix = re.sub(r'\[.+\]', '', fixed)
                dmatrices(clean_fix, data=data, NA_action=NA_handler)
            else:
                dmatrix(fixed, data=data, NA_action=NA_handler)

        # screen random terms
        if random is not None:
            for term in listify(random):
                for side in term.split('|'):
                    dmatrix(side, data=data, NA_action=NA_handler)

        # update the running list of complete cases
        if len(NA_handler.completes):
            self.completes.append(NA_handler.completes)

        # save arguments to pass to _add()
        args = dict(
            zip(['fixed', 'random', 'priors', 'family', 'link', 'categorical'],
                [fixed, random, priors, family, link, categorical]))
        self.added_terms.append(args)

        self.built = False
예제 #7
0
def test_listify():
    assert listify(None) == []
    assert listify([1, 2, 3]) == [1, 2, 3]
    assert listify("giraffe") == ["giraffe"]
예제 #8
0
def test_listify():
    assert listify(None) == []
    assert listify([1, 2, 3]) == [1, 2, 3]
    assert listify('giraffe') == ['giraffe']
예제 #9
0
    def add(self,
            fixed=None,
            random=None,
            priors=None,
            family='gaussian',
            link=None,
            categorical=None,
            append=True):
        '''
        Adds one or more terms to the model via an R-like formula syntax.
        Args:
            fixed (str): Optional formula specification of fixed effects.
            random (list): Optional list-based specification of random effects.
            priors (dict): Optional specification of priors for one or more
                terms. A dict where the keys are the names of terms in the
                model, and the values are either instances of class Prior or
                ints, floats, or strings that specify the width of the priors
                on a standardized scale.
            family (str, Family): A specification of the model family
                (analogous to the family object in R). Either a string, or an
                instance of class priors.Family. If a string is passed, a
                family with the corresponding name must be defined in the
                defaults loaded at Model initialization. Valid pre-defined
                families are 'gaussian', 'bernoulli', 'poisson', and 't'.
            link (str): The model link function to use. Can be either a string
                (must be one of the options defined in the current backend;
                typically this will include at least 'identity', 'logit',
                'inverse', and 'log'), or a callable that takes a 1D ndarray
                or theano tensor as the sole argument and returns one with
                the same shape.
            categorical (str, list): The names of any variables to treat as
                categorical. Can be either a single variable name, or a list
                of names. If categorical is None, the data type of the columns
                in the DataFrame will be used to infer handling. In cases where
                numeric columns are to be treated as categoricals (e.g., random
                factors coded as numerical IDs), explicitly passing variable
                names via this argument is recommended.
            append (bool): if True, terms are appended to the existing model
                rather than replacing any existing terms. This allows
                formula-based specification of the model in stages.
        '''
        data = self.data

        # Primitive values (floats, strs) can be overwritten with Prior objects
        # so we need to make sure to copy first to avoid bad things happening
        # if user is re-using same prior dict in multiple models.
        if priors is None:
            priors = {}
        else:
            priors = deepcopy(priors)

        if not append:
            self.reset()

        # Explicitly convert columns to category if desired--though this
        # can also be done within the formula using C().
        if categorical is not None:
            data = data.copy()
            cats = listify(categorical)
            data[cats] = data[cats].apply(lambda x: x.astype('category'))

        if fixed is not None:
            if '~' in fixed:
                # check to see if formula is using the 'y[event] ~ x' syntax
                # (for bernoulli models). If so, chop it into groups:
                # 1 = 'y[event]', 2 = 'y', 3 = 'event', 4 = 'x'
                # If this syntax is not being used, event = None
                event = re.match(r'^((\S+)\[(\S+)\])\s*~(.*)$', fixed)
                if event is not None:
                    fixed = '{}~{}'.format(event.group(2), event.group(4))
                y, X = dmatrices(fixed, data=data, NA_action=Ignore_NA())
                y_label = y.design_info.term_names[0]
                if event is not None:
                    # pass in new Y data that has 1 if y=event and 0 otherwise
                    y_data = y[:,
                               y.design_info.column_names.index(event.group(1)
                                                                )]
                    y_data = pd.DataFrame({event.group(3): y_data})
                    self._add_y(y_label, family=family, link=link, data=y_data)
                else:
                    # use Y as-is
                    self._add_y(y_label, family=family, link=link)
            else:
                X = dmatrix(fixed, data=data, NA_action=Ignore_NA())

            # Loop over predictor terms
            for _name, _slice in X.design_info.term_name_slices.items():
                cols = X.design_info.column_names[_slice]
                term_data = pd.DataFrame(X[:, _slice], columns=cols)
                prior = priors.pop(_name, priors.get('fixed', None))
                self.terms[_name] = Term(self, _name, term_data, prior=prior)

        # Random effects
        if random is not None:

            random = listify(random)

            for f in random:

                f = f.strip()

                # Split specification into intercept, predictor, and grouper
                patt = r'^([01]+)*[\s\+]*([^\|]+)*\|(.*)'

                intcpt, pred, grpr = re.search(patt, f).groups()
                label = '{}|{}'.format(pred, grpr) if pred else grpr
                prior = priors.pop(label, priors.get('random', None))

                # Treat all grouping variables as categoricals, regardless of
                # their dtype and what the user may have specified in the
                # 'categorical' argument.
                var_names = re.findall('(\w+)', grpr)
                for v in var_names:
                    if v in data.columns:
                        data[v] = data.loc[:, v].astype('category')
                        self.data[v] = data[v]

                # Default to including random intercepts
                intcpt = 1 if intcpt is None else int(intcpt)

                grpr_df = dmatrix('0+%s' % grpr,
                                  data,
                                  return_type='dataframe',
                                  NA_action=Ignore_NA())

                # If there's no predictor, we must be adding random intercepts
                if not pred and grpr not in self.terms:
                    name = '1|' + grpr
                    pred = np.ones((len(grpr_df), 1))
                    term = RandomTerm(self,
                                      name,
                                      grpr_df,
                                      pred,
                                      grpr_df.values,
                                      categorical=True,
                                      prior=prior)
                    self.terms[name] = term
                else:
                    pred_df = dmatrix('%s+%s' % (intcpt, pred),
                                      data,
                                      return_type='dataframe',
                                      NA_action=Ignore_NA())
                    # determine value of the 'constant' attribute
                    const = np.atleast_2d(pred_df.T).T.sum(1).var() == 0

                    for col, i in pred_df.design_info.column_name_indexes.items(
                    ):
                        pred_data = pred_df.iloc[:, i]
                        lev_data = grpr_df.multiply(pred_data, axis=0)

                        # Also rename intercepts and skip if already added.
                        # This can happen if user specifies something like
                        # random=['1|school', 'student|school'].
                        if col == 'Intercept':
                            if grpr in self.terms:
                                continue
                            label = '1|%s' % grpr
                        else:
                            label = col + '|' + grpr

                        prior = priors.pop(label, priors.get('random', None))

                        # Categorical or continuous is determined from data
                        ld = lev_data.values
                        if ((ld == 0) | (ld == 1)).all():
                            lev_data = lev_data.astype(int)
                            cat = True
                        else:
                            cat = False

                        pred_data = pred_data[:, None]  # Must be 2D later
                        term = RandomTerm(self,
                                          label,
                                          lev_data,
                                          pred_data,
                                          grpr_df.values,
                                          categorical=cat,
                                          constant=const if const else None)
                        self.terms[label] = term

        self.built = False
예제 #10
0
파일: models.py 프로젝트: mdekauwe/bambi
    def add_formula(self,
                    fixed=None,
                    random=None,
                    priors=None,
                    family='gaussian',
                    link=None,
                    categorical=None,
                    append=False):
        '''
        Adds one or more terms to the model via an R-like formula syntax.
        Args:
            fixed (str): Optional formula specification of fixed effects.
            random (list): Optional list-based specification of random effects.
            priors (dict): Optional specification of priors for one or more
                terms. A dict where the keys are the names of terms in the
                model, and the values are either instances of class Prior or
                ints, floats, or strings that specify the width of the priors
                on a standardized scale.
            family (str, Family): A specification of the model family
                (analogous to the family object in R). Either a string, or an
                instance of class priors.Family. If a string is passed, a
                family with the corresponding name must be defined in the
                defaults loaded at Model initialization. Valid pre-defined
                families are 'gaussian', 'binomial', 'poisson', and 't'.
            link (str): The model link function to use. Can be either a string
                (must be one of the options defined in the current backend;
                typically this will include at least 'identity', 'logit',
                'inverse', and 'exp'), or a callable that takes a 1D ndarray
                or theano tensor as the sole argument and returns one with
                the same shape.
            categorical (str, list): The names of any variables to treat as
                categorical. Can be either a single variable name, or a list
                of names. If categorical is None, the data type of the columns
                in the DataFrame will be used to infer handling. In cases where
                numeric columns are to be treated as categoricals (e.g., random
                factors coded as numerical IDs), explicitly passing variable
                names via this argument is recommended.
            append (bool): if True, terms are appended to the existing model
                rather than replacing any existing terms. This allows
                formula-based specification of the model in stages.
        '''
        data = self.data

        if priors is None:
            priors = {}

        if not append:
            self.reset()

        if fixed is not None:
            # Explicitly convert columns to category if desired--though this
            # can also be done within the formula using C().
            if categorical is not None:
                data = data.copy()
                cats = listify(categorical)
                data[cats] = data[cats].apply(lambda x: x.astype('category'))

            if '~' in fixed:
                y, X = dmatrices(fixed, data=data)
                y_label = y.design_info.term_names[0]
                self.add_y(y_label, family=family, link=link)
            else:
                X = dmatrix(fixed, data=data)

            # Loop over predictor terms
            for _name, _slice in X.design_info.term_name_slices.items():
                cols = X.design_info.column_names[_slice]
                term_data = pd.DataFrame(X[:, _slice], columns=cols)
                prior = priors.pop(_name, priors.pop('fixed', None))
                self.add_term(_name, data=term_data, prior=prior)

        # Random effects
        if random is not None:
            random = listify(random)
            for f in random:
                f = f.strip()
                kwargs = {'random': True}
                if re.search('[\*\(\)]+', f):
                    raise ValueError("Random term '%s' contains an invalid "
                                     "character. Note that only the | and + "
                                     "operators are currently supported in "
                                     "random effects specifications.")

                # replace explicit intercept terms like '1|subj' with just 'subj'
                f = re.sub(r'^1\s*\|(.*)', r'\1', f).strip()

                # Split specification into intercept, predictor, and grouper
                patt = r'^([01]+)*[\s\+]*([^\|]+)\|*(.*)'
                intcpt, pred, grpr = re.search(patt, f).groups()
                label = '{}|{}'.format(pred, grpr) if grpr else pred

                # Default to including random intercepts
                if intcpt is None:
                    intcpt = 1
                intcpt = int(intcpt)

                # If there's no grouper, we must be adding random intercepts
                if not grpr:
                    kwargs.update(dict(categorical=True, drop_first=False))
                    variable = pred

                else:
                    # If we're adding slopes, add random intercepts as well,
                    # unless they were explicitly excluded
                    if intcpt and grpr not in self.terms:
                        self.add_term(variable=grpr,
                                      categorical=True,
                                      random=True,
                                      drop_first=False)
                    if self.data[pred].dtype.name in ['object', 'category']:
                        kwargs['categorical'] = True
                        if not intcpt:
                            kwargs['drop_first'] = False
                    variable, kwargs['over'] = pred, grpr

                prior = priors.pop(label, priors.pop('random', None))
                self.add_term(variable=variable, label=label, **kwargs)