def set_priors(self, priors=None, fixed=None, random=None): ''' Set priors for one or more existing terms. Args: priors (dict): Dict of priors to update. Keys are names of terms to update; values are the new priors (either a Prior instance, or an int or float that scales the default priors). Note that a tuple can be passed as the key, in which case the same prior will be applied to all terms named in the tuple. fixed (Prior, int, float, str): a prior specification to apply to all fixed terms currently included in the model. random (Prior, int, float, str): a prior specification to apply to all random terms currently included in the model. ''' targets = {} if fixed is not None: targets.update({name: fixed for name in self.fixed_terms.keys()}) if random is not None: targets.update({name: random for name in self.random_terms.keys()}) if priors is not None: for k, prior in priors.items(): for name in listify(k): if name not in self.terms: raise ValueError("The model contains no term with " "the name '%s'." % name) targets[name] = prior for name, prior in targets.items(): self.terms[name].prior = prior
def _set_priors(self, priors=None, fixed=None, random=None, match_derived_names=True): """Internal version of set_priors(), with same arguments. Runs during Model.build(). """ targets = {} if fixed is not None: targets.update({name: fixed for name in self.fixed_terms.keys()}) if random is not None: targets.update({name: random for name in self.random_terms.keys()}) if priors is not None: for k, prior in priors.items(): for name in listify(k): term_names = list(self.terms.keys()) msg = "No terms in model match '%s'." % name if name not in term_names: terms = self._match_derived_terms(name) if not match_derived_names or terms is None: raise ValueError(msg) for term in terms: targets[term.name] = prior else: targets[name] = prior for name, prior in targets.items(): self.terms[name].prior = prior
def set_priors(self, priors=None, fixed=None, random=None, match_derived_names=True): ''' Set priors for one or more existing terms. Args: priors (dict): Dict of priors to update. Keys are names of terms to update; values are the new priors (either a Prior instance, or an int or float that scales the default priors). Note that a tuple can be passed as the key, in which case the same prior will be applied to all terms named in the tuple. fixed (Prior, int, float, str): a prior specification to apply to all fixed terms currently included in the model. random (Prior, int, float, str): a prior specification to apply to all random terms currently included in the model. match_derived_names (bool): if True, the specified prior(s) will be applied not only to terms that match the keyword exactly, but to the levels of random effects that were derived from the original specification with the passed name. For example, `priors={'condition|subject':0.5}` would apply the prior to the terms with names '1|subject', 'condition[T.1]|subject', and so on. If False, an exact match is required for the prior to be applied. ''' targets = {} if fixed is not None: targets.update({name: fixed for name in self.fixed_terms.keys()}) if random is not None: targets.update({name: random for name in self.random_terms.keys()}) if priors is not None: for k, prior in priors.items(): for name in listify(k): term_names = list(self.terms.keys()) msg = "No terms in model match '%s'." % name if name not in term_names: terms = self._match_derived_terms(name) if not match_derived_names or terms is None: raise ValueError(msg) for t in terms: targets[t.name] = prior else: targets[name] = prior for prior in targets.values(): if isinstance(prior, Prior): prior._auto_scale = False for name, prior in targets.items(): self.terms[name].prior = prior if fixed is not None or random is not None or priors is not None: self.built = False
def _set_priors(self, priors=None, fixed=None, random=None, match_derived_names=True): ''' Internal version of set_priors(), with same arguments. Runs during Model.build(). ''' targets = {} if fixed is not None: targets.update({name: fixed for name in self.fixed_terms.keys()}) if random is not None: targets.update({name: random for name in self.random_terms.keys()}) if priors is not None: for k, prior in priors.items(): for name in listify(k): term_names = list(self.terms.keys()) msg = "No terms in model match '%s'." % name if name not in term_names: terms = self._match_derived_terms(name) if not match_derived_names or terms is None: raise ValueError(msg) for t in terms: targets[t.name] = prior else: targets[name] = prior for name, prior in targets.items(): _type = 'intercept' if name == 'Intercept' else \ 'random' if self.terms[name].random else 'fixed' self.terms[name].prior = self._prepare_prior(prior, _type) if fixed is not None or random is not None or priors is not None: self.built = False
def _add(self, fixed=None, random=None, priors=None, family='gaussian', link=None, categorical=None, append=True): ''' Internal version of add(), with the same arguments. Runs during Model.build() ''' # use cleaned data with NAs removed (if user requested) data = self.clean_data # alter this pandas flag to avoid false positive SettingWithCopyWarnings data.is_copy = False # Explicitly convert columns to category if desired--though this # can also be done within the formula using C(). if categorical is not None: data = data.copy() cats = listify(categorical) data[cats] = data[cats].apply(lambda x: x.astype('category')) if fixed is not None: if '~' in fixed: # check to see if formula is using the 'y[event] ~ x' syntax # (for bernoulli models). If so, chop it into groups: # 1 = 'y[event]', 2 = 'y', 3 = 'event', 4 = 'x' # If this syntax is not being used, event = None event = re.match(r'^((\S+)\[(\S+)\])\s*~(.*)$', fixed) if event is not None: fixed = '{}~{}'.format(event.group(2), event.group(4)) y, X = dmatrices(fixed, data=data, NA_action='raise') y_label = y.design_info.term_names[0] if event is not None: # pass in new Y data that has 1 if y=event and 0 otherwise y_data = y[:, y.design_info.column_names.index(event.group(1) )] y_data = pd.DataFrame({event.group(3): y_data}) self._add_y(y_label, family=family, link=link, data=y_data) else: # use Y as-is self._add_y(y_label, family=family, link=link) else: X = dmatrix(fixed, data=data, NA_action='raise') # Loop over predictor terms for _name, _slice in X.design_info.term_name_slices.items(): cols = X.design_info.column_names[_slice] term_data = pd.DataFrame(X[:, _slice], columns=cols) prior = priors.pop(_name, priors.get('fixed', None)) _type = 'intercept' if _name == 'Intercept' else 'fixed' prior = self._prepare_prior(prior, _type) self.terms[_name] = Term(_name, term_data, prior=prior) # Random effects if random is not None: random = listify(random) for f in random: f = f.strip() # Split specification into intercept, predictor, and grouper patt = r'^([01]+)*[\s\+]*([^\|]+)*\|(.*)' intcpt, pred, grpr = re.search(patt, f).groups() label = '{}|{}'.format(pred, grpr) if pred else grpr prior = priors.pop(label, priors.get('random', None)) prior = self._prepare_prior(prior, 'random') # Treat all grouping variables as categoricals, regardless of # their dtype and what the user may have specified in the # 'categorical' argument. var_names = re.findall('(\w+)', grpr) for v in var_names: if v in data.columns: data.loc[:, v] = data.loc[:, v].astype('category') self.clean_data.loc[:, v] = data.loc[:, v] # Default to including random intercepts intcpt = 1 if intcpt is None else int(intcpt) grpr_df = dmatrix('0+%s' % grpr, data, return_type='dataframe', NA_action='raise') # If there's no predictor, we must be adding random intercepts if not pred and grpr not in self.terms: name = '1|' + grpr pred = np.ones((len(grpr_df), 1)) term = RandomTerm(name, grpr_df, pred, grpr_df.values, categorical=True, prior=prior) self.terms[name] = term else: pred_df = dmatrix('%s+%s' % (intcpt, pred), data, return_type='dataframe', NA_action='raise') # determine value of the 'constant' attribute const = np.atleast_2d(pred_df.T).T.sum(1).var() == 0 for col, i in pred_df.design_info.column_name_indexes.items( ): pred_data = pred_df.iloc[:, i] lev_data = grpr_df.multiply(pred_data, axis=0) # Also rename intercepts and skip if already added. # This can happen if user specifies something like # random=['1|school', 'student|school']. if col == 'Intercept': if grpr in self.terms: continue label = '1|%s' % grpr else: label = col + '|' + grpr prior = priors.pop(label, priors.get('random', None)) prior = self._prepare_prior(prior, 'random') # Categorical or continuous is determined from data ld = lev_data.values if ((ld == 0) | (ld == 1)).all(): lev_data = lev_data.astype(int) cat = True else: cat = False pred_data = pred_data[:, None] # Must be 2D later term = RandomTerm(label, lev_data, pred_data, grpr_df.values, categorical=cat, constant=const if const else None, prior=prior) self.terms[label] = term
def add(self, fixed=None, random=None, priors=None, family='gaussian', link=None, categorical=None, append=True): ''' Adds one or more terms to the model via an R-like formula syntax. Args: fixed (str): Optional formula specification of fixed effects. random (list): Optional list-based specification of random effects. priors (dict): Optional specification of priors for one or more terms. A dict where the keys are the names of terms in the model, and the values are either instances of class Prior or ints, floats, or strings that specify the width of the priors on a standardized scale. family (str, Family): A specification of the model family (analogous to the family object in R). Either a string, or an instance of class priors.Family. If a string is passed, a family with the corresponding name must be defined in the defaults loaded at Model initialization. Valid pre-defined families are 'gaussian', 'bernoulli', 'poisson', and 't'. link (str): The model link function to use. Can be either a string (must be one of the options defined in the current backend; typically this will include at least 'identity', 'logit', 'inverse', and 'log'), or a callable that takes a 1D ndarray or theano tensor as the sole argument and returns one with the same shape. categorical (str, list): The names of any variables to treat as categorical. Can be either a single variable name, or a list of names. If categorical is None, the data type of the columns in the DataFrame will be used to infer handling. In cases where numeric columns are to be treated as categoricals (e.g., random factors coded as numerical IDs), explicitly passing variable names via this argument is recommended. append (bool): if True, terms are appended to the existing model rather than replacing any existing terms. This allows formula-based specification of the model in stages. ''' data = self.data # Primitive values (floats, strs) can be overwritten with Prior objects # so we need to make sure to copy first to avoid bad things happening # if user is re-using same prior dict in multiple models. if priors is None: priors = {} else: priors = deepcopy(priors) if not append: self.reset() # Explicitly convert columns to category if desired--though this # can also be done within the formula using C(). if categorical is not None: data = data.copy() cats = listify(categorical) data[cats] = data[cats].apply(lambda x: x.astype('category')) # Custom patsy.missing.NAAction class. Similar to patsy drop/raise # defaults, but changes the raised message and logs any dropped rows NA_handler = Custom_NA(dropna=self.dropna) # screen fixed terms if fixed is not None: if '~' in fixed: clean_fix = re.sub(r'\[.+\]', '', fixed) dmatrices(clean_fix, data=data, NA_action=NA_handler) else: dmatrix(fixed, data=data, NA_action=NA_handler) # screen random terms if random is not None: for term in listify(random): for side in term.split('|'): dmatrix(side, data=data, NA_action=NA_handler) # update the running list of complete cases if len(NA_handler.completes): self.completes.append(NA_handler.completes) # save arguments to pass to _add() args = dict( zip(['fixed', 'random', 'priors', 'family', 'link', 'categorical'], [fixed, random, priors, family, link, categorical])) self.added_terms.append(args) self.built = False
def test_listify(): assert listify(None) == [] assert listify([1, 2, 3]) == [1, 2, 3] assert listify("giraffe") == ["giraffe"]
def test_listify(): assert listify(None) == [] assert listify([1, 2, 3]) == [1, 2, 3] assert listify('giraffe') == ['giraffe']
def add(self, fixed=None, random=None, priors=None, family='gaussian', link=None, categorical=None, append=True): ''' Adds one or more terms to the model via an R-like formula syntax. Args: fixed (str): Optional formula specification of fixed effects. random (list): Optional list-based specification of random effects. priors (dict): Optional specification of priors for one or more terms. A dict where the keys are the names of terms in the model, and the values are either instances of class Prior or ints, floats, or strings that specify the width of the priors on a standardized scale. family (str, Family): A specification of the model family (analogous to the family object in R). Either a string, or an instance of class priors.Family. If a string is passed, a family with the corresponding name must be defined in the defaults loaded at Model initialization. Valid pre-defined families are 'gaussian', 'bernoulli', 'poisson', and 't'. link (str): The model link function to use. Can be either a string (must be one of the options defined in the current backend; typically this will include at least 'identity', 'logit', 'inverse', and 'log'), or a callable that takes a 1D ndarray or theano tensor as the sole argument and returns one with the same shape. categorical (str, list): The names of any variables to treat as categorical. Can be either a single variable name, or a list of names. If categorical is None, the data type of the columns in the DataFrame will be used to infer handling. In cases where numeric columns are to be treated as categoricals (e.g., random factors coded as numerical IDs), explicitly passing variable names via this argument is recommended. append (bool): if True, terms are appended to the existing model rather than replacing any existing terms. This allows formula-based specification of the model in stages. ''' data = self.data # Primitive values (floats, strs) can be overwritten with Prior objects # so we need to make sure to copy first to avoid bad things happening # if user is re-using same prior dict in multiple models. if priors is None: priors = {} else: priors = deepcopy(priors) if not append: self.reset() # Explicitly convert columns to category if desired--though this # can also be done within the formula using C(). if categorical is not None: data = data.copy() cats = listify(categorical) data[cats] = data[cats].apply(lambda x: x.astype('category')) if fixed is not None: if '~' in fixed: # check to see if formula is using the 'y[event] ~ x' syntax # (for bernoulli models). If so, chop it into groups: # 1 = 'y[event]', 2 = 'y', 3 = 'event', 4 = 'x' # If this syntax is not being used, event = None event = re.match(r'^((\S+)\[(\S+)\])\s*~(.*)$', fixed) if event is not None: fixed = '{}~{}'.format(event.group(2), event.group(4)) y, X = dmatrices(fixed, data=data, NA_action=Ignore_NA()) y_label = y.design_info.term_names[0] if event is not None: # pass in new Y data that has 1 if y=event and 0 otherwise y_data = y[:, y.design_info.column_names.index(event.group(1) )] y_data = pd.DataFrame({event.group(3): y_data}) self._add_y(y_label, family=family, link=link, data=y_data) else: # use Y as-is self._add_y(y_label, family=family, link=link) else: X = dmatrix(fixed, data=data, NA_action=Ignore_NA()) # Loop over predictor terms for _name, _slice in X.design_info.term_name_slices.items(): cols = X.design_info.column_names[_slice] term_data = pd.DataFrame(X[:, _slice], columns=cols) prior = priors.pop(_name, priors.get('fixed', None)) self.terms[_name] = Term(self, _name, term_data, prior=prior) # Random effects if random is not None: random = listify(random) for f in random: f = f.strip() # Split specification into intercept, predictor, and grouper patt = r'^([01]+)*[\s\+]*([^\|]+)*\|(.*)' intcpt, pred, grpr = re.search(patt, f).groups() label = '{}|{}'.format(pred, grpr) if pred else grpr prior = priors.pop(label, priors.get('random', None)) # Treat all grouping variables as categoricals, regardless of # their dtype and what the user may have specified in the # 'categorical' argument. var_names = re.findall('(\w+)', grpr) for v in var_names: if v in data.columns: data[v] = data.loc[:, v].astype('category') self.data[v] = data[v] # Default to including random intercepts intcpt = 1 if intcpt is None else int(intcpt) grpr_df = dmatrix('0+%s' % grpr, data, return_type='dataframe', NA_action=Ignore_NA()) # If there's no predictor, we must be adding random intercepts if not pred and grpr not in self.terms: name = '1|' + grpr pred = np.ones((len(grpr_df), 1)) term = RandomTerm(self, name, grpr_df, pred, grpr_df.values, categorical=True, prior=prior) self.terms[name] = term else: pred_df = dmatrix('%s+%s' % (intcpt, pred), data, return_type='dataframe', NA_action=Ignore_NA()) # determine value of the 'constant' attribute const = np.atleast_2d(pred_df.T).T.sum(1).var() == 0 for col, i in pred_df.design_info.column_name_indexes.items( ): pred_data = pred_df.iloc[:, i] lev_data = grpr_df.multiply(pred_data, axis=0) # Also rename intercepts and skip if already added. # This can happen if user specifies something like # random=['1|school', 'student|school']. if col == 'Intercept': if grpr in self.terms: continue label = '1|%s' % grpr else: label = col + '|' + grpr prior = priors.pop(label, priors.get('random', None)) # Categorical or continuous is determined from data ld = lev_data.values if ((ld == 0) | (ld == 1)).all(): lev_data = lev_data.astype(int) cat = True else: cat = False pred_data = pred_data[:, None] # Must be 2D later term = RandomTerm(self, label, lev_data, pred_data, grpr_df.values, categorical=cat, constant=const if const else None) self.terms[label] = term self.built = False
def add_formula(self, fixed=None, random=None, priors=None, family='gaussian', link=None, categorical=None, append=False): ''' Adds one or more terms to the model via an R-like formula syntax. Args: fixed (str): Optional formula specification of fixed effects. random (list): Optional list-based specification of random effects. priors (dict): Optional specification of priors for one or more terms. A dict where the keys are the names of terms in the model, and the values are either instances of class Prior or ints, floats, or strings that specify the width of the priors on a standardized scale. family (str, Family): A specification of the model family (analogous to the family object in R). Either a string, or an instance of class priors.Family. If a string is passed, a family with the corresponding name must be defined in the defaults loaded at Model initialization. Valid pre-defined families are 'gaussian', 'binomial', 'poisson', and 't'. link (str): The model link function to use. Can be either a string (must be one of the options defined in the current backend; typically this will include at least 'identity', 'logit', 'inverse', and 'exp'), or a callable that takes a 1D ndarray or theano tensor as the sole argument and returns one with the same shape. categorical (str, list): The names of any variables to treat as categorical. Can be either a single variable name, or a list of names. If categorical is None, the data type of the columns in the DataFrame will be used to infer handling. In cases where numeric columns are to be treated as categoricals (e.g., random factors coded as numerical IDs), explicitly passing variable names via this argument is recommended. append (bool): if True, terms are appended to the existing model rather than replacing any existing terms. This allows formula-based specification of the model in stages. ''' data = self.data if priors is None: priors = {} if not append: self.reset() if fixed is not None: # Explicitly convert columns to category if desired--though this # can also be done within the formula using C(). if categorical is not None: data = data.copy() cats = listify(categorical) data[cats] = data[cats].apply(lambda x: x.astype('category')) if '~' in fixed: y, X = dmatrices(fixed, data=data) y_label = y.design_info.term_names[0] self.add_y(y_label, family=family, link=link) else: X = dmatrix(fixed, data=data) # Loop over predictor terms for _name, _slice in X.design_info.term_name_slices.items(): cols = X.design_info.column_names[_slice] term_data = pd.DataFrame(X[:, _slice], columns=cols) prior = priors.pop(_name, priors.pop('fixed', None)) self.add_term(_name, data=term_data, prior=prior) # Random effects if random is not None: random = listify(random) for f in random: f = f.strip() kwargs = {'random': True} if re.search('[\*\(\)]+', f): raise ValueError("Random term '%s' contains an invalid " "character. Note that only the | and + " "operators are currently supported in " "random effects specifications.") # replace explicit intercept terms like '1|subj' with just 'subj' f = re.sub(r'^1\s*\|(.*)', r'\1', f).strip() # Split specification into intercept, predictor, and grouper patt = r'^([01]+)*[\s\+]*([^\|]+)\|*(.*)' intcpt, pred, grpr = re.search(patt, f).groups() label = '{}|{}'.format(pred, grpr) if grpr else pred # Default to including random intercepts if intcpt is None: intcpt = 1 intcpt = int(intcpt) # If there's no grouper, we must be adding random intercepts if not grpr: kwargs.update(dict(categorical=True, drop_first=False)) variable = pred else: # If we're adding slopes, add random intercepts as well, # unless they were explicitly excluded if intcpt and grpr not in self.terms: self.add_term(variable=grpr, categorical=True, random=True, drop_first=False) if self.data[pred].dtype.name in ['object', 'category']: kwargs['categorical'] = True if not intcpt: kwargs['drop_first'] = False variable, kwargs['over'] = pred, grpr prior = priors.pop(label, priors.pop('random', None)) self.add_term(variable=variable, label=label, **kwargs)