def from_formula(cls, formula, data, window, weights=None, subset=None, *args, **kwargs): if subset is not None: data = data.loc[subset] eval_env = kwargs.pop('eval_env', None) if eval_env is None: eval_env = 2 elif eval_env == -1: from patsy import EvalEnvironment eval_env = EvalEnvironment({}) else: eval_env += 1 # we're going down the stack again missing = kwargs.get('missing', 'skip') from patsy import dmatrices, NAAction na_action = NAAction(on_NA='raise', NA_types=[]) result = dmatrices(formula, data, eval_env, return_type='dataframe', NA_action=na_action) endog, exog = result if (endog.ndim > 1 and endog.shape[1] > 1) or endog.ndim > 2: raise ValueError('endog has evaluated to an array with multiple ' 'columns that has shape {0}. This occurs when ' 'the variable converted to endog is non-numeric' ' (e.g., bool or str).'.format(endog.shape)) kwargs.update({'missing': missing, 'window': window}) if weights is not None: kwargs['weights'] = weights mod = cls(endog, exog, *args, **kwargs) mod.formula = formula # since we got a dataframe, attach the original mod.data.frame = data return mod
def infer_discrete_state_transition_from_training_data(is_non_local, penalty=1e-5): data = pd.DataFrame({ 'is_non_local': is_non_local.astype(np.float64), 'lagged_is_non_local': lagmat(is_non_local, maxlag=1).astype(np.float64).squeeze(), }).dropna() MODEL_FORMULA = 'is_non_local ~ 1 + lagged_is_non_local' response, design_matrix = dmatrices(MODEL_FORMULA, data) penalty = np.ones((design_matrix.shape[1], )) * penalty penalty[0] = 0.0 fit = penalized_IRLS(design_matrix, response, family=families.Binomial(), penalty=penalty) predict_data = { 'lagged_is_non_local': np.asarray([0, 1]), } predict_design_matrix = build_design_matrices( [design_matrix.design_info], predict_data, NA_action=NAAction(NA_types=[]))[0] non_local_probability = families.Binomial().link.inverse( predict_design_matrix @ np.squeeze(fit.coefficients)) non_local_probability[np.isnan(non_local_probability)] = 0.0 return np.asarray( [[1 - non_local_probability[0], non_local_probability[0]], [1 - non_local_probability[1], non_local_probability[1]]])
def make_design_matrix_no_speed(lagged_is_replay, lagged_speed, design_matrix): predict_data = { 'lagged_is_replay': lagged_is_replay * np.ones_like(lagged_speed), } return build_design_matrices([design_matrix.design_info], predict_data, NA_action=NAAction(NA_types=[]))[0]
def handle_formula_data(Y, X, formula, depth=0, missing='drop'): """ Returns endog, exog, and the model specification from arrays and formula Parameters ---------- Y : array-like Either endog (the LHS) of a model specification or all of the data. Y must define __getitem__ for now. X : array-like Either exog or None. If all the data for the formula is provided in Y then you must explicitly set X to None. formula : str or patsy.model_desc You can pass a handler by import formula_handler and adding a key-value pair where the key is the formula object class and the value is a function that returns endog, exog, formula object Returns ------- endog : array-like Should preserve the input type of Y,X exog : array-like Should preserve the input type of Y,X. Could be None. """ # half ass attempt to handle other formula objects if isinstance(formula, tuple(iterkeys(formula_handler))): return formula_handler[type(formula)] na_action = NAAction(on_NA=missing) if X is not None: if data_util._is_using_pandas(Y, X): result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, (Y, X), depth, return_type='dataframe', NA_action=na_action) else: if data_util._is_using_pandas(Y, None): result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) else: result = dmatrices(formula, Y, depth, return_type='dataframe', NA_action=na_action) # if missing == 'raise' there's not missing_mask missing_mask = getattr(na_action, 'missing_mask', None) if not np.any(missing_mask): missing_mask = None return result, missing_mask
def InitializeTransformers(df): vect1 = TfidfVectorizer(max_features=3000) vect1.fit([str(x) for x in df['titledescription'].values]) vect2 = CountVectorizer(binary=True) vect2.fit([str(x) for x in train['locationfull'].values]) (a,b) = dmatrices('SalaryNormalized ~ ContractTime + ContractType + Company + Category + SourceName', data=df, NA_action=NAAction(on_NA='drop', NA_types=[])) builder = b.design_info.builder return (vect1,vect2,builder)
def handle_formula_data(Y, X, formula, depth=0, missing='drop'): """ Returns endog, exog, and the model specification from arrays and formula Parameters ---------- Y : array-like Either endog (the LHS) of a model specification or all of the data. Y must define __getitem__ for now. X : array-like Either exog or None. If all the data for the formula is provided in Y then you must explicitly set X to None. formula : str or patsy.model_desc You can pass a handler by import formula_handler and adding a key-value pair where the key is the formula object class and the value is a function that returns endog, exog, formula object Returns ------- endog : array-like Should preserve the input type of Y,X exog : array-like Should preserve the input type of Y,X. Could be None. """ # half-assed attempt to handle other formula objects if isinstance(formula, tuple(iterkeys(formula_handler))): return formula_handler[type(formula)] na_action = NAAction(on_NA=missing) if X is not None: yxtup = (Y, X) else: yxtup = Y result = dmatrices(formula, yxtup, depth, return_type='dataframe', NA_action=na_action) # if missing == 'raise' there's not missing_mask missing_mask = getattr(na_action, 'missing_mask', None) if not np.any(missing_mask): missing_mask = None if len(result) > 1: # have RHS design design_info = result[1].design_info # detach it from DataFrame else: design_info = None # NOTE: is there ever a case where we'd need LHS design_info? return result, missing_mask, design_info
def from_formula(cls, formula, data, window, weights=None, subset=None, *args, **kwargs): if subset is not None: data = data.loc[subset] eval_env = kwargs.pop("eval_env", None) if eval_env is None: eval_env = 2 elif eval_env == -1: from patsy import EvalEnvironment eval_env = EvalEnvironment({}) else: eval_env += 1 # we're going down the stack again missing = kwargs.get("missing", "skip") from patsy import NAAction, dmatrices na_action = NAAction(on_NA="raise", NA_types=[]) result = dmatrices( formula, data, eval_env, return_type="dataframe", NA_action=na_action, ) endog, exog = result if (endog.ndim > 1 and endog.shape[1] > 1) or endog.ndim > 2: raise ValueError("endog has evaluated to an array with multiple " "columns that has shape {0}. This occurs when " "the variable converted to endog is non-numeric" " (e.g., bool or str).".format(endog.shape)) kwargs.update({"missing": missing, "window": window}) if weights is not None: kwargs["weights"] = weights mod = cls(endog, exog, *args, **kwargs) mod.formula = formula # since we got a dataframe, attach the original mod.data.frame = data return mod