Ejemplo n.º 1
0
    def from_formula(cls, formula, data, window, weights=None, subset=None,
                     *args, **kwargs):
        if subset is not None:
            data = data.loc[subset]
        eval_env = kwargs.pop('eval_env', None)
        if eval_env is None:
            eval_env = 2
        elif eval_env == -1:
            from patsy import EvalEnvironment
            eval_env = EvalEnvironment({})
        else:
            eval_env += 1  # we're going down the stack again
        missing = kwargs.get('missing', 'skip')
        from patsy import dmatrices, NAAction
        na_action = NAAction(on_NA='raise', NA_types=[])
        result = dmatrices(formula, data, eval_env, return_type='dataframe',
                           NA_action=na_action)

        endog, exog = result
        if (endog.ndim > 1 and endog.shape[1] > 1) or endog.ndim > 2:
            raise ValueError('endog has evaluated to an array with multiple '
                             'columns that has shape {0}. This occurs when '
                             'the variable converted to endog is non-numeric'
                             ' (e.g., bool or str).'.format(endog.shape))

        kwargs.update({'missing': missing,
                       'window': window})
        if weights is not None:
            kwargs['weights'] = weights
        mod = cls(endog, exog, *args, **kwargs)
        mod.formula = formula
        # since we got a dataframe, attach the original
        mod.data.frame = data
        return mod
def infer_discrete_state_transition_from_training_data(is_non_local,
                                                       penalty=1e-5):
    data = pd.DataFrame({
        'is_non_local':
        is_non_local.astype(np.float64),
        'lagged_is_non_local':
        lagmat(is_non_local, maxlag=1).astype(np.float64).squeeze(),
    }).dropna()

    MODEL_FORMULA = 'is_non_local ~ 1 + lagged_is_non_local'
    response, design_matrix = dmatrices(MODEL_FORMULA, data)
    penalty = np.ones((design_matrix.shape[1], )) * penalty
    penalty[0] = 0.0
    fit = penalized_IRLS(design_matrix,
                         response,
                         family=families.Binomial(),
                         penalty=penalty)

    predict_data = {
        'lagged_is_non_local': np.asarray([0, 1]),
    }
    predict_design_matrix = build_design_matrices(
        [design_matrix.design_info],
        predict_data,
        NA_action=NAAction(NA_types=[]))[0]

    non_local_probability = families.Binomial().link.inverse(
        predict_design_matrix @ np.squeeze(fit.coefficients))

    non_local_probability[np.isnan(non_local_probability)] = 0.0

    return np.asarray(
        [[1 - non_local_probability[0], non_local_probability[0]],
         [1 - non_local_probability[1], non_local_probability[1]]])
def make_design_matrix_no_speed(lagged_is_replay, lagged_speed, design_matrix):
    predict_data = {
        'lagged_is_replay': lagged_is_replay * np.ones_like(lagged_speed),
    }
    return build_design_matrices([design_matrix.design_info],
                                 predict_data,
                                 NA_action=NAAction(NA_types=[]))[0]
Ejemplo n.º 4
0
def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half ass attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    na_action = NAAction(on_NA=missing)

    if X is not None:
        if data_util._is_using_pandas(Y, X):
            result = dmatrices(formula, (Y, X),
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula, (Y, X),
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)
    else:
        if data_util._is_using_pandas(Y, None):
            result = dmatrices(formula,
                               Y,
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)
        else:
            result = dmatrices(formula,
                               Y,
                               depth,
                               return_type='dataframe',
                               NA_action=na_action)

    # if missing == 'raise' there's not missing_mask
    missing_mask = getattr(na_action, 'missing_mask', None)
    if not np.any(missing_mask):
        missing_mask = None
    return result, missing_mask
Ejemplo n.º 5
0
def InitializeTransformers(df):
    vect1 = TfidfVectorizer(max_features=3000)
    vect1.fit([str(x) for x in df['titledescription'].values])
    vect2 = CountVectorizer(binary=True)
    vect2.fit([str(x) for x in train['locationfull'].values])
    (a,b) = dmatrices('SalaryNormalized ~ ContractTime + ContractType + Company + Category + SourceName', 
                   data=df, NA_action=NAAction(on_NA='drop', NA_types=[]))
    builder = b.design_info.builder
    return (vect1,vect2,builder)
Ejemplo n.º 6
0
def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
    """
    Returns endog, exog, and the model specification from arrays and formula

    Parameters
    ----------
    Y : array-like
        Either endog (the LHS) of a model specification or all of the data.
        Y must define __getitem__ for now.
    X : array-like
        Either exog or None. If all the data for the formula is provided in
        Y then you must explicitly set X to None.
    formula : str or patsy.model_desc
        You can pass a handler by import formula_handler and adding a
        key-value pair where the key is the formula object class and
        the value is a function that returns endog, exog, formula object

    Returns
    -------
    endog : array-like
        Should preserve the input type of Y,X
    exog : array-like
        Should preserve the input type of Y,X. Could be None.
    """
    # half-assed attempt to handle other formula objects
    if isinstance(formula, tuple(iterkeys(formula_handler))):
        return formula_handler[type(formula)]

    na_action = NAAction(on_NA=missing)

    if X is not None:
        yxtup = (Y, X)
    else:
        yxtup = Y
    result = dmatrices(formula, yxtup, depth,
                       return_type='dataframe', NA_action=na_action)

    # if missing == 'raise' there's not missing_mask
    missing_mask = getattr(na_action, 'missing_mask', None)
    if not np.any(missing_mask):
        missing_mask = None
    if len(result) > 1:  # have RHS design
        design_info = result[1].design_info  # detach it from DataFrame
    else:
        design_info = None
    # NOTE: is there ever a case where we'd need LHS design_info?
    return result, missing_mask, design_info
Ejemplo n.º 7
0
    def from_formula(cls,
                     formula,
                     data,
                     window,
                     weights=None,
                     subset=None,
                     *args,
                     **kwargs):
        if subset is not None:
            data = data.loc[subset]
        eval_env = kwargs.pop("eval_env", None)
        if eval_env is None:
            eval_env = 2
        elif eval_env == -1:
            from patsy import EvalEnvironment

            eval_env = EvalEnvironment({})
        else:
            eval_env += 1  # we're going down the stack again
        missing = kwargs.get("missing", "skip")
        from patsy import NAAction, dmatrices

        na_action = NAAction(on_NA="raise", NA_types=[])
        result = dmatrices(
            formula,
            data,
            eval_env,
            return_type="dataframe",
            NA_action=na_action,
        )

        endog, exog = result
        if (endog.ndim > 1 and endog.shape[1] > 1) or endog.ndim > 2:
            raise ValueError("endog has evaluated to an array with multiple "
                             "columns that has shape {0}. This occurs when "
                             "the variable converted to endog is non-numeric"
                             " (e.g., bool or str).".format(endog.shape))

        kwargs.update({"missing": missing, "window": window})
        if weights is not None:
            kwargs["weights"] = weights
        mod = cls(endog, exog, *args, **kwargs)
        mod.formula = formula
        # since we got a dataframe, attach the original
        mod.data.frame = data
        return mod