Ejemplo n.º 1
0
def test_formula_missing_extra_arrays():
    np.random.seed(1)
    # because patsy can't turn off missing data-handling as of 0.3.0, we need
    # separate tests to make sure that missing values are handled correctly
    # when going through formulas

    # there is a handle_formula_data step
    # then there is the regular handle_data step
    # see 2083

    # the untested cases are endog/exog have missing. extra has missing.
    # endog/exog are fine. extra has missing.
    # endog/exog do or do not have missing and extra has wrong dimension
    y = np.random.randn(10)
    y_missing = y.copy()
    y_missing[[2, 5]] = np.nan
    X = np.random.randn(10)
    X_missing = X.copy()
    X_missing[[1, 3]] = np.nan

    weights = np.random.uniform(size=10)
    weights_missing = weights.copy()
    weights_missing[[6]] = np.nan

    weights_wrong_size = np.random.randn(12)

    data = {'y': y,
            'X': X,
            'y_missing': y_missing,
            'X_missing': X_missing,
            'weights': weights,
            'weights_missing': weights_missing}
    data = pandas.DataFrame.from_dict(data)
    data['constant'] = 1

    formula = 'y_missing ~ X_missing'

    ((endog, exog),
     missing_idx, design_info) = handle_formula_data(data, None, formula,
                                                     depth=2,
                                                     missing='drop')

    kwargs = {'missing_idx': missing_idx, 'missing': 'drop',
              'weights': data['weights_missing']}

    model_data = sm_data.handle_data(endog, exog, **kwargs)
    data_nona = data.dropna()
    assert_equal(data_nona['y'].values, model_data.endog)
    assert_equal(data_nona[['constant', 'X']].values, model_data.exog)
    assert_equal(data_nona['weights'].values, model_data.weights)

    tmp = handle_formula_data(data, None, formula, depth=2, missing='drop')
    (endog, exog), missing_idx, design_info = tmp
    weights_2d = np.random.randn(10, 10)
    weights_2d[[8, 7], [7, 8]] = np.nan  #symmetric missing values
    kwargs.update({'weights': weights_2d,
                   'missing_idx': missing_idx})

    model_data2 = sm_data.handle_data(endog, exog, **kwargs)

    good_idx = [0, 4, 6, 9]
    assert_equal(data.loc[good_idx, 'y'], model_data2.endog)
    assert_equal(data.loc[good_idx, ['constant', 'X']], model_data2.exog)
    assert_equal(weights_2d[good_idx][:, good_idx], model_data2.weights)

    tmp = handle_formula_data(data, None, formula, depth=2, missing='drop')
    (endog, exog), missing_idx, design_info = tmp

    kwargs.update({'weights': weights_wrong_size,
                   'missing_idx': missing_idx})
    assert_raises(ValueError, sm_data.handle_data, endog, exog, **kwargs)
Ejemplo n.º 2
0
def test_formula_missing_extra_arrays():
    np.random.seed(1)
    # because patsy can't turn off missing data-handling as of 0.3.0, we need
    # separate tests to make sure that missing values are handled correctly
    # when going through formulas

    # there is a handle_formula_data step
    # then there is the regular handle_data step
    # see 2083

    # the untested cases are endog/exog have missing. extra has missing.
    # endog/exog are fine. extra has missing.
    # endog/exog do or do not have missing and extra has wrong dimension
    y = np.random.randn(10)
    y_missing = y.copy()
    y_missing[[2, 5]] = np.nan
    X = np.random.randn(10)
    X_missing = X.copy()
    X_missing[[1, 3]] = np.nan

    weights = np.random.uniform(size=10)
    weights_missing = weights.copy()
    weights_missing[[6]] = np.nan

    weights_wrong_size = np.random.randn(12)

    data = {
        'y': y,
        'X': X,
        'y_missing': y_missing,
        'X_missing': X_missing,
        'weights': weights,
        'weights_missing': weights_missing
    }
    data = pandas.DataFrame.from_dict(data)
    data['constant'] = 1

    formula = 'y_missing ~ X_missing'

    ((endog, exog), missing_idx,
     design_info) = handle_formula_data(data,
                                        None,
                                        formula,
                                        depth=2,
                                        missing='drop')

    kwargs = {
        'missing_idx': missing_idx,
        'missing': 'drop',
        'weights': data['weights_missing']
    }

    model_data = sm_data.handle_data(endog, exog, **kwargs)
    data_nona = data.dropna()
    assert_equal(data_nona['y'].values, model_data.endog)
    assert_equal(data_nona[['constant', 'X']].values, model_data.exog)
    assert_equal(data_nona['weights'].values, model_data.weights)

    tmp = handle_formula_data(data, None, formula, depth=2, missing='drop')
    (endog, exog), missing_idx, design_info = tmp
    weights_2d = np.random.randn(10, 10)
    weights_2d[[8, 7], [7, 8]] = np.nan  #symmetric missing values
    kwargs.update({'weights': weights_2d, 'missing_idx': missing_idx})

    model_data2 = sm_data.handle_data(endog, exog, **kwargs)

    good_idx = [0, 4, 6, 9]
    assert_equal(data.loc[good_idx, 'y'], model_data2.endog)
    assert_equal(data.loc[good_idx, ['constant', 'X']], model_data2.exog)
    assert_equal(weights_2d[good_idx][:, good_idx], model_data2.weights)

    tmp = handle_formula_data(data, None, formula, depth=2, missing='drop')
    (endog, exog), missing_idx, design_info = tmp

    kwargs.update({'weights': weights_wrong_size, 'missing_idx': missing_idx})
    assert_raises(ValueError, sm_data.handle_data, endog, exog, **kwargs)
Ejemplo n.º 3
0
def test_formula_missing_extra_arrays():
    np.random.seed(1)
    # because patsy can't turn off missing data-handling as of 0.3.0, we need
    # separate tests to make sure that missing values are handled correctly
    # when going through formulas

    # there is a handle_formula_data step
    # then there is the regular handle_data step
    # see 2083

    # the untested cases are endog/exog have missing. extra has missing.
    # endog/exog are fine. extra has missing.
    # endog/exog do or do not have missing and extra has wrong dimension
    y = np.random.randn(10)
    y_missing = y.copy()
    y_missing[[2, 5]] = np.nan
    X = np.random.randn(10)
    X_missing = X.copy()
    X_missing[[1, 3]] = np.nan

    weights = np.random.uniform(size=10)
    weights_missing = weights.copy()
    weights_missing[[6]] = np.nan

    weights_wrong_size = np.random.randn(12)

    data = {
        "y": y,
        "X": X,
        "y_missing": y_missing,
        "X_missing": X_missing,
        "weights": weights,
        "weights_missing": weights_missing,
    }
    data = pandas.DataFrame.from_dict(data)
    data["constant"] = 1

    formula = "y_missing ~ X_missing"

    (endog, exog), missing_idx = handle_formula_data(data, None, formula, depth=2, missing="drop")

    kwargs = {"missing_idx": missing_idx, "missing": "drop", "weights": data["weights_missing"]}

    model_data = sm_data.handle_data(endog, exog, **kwargs)
    data_nona = data.dropna()
    assert_equal(data_nona["y"].values, model_data.endog)
    assert_equal(data_nona[["constant", "X"]].values, model_data.exog)
    assert_equal(data_nona["weights"].values, model_data.weights)

    (endog, exog), missing_idx = handle_formula_data(data, None, formula, depth=2, missing="drop")
    weights_2d = np.random.randn(10, 10)
    weights_2d[[8, 7], [7, 8]] = np.nan  # symmetric missing values
    kwargs.update({"weights": weights_2d, "missing_idx": missing_idx})

    model_data2 = sm_data.handle_data(endog, exog, **kwargs)

    good_idx = [0, 4, 6, 9]
    assert_equal(data.ix[good_idx, "y"], model_data2.endog)
    assert_equal(data.ix[good_idx, ["constant", "X"]], model_data2.exog)
    assert_equal(weights_2d[good_idx][:, good_idx], model_data2.weights)

    (endog, exog), missing_idx = handle_formula_data(data, None, formula, depth=2, missing="drop")
    kwargs.update({"weights": weights_wrong_size, "missing_idx": missing_idx})
    assert_raises(ValueError, sm_data.handle_data, endog, exog, **kwargs)