Esempio n. 1
0
def test_stateful_transform():
    data_train = patsy.demo_data("x1", "x2", "y")
    data_train['x1'][:] = 1
    # mean of x1 is 1
    data_test = patsy.demo_data("x1", "x2", "y")
    data_test['x1'][:] = 0

    # center x1
    est = PatsyTransformer("center(x1) + x2")
    est.fit(data_train)
    data_trans = est.transform(data_test)
    # make sure that mean of training, not test data was removed
    assert_array_equal(data_trans[:, 0], -1)
Esempio n. 2
0
def test_stateful_transform():
    data_train = patsy.demo_data("x1", "x2", "y")
    data_train['x1'][:] = 1
    # mean of x1 is 1
    data_test = patsy.demo_data("x1", "x2", "y")
    data_test['x1'][:] = 0

    # center x1
    est = PatsyTransformer("center(x1) + x2")
    est.fit(data_train)
    data_trans = est.transform(data_test)
    # make sure that mean of training, not test data was removed
    assert_array_equal(data_trans[:, 0], -1)
Esempio n. 3
0
 def test_proflogit_with_patsy_demo_data_no_intercept(self):
     """
     Test on simple demo data from patsy w/o intercept.
     """
     # demo_data: returns a dict
     # categorical variables are returned as a list of strings.
     # Numerical data sampled from a normal distribution (fixed seed)
     rng = np.random.RandomState(42)
     data = patsy.demo_data("a", "b", "x1", "x2", nlevels=3)
     y = rng.randint(2, size=len(data["a"]))
     # dmatrix: to create the design matrix alone (no left-hand side)
     X = patsy.dmatrix("a + b + x1 + x2 - 1", data)
     pfl = ProfLogitCCP(
         rga_kws={
             "niter": 10,
             "disp": False,
             "random_state": 42
         },
         intercept=False,
     )
     pfl.fit(X, y)
     npt.assert_array_almost_equal(
         pfl.rga.res.x,
         [0.27466536, 0.0, -0.24030505, 0.0, 0.0, -0.82215168, 0.0],
     )
     self.assertAlmostEqual(pfl.rga.res.fun, 12.310732234783764)
     empc_score = pfl.score(X, y)
     self.assertAlmostEqual(empc_score, 12.4444444445)
Esempio n. 4
0
 def test_proflogit_with_patsy_build_in_transformation_functions(self):
     """Test patsy build-in transformation functions."""
     # demo_data: returns a dict
     # Categorical variables are returned as a list of strings.
     # Numerical data sampled from a normal distribution (fixed seed)
     rng = np.random.RandomState(42)
     data = patsy.demo_data("a", "b", "x1", "x2", nlevels=3)
     y = rng.randint(2, size=len(data["a"]))
     # dmatrix: to create the design matrix alone (no left-hand side)
     # Important that `data` can be indexed like a Python dictionary,
     # e.g., `data[varname]`. It can also be a pandas.DataFrame
     # Strings and booleans are treated as categorical variables, where
     # the first level is the baseline.
     X = patsy.dmatrix(
         "a + b + standardize(x1) + standardize(x2)",
         data,
     )
     pfl = ProfLogitCCP(rga_kws={
         "niter": 10,
         "disp": False,
         "random_state": 42
     }, )
     pfl.fit(X, y)
     npt.assert_array_almost_equal(
         pfl.rga.res.x,
         [0.71321495, 0.0, -0.6815996, 0.0, 0.0, -0.92505635, 0.0],
     )
     self.assertAlmostEqual(pfl.rga.res.fun, 12.2837788495)
     empc_score = pfl.score(X, y)
     self.assertAlmostEqual(empc_score, 12.4444444445)
Esempio n. 5
0
def test_error_on_y_transform():
    data = patsy.demo_data("x1", "x2", "x3", "y")
    est = PatsyTransformer("y ~ x1 + x2")
    msg = ("encountered outcome variables for a model"
           " that does not expect them")
    assert_raise_message(patsy.PatsyError, msg, est.fit, data)
    assert_raise_message(patsy.PatsyError, msg, est.fit_transform, data)
Esempio n. 6
0
def test_error_on_y_transform():
    data = patsy.demo_data("x1", "x2", "x3", "y")
    est = PatsyTransformer("y ~ x1 + x2")
    msg = ("encountered outcome variables for a model"
           " that does not expect them")
    assert_raise_message(patsy.PatsyError, msg, est.fit, data)
    assert_raise_message(patsy.PatsyError, msg, est.fit_transform, data)
Esempio n. 7
0
 def test_proflogit_with_patsy_demo_data(self):
     """Test on simple categorical/numerical demo data from patsy."""
     # demo_data: returns a dict
     # Categorical variables are returned as a list of strings.
     # Numerical data sampled from a normal distribution (fixed seed)
     rng = np.random.RandomState(42)
     data = patsy.demo_data("a", "b", "x1", "x2", nlevels=3)
     y = rng.randint(2, size=len(data["a"]))
     # dmatrix: to create the design matrix alone (no left-hand side)
     # Important that `data` can be indexed like a Python dictionary,
     # e.g., `data[varname]`. It can also be a pandas.DataFrame
     X = patsy.dmatrix("a + b + x1 + x2", data)
     pfl = ProfLogitCCP(rga_kws={
         "niter": 10,
         "disp": False,
         "random_state": 42
     }, )
     pfl.fit(X, y)
     npt.assert_array_almost_equal(
         pfl.rga.res.x,
         [
             0.26843982,  # Intercept
             0.0,  # Categorical variable 'a' - level a2
             -0.21947001,  # Categorical variable 'a' - level a3
             0.12036944,  # Categorical variable 'b' - level b2
             0.0,  # Categorical variable 'b' - level b3
             -0.47514314,  # Numeric variable 'x1'
             -0.08812723,  # Numeric variable 'x2'
         ],
     )
     self.assertAlmostEqual(pfl.rga.res.fun, 12.3541334628)
     empc_score = pfl.score(X, y)
     self.assertAlmostEqual(empc_score, 12.4444444445)
Esempio n. 8
0
def test_stateful_transform_dataframe():
    data_train = pd.DataFrame(patsy.demo_data("x1", "x2", "y"))
    data_train['x1'][:] = 1
    # mean of x1 is 1
    data_test = pd.DataFrame(patsy.demo_data("x1", "x2", "y"))
    data_test['x1'][:] = 0

    # center x1
    est = PatsyTransformer("center(x1) + x2", return_type='dataframe')
    est.fit(data_train)
    data_trans = est.transform(data_test)

    # make sure result is pandas dataframe
    assert type(data_trans) is pd.DataFrame

    # make sure that mean of training, not test data was removed
    assert_array_equal(data_trans['center(x1)'][:], -1)
Esempio n. 9
0
def test_stateful_model():
    data_train = patsy.demo_data("x1", "x2", "y")
    data_train['x1'][:] = 1
    # mean of x1 is 1
    data_test = patsy.demo_data("x1", "x2", "y")
    data_test['x1'][:] = 0

    # center x1
    est = PatsyModel(CheckingClassifier(), "y ~ center(x1) + x2")
    est.fit(data_train)

    def check_centering(X):
        return np.all(X[:, 0] == -1)

    est.estimator_.check_X = check_centering
    # make sure that mean of training, not test data was removed
    est.predict(data_test)
Esempio n. 10
0
def test_stateful_transform_dataframe():
    data_train = pd.DataFrame(patsy.demo_data("x1", "x2", "y"))
    data_train['x1'][:] = 1
    # mean of x1 is 1
    data_test = pd.DataFrame(patsy.demo_data("x1", "x2", "y"))
    data_test['x1'][:] = 0

    # center x1
    est = PatsyTransformer("center(x1) + x2", return_type='dataframe')
    est.fit(data_train)
    data_trans = est.transform(data_test)

    # make sure result is pandas dataframe
    assert type(data_trans) is pd.DataFrame

    # make sure that mean of training, not test data was removed
    assert_array_equal(data_trans['center(x1)'][:],-1)
Esempio n. 11
0
def test_stateful_model():
    data_train = patsy.demo_data("x1", "x2", "y")
    data_train['x1'][:] = 1
    # mean of x1 is 1
    data_test = patsy.demo_data("x1", "x2", "y")
    data_test['x1'][:] = 0

    # center x1
    est = PatsyModel(CheckingClassifier(), "y ~ center(x1) + x2")
    est.fit(data_train)

    def check_centering(X):
        return np.all(X[:, 0] == -1)

    est.estimator_.check_X = check_centering
    # make sure that mean of training, not test data was removed
    est.predict(data_test)
Esempio n. 12
0
def test_intercept_transformer():
    data = patsy.demo_data("x1", "x2", "x3", "y")

    # check wether X contains only the two features, no intercept
    est = PatsyTransformer("x1 + x2")
    est.fit(data)
    assert_equal(est.transform(data).shape[1], 2)

    # check wether X does contain intercept
    est = PatsyTransformer("x1 + x2", add_intercept=True)
    est.fit(data)
    data_transformed = est.transform(data)
    assert_array_equal(data_transformed[:, 0], 1)
    assert_equal(est.transform(data).shape[1], 3)
Esempio n. 13
0
def test_intercept_transformer():
    data = patsy.demo_data("x1", "x2", "x3", "y")

    # check wether X contains only the two features, no intercept
    est = PatsyTransformer("x1 + x2")
    est.fit(data)
    assert_equal(est.transform(data).shape[1], 2)

    # check wether X does contain intercept
    est = PatsyTransformer("x1 + x2", add_intercept=True)
    est.fit(data)
    data_transformed = est.transform(data)
    assert_array_equal(data_transformed[:, 0], 1)
    assert_equal(est.transform(data).shape[1], 3)
Esempio n. 14
0
def test_scope_model():
    data = patsy.demo_data("x1", "x2", "x3", "y")

    def myfunc(x):
        tmp = np.ones_like(x)
        tmp.fill(42)
        return tmp

    def check_X(X):
        return np.all(X[:, 1] == 42)

    # checking classifier raises error if check_X doesn't return true.
    # this checks that myfunc was actually applied
    est = PatsyModel(CheckingClassifier(check_X=check_X), "y ~ x1 + myfunc(x2)")
    est.fit(data)
Esempio n. 15
0
def test_scope_transformer():
    data = patsy.demo_data("x1", "x2", "x3", "y")

    def myfunc(x):
        tmp = np.ones_like(x)
        tmp.fill(42)
        return tmp

    est = PatsyTransformer("x1 + myfunc(x2)")
    est.fit(data)
    data_trans = est.transform(data)
    assert_array_equal(data_trans[:, 1], 42)

    est = PatsyTransformer("x1 + myfunc(x2)")
    data_trans = est.fit_transform(data)
    assert_array_equal(data_trans[:, 1], 42)
Esempio n. 16
0
def test_scope_transformer():
    data = patsy.demo_data("x1", "x2", "x3", "y")

    def myfunc(x):
        tmp = np.ones_like(x)
        tmp.fill(42)
        return tmp

    est = PatsyTransformer("x1 + myfunc(x2)")
    est.fit(data)
    data_trans = est.transform(data)
    assert_array_equal(data_trans[:, 1], 42)

    est = PatsyTransformer("x1 + myfunc(x2)")
    data_trans = est.fit_transform(data)
    assert_array_equal(data_trans[:, 1], 42)

    # test feature names
    assert_equal(est.feature_names_, ["x1", "myfunc(x2)"])
Esempio n. 17
0
def test_scope_model():
    data = patsy.demo_data("x1", "x2", "x3", "y")

    def myfunc(x):
        tmp = np.ones_like(x)
        tmp.fill(42)
        return tmp

    def check_X(X):
        return np.all(X[:, 1] == 42)

    # checking classifier raises error if check_X doesn't return true.
    # this checks that myfunc was actually applied
    est = PatsyModel(CheckingClassifier(check_X=check_X),
                     "y ~ x1 + myfunc(x2)")
    est.fit(data)

    # test feature names
    assert_equal(est.feature_names_, ["x1", "myfunc(x2)"])
Esempio n. 18
0
def test_intercept_model():
    data = patsy.demo_data("x1", "x2", "x3", "y")

    def check_X_no_intercept(X):
        return X.shape[1] == 2

    # check wether X contains only the two features, no intercept
    est = PatsyModel(CheckingClassifier(check_X=check_X_no_intercept), "y ~ x1 + x2")
    est.fit(data)
    # predict checks applying to new data
    est.predict(data)

    def check_X_intercept(X):
        shape_correct = X.shape[1] == 3
        first_is_intercept = np.all(X[:, 0] == 1)
        return shape_correct and first_is_intercept

    # check wether X does contain intercept
    est = PatsyModel(CheckingClassifier(check_X=check_X_intercept), "y ~ x1 + x2", add_intercept=True)
    est.fit(data)
    est.predict(data)
Esempio n. 19
0
def test_intercept_model():
    data = patsy.demo_data("x1", "x2", "x3", "y")

    def check_X_no_intercept(X):
        return X.shape[1] == 2

    # check wether X contains only the two features, no intercept
    est = PatsyModel(CheckingClassifier(check_X=check_X_no_intercept),
                     "y ~ x1 + x2")
    est.fit(data)
    # predict checks applying to new data
    est.predict(data)

    def check_X_intercept(X):
        shape_correct = X.shape[1] == 3
        first_is_intercept = np.all(X[:, 0] == 1)
        return shape_correct and first_is_intercept

    # check wether X does contain intercept
    est = PatsyModel(CheckingClassifier(check_X=check_X_intercept),
                     "y ~ x1 + x2",
                     add_intercept=True)
    est.fit(data)
    est.predict(data)
Esempio n. 20
0
def ser_types(ser):
    res = list(set(ser.apply(lambda x: str(type(x)))))
    return res


def df_coltypes(df):
    panda_types = pd.DataFrame(df.dtypes, columns=["PandaType"])
    python_types = pd.DataFrame(df.apply(ser_types), columns=["PythonTypes"])
    res = pd.merge(panda_types,
                   python_types,
                   how="outer",
                   left_index=True,
                   right_index=True)
    return res


data = patsy.demo_data('city',
                       'state',
                       'population',
                       'xLocation',
                       'yLatitude',
                       min_rows=100)
df = pd.DataFrame(data)
x = df_coltypes(df)

dir = os.path.expanduser("~")
fpath = os.path.join(dir, "sample.csv")
df.to_csv(fpath)
print("done")
pass
Esempio n. 21
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from patsy import demo_data
from LM import LM
import numpy as np

data = demo_data("x", "y", "a")
print(data["x"])
# Old and boring approach (but it still works):
X = np.column_stack(([1] * len(data["y"]), data["x"]))
print(X)
print(LM((data["y"], X)))
m = LM("y ~ x", data)
print(m)
print(m.loglik(data))
print(m.loglik({"x": [10, 20, 30], "y": [-1, -2, -3]}))
# Your users get support for categorical predictors for free:
print(LM("y ~ a", data))
print(LM("y ~ np.log(x ** 2)", data))
Esempio n. 22
0
# -*- coding: utf-8 -*-
import numpy as np
from patsy import dmatrices, dmatrix, demo_data
data = demo_data('a', 'b', 'x1', 'x2', 'y', 'z column')
print(f'data:\n{data}')
y, X = dmatrices("y ~ x1 + x2", data)
print(f'y={y}')
print(f"X={X}")

Esempio n. 23
0
import argparse
import patsy
import pandas as pd

parser = argparse.ArgumentParser()
parser.add_argument('--columns')
parser.add_argument('--data')

args = parser.parse_args()

with open(args.columns, 'r') as file:
    columns = file.read().split()

data = patsy.demo_data(*columns)

pd.DataFrame(data).to_feather(args.data)
Esempio n. 24
0
from patsy import dmatrix, demo_data

# demo of how patsy handles categorical variables

# Patsy notation is described here
#http://statsmode#ls.sourceforge.net/devel/example_formulas.html

#http://patsy.readthedocs.org/en/latest/categorical-coding.html
data = demo_data("a", nlevels=3)
dmatrix("a", data)
'''
DesignMatrix with shape (6, 3)
  Intercept  a[T.a2]  a[T.a3]
          1        0        0
          1        1        0
          1        0        1
          1        0        0
          1        1        0
          1        0        1
  Terms:
    'Intercept' (column 0)
    'a' (columns 1:3)
    '''

data = demo_data("a", nlevels=3)
dmatrix("a-1", data)
'''
DesignMatrix with shape (6, 3)
  a[a1]  a[a2]  a[a3]
      1      0      0
      0      1      0
Esempio n. 25
0
print(ModelDesc.from_formula("y ~ x + x + x").describe())
print(ModelDesc.from_formula("y ~ -1 + x").describe())
print(ModelDesc.from_formula("~ -1").describe())
print(ModelDesc.from_formula("y ~ a:b").describe())
print(ModelDesc.from_formula("y ~ a*b").describe())
print(ModelDesc.from_formula("y ~ (a + b + c + d) ** 2").describe())
print(ModelDesc.from_formula("y ~ (a + b)/(c + d)").describe())
print(
    ModelDesc.from_formula("np.log(x1 + x2) "
                           "+ (x + {6: x3, 8 + 1: x4}[3 * i])").describe())
#Sometimes it might be easier to read if you put the processed formula back into formula notation using ModelDesc.describe():

desc = ModelDesc.from_formula("y ~ (a + b + c + d) ** 2")
print(desc.describe())

data = demo_data("a", "b", "x1", "x2")
mat = dmatrix("x1:x2 + a:b + b + x1:a:b + a + x2:a:x1", data)
print(mat.design_info.term_names)

data = demo_data("a", "b", "y")

mat1 = dmatrices("y ~ 0 + a:b", data)[1]

mat2 = dmatrices("y ~ 1 + a + b + a:b", data)[1]

np.linalg.matrix_rank(mat1)

print(np.linalg.matrix_rank(mat2))
print(np.linalg.matrix_rank(np.column_stack((mat1, mat2))))
print(mat1)
print(mat2)
Esempio n. 26
0
def generate_count_matrix(
    n_factors=1,
    n_replicates=4,
    n_features=1000,
    intercept_mean=4,
    intercept_std=2,
    coefficient_stds=0.4,
    size_factors=None,
    size_factors_std=0.1,
    dispersion_function=None,
):
    """
    Generate count matrix for groups of samples by sampling from a
    negative binomial distribution.
    """
    import patsy

    if isinstance(coefficient_stds, (int, float)):
        coefficient_stds = [coefficient_stds] * n_factors

    if dispersion_function is None:
        dispersion_function = _disp

    # Build sample vs factors table
    dcat = pd.DataFrame(
        patsy.demo_data(*(list(string.ascii_lowercase[:n_factors]))))
    dcat.columns = dcat.columns.str.upper()
    for col in dcat.columns:
        dcat[col] = dcat[col].str.upper()
    if n_replicates > 1:
        dcat = (pd.concat([
            dcat for _ in range(int(np.ceil(n_replicates / 2)))
        ]).sort_values(dcat.columns.tolist()).reset_index(drop=True))
    dcat.index = [
        "S{}_{}".format(str(i + 1).zfill(2), dcat.loc[i, :].sum())
        for i in dcat.index
    ]
    m_samples = dcat.shape[0]

    # make model design table
    design = np.asarray(
        patsy.dmatrix(
            "~ 1 + " + " + ".join(string.ascii_uppercase[:n_factors]), dcat))

    # get means
    beta = np.asarray(
        [np.random.normal(intercept_mean, intercept_std, n_features)] +
        [np.random.normal(0, std, n_features) for std in coefficient_stds]).T

    if size_factors is None:
        size_factors = np.random.normal(1, size_factors_std, (m_samples, 1))

    mean = (2**(design @ beta.T) * size_factors).T

    # now sample counts
    dispersion = (1 / dispersion_function(2**(beta[:, 1:]))).mean(1).reshape(
        -1, 1)
    dnum = pd.DataFrame(np.random.negative_binomial(n=mean,
                                                    p=dispersion,
                                                    size=mean.shape),
                        columns=dcat.index)
    dcat.index.name = dnum.columns.name = "sample_name"
    return dnum, dcat
Esempio n. 27
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import patsy
from patsy import dmatrices, dmatrix, demo_data

data = demo_data("a", "b", "x1", "x2", "y", "z column")

print(data)

matrix = dmatrices("y ~ x1 + x2", data)
print(matrix)

outcome, predictors = dmatrices("y ~ x1 + x2", data)
print(outcome)
print(predictors)
betas = np.linalg.lstsq(predictors, outcome, rcond=None)[0].ravel()
print(betas)
for name, beta in zip(predictors.design_info.column_names, betas):
    print(f"{name}: {beta}")

d = dmatrix("x1 + x2", data)
print(d)
d = dmatrix("x1 + x2 - 1", data)
print(d)
d = dmatrix("x1 + np.log(x2 + 10)", data)
print(d)
new_x2 = data["x2"] * 100
d = dmatrix("new_x2")
print(d)
Esempio n. 28
0
imputer_age = SimpleImputer(strategy="median")
titanic["age"] = imputer_embark_town.fit_transform(titanic[["age"]])

msno.matrix(titanic)
plt.show()

print("==========================")
# Pasty 패키지
# 이번에는 pasty 패키지를 사용하여 데이터 프레임에서 원하는 데이터만 선택하거나
# 새로운 데이터를 조합 생성하는 방법을 살펴본다. 설명을 위해 PASTY 패키지가 제공하는 demo_data() 함수로
# 다음과 같은 예제 데이터프레임을 만들자
# demo_data() 함수는 x로 시작하는 변수에 대해 임의의 실수 데이터를 생성한다.

from patsy import demo_data

df = pd.DataFrame(demo_data("x1", "x2", "x3", "x4", "x5"))
df

from patsy import dmatrix

dmatrix("x1+0", data=df)

dmatrix("x1 + x2 + x3 + 0", data=df)

# 'dmatrix()'함수는 변수를 어떤 함수에 넣어서 다른 값으로 만드는 수학변환(트랜스폼)도 가능하다.
dmatrix("x1 + np.log(np.abs(x2))", df)


def ten_times(x):
    return 10 * x