Example #1
0
def engine():
    # Set up the data generation
    cctypes, distargs = cu.parse_distargs([
        'normal',
        'poisson',
        'bernoulli',
        'categorical(k=4)',
        'lognormal',
        'exponential',
        'beta',
        'geometric',
        'vonmises',
    ])

    T, Zv, Zc = tu.gen_data_table(20, [1], [[.25, .25, .5]],
                                  cctypes,
                                  distargs, [.95] * len(cctypes),
                                  rng=gu.gen_rng(10))

    return Engine(T.T,
                  cctypes=cctypes,
                  distargs=distargs,
                  num_states=4,
                  rng=gu.gen_rng(312),
                  multiprocess=False)
def test_dependence_probability_pairwise():
    cctypes, distargs = cu.parse_distargs(['normal', 'normal', 'normal'])

    T, Zv, _Zc = tu.gen_data_table(10, [.5, .5], [[.25, .25, .5], [.3, .7]],
                                   cctypes,
                                   distargs, [.95] * len(cctypes),
                                   rng=gu.gen_rng(100))

    outputs = [0, 1, 2]
    engine = Engine(T.T,
                    outputs=outputs,
                    cctypes=cctypes,
                    num_states=4,
                    distargs=distargs,
                    Zv={o: z
                        for o, z in zip(outputs, Zv)},
                    rng=gu.gen_rng(0))

    Ds = engine.dependence_probability_pairwise(multiprocess=0)
    assert len(Ds) == engine.num_states()
    assert all(np.shape(D) == (len(outputs), len(outputs)) for D in Ds)
    for D in Ds:
        for col0, col1 in itertools.product(outputs, outputs):
            i0 = outputs.index(col0)
            i1 = outputs.index(col1)
            actual = D[i0, i1]
            expected = Zv[i0] == Zv[i1]
            assert actual == expected

    Ds = engine.dependence_probability_pairwise(colnos=[0, 2], multiprocess=0)
    assert len(Ds) == engine.num_states()
    assert all(np.shape(D) == (2, 2) for D in Ds)
def state():
    # Set up the data generation
    cctypes, distargs = cu.parse_distargs(
        ['normal', 'poisson', 'bernoulli', 'lognormal', 'beta', 'vonmises'])
    T, Zv, Zc = tu.gen_data_table(30, [1], [[.25, .25, .5]],
                                  cctypes,
                                  distargs, [.95] * len(cctypes),
                                  rng=gu.gen_rng(0))
    T = T.T
    s = State(T,
              cctypes=cctypes,
              distargs=distargs,
              Zv={i: 0
                  for i in xrange(len(cctypes))},
              rng=gu.gen_rng(0))
    return s
Example #4
0
def test_multiple_stattypes():
    '''Test cgpm statistical types are heuristically converted to Loom types.'''
    cctypes, distargs = cu.parse_distargs([
        'normal', 'poisson', 'bernoulli', 'categorical(k=4)', 'lognormal',
        'exponential', 'beta', 'geometric', 'vonmises'
    ])

    T, Zv, Zc = tu.gen_data_table(200, [1], [[.25, .25, .5]],
                                  cctypes,
                                  distargs, [.95] * len(cctypes),
                                  rng=gu.gen_rng(10))

    engine = Engine(
        T.T,
        cctypes=cctypes,
        distargs=distargs,
        rng=gu.gen_rng(15),
        num_states=16,
    )

    logscore0 = engine.logpdf_score()
    engine.transition_loom(N=5)
    logscore1 = engine.logpdf_score()
    assert numpy.mean(logscore1) > numpy.mean(logscore0)

    # Check serializeation.
    metadata = engine.to_metadata()
    modname = importlib.import_module(metadata['factory'][0])
    builder = getattr(modname, metadata['factory'][1])
    engine2 = builder.from_metadata(metadata)

    # To JSON.
    json_metadata = json.dumps(engine.to_metadata())
    engine3 = builder.from_metadata(json.loads(json_metadata))

    # Assert all states in engine, engine2, and engine3 have same loom_path.
    loom_paths = list(
        itertools.chain.from_iterable([s._loom_path for s in e.states]
                                      for e in [engine, engine2, engine3]))
    assert all(p == loom_paths[0] for p in loom_paths)

    engine2.transition(S=5)
    dependence_probability = engine2.dependence_probability_pairwise()

    assert numpy.all(dependence_probability > 0.85)
Example #5
0
def get_engine():
    cctypes, distargs = cu.parse_distargs(
        ['normal', 'poisson', 'bernoulli', 'lognormal', 'beta', 'vonmises'])
    T, Zv, Zc = tu.gen_data_table(20, [1], [[.25, .25, .5]],
                                  cctypes,
                                  distargs, [.95] * len(cctypes),
                                  rng=gu.gen_rng(0))
    T = T.T
    # Make some nan cells for evidence.
    T[5, 0] = T[5, 1] = T[5, 2] = T[5, 3] = np.nan
    T[8, 4] = np.nan
    engine = Engine(T,
                    cctypes=cctypes,
                    distargs=distargs,
                    num_states=6,
                    rng=gu.gen_rng(0))
    engine.transition(N=2)
    return engine
Example #6
0
def state():
    cctypes, distargs = cu.parse_distargs(
        ['categorical(k=5)', 'normal', 'poisson', 'bernoulli'])
    T, Zv, Zc = tu.gen_data_table(50, [1], [[.33, .33, .34]],
                                  cctypes,
                                  distargs, [.95] * len(cctypes),
                                  rng=gu.gen_rng(0))
    s = State(T.T,
              cctypes=cctypes,
              distargs=distargs,
              Zv={i: 0
                  for i in xrange(len(cctypes))},
              rng=gu.gen_rng(0))
    s.update_cctype(0, 'random_forest', distargs={'k': 5})
    # XXX Uncomment me for a bug!
    # state.update_cctype(1, 'linear_regression')
    kernels = [
        'rows', 'view_alphas', 'alpha', 'column_params', 'column_hypers'
    ]
    s.transition(N=1, kernels=kernels)
    return s
Example #7
0
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from cgpm.crosscat.engine import Engine
from cgpm.utils import config as cu

np.random.seed(0)

N_ROWS = 300
N_STATES = 12
N_ITERS = 100

cctypes = ['categorical(k={})'.format(N_ROWS)] + ['normal']*8
cctypes, distargs = cu.parse_distargs(cctypes)
column_names = ['id'] + ['one cluster']*4 + ['four cluster']*4

# id column.
X = np.zeros((N_ROWS, 9))
X[:,0] = np.arange(N_ROWS)

# Four columns of one cluster from the standard normal.
X[:,1:5] = np.random.randn(N_ROWS, 4)

# Four columns of four clusters with unit variance and means \in {0,1,2,3}.
Z = np.random.randint(4, size=(N_ROWS))
X[:,5:] = 4*np.reshape(np.repeat(Z,4), (len(Z),4)) + np.random.randn(N_ROWS, 4)

# Inference.
engine = Engine(
Example #8
0
import numpy as np

from cgpm.crosscat import lovecat
from cgpm.crosscat.engine import Engine
from cgpm.crosscat.state import State
from cgpm.utils import config as cu
from cgpm.utils import general as gu
from cgpm.utils import test as tu

# -- Global variables shared by all module functions.
rng = gu.gen_rng(2)

outputs = range(8)
cctypes, distargs = cu.parse_distargs([
    'normal', 'poisson', 'bernoulli', 'categorical(k=8)', 'lognormal',
    'categorical(k=4)', 'beta', 'vonmises'
])


def generate_dataset():
    # Set up the data generation, 20 rows by 8 cols, with some missing values.
    D, Zv, Zc = tu.gen_data_table(20, [1], [[.25, .25, .5]],
                                  cctypes,
                                  distargs, [.95] * len(cctypes),
                                  rng=gu.gen_rng(2))

    # Generate some missing entries in D.
    missing = rng.choice(range(D.shape[1]), size=(D.shape[0], 4), replace=True)
    for i, m in enumerate(missing):
        D[i, m] = np.nan
Example #9
0
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest

from cgpm.crosscat.engine import Engine
from cgpm.crosscat.state import State
from cgpm.utils import config as cu
from cgpm.utils import general as gu
from cgpm.utils import test as tu


CCTYPES, DISTARGS = cu.parse_distargs([
    'normal',        # 0
    'poisson',       # 1
    'bernoulli',     # 2
    'lognormal',     # 3
    'exponential',   # 4
    'geometric',     # 5
    'vonmises'])     # 6


T, Zv, Zc = tu.gen_data_table(
    10, [1], [[.33, .33, .34]], CCTYPES, DISTARGS,
    [.95]*len(CCTYPES), rng=gu.gen_rng(0))
T = T.T


def test_incorporate_engine():
    engine = Engine(
        T[:,:2],
        cctypes=CCTYPES[:2],
Example #10
0
# limitations under the License.

import importlib
import pytest

from math import log

import numpy as np

from cgpm.regressions.ols import OrdinaryLeastSquares
from cgpm.utils import config as cu
from cgpm.utils import general as gu
from cgpm.utils import test as tu

cctypes, distargs = cu.parse_distargs([
    'normal', 'categorical(k=3)', 'poisson', 'bernoulli', 'lognormal',
    'exponential', 'geometric', 'vonmises', 'normal'
])

T, Zv, Zc = tu.gen_data_table(100, [1], [[.33, .33, .34]],
                              cctypes,
                              distargs, [.2] * len(cctypes),
                              rng=gu.gen_rng(0))

D = T.T
OLS_DISTARGS = {
    'inputs': {
        'stattypes':
        cctypes[1:],
        'statargs': [{
            'k': 3
        }] + [None] + [{
Example #11
0
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest

from cgpm.crosscat.state import State
from cgpm.utils import config as cu
from cgpm.utils import general as gu
from cgpm.utils import test as tu


CCTYPES, DISTARGS = cu.parse_distargs([
    'normal',
    'poisson',
    'categorical(k=2)',
    'bernoulli',
    'lognormal',
    'exponential',
    'geometric',
    'vonmises'])

T, Zv, Zc = tu.gen_data_table(
    20, [1], [[.33, .33, .34]], CCTYPES, DISTARGS,
    [.95]*len(CCTYPES), rng=gu.gen_rng(0))

T = T.T


def test_categorical_bernoulli():
    state = State(
        T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(0))
Example #12
0
def parse_schema(schema, dataframe):
    """Apply a schema to a dataframe, and return variables to construct State.

    Parameters
    ----------
    schema : list(tuple)
        A list of tuples, where each tuple is ('column', 'stattype'). The values
        of 'stattype' are either DistirbutionGpms, or 'ignore'. For categorical
        datatypes, it is permitted to specify the number of components distarg
        by 'categorical(k=7)' although make sure the number of components is
        correct; if unspecified, the number of components will be estimated from
        the dataset.
    dataframe : pd.DataFrame
        Dataframe containing the dataset to parse according to the schema. All
        missing values must be 'NA' or np.nan -- otherwise very bad things will
        happen.

    Returns
    -------
    D : np.array
        Data matrix that gpmcc can ingest.
    cctypes : list<str>
        List of cctype strings that gpmcc can ingest.
    distargs : list<dict>
        Distargs for the cctypes, according to the schema.
    valmap : dict<str->dict>
        For Bernoulli or categorical columns, strings are converted to integer
        values in [0..k]. valmap['column'] gives the mapping from strings to
        integers for such columns. Needed for reference only, not for gpmcc.
    columns : list<str>
        List of column names, where columns[i] is the ith column of D. Needed
        for reference only, not for gpmcc.

    Example
    -------
    >>> dataframe = pd.read_csv('dataset.csv')
    >>> schema = [('id','ignore'), ('age','normal'), ('gender','bernoulli'),
    ...     ('university','categorical(k=2)'), ('country','categorical')]
    >>> D, cctypes, distargs, valmap, columns = parse_schema(dataframe, schema)

    -   D will be the dataset as np array.
    -   cctypes = ['normal', 'bernoulli', 'categorical', 'categorical']
    -   distargs = [None, None, {'k':2}, {'k':3}]
    -   valmap = {
            'university': {
                'mit': 0,
                'harvard': 1
                },
            'country': {
                'usa': 0,
                'nepal': 1,
                'lebanon': 2
                }
            }
        where 'k' for 'country' has been extracted from the dataset.

    >>> S = cgpm.crosscat.state.State(D, cctypes=cctypes, distargs=distargs)
    """
    dataframe.replace('NA', np.nan, inplace=True)
    D = []
    cctypes, distargs = [], []
    valmap = dict()
    columns = []
    outputs = []
    for column, stattype, index in schema:
        if stattype == 'ignore':
            continue
        X = dataframe[column]
        columns.append(column)
        cctypes.append(stattype)
        outputs.append(index)
        distargs.append(None)
        # XXX Should check for is_numeric!
        if stattype in ['bernoulli', 'categorical']:
            mapping = build_valmap(X)
            X = X.replace(mapping)
            valmap[column] = mapping
            if stattype == 'bernoulli':
                assert len(mapping) == 2
            else:
                # Did user specify categorical mapping?
                dist, k_user = cu.parse_distargs([column])
                if k_user == [None]:
                    distargs[-1] = {'k': len(mapping)}
                else:
                    assert len(mapping) <= k_user
        D.append(X)
    T = np.asarray(D).T
    assert len(cctypes) == len(distargs) == len(columns)
    assert len(columns) == T.shape[1]
    return T, outputs, cctypes, distargs, valmap, columns
Example #13
0
def test_serialize_composite_cgpm():
    rng = gu.gen_rng(2)

    # Generate the data.
    cctypes, distargs = cu.parse_distargs([
        'categorical(k=3)',     # RandomForest          0
        'normal',               # LinearRegression      1
        'categorical(k=3)',     # GPMCC                 2
        'poisson',              # GPMCC                 3
        'normal',               # GPMCC                 4
        'lognormal'             # GPMCC                 5
        ])
    T, Zv, Zc = tu.gen_data_table(
        35, [.4, .6], [[.33, .33, .34], [.5, .5]],
        cctypes, distargs, [.2]*len(cctypes), rng=rng)
    D = np.transpose(T)

    # Create GPMCC.
    state = State(
        D[:,2:], outputs=[2,3,4,5], cctypes=cctypes[2:],
        distargs=distargs[2:], rng=rng)

    # Create a Forest.
    forest = RandomForest(
        outputs=[0],
        inputs=[1,2,3,4],
        distargs={
            'inputs': {
                'stattypes': [cctypes[i] for i in [1,2,3,4]],
                'statargs': [distargs[i] for i in [1,2,3,4]]},
            'k': distargs[0]['k']},
        rng=rng)

    # Create a Regression.
    linreg = LinearRegression(
        outputs=[1],
        inputs=[3,4,5],
        distargs={
            'inputs': {
                'stattypes': [cctypes[i] for i in [3,4,5]],
                'statargs': [distargs[i] for i in [3,4,5]]}},
        rng=rng)

    # Incorporate the data.
    def incorporate_data(cgpm, rowid, row):
        cgpm.incorporate(
            rowid,
            {i: row[i] for i in cgpm.outputs},
            {i: row[i] for i in cgpm.inputs},
        )
    for rowid, row in enumerate(D):
        incorporate_data(forest, rowid, row)
        incorporate_data(linreg, rowid, row)

    # Compose the CGPMs.

    # Run state transitions.
    state.transition(N=10, progress=False)
    # Compose CGPMs, instructing State to run the transitions.
    token_forest = state.compose_cgpm(forest)
    token_linreg = state.compose_cgpm(linreg)
    state.transition_foreign(N=10, cols=[forest.outputs[0], linreg.outputs[0]])

    # Now run the serialization.
    metadata = state.to_metadata()
    state2 = State.from_metadata(metadata)

    # Check that the tokens are in state2.
    assert token_forest in state2.hooked_cgpms
    assert token_linreg in state2.hooked_cgpms

    # The hooked cgpms must be unique objects after serialize/deserialize.
    assert state.hooked_cgpms[token_forest] != state2.hooked_cgpms[token_forest]
    assert state.hooked_cgpms[token_linreg] != state2.hooked_cgpms[token_linreg]

    # Check that the log scores of the hooked cgpms agree.
    assert np.allclose(
        state.hooked_cgpms[token_forest].logpdf_score(),
        state2.hooked_cgpms[token_forest].logpdf_score())
    assert np.allclose(
        state.hooked_cgpms[token_linreg].logpdf_score(),
        state2.hooked_cgpms[token_linreg].logpdf_score())

    # Now run some tests for the engine.
    e = Engine(
        D[:,2:], outputs=[2,3,4,5], cctypes=cctypes[2:],
        distargs=distargs[2:], num_states=2, rng=rng)
    e.compose_cgpm([forest, forest], multiprocess=1)
    e.compose_cgpm([linreg, linreg], multiprocess=1)
    e.transition_foreign(N=1, cols=[forest.outputs[0], linreg.outputs[0]])
    e.dependence_probability(0,1)
    e.simulate(-1, [0,1], {2:1}, multiprocess=0)
    e.logpdf(-1, {1:1}, {2:1, 0:0}, multiprocess=0)

    state3 = e.get_state(0)

    # There is no guarantee that the logpdf score improves with inference, but
    # it should reduce by more than a few nats.
    def check_logpdf_delta(before, after):
        return before < after or (after-before) < 5
    check_logpdf_delta(
        before=state.hooked_cgpms[token_forest].logpdf_score(),
        after=state3.hooked_cgpms[token_forest].logpdf_score())
    check_logpdf_delta(
        before=state.hooked_cgpms[token_linreg].logpdf_score(),
        after=state3.hooked_cgpms[token_linreg].logpdf_score())
def test_dependence_probability():
    cctypes, distargs = cu.parse_distargs(
        ['normal', 'poisson', 'bernoulli', 'lognormal', 'beta', 'vonmises'])

    T, Zv, Zc = tu.gen_data_table(100, [.5, .5], [[.25, .25, .5], [.3, .7]],
                                  cctypes,
                                  distargs, [.95] * len(cctypes),
                                  rng=gu.gen_rng(100))

    T = T.T
    outputs = range(0, 12, 2)

    # Test for direct dependence for state and engine.
    s = State(T,
              outputs=outputs,
              cctypes=cctypes,
              distargs=distargs,
              Zv={o: z
                  for o, z in zip(outputs, Zv)},
              rng=gu.gen_rng(0))

    e = Engine(T,
               outputs=outputs,
               cctypes=cctypes,
               distargs=distargs,
               Zv={o: z
                   for o, z in zip(outputs, Zv)},
               rng=gu.gen_rng(0))

    for C in [s, e]:
        for col0, col1 in itertools.product(outputs, outputs):
            i0 = outputs.index(col0)
            i1 = outputs.index(col1)
            assert (compute_depprob(C.dependence_probability(
                col0, col1)) == (Zv[i0] == Zv[i1]))

    # Hook some cgpms into state.

    # XXX What if Zv has only one unique value? Hopefully not with this rng!
    uniques = list(set(Zv))
    parent_1 = [o for i, o in enumerate(outputs) if Zv[i] == uniques[0]]
    parent_2 = [o for i, o in enumerate(outputs) if Zv[i] == uniques[1]]

    c1 = BareBonesCGpm(outputs=[1821, 154], inputs=[parent_1[0]])
    c2 = BareBonesCGpm(outputs=[1721], inputs=[parent_2[0]])
    c3 = BareBonesCGpm(outputs=[9721], inputs=[parent_2[1]])
    c4 = BareBonesCGpm(outputs=[74], inputs=[9721])

    for i, C in enumerate([s, e]):
        C.compose_cgpm(c1 if i == 0 else [c1])
        C.compose_cgpm(c2 if i == 0 else [c2])
        C.compose_cgpm(c3 if i == 0 else [c3])
        C.compose_cgpm(c4 if i == 0 else [c4])

        # Between hooked cgpms and state parents.
        for p in parent_1:
            assert compute_depprob(C.dependence_probability(1821, p)) == 1
            assert compute_depprob(C.dependence_probability(154, p)) == 1
            assert compute_depprob(C.dependence_probability(1721, p)) == 0
            assert compute_depprob(C.dependence_probability(9721, p)) == 0
            assert compute_depprob(C.dependence_probability(74, p)) == 0
        for p in parent_2:
            assert compute_depprob(C.dependence_probability(1821, p)) == 0
            assert compute_depprob(C.dependence_probability(154, p)) == 0
            assert compute_depprob(C.dependence_probability(1721, p)) == 1
            assert compute_depprob(C.dependence_probability(9721, p)) == 1
            assert compute_depprob(C.dependence_probability(74, p)) == 1

        # Between hooked cgpm.
        assert compute_depprob(C.dependence_probability(9721, 1721)) == 1
        assert compute_depprob(C.dependence_probability(1821, 154)) == 1
        assert compute_depprob(C.dependence_probability(74, 9721)) == 1
        assert compute_depprob(C.dependence_probability(74, 1721)) == 1

        assert compute_depprob(C.dependence_probability(1821, 1721)) == 0
        assert compute_depprob(C.dependence_probability(1821, 74)) == 0
        assert compute_depprob(C.dependence_probability(154, 74)) == 0