Ejemplo n.º 1
0
def read_param_csv(param_csv_path, **options):
    """Reads configs from a csv file.

    Args:
        param_csv_path: The path to a csv file with one line per config.
        options: A dict of extra config parameters.

    Returns:
        A pair (header, configs), where:
        header is a list of parameters, and
        configs is list of config dicts.
    """
    # This is a hack: to configure crossvalidation from a config, we alias:
    #   num_parts = model_ensemble_size
    #   partid = seed
    num_parts = options.get('model_ensemble_size', 4)
    assert 'seed' not in options
    configs = []
    with csv_reader(param_csv_path) as reader:
        header = next(reader)
        assert 'seed' not in header
        assert 'model_ensemble_size' not in header
        for row in reader:
            if len(row) != len(header) or row[0].startswith('#'):
                continue
            for key, value in zip(header, row):
                options[key] = int(value)
            for partid in range(num_parts):
                configs.append(make_config(seed=partid, **options))
    return header, configs
Ejemplo n.º 2
0
def test_train_model(N, V, C, M, parallel):
    K = V * (V - 1) // 2
    config = make_config(model_num_clusters=M, learning_parallel=parallel)
    dataset = generate_dataset(num_rows=N, num_cols=V, num_cats=C)
    table = dataset['table']
    tree_prior = np.exp(np.random.random(K), dtype=np.float32)
    model = train_model(table, tree_prior, config)
    validate_model(table, model, config)
Ejemplo n.º 3
0
def train(dataset_in, ensemble_out, **options):
    """Train a TreeCat ensemble model on imported data."""
    from treecat.training import train_ensemble
    dataset = pickle_load(dataset_in)
    table = dataset['table']
    tree_prior = dataset['schema']['tree_prior']
    config = make_config(**options)
    ensemble = train_ensemble(table, tree_prior, config)
    pickle_dump(ensemble, ensemble_out)
Ejemplo n.º 4
0
def serve(rows=100, cols=10, cats=4, tool='timers'):
    """Profile TreeCatServer on a random dataset.
    Available tools: timers, time, snakeviz, line_profiler, pdb
    """
    from treecat.generate import generate_model_file
    config = make_config()
    model_path = generate_model_file(rows, cols, cats)
    with tempdir() as dirname:
        config_path = os.path.join(dirname, 'config.pkz')
        pickle_dump(config, config_path)
        cmd = [FILE, 'serve_files', model_path, config_path, str(rows)]
        run_with_tool(cmd, tool, dirname)
Ejemplo n.º 5
0
def test_train_ensemble(N, V, C, M):
    config = make_config(model_num_clusters=M)
    K = V * (V - 1) // 2
    dataset = generate_dataset(num_rows=N, num_cols=V, num_cats=C)
    table = dataset['table']
    tree_prior = np.exp(np.random.random(K), dtype=np.float32)
    ensemble = train_ensemble(table, tree_prior, config)

    assert len(ensemble) == config['model_ensemble_size']
    for sub_seed, model in enumerate(ensemble):
        sub_config = config.copy()
        sub_config['seed'] += sub_seed
        validate_model(table, model, sub_config)
Ejemplo n.º 6
0
def generate_fake_ensemble(num_rows, num_cols, num_cats, num_components):
    dataset = generate_dataset(num_rows, num_cols, num_cats)
    ensemble = []
    config = make_config(model_num_clusters=num_components, seed=0)
    for sub_seed in range(3):
        sub_config = config.copy()
        sub_config['seed'] += sub_seed
        set_random_seed(sub_config['seed'])
        model = generate_fake_model(num_rows, num_cols, num_cats,
                                    num_components, dataset)
        model['config'] = sub_config
        ensemble.append(model)
    return ensemble
Ejemplo n.º 7
0
def test_recover_structure(V, C):
    set_random_seed(V + C * 10)
    N = 200
    M = 2 * C
    K = V * (V - 1) // 2
    tree_prior = np.zeros(K, np.float32)
    tree = generate_tree(num_cols=V)
    table = generate_clean_dataset(tree, num_rows=N, num_cats=C)['table']
    config = make_config(model_num_clusters=M)
    model = train_model(table, tree_prior, config)

    # Compute three types of edges.
    expected_edges = tree.get_edges()
    optimal_edges = estimate_tree(tree.complete_grid, model['edge_logits'])
    actual_edges = model['tree'].get_edges()

    # Print debugging information.
    feature_names = [str(v) for v in range(V)]
    root = '0'
    readable_data = np.zeros([N, V], np.int8)
    for v in range(V):
        beg, end = table.ragged_index[v:v + 2]
        readable_data[:, v] = table.data[:, beg:end].argmax(axis=1)
    with np_printoptions(precision=2, threshold=100, edgeitems=5):
        print('Expected:')
        print(print_tree(expected_edges, feature_names, root))
        print('Optimal:')
        print(print_tree(optimal_edges, feature_names, root))
        print('Actual:')
        print(print_tree(actual_edges, feature_names, root))
        print('Correlation:')
        print(np.corrcoef(readable_data.T))
        print('Edge logits:')
        print(triangular_to_square(tree.complete_grid, model['edge_logits']))
        print('Data:')
        print(readable_data)
        print('Feature Sufficient Statistics:')
        print(model['suffstats']['feat_ss'])
        print('Edge Sufficient Statistics:')
        print(model['suffstats']['edge_ss'])

    # Check agreement.
    assert actual_edges == optimal_edges, 'Error in sample_tree'
    assert actual_edges == expected_edges, 'Error in likelihood'
Ejemplo n.º 8
0
def test_assignment_sampler_gof(N, V, C, M):
    config = make_config(model_num_clusters=M)
    K = V * (V - 1) // 2
    dataset = generate_dataset(num_rows=N, num_cols=V, num_cats=C)
    table = dataset['table']
    tree_prior = np.exp(np.random.random(K), dtype=np.float32)
    trainer = TreeCatTrainer(table, tree_prior, config)
    print('Data:')
    print(dataset['table'].data)

    # Add all rows.
    set_random_seed(1)
    for row_id in range(N):
        trainer.add_row(row_id)

    # Collect samples.
    num_samples = 500 * M**(N * V)
    counts = {}
    logprobs = {}
    for _ in range(num_samples):
        for row_id in range(N):
            # This is a single-site Gibbs sampler.
            trainer.remove_row(row_id)
            trainer.add_row(row_id)
        key = hash_assignments(trainer._assignments)
        if key in counts:
            counts[key] += 1
        else:
            counts[key] = 1
            logprobs[key] = trainer.logprob()
    assert len(counts) == M**(N * V)

    # Check accuracy using Pearson's chi-squared test.
    keys = sorted(counts.keys())
    counts = np.array([counts[k] for k in keys], dtype=np.int32)
    probs = np.exp(np.array([logprobs[k] for k in keys]))
    probs /= probs.sum()
    print('Actual\tExpected\tAssignment')
    for count, prob, key in zip(counts, probs, keys):
        print('{:}\t{:0.1f}\t{}'.format(count, prob * num_samples, key))
    gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True)
    assert 1e-2 < gof
Ejemplo n.º 9
0
def train(rows=100,
          cols=10,
          epochs=5,
          clusters=32,
          parallel=False,
          tool='timers'):
    """Profile TreeCatTrainer on a random dataset.
    Available tools: timers, time, snakeviz, line_profiler, pdb
    """
    from treecat.generate import generate_dataset_file
    config = make_config(learning_init_epochs=epochs,
                         model_num_clusters=clusters,
                         model_ensemble_size=1,
                         learning_parallel=parallel)
    dataset_path = generate_dataset_file(rows, cols)
    with tempdir() as dirname:
        config_path = os.path.join(dirname, 'config.pkz')
        pickle_dump(config, config_path)
        cmd = [FILE, 'train_files', dataset_path, config_path]
        run_with_tool(cmd, tool, dirname)
Ejemplo n.º 10
0
def generate_model_file(num_rows, num_cols, num_cats=4, rate=1.0):
    """Generate a random model.

    Returns:
        The path to a gzipped pickled model.
    """
    path = os.path.join(
        DATA, '{}-{}-{}-{:0.1f}.model.pkz'.format(num_rows, num_cols, num_cats,
                                                  rate))
    V = num_cols
    K = V * (V - 1) // 2
    if os.path.exists(path):
        return path
    print('Generating {}'.format(path))
    if not os.path.exists(DATA):
        os.makedirs(DATA)
    dataset_path = generate_dataset_file(num_rows, num_cols, num_cats, rate)
    dataset = pickle_load(dataset_path)
    table = dataset['table']
    tree_prior = np.zeros(K, dtype=np.float32)
    config = make_config(learning_init_epochs=5)
    model = train_model(table, tree_prior, config)
    pickle_dump(model, path)
    return path
Ejemplo n.º 11
0
import os
import shutil
import tempfile

import numpy as np
import pytest

from treecat.config import make_config
from treecat.format import fingerprint
from treecat.tables import TY_MULTINOMIAL
from treecat.tables import Table

TESTDATA = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'testdata')

TINY_CONFIG = make_config(
    learning_init_epochs=2,  #
    model_num_clusters=7,  #
    model_ensemble_size=3)

TINY_FEATURE_TYPES = np.array([TY_MULTINOMIAL] * 5, dtype=np.int8)
TINY_FEATURE_TYPES.flags.writeable = False

TINY_RAGGED_INDEX = np.array([0, 2, 4, 7, 10, 13], dtype=np.int32)
TINY_RAGGED_INDEX.flags.writeable = False

TINY_DATA = np.array(
    [
        # f1 | f2  |   f3   |   f4   |   f5   |
        # ---+-----+--------+--------+--------+
        [1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1],
Ejemplo n.º 12
0
def generate_clean_dataset(tree, num_rows, num_cats):
    """Generate a dataset whose structure should be easy to learn.

    This generates a highly correlated uniformly distributed dataset with
    given tree structure. This is useful to test that structure learning can
    recover a known structure.

    Args:
        tree: A TreeStructure instance.
        num_rows: The number of rows in the generated dataset.
        num_cats: The number of categories in the geneated categorical dataset.
            This will also be used for the number of latent classes.

    Returns:
        A dict with key 'table' and value a Table object.
    """
    assert isinstance(tree, TreeStructure)
    V = tree.num_vertices
    E = V - 1
    K = V * (V - 1) // 2
    C = num_cats
    M = num_cats
    config = make_config(model_num_clusters=M)
    ragged_index = np.arange(0, C * (V + 1), C, np.int32)
    ragged_index.flags.writeable = False

    # Create sufficient statistics that are ideal for structure learning:
    # Correlation should be high enough that (vertex,vertex) correlation can be
    # detected, but low enough that multi-hop correlation can be distinguished
    # from single-hop correlation.
    # Observations should have very low error rate.
    edge_precision = 1
    feat_precision = 100
    vert_ss = np.zeros((V, M), dtype=np.int32)
    edge_ss = np.zeros((E, M, M), dtype=np.int32)
    feat_ss = np.zeros((V * C, M), dtype=np.int32)
    meas_ss = np.zeros([V, M], np.int32)
    vert_ss[...] = edge_precision
    meas_ss[...] = feat_precision
    for e, v1, v2 in tree.tree_grid.T:
        edge_ss[e, :, :] = edge_precision * np.eye(M, dtype=np.int32)
    for v in range(V):
        beg, end = ragged_index[v:v + 2]
        feat_ss[beg:end, :] = feat_precision * np.eye(M, dtype=np.int32)
    model = {
        'config': config,
        'tree': tree,
        'edge_logits': np.zeros(K, np.float32),
        'suffstats': {
            'ragged_index': ragged_index,
            'vert_ss': vert_ss,
            'edge_ss': edge_ss,
            'feat_ss': feat_ss,
            'meas_ss': meas_ss,
        },
    }
    server = TreeCatServer(model)
    data = server.sample(num_rows, counts=np.ones(V, np.int8))
    data.flags.writeable = False
    feature_types = [TY_MULTINOMIAL] * V
    table = Table(feature_types, ragged_index, data)
    return {'table': table}