def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    num_views = argin["num_views"]
    num_clusters = argin["num_clusters"]
    prop_missing = argin["prop_missing"]
    separation = argin["separation"]
    ct_kernel = argin["ct_kernel"]

    multinomial_categories = argin["multinomial_categories"]
    seed = argin["seed"]

    random.seed(seed)

    # TODO: use dha.csv
    ofilename = "reasonably_calibrated_ofile.csv"
    table_name = "reasonably_calibrated"

    argin["distargs"] = [{"K": multinomial_categories}] * num_cols
    argin["cctypes"] = ["multinomial"] * num_cols
    argin["separation"] = [argin["separation"]] * num_views

    T_array, structure = eu.gen_data(ofilename, argin, save_csv=True)

    filename, indices, col_names = eu.gen_missing_data_csv(ofilename, prop_missing, [])

    # create a client
    client = Client()

    # caluclate empirical frequency of each point
    frequencies = []
    for col in range(num_cols):
        frequencies.append(numpy.zeros(multinomial_categories))
    T_int = numpy.array(T_array, dtype=int)

    n_indices = len(indices[0])
    for i in range(n_indices):
        r = indices[0][i]
        c = indices[1][i]
        x = T_int[r, c]
        frequencies[c][x] += 1.0

    frequencies = [f / numpy.sum(f) for f in frequencies]

    # set up a dict fro the different config data
    result = dict()

    # do analyses
    for config in ["cc", "crp", "nb"]:
        config_string = eu.config_map[config]
        table = table_name + "-" + config

        # drop old btable, create a new one with the new data and init models
        client("DROP BTABLE %s;" % table, yes=True)
        client("CREATE BTABLE %s FROM %s;" % (table, filename))
        client("INITIALIZE %i MODELS FOR %s %s;" % (num_chains, table, config_string))

        if ct_kernel == 1:
            client("ANALYZE %s FOR %i ITERATIONS WITH MH KERNEL WAIT;" % (table, num_iters))
        else:
            client("ANALYZE %s FOR %i ITERATIONS WAIT;" % (table, num_iters))

        # imput each index in indices and calculate the squared error
        results_config = []
        for col in range(num_cols):
            results_config.append(numpy.zeros(multinomial_categories))
        for col in range(num_cols):
            col_name = col_names[col]
            out = client(
                "INFER %s FROM %s WITH CONFIDENCE .95 WITH 1 SAMPLES;" % (col_name, table),
                pretty=False,
                pandas_output=False,
            )
            for i in range(n_indices):
                r = indices[0][i]
                c = indices[1][i]
                if c == col:
                    x = out[0]["data"][r][1]
                    results_config[c][int(x)] += 1.0

        results_config = [f / sum(f) for f in results_config]
        result[config] = results_config

    retval = dict()
    retval["actual_frequencies"] = frequencies
    retval["inferred_P_cc"] = result["cc"]
    retval["inferred_P_crp"] = result["crp"]
    retval["inferred_P_nb"] = result["nb"]
    retval["config"] = argin

    return retval
Beispiel #2
0
def run_experiment(argin):
    num_iters       = argin["num_iters"]
    num_chains      = argin["num_chains"]
    num_rows        = argin["num_rows"]
    num_cols        = argin["num_cols"]
    num_views       = argin["num_views"]
    num_clusters    = argin["num_clusters"]
    prop_missing    = argin["prop_missing"]
    impute_samples  = argin["impute_samples"]
    separation      = argin["separation"]
    ct_kernel       = argin["ct_kernel"]
    seed            = argin["seed"]

    if seed > 0 :
        random.seed(seed)

    filename = "exp_fills_in_ofile.csv"
    table_name = 'exp_fills_in'

    argin['cctypes'] = ['continuous']*num_cols
    argin['separation'] = [argin['separation']]*num_views

    eu.gen_data(filename, argin, save_csv=True)

    # generate a new csv
    all_filenames = []
    all_indices = []
    for p in prop_missing:
        data_filename, indices, col_names, extra = eu.gen_missing_data_csv(filename,
                                        p, [], True)
        all_indices.append(indices)
        all_filenames.append(data_filename)

    # get the starting table so we can calculate errors
    T_array = extra['array_filled']
    num_rows, num_cols = T_array.shape

    # create a client
    client = Client()

    # set up a dict fro the different config data
    result = dict()
    result['cc'] = numpy.zeros(len(prop_missing))
    result['crp'] = numpy.zeros(len(prop_missing))
    result['nb'] = numpy.zeros(len(prop_missing))

    # do analyses
    for p in range(len(prop_missing)):
        this_indices = all_indices[p]
        this_filename = all_filenames[p]
        for config in ['cc', 'crp', 'nb']:
            config_string = eu.config_map[config]
            table = table_name + '-' + config

            # drop old btable, create a new one with the new data and init models
            client('DROP BTABLE %s;' % table, yes=True)
            client('CREATE BTABLE %s FROM %s;' % (table, this_filename))
            client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string))

            if ct_kernel == 1:
                client('ANALYZE %s FOR %i ITERATIONS WITH MH KENEL WAIT;' % (table, num_iters) )
            else:
                client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters) )

            MSE = 0.0
            count = 0.0
            # imput each index in indices and calculate the squared error
            for col in range(0,num_cols):
                col_name = col_names[col]
                # confidence is set to zero so that a value is always returned
                out = client('INFER %s from %s WITH CONFIDENCE %f WITH %i SAMPLES;' % (col_name, table, 0, impute_samples), pretty=False, pandas_output=False )

                data = out[0]['data']

                # calcaulte MSE
                for row, tcol in zip(this_indices[0], this_indices[1]):
                    if tcol == col:
                        MSE += ( T_array[row,col] - data[row][1] )**2.0
                        count += 1.0

            result[config][p] = MSE/count
            print "error = %f" % result[config][p]

    retval = dict()
    retval['MSE_naive_bayes_indexer'] = result['nb']
    retval['MSE_crp_mixture_indexer'] = result['crp']
    retval['MSE_crosscat_indexer'] = result['cc']
    retval['prop_missing'] = prop_missing
    retval['config'] = argin

    return retval
def run_experiment(argin):
    num_iters    = argin["num_iters"]
    num_chains   = argin["num_chains"]
    num_runs     = argin["num_runs"]
    prop_missing = argin["prop_missing"]
    confidence   = argin["confidence"]
    seed         = argin["seed"]

    n_queries = 2

    # random.seed(seed)

    # using dha, for now
    start_filename = "../data/dha.csv"
    table = 'exp_shrinks_with_iters'

    filename, indices, col_names = eu.gen_missing_data_csv(start_filename, prop_missing, [0])

    # get some random column pairs to do DEPENDENCE PROBABILITY queries on
    # don't do queries on the first column
    columns = range(1,len(col_names))
    column_queries = [ random.sample(columns, 2) for _ in range(n_queries)]

    dependence_queries = []
    for q in column_queries:
        col_1 = col_names[q[0]].lower()
        col_2 = col_names[q[1]].lower()
        this_query = "SELECT DEPENDENCE PROBABILITY OF %s WITH %s FROM %s;" % (col_1, col_2, table)
        dependence_queries.append(this_query)

    # get some inference queries
    column_queries = random.sample(columns, n_queries)
    infer_queries = []
    for q in column_queries:
        col = col_names[q].lower()
        this_query = 'INFER %s FROM %s WITH CONFIDENCE %f;' % (col, table, confidence)
        infer_queries.append(this_query)

    # create a client
    client = Client()

    dependence_results = []
    inference_results = []
    for _ in range(num_runs):

        # drop old table, create new table, init models
        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, filename))
        client('INITIALIZE %i MODELS FOR %s;' % (num_chains, table))

        dependence_results_run = numpy.zeros((n_queries, num_iters))
        inference_results_run = numpy.zeros((n_queries, num_iters))

        for i in range(num_iters):
            # analyze
            client('ANALYZE %s FOR 1 ITERATIONS;' % (table) )

            # dependence
            for q in range(n_queries):
                out_dep = client(dependence_queries[q], pretty=False, pandas_output=False)
                dep = out_dep[0]['data'][0][1]
                dependence_results_run[q,i] = dep

            # infer
            for q in range(n_queries):
                out_inf = client(infer_queries[q], pretty=False, pandas_output=False)
                prop = _get_prop_inferred(out_inf[0]['data'], indices, column_queries[q])
                inference_results_run[q,i] = prop

        dependence_results.append( dependence_results_run )
        inference_results.append( inference_results_run )

    # calculate mean and errors (dependence)
    dep_means = numpy.zeros( (n_queries, num_iters) )
    dep_error = numpy.zeros( (n_queries, num_iters) )

    for i in range(num_iters):
        X = numpy.zeros( (n_queries,num_runs) )
        for r in range(num_runs):
            X[:,r] = dependence_results[r][:,i]        
        dep_means[:,i] = numpy.mean(X, axis=1)
        dep_error[:,i] = numpy.std(X, axis=1)/float(num_runs)**.5


    # calculate mean and errors (infer)
    inf_means = numpy.zeros( (n_queries, num_iters) )
    inf_error = numpy.zeros( (n_queries, num_iters) )
    for i in range(num_iters):
        X = numpy.zeros( (n_queries,num_runs) )
        for r in range(num_runs):
            X[:,r] = inference_results[r][:,i]

        inf_means[:,i] = numpy.mean(X, axis=1)
        inf_error[:,i] = numpy.std(X, axis=1)/float(num_runs)**.5

    result = dict()
    result['config'] = argin
    result['num_queries'] = n_queries
    result['iteration'] = range(1,num_iters+1)
    result['dependence_probability_mean'] = dep_means
    result['dependence_probability_error'] = dep_error
    result['infer_means'] = inf_means
    result['infer_stderr'] = inf_error

    return result
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    num_views = argin["num_views"]
    num_clusters = argin["num_clusters"]
    prop_missing = argin["prop_missing"]
    impute_samples = argin["impute_samples"]
    separation = argin["separation"]
    ct_kernel = argin["ct_kernel"]
    seed = argin["seed"]

    if seed > 0:
        random.seed(seed)

    filename = "exp_fills_in_ofile.csv"
    table_name = 'exp_fills_in'

    argin['cctypes'] = ['continuous'] * num_cols
    argin['separation'] = [argin['separation']] * num_views

    eu.gen_data(filename, argin, save_csv=True)

    # generate a new csv
    all_filenames = []
    all_indices = []
    for p in prop_missing:
        data_filename, indices, col_names, extra = eu.gen_missing_data_csv(
            filename, p, [], True)
        all_indices.append(indices)
        all_filenames.append(data_filename)

    # get the starting table so we can calculate errors
    T_array = extra['array_filled']
    num_rows, num_cols = T_array.shape

    # create a client
    client = Client()

    # set up a dict fro the different config data
    result = dict()
    result['cc'] = numpy.zeros(len(prop_missing))
    result['crp'] = numpy.zeros(len(prop_missing))
    result['nb'] = numpy.zeros(len(prop_missing))

    # do analyses
    for p in range(len(prop_missing)):
        this_indices = all_indices[p]
        this_filename = all_filenames[p]
        for config in ['cc', 'crp', 'nb']:
            config_string = eu.config_map[config]
            table = table_name + '-' + config

            # drop old btable, create a new one with the new data and init models
            client('DROP BTABLE %s;' % table, yes=True)
            client('CREATE BTABLE %s FROM %s;' % (table, this_filename))
            client('INITIALIZE %i MODELS FOR %s %s;' %
                   (num_chains, table, config_string))

            if ct_kernel == 1:
                client('ANALYZE %s FOR %i ITERATIONS WITH MH KENEL WAIT;' %
                       (table, num_iters))
            else:
                client('ANALYZE %s FOR %i ITERATIONS WAIT;' %
                       (table, num_iters))

            MSE = 0.0
            count = 0.0
            # imput each index in indices and calculate the squared error
            for col in range(0, num_cols):
                col_name = col_names[col]
                # confidence is set to zero so that a value is always returned
                out = client(
                    'INFER %s from %s WITH CONFIDENCE %f WITH %i SAMPLES;' %
                    (col_name, table, 0, impute_samples),
                    pretty=False,
                    pandas_output=False)

                data = out[0]['data']

                # calcaulte MSE
                for row, tcol in zip(this_indices[0], this_indices[1]):
                    if tcol == col:
                        MSE += (T_array[row, col] - data[row][1])**2.0
                        count += 1.0

            result[config][p] = MSE / count
            print "error = %f" % result[config][p]

    retval = dict()
    retval['MSE_naive_bayes_indexer'] = result['nb']
    retval['MSE_crp_mixture_indexer'] = result['crp']
    retval['MSE_crosscat_indexer'] = result['cc']
    retval['prop_missing'] = prop_missing
    retval['config'] = argin

    return retval
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_runs = argin["num_runs"]
    prop_missing = argin["prop_missing"]
    confidence = argin["confidence"]
    seed = argin["seed"]

    n_queries = 2

    # random.seed(seed)

    # using dha, for now
    start_filename = "../data/dha.csv"
    table = 'exp_shrinks_with_iters'

    filename, indices, col_names = eu.gen_missing_data_csv(
        start_filename, prop_missing, [0])

    # get some random column pairs to do DEPENDENCE PROBABILITY queries on
    # don't do queries on the first column
    columns = range(1, len(col_names))
    column_queries = [random.sample(columns, 2) for _ in range(n_queries)]

    dependence_queries = []
    for q in column_queries:
        col_1 = col_names[q[0]].lower()
        col_2 = col_names[q[1]].lower()
        this_query = "SELECT DEPENDENCE PROBABILITY OF %s WITH %s FROM %s;" % (
            col_1, col_2, table)
        dependence_queries.append(this_query)

    # get some inference queries
    column_queries = random.sample(columns, n_queries)
    infer_queries = []
    for q in column_queries:
        col = col_names[q].lower()
        this_query = 'INFER %s FROM %s WITH CONFIDENCE %f;' % (col, table,
                                                               confidence)
        infer_queries.append(this_query)

    # create a client
    client = Client()

    dependence_results = []
    inference_results = []
    for _ in range(num_runs):

        # drop old table, create new table, init models
        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, filename))
        client('INITIALIZE %i MODELS FOR %s;' % (num_chains, table))

        dependence_results_run = numpy.zeros((n_queries, num_iters))
        inference_results_run = numpy.zeros((n_queries, num_iters))

        for i in range(num_iters):
            # analyze
            client('ANALYZE %s FOR 1 ITERATIONS;' % (table))

            # dependence
            for q in range(n_queries):
                out_dep = client(dependence_queries[q],
                                 pretty=False,
                                 pandas_output=False)
                dep = out_dep[0]['data'][0][1]
                dependence_results_run[q, i] = dep

            # infer
            for q in range(n_queries):
                out_inf = client(infer_queries[q],
                                 pretty=False,
                                 pandas_output=False)
                prop = _get_prop_inferred(out_inf[0]['data'], indices,
                                          column_queries[q])
                inference_results_run[q, i] = prop

        dependence_results.append(dependence_results_run)
        inference_results.append(inference_results_run)

    # calculate mean and errors (dependence)
    dep_means = numpy.zeros((n_queries, num_iters))
    dep_error = numpy.zeros((n_queries, num_iters))

    for i in range(num_iters):
        X = numpy.zeros((n_queries, num_runs))
        for r in range(num_runs):
            X[:, r] = dependence_results[r][:, i]
        dep_means[:, i] = numpy.mean(X, axis=1)
        dep_error[:, i] = numpy.std(X, axis=1) / float(num_runs)**.5

    # calculate mean and errors (infer)
    inf_means = numpy.zeros((n_queries, num_iters))
    inf_error = numpy.zeros((n_queries, num_iters))
    for i in range(num_iters):
        X = numpy.zeros((n_queries, num_runs))
        for r in range(num_runs):
            X[:, r] = inference_results[r][:, i]

        inf_means[:, i] = numpy.mean(X, axis=1)
        inf_error[:, i] = numpy.std(X, axis=1) / float(num_runs)**.5

    result = dict()
    result['config'] = argin
    result['num_queries'] = n_queries
    result['iteration'] = range(1, num_iters + 1)
    result['dependence_probability_mean'] = dep_means
    result['dependence_probability_error'] = dep_error
    result['infer_means'] = inf_means
    result['infer_stderr'] = inf_error

    return result
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    num_views = argin["num_views"]
    num_clusters = argin["num_clusters"]
    prop_missing = argin["prop_missing"]
    separation = argin["separation"]
    ct_kernel = argin["ct_kernel"]

    multinomial_categories = argin["multinomial_categories"]
    seed = argin["seed"]

    random.seed(seed)

    # TODO: use dha.csv
    ofilename = "reasonably_calibrated_ofile.csv"
    table_name = 'reasonably_calibrated'

    argin['distargs'] = [{"K": multinomial_categories}] * num_cols
    argin['cctypes'] = ['multinomial'] * num_cols
    argin['separation'] = [argin['separation']] * num_views

    T_array, structure = eu.gen_data(ofilename, argin, save_csv=True)

    filename, indices, col_names = eu.gen_missing_data_csv(
        ofilename, prop_missing, [])

    # create a client
    client = Client()

    # caluclate empirical frequency of each point
    frequencies = []
    for col in range(num_cols):
        frequencies.append(numpy.zeros(multinomial_categories))
    T_int = numpy.array(T_array, dtype=int)

    n_indices = len(indices[0])
    for i in range(n_indices):
        r = indices[0][i]
        c = indices[1][i]
        x = T_int[r, c]
        frequencies[c][x] += 1.0

    frequencies = [f / numpy.sum(f) for f in frequencies]

    # set up a dict fro the different config data
    result = dict()

    # do analyses
    for config in ['cc', 'crp', 'nb']:
        config_string = eu.config_map[config]
        table = table_name + '-' + config

        # drop old btable, create a new one with the new data and init models
        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, filename))
        client('INITIALIZE %i MODELS FOR %s %s;' %
               (num_chains, table, config_string))

        if ct_kernel == 1:
            client('ANALYZE %s FOR %i ITERATIONS WITH MH KERNEL WAIT;' %
                   (table, num_iters))
        else:
            client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters))

        # imput each index in indices and calculate the squared error
        results_config = []
        for col in range(num_cols):
            results_config.append(numpy.zeros(multinomial_categories))
        for col in range(num_cols):
            col_name = col_names[col]
            out = client(
                "INFER %s FROM %s WITH CONFIDENCE .95 WITH 1 SAMPLES;" %
                (col_name, table),
                pretty=False,
                pandas_output=False)
            for i in range(n_indices):
                r = indices[0][i]
                c = indices[1][i]
                if c == col:
                    x = out[0]['data'][r][1]
                    results_config[c][int(x)] += 1.0

        results_config = [f / sum(f) for f in results_config]
        result[config] = results_config

    retval = dict()
    retval['actual_frequencies'] = frequencies
    retval['inferred_P_cc'] = result['cc']
    retval['inferred_P_crp'] = result['crp']
    retval['inferred_P_nb'] = result['nb']
    retval['config'] = argin

    return retval