Example #1
0
def test_flights():
    client = Client()

    cmds = []
    cmds.append('drop ptable jayt;')
    cmds.append('create ptable jayt from /home/sgeadmin/tabular_predDB/Examples/flight_data_subset.csv;')
    cmds.append('create 2 models for jayt;')
    cmds.append('analyze jayt for 1 iterations;')
    cmds.append('select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20;')
    cmds.append('select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0;')
    cmds.append('select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0 with respect to actualelapsedtime;')
    cmds.append('select dayofweek, actualelapsedtime, similarity to 0 with respect to actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0 with respect to actualelapsedtime, dayofweek;')
    cmds.append('select dayofweek, actualelapsedtime, similarity to 0 from jayt where distance > 800 limit 5;')
    cmds.append('select dayofweek, actualelapsedtime, arrtime, similarity to 0 with respect to arrtime from jayt where distance > 800 limit 5 order by similarity to 0 with respect to arrtime, dayofweek;')

    cmds.append('select probability(actualelapsedtime=200) from jayt where distance > 800 limit 20;')
    # cmds.append('select * from jayt limit 5;')
    #cmds.append('infer actualelapsedtime from jayt with confidence 0.8 limit 20;')

    cmds.append('simulate dayofweek, deptime, crsdeptime FROM jayt where dayofweek = 7 TIMES 3;')
    cmds.append('estimate dependence probabilities from jayt;')
    cmds.append('estimate dependence probabilities from jayt referencing actualelapsedtime limit 6 save to fz;')
    cmds.append('estimate dependence probabilities from jayt referencing actualelapsedtime with confidence 0.5;')
    #cmds.append('drop ptable jayt;')
    #cmds.append('estimate dependence probabilities from dan_kiva referencing activity limit 10 save to activity_z;')

    #cmds.append('select * from dha_small;')
    #cmds.append('select probability(mdcr_spnd_outp=1), probability(mdcr_spnd_outp=2), probability(mdcr_spnd_outp=3) from dha_small;')

    for cmd in cmds:
        print '>>> %s' % cmd
        result = client.execute(cmd, timing=True)
        print result
Example #2
0
def test_flights():
    client = Client()

    cmds = []
    cmds.append('drop ptable jayt;')
    cmds.append(
        'create ptable jayt from /home/sgeadmin/tabular_predDB/Examples/flight_data_subset.csv;'
    )
    cmds.append('create 2 models for jayt;')
    cmds.append('analyze jayt for 1 iterations;')
    cmds.append(
        'select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20;'
    )
    cmds.append(
        'select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0;'
    )
    cmds.append(
        'select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0 with respect to actualelapsedtime;'
    )
    cmds.append(
        'select dayofweek, actualelapsedtime, similarity to 0 with respect to actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0 with respect to actualelapsedtime, dayofweek;'
    )
    cmds.append(
        'select dayofweek, actualelapsedtime, similarity to 0 from jayt where distance > 800 limit 5;'
    )
    cmds.append(
        'select dayofweek, actualelapsedtime, arrtime, similarity to 0 with respect to arrtime from jayt where distance > 800 limit 5 order by similarity to 0 with respect to arrtime, dayofweek;'
    )

    cmds.append(
        'select probability(actualelapsedtime=200) from jayt where distance > 800 limit 20;'
    )
    # cmds.append('select * from jayt limit 5;')
    #cmds.append('infer actualelapsedtime from jayt with confidence 0.8 limit 20;')

    cmds.append(
        'simulate dayofweek, deptime, crsdeptime FROM jayt where dayofweek = 7 TIMES 3;'
    )
    cmds.append('estimate dependence probabilities from jayt;')
    cmds.append(
        'estimate dependence probabilities from jayt referencing actualelapsedtime limit 6 save to fz;'
    )
    cmds.append(
        'estimate dependence probabilities from jayt referencing actualelapsedtime with confidence 0.5;'
    )
    #cmds.append('drop ptable jayt;')
    #cmds.append('estimate dependence probabilities from dan_kiva referencing activity limit 10 save to activity_z;')

    #cmds.append('select * from dha_small;')
    #cmds.append('select probability(mdcr_spnd_outp=1), probability(mdcr_spnd_outp=2), probability(mdcr_spnd_outp=3) from dha_small;')

    for cmd in cmds:
        print '>>> %s' % cmd
        result = client.execute(cmd, timing=True)
        print result
Example #3
0
def test_dha_story_demo():
    client = Client()

    tests_dir = os.path.split(os.path.realpath(__file__))[0]
    dha_csv_path = os.path.join(tests_dir, 'data/dha.csv')
    dha_samples_path = os.path.join(tests_dir, 'samples/dha_samples.pkl.gz')
    test_results_path = os.path.join(
        tests_dir, 'regression_test_output/dha_story_results_record.pkl')

    cmd_list = [
        'DROP BTABLE dha_demo;',
        'CREATE BTABLE dha_demo FROM %s;' % dha_csv_path,
        'IMPORT SAMPLES %s INTO dha_demo;' % dha_samples_path,
        'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo LIMIT 10;',
        'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo;',
        'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING qual_score LIMIT 6;',
        'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING qual_score WITH CONFIDENCE 0.9;',
        'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING pymt_p_md_visit LIMIT 6;',
        #    'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\') LIMIT 10;',
        'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\', qual_score), ami_score  LIMIT 10;',
        'SELECT name, qual_score, ami_score,  pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\', pymt_p_visit_ratio), ttl_mdcr_spnd  LIMIT 10;',
        #    'SIMULATE name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo WHERE ami_score=95.0  TIMES 10;',
        #    'SIMULATE name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo WHERE ttl_mdcr_spnd=50000 TIMES 10;',
    ]

    dha_story_results = []
    if len(sys.argv) > 1 and sys.argv[1] == 'record':
        print 'Recording new dha_story_results to %s' % test_results_path
        record = True
    else:
        ## Testing
        dha_story_results = pickle.load(open(test_results_path, 'r'))
        record = False

    for i, cmd in enumerate(cmd_list):
        print cmd
        result = client.execute(cmd, timing=False, pretty=True)
        if record:
            dha_story_results.append(result)
        else:
            if type(result) == dict:
                for k, v in result.iteritems():
                    if isinstance(v, numpy.ndarray):
                        assert (v == dha_story_results[i][k]).all(), (
                            v, dha_story_results[i][k])
                    else:
                        assert v == dha_story_results[i][k], (
                            v, dha_story_results[i][k])
            else:
                #assert result == dha_story_results[i], (result, dha_story_results[i])
                pass

    if record:
        pickle.dump(dha_story_results, open(filename, 'w'))
Example #4
0
def test_dha_story_demo():
    client = Client()

    tests_dir = os.path.split(os.path.realpath(__file__))[0]
    dha_csv_path = os.path.join(tests_dir, 'data/dha.csv')
    dha_samples_path = os.path.join(tests_dir, 'samples/dha_samples.pkl.gz')
    test_results_path = os.path.join(tests_dir, 'regression_test_output/dha_story_results_record.pkl')

    cmd_list = [
        'DROP BTABLE dha_demo;',
        'CREATE BTABLE dha_demo FROM %s;' % dha_csv_path,
        'IMPORT SAMPLES %s INTO dha_demo;' % dha_samples_path,
        'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo LIMIT 10;',
        'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo;',
        'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING qual_score LIMIT 6;',
        'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING qual_score WITH CONFIDENCE 0.9;', 
        'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING pymt_p_md_visit LIMIT 6;',
    #    'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\') LIMIT 10;',
        'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\', qual_score), ami_score  LIMIT 10;',
        'SELECT name, qual_score, ami_score,  pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\', pymt_p_visit_ratio), ttl_mdcr_spnd  LIMIT 10;',
    #    'SIMULATE name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo WHERE ami_score=95.0  TIMES 10;',
    #    'SIMULATE name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo WHERE ttl_mdcr_spnd=50000 TIMES 10;',
    ]


    dha_story_results = []
    if len(sys.argv) > 1 and sys.argv[1] == 'record':
        print 'Recording new dha_story_results to %s' % test_results_path
        record = True
    else:
        ## Testing
        dha_story_results = pickle.load(open(test_results_path, 'r'))
        record = False

    for i, cmd in enumerate(cmd_list):
        print cmd
        result = client.execute(cmd, timing=False, pretty=True)
        if record:
            dha_story_results.append(result)
        else:
            if type(result) == dict:
                for k,v in result.iteritems():
                    if isinstance(v, numpy.ndarray):
                        assert (v == dha_story_results[i][k]).all(), (v, dha_story_results[i][k])
                    else:
                        assert v == dha_story_results[i][k], (v, dha_story_results[i][k])
            else:
                #assert result == dha_story_results[i], (result, dha_story_results[i])
                pass


    if record:
        pickle.dump(dha_story_results, open(filename, 'w'))
Example #5
0
def setup_function(function):
  global test_tablenames, client, test_filenames
  test_tablenames = []
  test_filenames = []
  # Default upgrade_key_column is None, to let the user choose, but need to avoid
  # user input during testing, so for testing just create a new key column.
  client = Client(testing=True)
Example #6
0
def test_btable_list():
  global client, test_filenames

  out = set(client('list btables', pretty=False, debug=True)[0]['btable'])
  init_btable_count = len(out)

  test_tablename1 = create_dha()

  out = set(client('list btables', pretty=False, debug=True)[0]['btable'])
  assert len(out) == 1 + init_btable_count
  assert test_tablename1 in out

  test_tablename2 = create_dha()

  out = set(client('list btables', pretty=False, debug=True)[0]['btable'])
  assert len(out) == 2 + init_btable_count
  assert test_tablename1 in out
  assert test_tablename2 in out

  client('drop btable %s' % test_tablename1, yes=True, debug=True, pretty=False)

  out = set(client('list btables', pretty=False, debug=True)[0]['btable'])
  assert len(out) == 1 + init_btable_count
  assert test_tablename1 not in out
  assert test_tablename2 in out

  ## test to make sure btable list is persisted
  del client
  client = Client()

  out = set(client('list btables', pretty=False, debug=True)[0]['btable'])
  assert len(out) == 1 + init_btable_count
  assert test_tablename1 not in out
  assert test_tablename2 in out
Example #7
0
def run_experiment(argin):
    num_rows = argin["num_rows"]
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    ct_kernel = argin["ct_kernel"]
    datatype = argin["datatype"]

    # generate the data
    datasets = gen_shapetest_csvs(num_rows)

    client = Client()

    # drop tables
    print "Dropping tables."
    client('DROP BTABLE exp_sinwave;', yes=True)
    client('DROP BTABLE exp_x;', yes=True)
    client('DROP BTABLE exp_ring;', yes=True)
    client('DROP BTABLE exp_dots;', yes=True)

    data_out = dict()
    data_out['config'] = argin

    # recreate sin wave
    for shape in ["sinwave", "x", "ring", "dots"]:
        query_list = gen_base_queries(num_iters, num_chains, num_rows, shape,
                                      ct_kernel, datatype)
        for query in query_list:
            print query
            client(query)

        table = table_string[shape]
        datafile = table + '.csv'

        out = client('SIMULATE x,y FROM %s TIMES %i;' % (table, num_rows),
                     pretty=False)

        X_original = datasets[shape]
        X_inferred = numpy.array(out[0])

        # get the logps
        # latent_states = client.engine.persistence_layer.get_latent_states(table)
        # X_L_list = latent_states[0]
        # logps = [X_L['logp'] for X_L in X_L_list]

        # this_key = shape + "_logps"
        # data_out[this_key] = logps

        this_key = shape + "_inferred"
        data_out[this_key] = X_inferred

        this_key = shape + "_original"
        data_out[this_key] = X_original

    return data_out
Example #8
0
def run_example(name):
    # Default upgrade_key_column is None, to let the user choose, but need to avoid
    # user input during testing, so default will be to create a new key column.
    client = Client(testing=True)
    file_path = os.path.join('../../examples/%s/%s_analysis.bql' %
                             (name, name))
    results = client(open(file_path, 'r'),
                     yes=True,
                     pretty=False,
                     plots=False,
                     key_column=0)
    for r in results:
        if 'Error' in r or ('error' in r and r['error']):
            raise Exception(str(r))
Example #9
0
def run_command_line():
    # Get command line arguments to specify hostname and port
    hostname = None
    port = None
    if len(sys.argv) > 1:
        # Treat the first argument as hostname[:port]
        input = sys.argv[1].split(':')
        hostname = input[0]
        if len(input) == 1:
            client = Client(hostname)
            print "Using hostname %s." % hostname
        if len(input) == 2:
            port = int(input[1])
            client = Client(hostname, port)
            print "Using hostname %s, port %d" % (hostname, port)
        elif len(input) > 2:
            print "Run with 'python bql [hostname[:port]]'"
    else:
        client = Client()

    print """Welcome to BayesDB. You may enter BQL commands directly into this prompt. Type 'help' for help, and 'quit' to quit."""
    app = BayesDBApp(client)
    app.cmdloop()
Example #10
0
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    max_cols = argin["max_cols"]
    rho = argin["rho"]
    num_indep_queries = argin["num_indep_queries"]
    independent_clusters = argin["independent_clusters"]
    ct_kernel = argin["ct_kernel"]
    multimodal = argin["multimodal"]
    separation = argin["separation"]

    all_cols = max_cols + 4 # max_cols plus number of dependent columns

    seed = argin["seed"]

    if seed > 0:
        random.seed(seed)
        numpy.random.seed(seed)

    # build full data file
    # generate column indices and header
    col_names = [ "col_%i" % i for i in range(all_cols)]

    Zv = [0,0,1,1] # our needles
    Zv.extend(range(2,all_cols-2))

    min_clusters = 3
    max_clusters = 10

    T_array = numpy.zeros( (num_rows, all_cols) )

    Sigma = numpy.array( [[1.0,rho],[rho,1.0]])
    mu = numpy.array([0,0])

    if multimodal:
        T = [[0]*num_cols]*num_rows
        Zv = [0,0,1,1] # our needles
        Zv.extend(range(2,num_cols-2))
        random.shuffle(Zv)

        num_views = max(Zv)+1

        separation = [separation]*2
        separation.extend([separation]*(num_views-2))

        min_clusters = 4
        max_clusters = 5

        cluster_weights = []
        # generate weights. 
        for v in range(num_views):
            if v < 2:
                num_clusters = random.randrange(min_clusters, max_clusters)
            else:
                num_clusters = 1
            cluster_weights.append( [1.0/num_clusters]*num_clusters ) 

        cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories)
        T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs)
        T_array = numpy.array(T)
    else:
        T_array[:, 0:1+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows)
        T_array[:, 2:3+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows)
        separation = .5
        for col in range(4, all_cols):
            num_clusters = random.randrange(min_clusters, max_clusters)+1
            for row in range(num_rows):
                k = random.randrange(num_clusters)
                T_array[row, col] = numpy.random.randn()+k*6*separation

        T = T_array.tolist()

    # save file to .csv
    exp_path = 'expdata/hb/'
    eu.make_folder(exp_path)
    filename = exp_path + "haystack_break_exp.csv"
    table = "haystack_break_exp"
    T.insert(0, col_names)
    eu.list_to_csv(filename, T)
    # done building data file

    # get colum step size (powers of two)
    num_steps = int( math.log(max_cols, 2) )-1
    step_size = [2**t for t in range(2, num_steps+1)]

    assert step_size[-1] <= max_cols

    if step_size[-1] < max_cols:
        step_size.append(max_cols)

    assert step_size[0] == 4 and step_size[-1] == max_cols

    # the needle column names
    needle_a_cols = (col_names[0],col_names[1])
    needle_b_cols = (col_names[2],col_names[3])

    result = dict()
    result['steps'] = []

    for num_distractor_columns in step_size:
        # create subdata
        T_sub = take_T_column_subset(T, range(4+num_distractor_columns) )
        subpath = exp_path+'d_'+str(num_distractor_columns)+'/'
        eu.make_folder(subpath)
        subfilename = subpath + "haystack_break_exp_" + str(num_distractor_columns) + ".csv"
        eu.list_to_csv(subfilename, T_sub)

        col_names_sub = T_sub[0]

        # generate queries
        queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols,
                            col_names_sub, table, num_indep_queries)
        num_queries = len(queries)

        dependence_probs = numpy.zeros( (num_iters+1, num_queries) )

        client = Client()

        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, subfilename))
        init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table)
        print init_string 
        client(init_string)
        client('SHOW DIAGNOSTICS FOR %s;' % table)

        # do the analyses
        for i in range(0,num_iters+1):
            if i > 0:
                if ct_kernel == 1:
                    client( 'ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table )
                else:
                    client( 'ANALYZE %s FOR 1 ITERATIONS WAIT;' % table )

            for q in range(num_queries):
                query = queries[q]
                out = client(query, pretty=False, pandas_output=False)
                dependence_probs[i,q] = out[0]['data'][0][1]

        subresult = dict()
        # store the queries in subresult
        subresult['query_col1'] = []
        subresult['query_col2'] = []
        subresult['dependence_probs'] = dependence_probs
        for pair in pairs:
            subresult['query_col1'].append(pair[0])
            subresult['query_col2'].append(pair[1])
        
        # for each query, get wether those columns were actually independent
        independent = [True]*num_queries
        for i in range(num_queries):
            col_idx_0 = pairs[i][0]
            col_idx_1 = pairs[i][1]            
            if Zv[col_idx_0] == Zv[col_idx_1]:
                independent[i] = False

        subresult['cols_independent'] = independent
        subresult['distractor_cols'] = num_distractor_columns
        result['steps'].append(subresult)
    
    result['config'] = argin
    result['data'] = T_array

    return result
Example #11
0
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    num_views = argin["num_views"]
    num_clusters = argin["num_clusters"]
    separation = argin["separation"]
    seed = argin["seed"]
    ct_kernel = argin["ct_kernel"]

    if seed > 0:
        random.seed(seed)

    argin['cctypes'] = ['continuous'] * num_cols
    argin['separation'] = [argin['separation']] * num_views

    # have to generate synthetic data
    filename = "exp_estimate_joint_ofile.csv"
    table_name = 'exp_estimate_joint'

    # generate starting data
    T_o, structure = eu.gen_data(filename, argin, save_csv=True)

    # generate a new csv with bottom row removed (held-out data)
    data_filename = 'exp_estimate_joint.csv'
    T_h = eu.gen_held_out_data(filename, data_filename, 1)

    # get the column names
    with open(filename, 'r') as f:
        csv_header = f.readline()
    col_names = csv_header.split(',')
    col_names[-1] = col_names[-1].strip()

    # set up a dict fro the different config data
    result = dict()

    true_held_out_p = []
    for col in range(num_cols):
        x = T_o[-1, col]
        logp = eu.get_true_logp(numpy.array([x]), col, structure)
        true_held_out_p.append(numpy.exp(logp))

    # start a client
    client = Client()

    # do analyses
    for config in ['cc', 'crp', 'nb']:
        config_string = eu.config_map[config]
        table = table_name + '-' + config

        # drop old btable, create a new one with the new data and init models
        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, data_filename))
        client('INITIALIZE %i MODELS FOR %s %s;' %
               (num_chains, table, config_string))

        these_ps = numpy.zeros(num_iters)
        these_ps_errors = numpy.zeros(num_iters)
        for i in range(num_iters):
            if ct_kernel == 1:
                client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' %
                       table)
            else:
                client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table)

            # imput each index in indices and calculate the squared error
            mean_p = []
            mean_p_error = []
            for col in range(0, num_cols):
                col_name = col_names[col]
                x = T_o[-1, col]
                out = client('SELECT PROBABILITY OF %s=%f from %s;' %
                             (col_name, x, table),
                             pretty=False,
                             pandas_output=False)
                p = out[0]['data'][0][1]

                mean_p.append(p)
                mean_p_error.append((true_held_out_p[col] - p)**2.0)

            these_ps[i] = numpy.mean(mean_p)
            these_ps_errors[i] = numpy.mean(mean_p_error)

        key_str_p = 'mean_held_out_p_' + config
        key_str_error = 'mean_error_' + config
        result[key_str_p] = these_ps
        result[key_str_error] = these_ps_errors

    retval = dict()
    retval['MSE_naive_bayes_indexer'] = result['mean_error_nb']
    retval['MSE_crp_mixture_indexer'] = result['mean_error_crp']
    retval['MSE_crosscat_indexer'] = result['mean_error_cc']

    retval['MEAN_P_naive_bayes_indexer'] = result['mean_held_out_p_nb']
    retval['MEAN_P_crp_mixture_indexer'] = result['mean_held_out_p_crp']
    retval['MEAN_P_crosscat_indexer'] = result['mean_held_out_p_cc']

    retval['config'] = argin

    return retval
Example #12
0
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    with_id = argin["with_id"]
    needles = argin["needles"]
    mixed_types = argin["mixed_types"]
    multinomial_categories = argin["multinomial_categories"]
    separation = argin["separation"]
    num_indep_queries = argin["num_indep_queries"]
    independent_clusters = argin["independent_clusters"]
    ct_kernel = argin["ct_kernel"]

    seed = argin["seed"]

    if seed > 0:
        random.seed(seed)

    # generate column indices and header
    col_names = ["col_%i" % i for i in range(num_cols)]

    if mixed_types and multinomial_categories > 0:
        data_mode = 'mixed'
    elif multinomial_categories > 0:
        data_mode = 'multinomial'
    else:
        data_mode = 'continuous'

    if needles:
        T = [[0] * num_cols] * num_rows
        Zv = [0, 0, 1, 1]  # our needles
        Zv.extend(range(2, num_cols - 2))
        # random.shuffle(Zv)

        num_views = max(Zv) + 1

        separation = [.95] * 2
        separation.extend([0.0] * (num_views - 2))

        min_clusters = 4
        max_clusters = 5

        cluster_weights = []
        # generate weights.
        for v in range(num_views):
            if v < 2:
                num_clusters = random.randrange(min_clusters, max_clusters)
            else:
                if independent_clusters:
                    num_clusters = random.randrange(min_clusters, max_clusters)
                else:
                    num_clusters = 1

            cluster_weights.append([1.0 / num_clusters] * num_clusters)

        cctypes, distargs = eu.get_column_types(data_mode, num_cols,
                                                multinomial_categories)
        T, _ = sdg.gen_data(cctypes,
                            num_rows,
                            Zv,
                            cluster_weights,
                            separation,
                            distargs=distargs)
    else:
        T, cctypes = eu.generate_noise(data_mode, num_rows, num_cols)

    # # preprend the row_id
    # if with_id:
    #     needle_a_cols = (1,2)
    #     needle_b_cols = (3,4)
    #     col_names.insert(0, 'ID')
    #     # TODO: ID type
    #     cctypes.insert(0,'continuous')
    #     # header = "ID,%s" % header
    #     if needles:
    #         Zv.insert(0, num_views)
    #     for row in range(num_rows):
    #         T[row].insert(0, row)
    # else:
    needle_a_cols = (col_names[0], col_names[1])
    needle_b_cols = (col_names[2], col_names[3])

    # save file to .csv
    filename = "needles_exp.csv"
    table = "needles_exp"
    T.insert(0, col_names)
    eu.list_to_csv(filename, T)

    # generate queries
    queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols,
                                                 col_names, table,
                                                 num_indep_queries)
    num_queries = len(queries)

    dependence_probs = numpy.zeros((num_iters, num_queries))

    client = Client()

    client('DROP BTABLE %s;' % table, yes=True)
    client('CREATE BTABLE %s FROM %s;' % (table, filename))
    init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table)
    print init_string
    client(init_string)
    client('SHOW DIAGNOSTICS FOR %s;' % table)

    # do the analyses
    for i in range(num_iters):
        if ct_kernel == 1:
            client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table)
        else:
            client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table)

        for q in range(num_queries):
            query = queries[q]
            out = client(query, pretty=False, pandas_output=False)
            dependence_probs[i, q] = out[0]['data'][0][1]

    result = dict()
    # store the queries in result
    result['query_col1'] = []
    result['query_col2'] = []
    result['dependence_probs'] = dependence_probs
    for pair in pairs:
        result['query_col1'].append(pair[0])
        result['query_col2'].append(pair[1])

    # for each query, get wether those columns were actually independent
    independent = [True] * num_queries
    if needles:
        for i in range(num_queries):
            col_idx_0 = pairs[i][0]
            col_idx_1 = pairs[i][1]
            if Zv[col_idx_0] == Zv[col_idx_1]:
                independent[i] = False

    result['cols_independent'] = independent
    result['config'] = argin
    result['config']['data_mode'] = data_mode

    client('SHOW DIAGNOSTICS FOR %s;' % table)

    return result
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_runs = argin["num_runs"]
    prop_missing = argin["prop_missing"]
    confidence = argin["confidence"]
    seed = argin["seed"]

    n_queries = 2

    # random.seed(seed)

    # using dha, for now
    start_filename = "../data/dha.csv"
    table = 'exp_shrinks_with_iters'

    filename, indices, col_names = eu.gen_missing_data_csv(
        start_filename, prop_missing, [0])

    # get some random column pairs to do DEPENDENCE PROBABILITY queries on
    # don't do queries on the first column
    columns = range(1, len(col_names))
    column_queries = [random.sample(columns, 2) for _ in range(n_queries)]

    dependence_queries = []
    for q in column_queries:
        col_1 = col_names[q[0]].lower()
        col_2 = col_names[q[1]].lower()
        this_query = "SELECT DEPENDENCE PROBABILITY OF %s WITH %s FROM %s;" % (
            col_1, col_2, table)
        dependence_queries.append(this_query)

    # get some inference queries
    column_queries = random.sample(columns, n_queries)
    infer_queries = []
    for q in column_queries:
        col = col_names[q].lower()
        this_query = 'INFER %s FROM %s WITH CONFIDENCE %f;' % (col, table,
                                                               confidence)
        infer_queries.append(this_query)

    # create a client
    client = Client()

    dependence_results = []
    inference_results = []
    for _ in range(num_runs):

        # drop old table, create new table, init models
        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, filename))
        client('INITIALIZE %i MODELS FOR %s;' % (num_chains, table))

        dependence_results_run = numpy.zeros((n_queries, num_iters))
        inference_results_run = numpy.zeros((n_queries, num_iters))

        for i in range(num_iters):
            # analyze
            client('ANALYZE %s FOR 1 ITERATIONS;' % (table))

            # dependence
            for q in range(n_queries):
                out_dep = client(dependence_queries[q],
                                 pretty=False,
                                 pandas_output=False)
                dep = out_dep[0]['data'][0][1]
                dependence_results_run[q, i] = dep

            # infer
            for q in range(n_queries):
                out_inf = client(infer_queries[q],
                                 pretty=False,
                                 pandas_output=False)
                prop = _get_prop_inferred(out_inf[0]['data'], indices,
                                          column_queries[q])
                inference_results_run[q, i] = prop

        dependence_results.append(dependence_results_run)
        inference_results.append(inference_results_run)

    # calculate mean and errors (dependence)
    dep_means = numpy.zeros((n_queries, num_iters))
    dep_error = numpy.zeros((n_queries, num_iters))

    for i in range(num_iters):
        X = numpy.zeros((n_queries, num_runs))
        for r in range(num_runs):
            X[:, r] = dependence_results[r][:, i]
        dep_means[:, i] = numpy.mean(X, axis=1)
        dep_error[:, i] = numpy.std(X, axis=1) / float(num_runs)**.5

    # calculate mean and errors (infer)
    inf_means = numpy.zeros((n_queries, num_iters))
    inf_error = numpy.zeros((n_queries, num_iters))
    for i in range(num_iters):
        X = numpy.zeros((n_queries, num_runs))
        for r in range(num_runs):
            X[:, r] = inference_results[r][:, i]

        inf_means[:, i] = numpy.mean(X, axis=1)
        inf_error[:, i] = numpy.std(X, axis=1) / float(num_runs)**.5

    result = dict()
    result['config'] = argin
    result['num_queries'] = n_queries
    result['iteration'] = range(1, num_iters + 1)
    result['dependence_probability_mean'] = dep_means
    result['dependence_probability_error'] = dep_error
    result['infer_means'] = inf_means
    result['infer_stderr'] = inf_error

    return result
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    num_views = argin["num_views"]
    num_clusters = argin["num_clusters"]
    prop_missing = argin["prop_missing"]
    impute_samples = argin["impute_samples"]
    separation = argin["separation"]
    ct_kernel = argin["ct_kernel"]
    seed = argin["seed"]

    if seed > 0:
        random.seed(seed)

    filename = "exp_fills_in_ofile.csv"
    table_name = 'exp_fills_in'

    argin['cctypes'] = ['continuous'] * num_cols
    argin['separation'] = [argin['separation']] * num_views

    eu.gen_data(filename, argin, save_csv=True)

    # generate a new csv
    all_filenames = []
    all_indices = []
    for p in prop_missing:
        data_filename, indices, col_names, extra = eu.gen_missing_data_csv(
            filename, p, [], True)
        all_indices.append(indices)
        all_filenames.append(data_filename)

    # get the starting table so we can calculate errors
    T_array = extra['array_filled']
    num_rows, num_cols = T_array.shape

    # create a client
    client = Client()

    # set up a dict fro the different config data
    result = dict()
    result['cc'] = numpy.zeros(len(prop_missing))
    result['crp'] = numpy.zeros(len(prop_missing))
    result['nb'] = numpy.zeros(len(prop_missing))

    # do analyses
    for p in range(len(prop_missing)):
        this_indices = all_indices[p]
        this_filename = all_filenames[p]
        for config in ['cc', 'crp', 'nb']:
            config_string = eu.config_map[config]
            table = table_name + '-' + config

            # drop old btable, create a new one with the new data and init models
            client('DROP BTABLE %s;' % table, yes=True)
            client('CREATE BTABLE %s FROM %s;' % (table, this_filename))
            client('INITIALIZE %i MODELS FOR %s %s;' %
                   (num_chains, table, config_string))

            if ct_kernel == 1:
                client('ANALYZE %s FOR %i ITERATIONS WITH MH KENEL WAIT;' %
                       (table, num_iters))
            else:
                client('ANALYZE %s FOR %i ITERATIONS WAIT;' %
                       (table, num_iters))

            MSE = 0.0
            count = 0.0
            # imput each index in indices and calculate the squared error
            for col in range(0, num_cols):
                col_name = col_names[col]
                # confidence is set to zero so that a value is always returned
                out = client(
                    'INFER %s from %s WITH CONFIDENCE %f WITH %i SAMPLES;' %
                    (col_name, table, 0, impute_samples),
                    pretty=False,
                    pandas_output=False)

                data = out[0]['data']

                # calcaulte MSE
                for row, tcol in zip(this_indices[0], this_indices[1]):
                    if tcol == col:
                        MSE += (T_array[row, col] - data[row][1])**2.0
                        count += 1.0

            result[config][p] = MSE / count
            print "error = %f" % result[config][p]

    retval = dict()
    retval['MSE_naive_bayes_indexer'] = result['nb']
    retval['MSE_crp_mixture_indexer'] = result['crp']
    retval['MSE_crosscat_indexer'] = result['cc']
    retval['prop_missing'] = prop_missing
    retval['config'] = argin

    return retval
Example #15
0
from bayesdb.client import Client
client = Client()
client('DROP BTABLE dialysisai;')
client('CREATE BTABLE dialysisai FROM learn_data.csv;')
client(
    'UPDATE DATATYPES FROM dialysisai SET PROG_DURATION=continuous, BLOOD_VOLUME=continuous, REAL_SYMPTOM_ID=ignore;'
)
client('CREATE 20 MODELS FOR dialysisai;')
client('ANALYZE dialysisai FOR 100 ITERATIONS;')
def run_example():
    client = Client()
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, 'flights_analysis.bql')
    print "\nA series of BQL commands will be displayed. Hit <Enter> to execute the displayed command.\n"
    client(open(file_path, 'r'), wait=True)
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    num_views = argin["num_views"]
    num_clusters = argin["num_clusters"]
    prop_missing = argin["prop_missing"]
    separation = argin["separation"]
    ct_kernel = argin["ct_kernel"]

    multinomial_categories = argin["multinomial_categories"]
    seed = argin["seed"]

    random.seed(seed)

    # TODO: use dha.csv
    ofilename = "reasonably_calibrated_ofile.csv"
    table_name = 'reasonably_calibrated'

    argin['distargs'] = [{"K": multinomial_categories}] * num_cols
    argin['cctypes'] = ['multinomial'] * num_cols
    argin['separation'] = [argin['separation']] * num_views

    T_array, structure = eu.gen_data(ofilename, argin, save_csv=True)

    filename, indices, col_names = eu.gen_missing_data_csv(
        ofilename, prop_missing, [])

    # create a client
    client = Client()

    # caluclate empirical frequency of each point
    frequencies = []
    for col in range(num_cols):
        frequencies.append(numpy.zeros(multinomial_categories))
    T_int = numpy.array(T_array, dtype=int)

    n_indices = len(indices[0])
    for i in range(n_indices):
        r = indices[0][i]
        c = indices[1][i]
        x = T_int[r, c]
        frequencies[c][x] += 1.0

    frequencies = [f / numpy.sum(f) for f in frequencies]

    # set up a dict fro the different config data
    result = dict()

    # do analyses
    for config in ['cc', 'crp', 'nb']:
        config_string = eu.config_map[config]
        table = table_name + '-' + config

        # drop old btable, create a new one with the new data and init models
        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, filename))
        client('INITIALIZE %i MODELS FOR %s %s;' %
               (num_chains, table, config_string))

        if ct_kernel == 1:
            client('ANALYZE %s FOR %i ITERATIONS WITH MH KERNEL WAIT;' %
                   (table, num_iters))
        else:
            client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters))

        # imput each index in indices and calculate the squared error
        results_config = []
        for col in range(num_cols):
            results_config.append(numpy.zeros(multinomial_categories))
        for col in range(num_cols):
            col_name = col_names[col]
            out = client(
                "INFER %s FROM %s WITH CONFIDENCE .95 WITH 1 SAMPLES;" %
                (col_name, table),
                pretty=False,
                pandas_output=False)
            for i in range(n_indices):
                r = indices[0][i]
                c = indices[1][i]
                if c == col:
                    x = out[0]['data'][r][1]
                    results_config[c][int(x)] += 1.0

        results_config = [f / sum(f) for f in results_config]
        result[config] = results_config

    retval = dict()
    retval['actual_frequencies'] = frequencies
    retval['inferred_P_cc'] = result['cc']
    retval['inferred_P_crp'] = result['crp']
    retval['inferred_P_nb'] = result['nb']
    retval['config'] = argin

    return retval