def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] prop_missing = argin["prop_missing"] separation = argin["separation"] ct_kernel = argin["ct_kernel"] multinomial_categories = argin["multinomial_categories"] seed = argin["seed"] random.seed(seed) # TODO: use dha.csv ofilename = "reasonably_calibrated_ofile.csv" table_name = "reasonably_calibrated" argin["distargs"] = [{"K": multinomial_categories}] * num_cols argin["cctypes"] = ["multinomial"] * num_cols argin["separation"] = [argin["separation"]] * num_views T_array, structure = eu.gen_data(ofilename, argin, save_csv=True) filename, indices, col_names = eu.gen_missing_data_csv(ofilename, prop_missing, []) # create a client client = Client() # caluclate empirical frequency of each point frequencies = [] for col in range(num_cols): frequencies.append(numpy.zeros(multinomial_categories)) T_int = numpy.array(T_array, dtype=int) n_indices = len(indices[0]) for i in range(n_indices): r = indices[0][i] c = indices[1][i] x = T_int[r, c] frequencies[c][x] += 1.0 frequencies = [f / numpy.sum(f) for f in frequencies] # set up a dict fro the different config data result = dict() # do analyses for config in ["cc", "crp", "nb"]: config_string = eu.config_map[config] table = table_name + "-" + config # drop old btable, create a new one with the new data and init models client("DROP BTABLE %s;" % table, yes=True) client("CREATE BTABLE %s FROM %s;" % (table, filename)) client("INITIALIZE %i MODELS FOR %s %s;" % (num_chains, table, config_string)) if ct_kernel == 1: client("ANALYZE %s FOR %i ITERATIONS WITH MH KERNEL WAIT;" % (table, num_iters)) else: client("ANALYZE %s FOR %i ITERATIONS WAIT;" % (table, num_iters)) # imput each index in indices and calculate the squared error results_config = [] for col in range(num_cols): results_config.append(numpy.zeros(multinomial_categories)) for col in range(num_cols): col_name = col_names[col] out = client( "INFER %s FROM %s WITH CONFIDENCE .95 WITH 1 SAMPLES;" % (col_name, table), pretty=False, pandas_output=False, ) for i in range(n_indices): r = indices[0][i] c = indices[1][i] if c == col: x = out[0]["data"][r][1] results_config[c][int(x)] += 1.0 results_config = [f / sum(f) for f in results_config] result[config] = results_config retval = dict() retval["actual_frequencies"] = frequencies retval["inferred_P_cc"] = result["cc"] retval["inferred_P_crp"] = result["crp"] retval["inferred_P_nb"] = result["nb"] retval["config"] = argin return retval
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] prop_missing = argin["prop_missing"] impute_samples = argin["impute_samples"] separation = argin["separation"] ct_kernel = argin["ct_kernel"] seed = argin["seed"] if seed > 0 : random.seed(seed) filename = "exp_fills_in_ofile.csv" table_name = 'exp_fills_in' argin['cctypes'] = ['continuous']*num_cols argin['separation'] = [argin['separation']]*num_views eu.gen_data(filename, argin, save_csv=True) # generate a new csv all_filenames = [] all_indices = [] for p in prop_missing: data_filename, indices, col_names, extra = eu.gen_missing_data_csv(filename, p, [], True) all_indices.append(indices) all_filenames.append(data_filename) # get the starting table so we can calculate errors T_array = extra['array_filled'] num_rows, num_cols = T_array.shape # create a client client = Client() # set up a dict fro the different config data result = dict() result['cc'] = numpy.zeros(len(prop_missing)) result['crp'] = numpy.zeros(len(prop_missing)) result['nb'] = numpy.zeros(len(prop_missing)) # do analyses for p in range(len(prop_missing)): this_indices = all_indices[p] this_filename = all_filenames[p] for config in ['cc', 'crp', 'nb']: config_string = eu.config_map[config] table = table_name + '-' + config # drop old btable, create a new one with the new data and init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, this_filename)) client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string)) if ct_kernel == 1: client('ANALYZE %s FOR %i ITERATIONS WITH MH KENEL WAIT;' % (table, num_iters) ) else: client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters) ) MSE = 0.0 count = 0.0 # imput each index in indices and calculate the squared error for col in range(0,num_cols): col_name = col_names[col] # confidence is set to zero so that a value is always returned out = client('INFER %s from %s WITH CONFIDENCE %f WITH %i SAMPLES;' % (col_name, table, 0, impute_samples), pretty=False, pandas_output=False ) data = out[0]['data'] # calcaulte MSE for row, tcol in zip(this_indices[0], this_indices[1]): if tcol == col: MSE += ( T_array[row,col] - data[row][1] )**2.0 count += 1.0 result[config][p] = MSE/count print "error = %f" % result[config][p] retval = dict() retval['MSE_naive_bayes_indexer'] = result['nb'] retval['MSE_crp_mixture_indexer'] = result['crp'] retval['MSE_crosscat_indexer'] = result['cc'] retval['prop_missing'] = prop_missing retval['config'] = argin return retval
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_runs = argin["num_runs"] prop_missing = argin["prop_missing"] confidence = argin["confidence"] seed = argin["seed"] n_queries = 2 # random.seed(seed) # using dha, for now start_filename = "../data/dha.csv" table = 'exp_shrinks_with_iters' filename, indices, col_names = eu.gen_missing_data_csv(start_filename, prop_missing, [0]) # get some random column pairs to do DEPENDENCE PROBABILITY queries on # don't do queries on the first column columns = range(1,len(col_names)) column_queries = [ random.sample(columns, 2) for _ in range(n_queries)] dependence_queries = [] for q in column_queries: col_1 = col_names[q[0]].lower() col_2 = col_names[q[1]].lower() this_query = "SELECT DEPENDENCE PROBABILITY OF %s WITH %s FROM %s;" % (col_1, col_2, table) dependence_queries.append(this_query) # get some inference queries column_queries = random.sample(columns, n_queries) infer_queries = [] for q in column_queries: col = col_names[q].lower() this_query = 'INFER %s FROM %s WITH CONFIDENCE %f;' % (col, table, confidence) infer_queries.append(this_query) # create a client client = Client() dependence_results = [] inference_results = [] for _ in range(num_runs): # drop old table, create new table, init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, filename)) client('INITIALIZE %i MODELS FOR %s;' % (num_chains, table)) dependence_results_run = numpy.zeros((n_queries, num_iters)) inference_results_run = numpy.zeros((n_queries, num_iters)) for i in range(num_iters): # analyze client('ANALYZE %s FOR 1 ITERATIONS;' % (table) ) # dependence for q in range(n_queries): out_dep = client(dependence_queries[q], pretty=False, pandas_output=False) dep = out_dep[0]['data'][0][1] dependence_results_run[q,i] = dep # infer for q in range(n_queries): out_inf = client(infer_queries[q], pretty=False, pandas_output=False) prop = _get_prop_inferred(out_inf[0]['data'], indices, column_queries[q]) inference_results_run[q,i] = prop dependence_results.append( dependence_results_run ) inference_results.append( inference_results_run ) # calculate mean and errors (dependence) dep_means = numpy.zeros( (n_queries, num_iters) ) dep_error = numpy.zeros( (n_queries, num_iters) ) for i in range(num_iters): X = numpy.zeros( (n_queries,num_runs) ) for r in range(num_runs): X[:,r] = dependence_results[r][:,i] dep_means[:,i] = numpy.mean(X, axis=1) dep_error[:,i] = numpy.std(X, axis=1)/float(num_runs)**.5 # calculate mean and errors (infer) inf_means = numpy.zeros( (n_queries, num_iters) ) inf_error = numpy.zeros( (n_queries, num_iters) ) for i in range(num_iters): X = numpy.zeros( (n_queries,num_runs) ) for r in range(num_runs): X[:,r] = inference_results[r][:,i] inf_means[:,i] = numpy.mean(X, axis=1) inf_error[:,i] = numpy.std(X, axis=1)/float(num_runs)**.5 result = dict() result['config'] = argin result['num_queries'] = n_queries result['iteration'] = range(1,num_iters+1) result['dependence_probability_mean'] = dep_means result['dependence_probability_error'] = dep_error result['infer_means'] = inf_means result['infer_stderr'] = inf_error return result
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] prop_missing = argin["prop_missing"] impute_samples = argin["impute_samples"] separation = argin["separation"] ct_kernel = argin["ct_kernel"] seed = argin["seed"] if seed > 0: random.seed(seed) filename = "exp_fills_in_ofile.csv" table_name = 'exp_fills_in' argin['cctypes'] = ['continuous'] * num_cols argin['separation'] = [argin['separation']] * num_views eu.gen_data(filename, argin, save_csv=True) # generate a new csv all_filenames = [] all_indices = [] for p in prop_missing: data_filename, indices, col_names, extra = eu.gen_missing_data_csv( filename, p, [], True) all_indices.append(indices) all_filenames.append(data_filename) # get the starting table so we can calculate errors T_array = extra['array_filled'] num_rows, num_cols = T_array.shape # create a client client = Client() # set up a dict fro the different config data result = dict() result['cc'] = numpy.zeros(len(prop_missing)) result['crp'] = numpy.zeros(len(prop_missing)) result['nb'] = numpy.zeros(len(prop_missing)) # do analyses for p in range(len(prop_missing)): this_indices = all_indices[p] this_filename = all_filenames[p] for config in ['cc', 'crp', 'nb']: config_string = eu.config_map[config] table = table_name + '-' + config # drop old btable, create a new one with the new data and init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, this_filename)) client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string)) if ct_kernel == 1: client('ANALYZE %s FOR %i ITERATIONS WITH MH KENEL WAIT;' % (table, num_iters)) else: client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters)) MSE = 0.0 count = 0.0 # imput each index in indices and calculate the squared error for col in range(0, num_cols): col_name = col_names[col] # confidence is set to zero so that a value is always returned out = client( 'INFER %s from %s WITH CONFIDENCE %f WITH %i SAMPLES;' % (col_name, table, 0, impute_samples), pretty=False, pandas_output=False) data = out[0]['data'] # calcaulte MSE for row, tcol in zip(this_indices[0], this_indices[1]): if tcol == col: MSE += (T_array[row, col] - data[row][1])**2.0 count += 1.0 result[config][p] = MSE / count print "error = %f" % result[config][p] retval = dict() retval['MSE_naive_bayes_indexer'] = result['nb'] retval['MSE_crp_mixture_indexer'] = result['crp'] retval['MSE_crosscat_indexer'] = result['cc'] retval['prop_missing'] = prop_missing retval['config'] = argin return retval
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_runs = argin["num_runs"] prop_missing = argin["prop_missing"] confidence = argin["confidence"] seed = argin["seed"] n_queries = 2 # random.seed(seed) # using dha, for now start_filename = "../data/dha.csv" table = 'exp_shrinks_with_iters' filename, indices, col_names = eu.gen_missing_data_csv( start_filename, prop_missing, [0]) # get some random column pairs to do DEPENDENCE PROBABILITY queries on # don't do queries on the first column columns = range(1, len(col_names)) column_queries = [random.sample(columns, 2) for _ in range(n_queries)] dependence_queries = [] for q in column_queries: col_1 = col_names[q[0]].lower() col_2 = col_names[q[1]].lower() this_query = "SELECT DEPENDENCE PROBABILITY OF %s WITH %s FROM %s;" % ( col_1, col_2, table) dependence_queries.append(this_query) # get some inference queries column_queries = random.sample(columns, n_queries) infer_queries = [] for q in column_queries: col = col_names[q].lower() this_query = 'INFER %s FROM %s WITH CONFIDENCE %f;' % (col, table, confidence) infer_queries.append(this_query) # create a client client = Client() dependence_results = [] inference_results = [] for _ in range(num_runs): # drop old table, create new table, init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, filename)) client('INITIALIZE %i MODELS FOR %s;' % (num_chains, table)) dependence_results_run = numpy.zeros((n_queries, num_iters)) inference_results_run = numpy.zeros((n_queries, num_iters)) for i in range(num_iters): # analyze client('ANALYZE %s FOR 1 ITERATIONS;' % (table)) # dependence for q in range(n_queries): out_dep = client(dependence_queries[q], pretty=False, pandas_output=False) dep = out_dep[0]['data'][0][1] dependence_results_run[q, i] = dep # infer for q in range(n_queries): out_inf = client(infer_queries[q], pretty=False, pandas_output=False) prop = _get_prop_inferred(out_inf[0]['data'], indices, column_queries[q]) inference_results_run[q, i] = prop dependence_results.append(dependence_results_run) inference_results.append(inference_results_run) # calculate mean and errors (dependence) dep_means = numpy.zeros((n_queries, num_iters)) dep_error = numpy.zeros((n_queries, num_iters)) for i in range(num_iters): X = numpy.zeros((n_queries, num_runs)) for r in range(num_runs): X[:, r] = dependence_results[r][:, i] dep_means[:, i] = numpy.mean(X, axis=1) dep_error[:, i] = numpy.std(X, axis=1) / float(num_runs)**.5 # calculate mean and errors (infer) inf_means = numpy.zeros((n_queries, num_iters)) inf_error = numpy.zeros((n_queries, num_iters)) for i in range(num_iters): X = numpy.zeros((n_queries, num_runs)) for r in range(num_runs): X[:, r] = inference_results[r][:, i] inf_means[:, i] = numpy.mean(X, axis=1) inf_error[:, i] = numpy.std(X, axis=1) / float(num_runs)**.5 result = dict() result['config'] = argin result['num_queries'] = n_queries result['iteration'] = range(1, num_iters + 1) result['dependence_probability_mean'] = dep_means result['dependence_probability_error'] = dep_error result['infer_means'] = inf_means result['infer_stderr'] = inf_error return result
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] prop_missing = argin["prop_missing"] separation = argin["separation"] ct_kernel = argin["ct_kernel"] multinomial_categories = argin["multinomial_categories"] seed = argin["seed"] random.seed(seed) # TODO: use dha.csv ofilename = "reasonably_calibrated_ofile.csv" table_name = 'reasonably_calibrated' argin['distargs'] = [{"K": multinomial_categories}] * num_cols argin['cctypes'] = ['multinomial'] * num_cols argin['separation'] = [argin['separation']] * num_views T_array, structure = eu.gen_data(ofilename, argin, save_csv=True) filename, indices, col_names = eu.gen_missing_data_csv( ofilename, prop_missing, []) # create a client client = Client() # caluclate empirical frequency of each point frequencies = [] for col in range(num_cols): frequencies.append(numpy.zeros(multinomial_categories)) T_int = numpy.array(T_array, dtype=int) n_indices = len(indices[0]) for i in range(n_indices): r = indices[0][i] c = indices[1][i] x = T_int[r, c] frequencies[c][x] += 1.0 frequencies = [f / numpy.sum(f) for f in frequencies] # set up a dict fro the different config data result = dict() # do analyses for config in ['cc', 'crp', 'nb']: config_string = eu.config_map[config] table = table_name + '-' + config # drop old btable, create a new one with the new data and init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, filename)) client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string)) if ct_kernel == 1: client('ANALYZE %s FOR %i ITERATIONS WITH MH KERNEL WAIT;' % (table, num_iters)) else: client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters)) # imput each index in indices and calculate the squared error results_config = [] for col in range(num_cols): results_config.append(numpy.zeros(multinomial_categories)) for col in range(num_cols): col_name = col_names[col] out = client( "INFER %s FROM %s WITH CONFIDENCE .95 WITH 1 SAMPLES;" % (col_name, table), pretty=False, pandas_output=False) for i in range(n_indices): r = indices[0][i] c = indices[1][i] if c == col: x = out[0]['data'][r][1] results_config[c][int(x)] += 1.0 results_config = [f / sum(f) for f in results_config] result[config] = results_config retval = dict() retval['actual_frequencies'] = frequencies retval['inferred_P_cc'] = result['cc'] retval['inferred_P_crp'] = result['crp'] retval['inferred_P_nb'] = result['nb'] retval['config'] = argin return retval