def create_dha(path='data/dha.csv'): test_tablename = 'dhatest' + str(int(time.time() * 1000000)) + str(int(random.random()*10000000)) header, rows = data_utils.read_csv(path) create_btable_result = engine.create_btable(test_tablename, header, rows, key_column=0) metadata = engine.persistence_layer.get_metadata(test_tablename) global test_tablenames test_tablenames.append(test_tablename) return test_tablename, create_btable_result
def create_dha(path='data/dha.csv'): test_tablename = 'dhatest' + str(int(time.time() * 1000000)) + \ str(int(random.random()*10000000)) header, rows = data_utils.read_csv(path) create_btable_result = engine.create_btable(test_tablename, header, rows, key_column=0) # metadata = engine.persistence_layer.get_metadata(test_tablename) global test_tablenames test_tablenames.append(test_tablename) return test_tablename, create_btable_result
def create_describe_btable(data_path='data/describe.csv', codebook_path='data/describe_codebook.csv', use_codebook=True): # TODO: refactor codebook generation to Engine, not Client test_tablename = 'describetest' + str(int(time.time() * 1000000)) + \ str(int(random.random()*10000000)) if use_codebook: codebook_header, codebook_rows = data_utils.read_csv(codebook_path) codebook = dict() for codebook_row in codebook_rows: codebook[codebook_row[0]] = dict(zip(['short_name', 'description', 'value_map'], codebook_row[1:])) else: codebook = None header, rows = data_utils.read_csv(data_path) create_btable_result = engine.create_btable(test_tablename, header, rows, key_column=0, codebook=codebook) global test_tablenames test_tablenames.append(test_tablename) return test_tablename, create_btable_result
def test_subsampling(): # Use Kiva table, which has 10000 rows, instead of DHA. test_tablename = 'kivatest' + str(int(time.time() * 1000000)) + \ str(int(random.random()*10000000)) global test_tablenames test_tablenames.append(test_tablename) path = 'data/kiva_small.csv' header, rows = data_utils.read_csv(path) num_rows = 4 # rows in kiva_small num_rows_subsample = 2 # client('create btable %s from %s' % (test_tablename, path), debug=True, pretty=False) # only analyze using some rows engine.create_btable(test_tablename, header, rows, subsample=num_rows_subsample, key_column=0) # make sure select (using no models) works and returns the correct number of rows functions = bql.bql_statement.parseString('select loan_id, loan_status from test', parseAll=True).functions whereclause = None limit = float('inf') order_by = False select_result = engine.select(test_tablename, functions, whereclause, limit, order_by, None) assert len(select_result['data']) == num_rows # number of rows in Kiva # TODO: better testing to see what we can do before subsampling (with partial models) num_models = 2 iterations = 1 engine.initialize_models(test_tablename, num_models) # analyze segfaults engine.analyze(test_tablename, model_indices='all', iterations=iterations, background=False) print('analyzed') model_ids = engine.persistence_layer.get_model_ids(test_tablename) for i in range(num_models): model = engine.persistence_layer.get_models(test_tablename, i) assert model['iterations'] == iterations # make sure normal queries work and return the correct number of rows functions = bql.bql_statement.parseString('select loan_id, predictive probability of ' 'loan_status from test', parseAll=True).functions whereclause = None limit = float('inf') order_by = False select_result = engine.select(test_tablename, functions, whereclause, limit, order_by, None) assert len(select_result['data']) == num_rows # number of rows in Kiva