def test_nonintegral_noindex(): with bayesdb_open() as bdb: df = pandas.DataFrame([(1, 2, 'foo'), (4, 5, 6), (7, 8, 9), (10, 11, 12)], index=[42, 78, 62, 43]) with pytest.raises(ValueError): bayesdb_read_pandas_df(bdb, 't', df)
def test_get_metadata(): table_name = "tmp_table" generator_name = "tmp_cc" pandas_df = get_test_df() import os os.environ["BAYESDB_WIZARD_MODE"] = "1" with bayeslite.bayesdb_open() as bdb: bayesdb_read_pandas_df(bdb, table_name, pandas_df, create=True) bdb.execute( """ create generator {} for {} using crosscat(guess(*)) """.format( generator_name, table_name ) ) with pytest.raises(BLE): md = crosscat_utils.get_metadata(bdb, generator_name, 0) bdb.execute("INITIALIZE 2 MODELS FOR {}".format(generator_name)) with pytest.raises(ValueError): # XXX from BayesLite: should be a BLE? crosscat_utils.get_metadata(bdb, "Peter_Gabriel", 0) md = crosscat_utils.get_metadata(bdb, generator_name, 0) assert isinstance(md, dict) assert "X_D" in md.keys() assert "X_L" in md.keys()
def table_from_url(bdb, table_name, url): s = requests.get(url).content df = pd.read_csv( StringIO.StringIO(s.decode('utf-8').encode('ascii', 'ignore'))) read_pandas.bayesdb_read_pandas_df(bdb, table_name, df, create=True, ifnotexists=True)
def prepare_bdb(bdb, samples, table): qt = bayeslite.bql_quote_name(table) dataframe = pd.DataFrame(data=samples) read_pandas.bayesdb_read_pandas_df(bdb, 'data', dataframe, create=True) bdb.execute(''' CREATE POPULATION FOR %s WITH SCHEMA ( GUESS STATTYPES OF (*) ) ''' % (qt, )) bdb.execute('CREATE GENERATOR FOR %s USING loom;' % (qt, )) bdb.execute('INITIALIZE 4 MODELS FOR %s;' % (qt, )) bdb.execute('ANALYZE %s FOR 100 ITERATIONS;' % (qt, ))
def prepare_bdb(bdb, samples, table): qt = bayeslite.bql_quote_name(table) dataframe = pd.DataFrame(data=samples) read_pandas.bayesdb_read_pandas_df(bdb, 'data', dataframe, create=True) bdb.execute(''' CREATE POPULATION FOR %s WITH SCHEMA ( GUESS STATTYPES OF (*) ) ''' % (qt,)) bdb.execute('CREATE GENERATOR FOR %s USING loom;' % (qt,)) bdb.execute('INITIALIZE 4 MODELS FOR %s;' % (qt,)) bdb.execute('ANALYZE %s FOR 100 ITERATIONS;' % (qt,))
def do_test(bdb, t, df, index=None): qt = bql_quote_name(t) countem = 'select count(*) from %s' % (qt, ) assert not bayesdb_has_table(bdb, t) with pytest.raises(ValueError): bayesdb_read_pandas_df(bdb, t, df, index=index) bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=False, index=index) assert len(df.index) == bdb.execute(countem).fetchvalue() with pytest.raises(ValueError): bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=False, index=index) assert 4 == bdb.execute(countem).fetchvalue() with pytest.raises(apsw.ConstraintError): bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=True, index=index) assert 4 == bdb.execute(countem).fetchvalue()
def df_to_table(df, tablename=None, **kwargs): """Return a new BayesDB with a single table with the data in `df`. `df` is a Pandas DataFrame. If `tablename` is not supplied, an arbitrary one will be chosen. `kwargs` are passed on to `bayesdb_open`. Returns a 2-tuple of the new BayesDB instance and the name of the new table. """ bdb = bayesdb_open(**kwargs) if tablename is None: tablename = bdb.temp_table_name() bayesdb_read_pandas_df(bdb, tablename, df, create=True) return (bdb, tablename)
def draw_a_cc_state(filename): rng_seed = random.randrange(10000) num_rows = 100 num_cols = 50 num_splits = 5 num_clusters = 5 nan_prop = .25 table_name = 'plottest' generator_name = 'plottest_cc' # generate some clustered data ccmd = du.generate_clean_state(rng_seed, num_clusters, num_cols, num_rows, num_splits) T, _M_c, _M_r, _X_L, _X_D = ccmd for row in range(num_rows): for col in range(num_cols): if random.random() < nan_prop: T[row][col] = float('nan') input_df = pd.DataFrame(T, columns=['col_%i' % i for i in range(num_cols)]) os.environ['BAYESDB_WIZARD_MODE'] = '1' bdb = bayeslite.bayesdb_open() bayesdb_read_pandas_df(bdb, table_name, input_df, create=True) bdb.execute(''' create generator {} for {} using crosscat(guess(*)) '''.format(generator_name, table_name)) bdb.execute('initialize 4 models for {}'.format(generator_name)) bdb.execute('analyze {} for 10 iterations wait'.format(generator_name)) plt.figure(facecolor='white', tight_layout=False) draw_state( bdb, 'plottest', 'plottest_cc', 0, separator_width=1, separator_color=(0., 0., 1., 1.), short_names=False, nan_color=(1, .15, .25, 1.)) plt.savefig(filename)
def draw_a_cc_state(filename): rng_seed = random.randrange(10000) num_rows = 100 num_cols = 50 num_splits = 5 num_clusters = 5 nan_prop = .25 table_name = 'plottest' generator_name = 'plottest_cc' # generate some clustered data ccmd = du.generate_clean_state(rng_seed, num_clusters, num_cols, num_rows, num_splits) T, _M_c, _M_r, _X_L, _X_D = ccmd for row in range(num_rows): for col in range(num_cols): if random.random() < nan_prop: T[row][col] = float('nan') input_df = pd.DataFrame(T, columns=['col_%i' % i for i in range(num_cols)]) os.environ['BAYESDB_WIZARD_MODE']='1' bdb = bayeslite.bayesdb_open() bayesdb_read_pandas_df(bdb, table_name, input_df, create=True) bdb.execute(''' create generator {} for {} using crosscat(guess(*)) '''.format(generator_name, table_name)) bdb.execute('initialize 4 models for {}'.format(generator_name)) bdb.execute('analyze {} for 10 iterations wait'.format(generator_name)) plt.figure(facecolor='white', tight_layout=False) draw_state(bdb, 'plottest', 'plottest_cc', 0, separator_width=1, separator_color=(0., 0., 1., 1.), short_names=False, nan_color=(1, .15, .25, 1.)) plt.savefig(filename)
def test_get_metadata(): table_name = 'tmp_table' generator_name = 'tmp_cc' pandas_df = get_test_df() import os os.environ['BAYESDB_WIZARD_MODE'] = '1' with bayeslite.bayesdb_open() as bdb: bayesdb_read_pandas_df(bdb, table_name, pandas_df, create=True) bdb.execute(''' create generator {} for {} using crosscat(guess(*)) '''.format(generator_name, table_name)) with pytest.raises(BLE): md = crosscat_utils.get_metadata(bdb, generator_name, 0) bdb.execute('INITIALIZE 2 MODELS FOR {}'.format(generator_name)) with pytest.raises(ValueError): # XXX from BayesLite: should be a BLE? crosscat_utils.get_metadata(bdb, 'Peter_Gabriel', 0) md = crosscat_utils.get_metadata(bdb, generator_name, 0) assert isinstance(md, dict) assert 'X_D' in md.keys() assert 'X_L' in md.keys()
def do_test(bdb, t, df, index=None): qt = bql_quote_name(t) countem = "select count(*) from %s" % (qt,) assert not bayesdb_has_table(bdb, t) with pytest.raises(ValueError): bayesdb_read_pandas_df(bdb, t, df, index=index) bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=False, index=index) assert len(df.index) == bdb.execute(countem).fetchvalue() with pytest.raises(ValueError): bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=False, index=index) assert 4 == bdb.execute(countem).fetchvalue() with pytest.raises(apsw.ConstraintError): bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=True, index=index) assert 4 == bdb.execute(countem).fetchvalue()
for element in liste_datar: if element in liste_col: liste_datar[liste_datar.index(element)] = liste_desc[liste_col.index(element)] datareduce.columns = liste_datar #creating the bdb file bdb = bayeslite.bayesdb_open("bdb/"+str(str(l[i][:-4]))+".bdb") bdbcontrib.query(bdb,'''drop generator if exists dfr_cc''') bdbcontrib.query(bdb,'''drop table if exists dfr''') bayesdb_read_pandas_df(bdb, "dfr", datareduce, create=True) test = quickstart(name='dfr', bdb_path="bdb/"+str(str(l[i][:-4]))+".bdb") q = test.q #run analysis import time start_time = time.time() test.analyze(models=30, iterations=70) t = int(time.time() - start_time) #Depprob matrix img = test.heatmap(test.q('''ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE COLUMNS OF %g''')) ax = img.add_subplot(111) handles, labels = ax.get_legend_handles_labels() lgd = ax.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5,-0.1))
, 'LOAN_DEATH_YR2_RT', 'LOAN_COMP_ORIG_YR2' , 'NOLOAN_DEATH_YR2_RT', 'NOLOAN_COMP_ORIG_YR2' , 'NOLOAN_ENRL_ORIG_YR', 'COMPL_RPY_1YR_RT' , 'NONCOMPL_RPY_1YR_RT', 'LO_INC_RPY_1YR_RT' , 'MD_INC_RPY_1YR_RT', 'HI_INC_RPY_1YR_RT' , 'DEP_RPY_1YR_RT', 'IND_RPY_1YR_RT' , 'COMPL_RPY_3YR_RT' , 'NONCOMPL_RPY_3YR_RT','LO_INC_RPY_3YR_RT' , 'MD_INC_RPY_3YR_RT', 'HI_INC_RPY_3YR_RT' , 'DEP_RPY_3YR_RT', 'IND_RPY_3YR_RT' , 'COMPL_RPY_5YR_RT' , 'NONCOMPL_RPY_5YR_RT', 'LO_INC_RPY_5YR_RT' , 'MD_INC_RPY_5YR_RT', 'HI_INC_RPY_5YR_RT' , 'DEP_RPY_5YR_RT', 'IND_RPY_5YR_RT' ,'DEBT_MDN', 'GRAD_DEBT_MDN' ,'WDRAW_DEBT_MDN', 'LO_INC_DEBT_MDN' ,'MD_INC_DEBT_MDN', 'HI_INC_DEBT_MDN' ,'DEP_DEBT_MDN', 'IND_DEBT_MDN' ,'faminc', 'md_faminc' ,'mn_earn_wne_p10', 'md_earn_wne_p10' ,'pct10_earn_wne_p10', 'pct25_earn_wne_p10' ,'pct75_earn_wne_p10', 'pct90_earn_wne_p10']] bdb = bayeslite.bayesdb_open("df.bdb") bdbcontrib.query(bdb,'drop generator df_cc') bdbcontrib.query(bdb,'drop table df') bayesdb_read_pandas_df(bdb, "df", df, create=True) ed = quickstart(name='df', bdb_path='df.bdb') q = ed.q ed.analyze(models=32, minutes=1)
def test_nonintegral_noindex(): with bayesdb_open() as bdb: df = pandas.DataFrame([(1,2,'foo'),(4,5,6),(7,8,9),(10,11,12)], index=[42, 78, 62, 43]) with pytest.raises(ValueError): bayesdb_read_pandas_df(bdb, 't', df)
#L = [cols[cols.iloc[:,0]==c]['Study ID'].iloc[0] for c in liste_datar] L = [] for element in liste_datar: if element in liste_col: liste_datar[liste_datar.index(element)] = liste_desc[ liste_col.index(element)] datareduce.columns = liste_datar #creating the bdb file bdb = bayeslite.bayesdb_open("bdb/" + str(str(l[i][:-4])) + ".bdb") bdbcontrib.query(bdb, '''drop generator if exists dfr_cc''') bdbcontrib.query(bdb, '''drop table if exists dfr''') bayesdb_read_pandas_df(bdb, "dfr", datareduce, create=True) test = quickstart(name='dfr', bdb_path="bdb/" + str(str(l[i][:-4])) + ".bdb") q = test.q #run analysis import time start_time = time.time() test.analyze(models=30, iterations=70) t = int(time.time() - start_time) #Depprob matrix img = test.heatmap( test.q( '''ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE COLUMNS OF %g''')) ax = img.add_subplot(111)