Ejemplo n.º 1
0
def test_nonintegral_noindex():
    with bayesdb_open() as bdb:
        df = pandas.DataFrame([(1, 2, 'foo'), (4, 5, 6), (7, 8, 9),
                               (10, 11, 12)],
                              index=[42, 78, 62, 43])
        with pytest.raises(ValueError):
            bayesdb_read_pandas_df(bdb, 't', df)
Ejemplo n.º 2
0
def test_get_metadata():
    table_name = "tmp_table"
    generator_name = "tmp_cc"
    pandas_df = get_test_df()

    import os

    os.environ["BAYESDB_WIZARD_MODE"] = "1"
    with bayeslite.bayesdb_open() as bdb:
        bayesdb_read_pandas_df(bdb, table_name, pandas_df, create=True)
        bdb.execute(
            """
            create generator {} for {} using crosscat(guess(*))
        """.format(
                generator_name, table_name
            )
        )
        with pytest.raises(BLE):
            md = crosscat_utils.get_metadata(bdb, generator_name, 0)

        bdb.execute("INITIALIZE 2 MODELS FOR {}".format(generator_name))

        with pytest.raises(ValueError):  # XXX from BayesLite: should be a BLE?
            crosscat_utils.get_metadata(bdb, "Peter_Gabriel", 0)
        md = crosscat_utils.get_metadata(bdb, generator_name, 0)

        assert isinstance(md, dict)
        assert "X_D" in md.keys()
        assert "X_L" in md.keys()
Ejemplo n.º 3
0
def table_from_url(bdb, table_name, url):
    s = requests.get(url).content
    df = pd.read_csv(
        StringIO.StringIO(s.decode('utf-8').encode('ascii', 'ignore')))
    read_pandas.bayesdb_read_pandas_df(bdb,
                                       table_name,
                                       df,
                                       create=True,
                                       ifnotexists=True)
def prepare_bdb(bdb, samples, table):
    qt = bayeslite.bql_quote_name(table)
    dataframe = pd.DataFrame(data=samples)
    read_pandas.bayesdb_read_pandas_df(bdb, 'data', dataframe, create=True)

    bdb.execute('''
        CREATE POPULATION FOR %s WITH SCHEMA (
            GUESS STATTYPES OF (*)
        )
    ''' % (qt, ))
    bdb.execute('CREATE GENERATOR FOR %s USING loom;' % (qt, ))
    bdb.execute('INITIALIZE 4 MODELS FOR %s;' % (qt, ))
    bdb.execute('ANALYZE %s FOR 100 ITERATIONS;' % (qt, ))
def prepare_bdb(bdb, samples, table):
    qt = bayeslite.bql_quote_name(table)
    dataframe = pd.DataFrame(data=samples)
    read_pandas.bayesdb_read_pandas_df(bdb, 'data', dataframe, create=True)

    bdb.execute('''
        CREATE POPULATION FOR %s WITH SCHEMA (
            GUESS STATTYPES OF (*)
        )
    ''' % (qt,))
    bdb.execute('CREATE GENERATOR FOR %s USING loom;' % (qt,))
    bdb.execute('INITIALIZE 4 MODELS FOR %s;' % (qt,))
    bdb.execute('ANALYZE %s FOR 100 ITERATIONS;' % (qt,))
Ejemplo n.º 6
0
def do_test(bdb, t, df, index=None):
    qt = bql_quote_name(t)
    countem = 'select count(*) from %s' % (qt, )
    assert not bayesdb_has_table(bdb, t)

    with pytest.raises(ValueError):
        bayesdb_read_pandas_df(bdb, t, df, index=index)

    bayesdb_read_pandas_df(bdb,
                           t,
                           df,
                           create=True,
                           ifnotexists=False,
                           index=index)
    assert len(df.index) == bdb.execute(countem).fetchvalue()

    with pytest.raises(ValueError):
        bayesdb_read_pandas_df(bdb,
                               t,
                               df,
                               create=True,
                               ifnotexists=False,
                               index=index)
    assert 4 == bdb.execute(countem).fetchvalue()

    with pytest.raises(apsw.ConstraintError):
        bayesdb_read_pandas_df(bdb,
                               t,
                               df,
                               create=True,
                               ifnotexists=True,
                               index=index)
    assert 4 == bdb.execute(countem).fetchvalue()
Ejemplo n.º 7
0
def df_to_table(df, tablename=None, **kwargs):
    """Return a new BayesDB with a single table with the data in `df`.

    `df` is a Pandas DataFrame.

    If `tablename` is not supplied, an arbitrary one will be chosen.

    `kwargs` are passed on to `bayesdb_open`.

    Returns a 2-tuple of the new BayesDB instance and the name of the
    new table.
    """
    bdb = bayesdb_open(**kwargs)
    if tablename is None:
        tablename = bdb.temp_table_name()
    bayesdb_read_pandas_df(bdb, tablename, df, create=True)
    return (bdb, tablename)
Ejemplo n.º 8
0
def df_to_table(df, tablename=None, **kwargs):
    """Return a new BayesDB with a single table with the data in `df`.

    `df` is a Pandas DataFrame.

    If `tablename` is not supplied, an arbitrary one will be chosen.

    `kwargs` are passed on to `bayesdb_open`.

    Returns a 2-tuple of the new BayesDB instance and the name of the
    new table.
    """
    bdb = bayesdb_open(**kwargs)
    if tablename is None:
        tablename = bdb.temp_table_name()
    bayesdb_read_pandas_df(bdb, tablename, df, create=True)
    return (bdb, tablename)
Ejemplo n.º 9
0
def draw_a_cc_state(filename):
    rng_seed = random.randrange(10000)
    num_rows = 100
    num_cols = 50
    num_splits = 5
    num_clusters = 5

    nan_prop = .25

    table_name = 'plottest'
    generator_name = 'plottest_cc'

    # generate some clustered data
    ccmd = du.generate_clean_state(rng_seed, num_clusters, num_cols, num_rows,
                                   num_splits)
    T, _M_c, _M_r, _X_L, _X_D = ccmd

    for row in range(num_rows):
        for col in range(num_cols):
            if random.random() < nan_prop:
                T[row][col] = float('nan')

    input_df = pd.DataFrame(T, columns=['col_%i' % i for i in range(num_cols)])

    os.environ['BAYESDB_WIZARD_MODE'] = '1'
    bdb = bayeslite.bayesdb_open()
    bayesdb_read_pandas_df(bdb, table_name, input_df, create=True)
    bdb.execute('''
        create generator {} for {} using crosscat(guess(*))
    '''.format(generator_name, table_name))
    bdb.execute('initialize 4 models for {}'.format(generator_name))
    bdb.execute('analyze {} for 10 iterations wait'.format(generator_name))
    plt.figure(facecolor='white', tight_layout=False)
    draw_state(
        bdb,
        'plottest',
        'plottest_cc',
        0,
        separator_width=1,
        separator_color=(0., 0., 1., 1.),
        short_names=False,
        nan_color=(1, .15, .25, 1.))
    plt.savefig(filename)
Ejemplo n.º 10
0
def draw_a_cc_state(filename):
    rng_seed = random.randrange(10000)
    num_rows = 100
    num_cols = 50
    num_splits = 5
    num_clusters = 5

    nan_prop = .25

    table_name = 'plottest'
    generator_name = 'plottest_cc'

    # generate some clustered data
    ccmd = du.generate_clean_state(rng_seed, num_clusters, num_cols, num_rows,
                                   num_splits)
    T, _M_c, _M_r, _X_L, _X_D = ccmd

    for row in range(num_rows):
        for col in range(num_cols):
            if random.random() < nan_prop:
                T[row][col] = float('nan')

    input_df = pd.DataFrame(T, columns=['col_%i' % i for i in range(num_cols)])

    os.environ['BAYESDB_WIZARD_MODE']='1'
    bdb = bayeslite.bayesdb_open()
    bayesdb_read_pandas_df(bdb, table_name, input_df, create=True)
    bdb.execute('''
        create generator {} for {} using crosscat(guess(*))
    '''.format(generator_name, table_name))
    bdb.execute('initialize 4 models for {}'.format(generator_name))
    bdb.execute('analyze {} for 10 iterations wait'.format(generator_name))
    plt.figure(facecolor='white', tight_layout=False)
    draw_state(bdb, 'plottest', 'plottest_cc', 0,
               separator_width=1, separator_color=(0., 0., 1., 1.),
               short_names=False, nan_color=(1, .15, .25, 1.))
    plt.savefig(filename)
Ejemplo n.º 11
0
def test_get_metadata():
    table_name = 'tmp_table'
    generator_name = 'tmp_cc'
    pandas_df = get_test_df()

    import os
    os.environ['BAYESDB_WIZARD_MODE'] = '1'
    with bayeslite.bayesdb_open() as bdb:
        bayesdb_read_pandas_df(bdb, table_name, pandas_df, create=True)
        bdb.execute('''
            create generator {} for {} using crosscat(guess(*))
        '''.format(generator_name, table_name))
        with pytest.raises(BLE):
            md = crosscat_utils.get_metadata(bdb, generator_name, 0)

        bdb.execute('INITIALIZE 2 MODELS FOR {}'.format(generator_name))

        with pytest.raises(ValueError):  # XXX from BayesLite: should be a BLE?
            crosscat_utils.get_metadata(bdb, 'Peter_Gabriel', 0)
        md = crosscat_utils.get_metadata(bdb, generator_name, 0)

        assert isinstance(md, dict)
        assert 'X_D' in md.keys()
        assert 'X_L' in md.keys()
Ejemplo n.º 12
0
def do_test(bdb, t, df, index=None):
    qt = bql_quote_name(t)
    countem = "select count(*) from %s" % (qt,)
    assert not bayesdb_has_table(bdb, t)

    with pytest.raises(ValueError):
        bayesdb_read_pandas_df(bdb, t, df, index=index)

    bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=False, index=index)
    assert len(df.index) == bdb.execute(countem).fetchvalue()

    with pytest.raises(ValueError):
        bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=False, index=index)
    assert 4 == bdb.execute(countem).fetchvalue()

    with pytest.raises(apsw.ConstraintError):
        bayesdb_read_pandas_df(bdb, t, df, create=True, ifnotexists=True, index=index)
    assert 4 == bdb.execute(countem).fetchvalue()
Ejemplo n.º 13
0
    
    for element in liste_datar:
      if element in liste_col:
        liste_datar[liste_datar.index(element)] = liste_desc[liste_col.index(element)]
    
    
    datareduce.columns = liste_datar
    
    
  #creating the bdb file

  
  bdb = bayeslite.bayesdb_open("bdb/"+str(str(l[i][:-4]))+".bdb")
  bdbcontrib.query(bdb,'''drop generator if exists dfr_cc''')
  bdbcontrib.query(bdb,'''drop table if exists dfr''')
  bayesdb_read_pandas_df(bdb, "dfr", datareduce, create=True)
  test = quickstart(name='dfr', bdb_path="bdb/"+str(str(l[i][:-4]))+".bdb")
  q = test.q


#run analysis
  import time
  start_time = time.time()
  test.analyze(models=30, iterations=70)
  t = int(time.time() - start_time)
#Depprob matrix

  img = test.heatmap(test.q('''ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE COLUMNS OF %g'''))
  ax = img.add_subplot(111)
  handles, labels = ax.get_legend_handles_labels()
  lgd = ax.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5,-0.1))
Ejemplo n.º 14
0
               , 'LOAN_DEATH_YR2_RT', 'LOAN_COMP_ORIG_YR2'
               , 'NOLOAN_DEATH_YR2_RT', 'NOLOAN_COMP_ORIG_YR2'
               , 'NOLOAN_ENRL_ORIG_YR', 'COMPL_RPY_1YR_RT'
               , 'NONCOMPL_RPY_1YR_RT', 'LO_INC_RPY_1YR_RT'
               , 'MD_INC_RPY_1YR_RT', 'HI_INC_RPY_1YR_RT'
               , 'DEP_RPY_1YR_RT', 'IND_RPY_1YR_RT'
               , 'COMPL_RPY_3YR_RT'
               , 'NONCOMPL_RPY_3YR_RT','LO_INC_RPY_3YR_RT'
               , 'MD_INC_RPY_3YR_RT', 'HI_INC_RPY_3YR_RT'
               , 'DEP_RPY_3YR_RT', 'IND_RPY_3YR_RT'
               , 'COMPL_RPY_5YR_RT'
               , 'NONCOMPL_RPY_5YR_RT', 'LO_INC_RPY_5YR_RT'
               , 'MD_INC_RPY_5YR_RT', 'HI_INC_RPY_5YR_RT'
               , 'DEP_RPY_5YR_RT', 'IND_RPY_5YR_RT'
               ,'DEBT_MDN', 'GRAD_DEBT_MDN'
               ,'WDRAW_DEBT_MDN', 'LO_INC_DEBT_MDN'
               ,'MD_INC_DEBT_MDN', 'HI_INC_DEBT_MDN'
               ,'DEP_DEBT_MDN', 'IND_DEBT_MDN'
               ,'faminc', 'md_faminc'
               ,'mn_earn_wne_p10', 'md_earn_wne_p10'
               ,'pct10_earn_wne_p10', 'pct25_earn_wne_p10'
               ,'pct75_earn_wne_p10', 'pct90_earn_wne_p10']] 

bdb = bayeslite.bayesdb_open("df.bdb")
bdbcontrib.query(bdb,'drop generator df_cc')
bdbcontrib.query(bdb,'drop table df')
bayesdb_read_pandas_df(bdb, "df", df, create=True)
ed = quickstart(name='df', bdb_path='df.bdb')
q = ed.q

ed.analyze(models=32, minutes=1)
Ejemplo n.º 15
0
def test_nonintegral_noindex():
    with bayesdb_open() as bdb:
        df = pandas.DataFrame([(1,2,'foo'),(4,5,6),(7,8,9),(10,11,12)],
            index=[42, 78, 62, 43])
        with pytest.raises(ValueError):
            bayesdb_read_pandas_df(bdb, 't', df)
Ejemplo n.º 16
0
        #L = [cols[cols.iloc[:,0]==c]['Study ID'].iloc[0] for c in liste_datar]
        L = []

        for element in liste_datar:
            if element in liste_col:
                liste_datar[liste_datar.index(element)] = liste_desc[
                    liste_col.index(element)]

        datareduce.columns = liste_datar

    #creating the bdb file

    bdb = bayeslite.bayesdb_open("bdb/" + str(str(l[i][:-4])) + ".bdb")
    bdbcontrib.query(bdb, '''drop generator if exists dfr_cc''')
    bdbcontrib.query(bdb, '''drop table if exists dfr''')
    bayesdb_read_pandas_df(bdb, "dfr", datareduce, create=True)
    test = quickstart(name='dfr',
                      bdb_path="bdb/" + str(str(l[i][:-4])) + ".bdb")
    q = test.q

    #run analysis
    import time
    start_time = time.time()
    test.analyze(models=30, iterations=70)
    t = int(time.time() - start_time)
    #Depprob matrix

    img = test.heatmap(
        test.q(
            '''ESTIMATE DEPENDENCE PROBABILITY FROM PAIRWISE COLUMNS OF %g'''))
    ax = img.add_subplot(111)