Beispiel #1
0
 def initialize(self):
   if self.bdb:
     self.check_representation()
     return
   self.bdb = bayeslite.bayesdb_open(self.bdb_path)
   if not bayeslite.core.bayesdb_has_table(self.bdb, self.name):
     if self.df is not None:
       bayeslite.read_pandas.bayesdb_read_pandas_df(
         self.bdb, self.name, self.df, create=True, ifnotexists=True)
     elif self.csv_path:
       bayeslite.bayesdb_read_csv_file(
         self.bdb, self.name, self.csv_path,
         header=True, create=True, ifnotexists=True)
     else:
       tables = self.list_tables()
       metamodels = self.list_metamodels()
       if len(tables) + len(metamodels) == 0:
         raise BLE(ValueError("No data sources specified, and an empty bdb."))
       else:
         raise BLE(ValueError("The name of the population must be the same"
                              " as a table in the bdb, one of: " +
                              ", ".join(tables) +
                              "\nNote also that the bdb has the following"
                              " metamodels defined: " + ", ".join(metamodels)))
   self.generators = self.query('''SELECT * FROM bayesdb_generator''')
   if len(self.generators) == 0:
     size = self.query('''SELECT COUNT(*) FROM %t''').ix[0, 0]
     assert 0 < size
     self.query('''
       CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''')
   self.check_representation()
Beispiel #2
0
def test_estimate_pairwise_similarity_long():
    """
    Tests larger queries that need to be broken into batch inserts of 500
    values each, as well as the N parameter.
    """
    os.environ['BAYESDB_WIZARD_MODE'] = '1'

    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            # n = 40 -> 40**2 -> 1600 rows total
            temp.write(_bigger_csv_data(40))
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(
                bdb, 't', temp.name, header=True, create=True)
        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # test N = 0
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc', N=0
        )
        assert cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')
        ).shape == (0, 0)

        # test other values of N
        for N in [1, 2, 10, 20, 40]:
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc', N=N, overwrite=True
            )
            assert cursor_to_df(
                bdb.execute('SELECT * FROM t_similarity')
            ).shape == (N**2, 3)
        # N too high should fail
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc', N=41, overwrite=True
            )

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')
        ).sort_values(by=['rowid0', 'rowid1'])
        parallel_sim.index = range(parallel_sim.shape[0])

        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')
        )

        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
def test_estimate_pairwise_similarity_long():
    """
    Tests larger queries that need to be broken into batch inserts of 500
    values each, as well as the N parameter.
    """
    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            # n = 40 -> 40**2 -> 1600 rows total
            temp.write(_bigger_csv_data(40))
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(bdb,
                                            't',
                                            temp.name,
                                            header=True,
                                            create=True)
        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # test N = 0
        parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc', N=0)
        assert cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')).shape == (0, 0)

        # test other values of N
        for N in [1, 2, 10, 20, 40]:
            parallel.estimate_pairwise_similarity(bdb_file.name,
                                                  't',
                                                  't_cc',
                                                  N=N,
                                                  overwrite=True)
            assert cursor_to_df(
                bdb.execute('SELECT * FROM t_similarity')).shape == (N**2, 3)
        # N too high should fail
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(bdb_file.name,
                                                  't',
                                                  't_cc',
                                                  N=41,
                                                  overwrite=True)

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1'))
        parallel_sim.index = range(parallel_sim.shape[0])

        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc'))

        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
Beispiel #4
0
def test_cardinality(data, cols, cardinalities_expected):
    with tempfile.NamedTemporaryFile() as temp:
        temp.write(data)
        temp.seek(0)
        with bayeslite.bayesdb_open() as bdb:
            bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True,
                                            create=True)
            cards = bql_utils.cardinality(bdb, 't', cols)
            for c in cards:
                assert 2 == len(c)
                assert c[0] in ('id', 'one', 'two', 'three', 'four')
            assert cardinalities_expected == [c[1] for c in cards]
def test_cardinality(data, cols, cardinalities_expected):
    with tempfile.NamedTemporaryFile() as temp:
        temp.write(data)
        temp.seek(0)
        with bayeslite.bayesdb_open() as bdb:
            bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True,
                                            create=True)
            cards = bql_utils.cardinality(bdb, 't', cols)
            for col, count, expected_count in zip(
                    cards['name'], cards['distinct_count'],
                    cardinalities_expected):
                assert expected_count == count
                assert col in ('id', 'one', 'two', 'three', 'four')
                if cols is not None:
                    expected_col = cols.pop(0)
                    assert expected_col == col
            assert len(cards) == len(cardinalities_expected)
Beispiel #6
0
def test_cardinality(data, cols, cardinalities_expected):
    with tempfile.NamedTemporaryFile() as temp:
        temp.write(data)
        temp.seek(0)
        with bayeslite.bayesdb_open() as bdb:
            bayeslite.bayesdb_read_csv_file(bdb,
                                            't',
                                            temp.name,
                                            header=True,
                                            create=True)
            cards = bql_utils.cardinality(bdb, 't', cols)
            for col, count, expected_count in zip(cards['name'],
                                                  cards['distinct_count'],
                                                  cardinalities_expected):
                assert expected_count == count
                assert col in ('id', 'one', 'two', 'three', 'four')
                if cols is not None:
                    expected_col = cols.pop(0)
                    assert expected_col == col
            assert len(cards) == len(cardinalities_expected)
Beispiel #7
0
def test_nullify_no_missing(data, value, num_nulls_expected):
    with tempfile.NamedTemporaryFile() as temp:
        temp.write(data)
        temp.seek(0)
        with bayeslite.bayesdb_open() as bdb:
            bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True,
                                            create=True)
            bql_utils.nullify(bdb, 't', value)

            c = bdb.execute('SELECT COUNT(*) FROM t WHERE one IS NULL;')
            assert c.fetchvalue() == num_nulls_expected[0]

            c = bdb.execute('SELECT COUNT(*) FROM t WHERE two IS NULL;')
            assert c.fetchvalue() == num_nulls_expected[1]

            c = bdb.execute('SELECT COUNT(*) FROM t WHERE three IS NULL;')
            assert c.fetchvalue() == num_nulls_expected[2]

            c = bdb.execute('SELECT COUNT(*) FROM t WHERE four IS NULL;')
            assert c.fetchvalue() == num_nulls_expected[3]
Beispiel #8
0
 def initialize(self):
   if self.bdb:
     return
   self.bdb = bayeslite.bayesdb_open(self.bdb_path)
   if not bayeslite.core.bayesdb_has_table(self.bdb, self.name):
     if self.df is not None:
       bayeslite.read_pandas.bayesdb_read_pandas_df(
         self.bdb, self.name, self.df, create=True, ifnotexists=True)
     elif self.csv_path:
       bayeslite.bayesdb_read_csv_file(
         self.bdb, self.name, self.csv_path,
         header=True, create=True, ifnotexists=True)
     else:
       raise BLE(ValueError("No data sources specified, and an empty bdb."))
   self.generators = self.query('''SELECT * FROM bayesdb_generator''')
   if len(self.generators) == 0:
     size = self.query('''SELECT COUNT(*) FROM %t''').ix(0, 0)
     assert 0 < size
     self.query('''
       CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''')
Beispiel #9
0
 def initialize(self):
     if self.bdb:
         self.check_representation()
         return
     self.bdb = bayeslite.bayesdb_open(self.bdb_path)
     if not bayeslite.core.bayesdb_has_table(self.bdb, self.name):
         if self.df is not None:
             bayeslite.read_pandas.bayesdb_read_pandas_df(self.bdb,
                                                          self.name,
                                                          self.df,
                                                          create=True,
                                                          ifnotexists=True)
         elif self.csv_path:
             bayeslite.bayesdb_read_csv_file(self.bdb,
                                             self.name,
                                             self.csv_path,
                                             header=True,
                                             create=True,
                                             ifnotexists=True)
         else:
             tables = self.list_tables()
             metamodels = self.list_metamodels()
             if len(tables) + len(metamodels) == 0:
                 raise BLE(
                     ValueError(
                         "No data sources specified, and an empty bdb."))
             else:
                 raise BLE(
                     ValueError(
                         "The name of the population must be the same"
                         " as a table in the bdb, one of: " +
                         ", ".join(tables) +
                         "\nNote also that the bdb has the following"
                         " metamodels defined: " + ", ".join(metamodels)))
     self.generators = self.query('''SELECT * FROM bayesdb_generator''')
     if len(self.generators) == 0:
         size = self.query('''SELECT COUNT(*) FROM %t''').ix[0, 0]
         assert 0 < size
         self.query('''
     CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''')
     self.check_representation()
Beispiel #10
0
def test_nullify_no_missing(data, value, num_nulls_expected):
    with tempfile.NamedTemporaryFile() as temp:
        temp.write(data)
        temp.seek(0)
        with bayeslite.bayesdb_open() as bdb:
            bayeslite.bayesdb_read_csv_file(bdb,
                                            't',
                                            temp.name,
                                            header=True,
                                            create=True)
            bql_utils.nullify(bdb, 't', value)

            c = bdb.execute('SELECT COUNT(*) FROM t WHERE one IS NULL;')
            assert c.fetchvalue() == num_nulls_expected[0]

            c = bdb.execute('SELECT COUNT(*) FROM t WHERE two IS NULL;')
            assert c.fetchvalue() == num_nulls_expected[1]

            c = bdb.execute('SELECT COUNT(*) FROM t WHERE three IS NULL;')
            assert c.fetchvalue() == num_nulls_expected[2]

            c = bdb.execute('SELECT COUNT(*) FROM t WHERE four IS NULL;')
            assert c.fetchvalue() == num_nulls_expected[3]
Beispiel #11
0
 def initialize(self):
   if self.bdb:
     return
   self.bdb = bayeslite.bayesdb_open(self.bdb_path)
   if not bayeslite.core.bayesdb_has_table(self.bdb, self.name):
     if self.df is not None:
       bayeslite.read_pandas.bayesdb_read_pandas_df(
         self.bdb, self.name, self.df, create=True, ifnotexists=True)
     elif self.csv_path:
       bayeslite.bayesdb_read_csv_file(
         self.bdb, self.name, self.csv_path,
         header=True, create=True, ifnotexists=True)
     else:
       raise ValueError("No data sources specified, and an empty bdb.")
   size = self.query('''SELECT COUNT(*) FROM %t''').ix(0, 0)
   assert 0 < size
   if "BAYESDB_WIZARD_MODE" in os.environ:
     old_wizmode = os.environ["BAYESDB_WIZARD_MODE"]
   else:
     old_wizmode = ""
   os.environ["BAYESDB_WIZARD_MODE"] = "1"
   self.query('''
       CREATE GENERATOR %g IF NOT EXISTS FOR %t USING crosscat( GUESS(*) )''')
   os.environ["BAYESDB_WIZARD_MODE"] = old_wizmode
def test_read_csv():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:
        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Table must already exist for create=False.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False,
                ifnotexists=False)
        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Must pass create=True for ifnotexists=True.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False,
                ifnotexists=True)
        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Must pass create=False for header=False.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True,
                ifnotexists=False)
        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Must pass create=False for header=False.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True,
                ifnotexists=True)
        f = StringIO.StringIO(csv_hdrdata)
        with pytest.raises(ValueError):
            # Table must already exist for create=False.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False,
                ifnotexists=False)
        f = StringIO.StringIO(csv_hdrdata)
        with pytest.raises(ValueError):
            # Must pass create=True for ifnotexists=True.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False,
                ifnotexists=True)
        f = StringIO.StringIO(csv_hdrdata)
        with pytest.raises(ValueError):
            with bdb.savepoint():
                # Table must not exist if ifnotexists=False.
                bdb.sql_execute('CREATE TABLE t(x)')
                bayeslite.bayesdb_read_csv(bdb, 't', f, header=True,
                    create=True, ifnotexists=False)
        f = StringIO.StringIO(csv_hdrdata)
        bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True,
            ifnotexists=False)
        data = bdb.sql_execute('SELECT * FROM t').fetchall()
        assert data == [
            # XXX Would be nice if the NaN could actually be that, or
            # at least None/NULL.
            (1,2,3,'foo','bar',u'nan',u'',u'quagga'),
            (4,5,6,'baz','quux',42.0,u'',u'eland'),
            (7,8,6,'zot','mumble',87.0,u'zoot',u'caribou'),
        ]
        f = StringIO.StringIO(csv_hdr)
        bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True,
            ifnotexists=True)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == data
        assert cursor_value(bdb.sql_execute('SELECT sql FROM sqlite_master'
                    ' WHERE name = ?', ('t',))) == \
            'CREATE TABLE "t"' \
            '("a" NUMERIC,"b" NUMERIC,"c" NUMERIC,"name" NUMERIC,' \
            '"nick" NUMERIC,"age" NUMERIC,"muppet" NUMERIC,"animal" NUMERIC)'
        f = StringIO.StringIO(csv_data)
        bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False,
            ifnotexists=False)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == data + data
        f = StringIO.StringIO(csv_hdrdata)
        bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False,
            ifnotexists=False)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == \
            data + data + data
        with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp:
            with open(temp.name, 'w') as f:
                f.write(csv_hdrdata)
            bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True,
                create=False, ifnotexists=False)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == \
            data + data + data + data
Beispiel #13
0
PATH_EXAMPLES = os.path.dirname(PATH_KEPLER)
PATH_SATELLITES = os.path.join(PATH_EXAMPLES, 'satellites')
PATH_SATELLITES_CSV = os.path.join(PATH_SATELLITES, 'satellites.csv')

composer = Composer()
composer.register_foreign_predictor(keplers_law.KeplersLaw)
composer.register_foreign_predictor(random_forest.RandomForest)

if os.path.exists(os.path.join(outdir, 'kepler.bdb')):
    os.remove(os.path.join(outdir, 'kepler.bdb'))

bdb = bayeslite.bayesdb_open(os.path.join(outdir, 'kepler.bdb'))
bayeslite.bayesdb_register_metamodel(bdb, composer)
bayeslite.bayesdb_read_csv_file(bdb,
                                'satellites',
                                PATH_SATELLITES_CSV,
                                header=True,
                                create=True)

bdbcontrib.query(
    bdb, '''
    CREATE GENERATOR sat_kepler FOR satellites USING composer(
        default (
            Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
            Users CATEGORICAL, Purpose CATEGORICAL,
            Class_of_Orbit CATEGORICAL, Perigee_km NUMERICAL,
            Apogee_km NUMERICAL, Eccentricity NUMERICAL,
            Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
            Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
            Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL,
            Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
Beispiel #14
0
def test_read_csv():
    with bayeslite.bayesdb_open(builtin_metamodels=False) as bdb:

        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Table must already exist for create=False.
            bayeslite.bayesdb_read_csv(bdb,
                                       't',
                                       f,
                                       header=False,
                                       create=False,
                                       ifnotexists=False)

        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Must pass create=True for ifnotexists=True.
            bayeslite.bayesdb_read_csv(bdb,
                                       't',
                                       f,
                                       header=False,
                                       create=False,
                                       ifnotexists=True)

        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Must pass create=False for header=False.
            bayeslite.bayesdb_read_csv(bdb,
                                       't',
                                       f,
                                       header=False,
                                       create=True,
                                       ifnotexists=False)

        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Must pass create=False for header=False.
            bayeslite.bayesdb_read_csv(bdb,
                                       't',
                                       f,
                                       header=False,
                                       create=True,
                                       ifnotexists=True)

        f = StringIO.StringIO(csv_hdrdata)
        with pytest.raises(ValueError):
            # Table must already exist for create=False.
            bayeslite.bayesdb_read_csv(bdb,
                                       't',
                                       f,
                                       header=True,
                                       create=False,
                                       ifnotexists=False)

        f = StringIO.StringIO(csv_hdrdata)
        with pytest.raises(ValueError):
            # Must pass create=True for ifnotexists=True.
            bayeslite.bayesdb_read_csv(bdb,
                                       't',
                                       f,
                                       header=True,
                                       create=False,
                                       ifnotexists=True)

        f = StringIO.StringIO(csv_hdrdata)
        with pytest.raises(ValueError):
            with bdb.savepoint():
                # Table must not exist if ifnotexists=False.
                bdb.sql_execute('CREATE TABLE t(x)')
                bayeslite.bayesdb_read_csv(bdb,
                                           't',
                                           f,
                                           header=True,
                                           create=True,
                                           ifnotexists=False)
        with pytest.raises(IOError):
            # Table must have no empty values in header.
            csv_hdrdata_prime = csv_hdrdata[1:]
            f = StringIO.StringIO(csv_hdrdata_prime)
            with bdb.savepoint():
                bayeslite.bayesdb_read_csv(bdb,
                                           't',
                                           f,
                                           header=True,
                                           create=True,
                                           ifnotexists=False)

        f = StringIO.StringIO(csv_hdrdata)
        bayeslite.bayesdb_read_csv(bdb,
                                   't',
                                   f,
                                   header=True,
                                   create=True,
                                   ifnotexists=False)
        data = bdb.sql_execute('SELECT * FROM t').fetchall()
        assert data == [
            # XXX Would be nice if the NaN could actually be that, or
            # at least None/NULL.
            (1, 2, 3, 'foo', 'bar', u'nan', u'', u'quagga'),
            (4, 5, 6, 'baz', 'quux', 42.0, u'', u'eland'),
            (7, 8, 6, 'zot', 'mumble', 87.0, u'zoot', u'caribou'),
        ]

        f = StringIO.StringIO(csv_hdr)
        bayeslite.bayesdb_read_csv(bdb,
                                   't',
                                   f,
                                   header=True,
                                   create=True,
                                   ifnotexists=True)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == data
        assert cursor_value(bdb.sql_execute('SELECT sql FROM sqlite_master'
                    ' WHERE name = ?', ('t',))) == \
            'CREATE TABLE "t"' \
            '("a" NUMERIC,"b" NUMERIC,"c" NUMERIC,"name" NUMERIC,' \
            '"nick" NUMERIC,"age" NUMERIC,"muppet" NUMERIC,"animal" NUMERIC)'

        f = StringIO.StringIO(csv_data)
        bayeslite.bayesdb_read_csv(bdb,
                                   't',
                                   f,
                                   header=False,
                                   create=False,
                                   ifnotexists=False)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == data + data

        f = StringIO.StringIO(csv_hdrdata)
        bayeslite.bayesdb_read_csv(bdb,
                                   't',
                                   f,
                                   header=True,
                                   create=False,
                                   ifnotexists=False)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == \
            data + data + data
        with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp:
            with open(temp.name, 'w') as f:
                f.write(csv_hdrdata)
            bayeslite.bayesdb_read_csv_file(bdb,
                                            't',
                                            temp.name,
                                            header=True,
                                            create=False,
                                            ifnotexists=False)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == \
            data + data + data + data

        # Test the BQL CREATE TABLE FROM <csv-file> syntax.
        f = StringIO.StringIO(csv_hdrdata)
        with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp:
            with open(temp.name, 'w') as f:
                f.write(csv_hdrdata)
            bdb.execute('CREATE TABLE t2 FROM \'%s\'' % (temp.name, ))
            assert bdb.sql_execute('SELECT * FROM t2').fetchall() == data

        # Trying to read a csv with an empty column name should fail.
        csv_header_corrupt = csv_hdr.replace('a,b', ',')
        csv_hdrdata_corrupt = csv_header_corrupt + csv_data
        with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp:
            with open(temp.name, 'w') as f:
                f.write(csv_hdrdata_corrupt)
            with pytest.raises(IOError):
                bayeslite.bayesdb_read_csv_file(bdb,
                                                't3',
                                                temp.name,
                                                header=True,
                                                create=True)
Beispiel #15
0
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed):
    then = time.time()

    timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d')
    user = subprocess.check_output(["whoami"]).strip()
    host = subprocess.check_output(["hostname"]).strip()
    filestamp = '-' + timestamp + '-' + user
    def out_file_name(base, ext):
        return out_dir + '/' + base + filestamp + ext

    csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv')
    bdb_file = out_file_name('satellites', '.bdb')

    # so we can build bdb models
    os.environ['BAYESDB_WIZARD_MODE']='1'

    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    if os.path.exists(bdb_file):
        print 'Error: File', bdb_file, 'already exists. Please remove it.'
        sys.exit(1)

    # create database mapped to filesystem
    log('opening bdb on disk: %s' % bdb_file)
    bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False)

    def execute(bql):
        log("executing %s" % bql)
        bdb.execute(bql)

    # read csv into table
    log('reading data from %s' % csv_file)
    bayeslite.bayesdb_read_csv_file(bdb, 'satellites', csv_file,
            header=True, create=True, ifnotexists=True)

    # Add a "not applicable" orbit sub-type
    log('adding "not applicable" orbit sub-type')
    bdb.sql_execute('''UPDATE satellites
        SET type_of_orbit = 'N/A'
        WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO')
          AND type_of_orbit = 'NaN'
    ''')

    # nullify "NaN"
    log('nullifying NaN')
    bdbcontrib.nullify(bdb, 'satellites', 'NaN')

    # register crosscat metamodel
    cc = ccme.MultiprocessingEngine(seed=seed)
    ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, ccmm)

    # create the crosscat generator using
    execute('''
        CREATE GENERATOR satellites_cc FOR satellites USING crosscat (
            GUESS(*),
            name IGNORE,
            Country_of_Operator CATEGORICAL,
            Operator_Owner CATEGORICAL,
            Users CATEGORICAL,
            Purpose CATEGORICAL,
            Class_of_Orbit CATEGORICAL,
            Type_of_Orbit CATEGORICAL,
            Perigee_km NUMERICAL,
            Apogee_km NUMERICAL,
            Eccentricity NUMERICAL,
            Period_minutes NUMERICAL,
            Launch_Mass_kg NUMERICAL,
            Dry_Mass_kg NUMERICAL,
            Power_watts NUMERICAL,
            Date_of_Launch NUMERICAL,
            Anticipated_Lifetime NUMERICAL,
            Contractor CATEGORICAL,
            Country_of_Contractor CATEGORICAL,
            Launch_Site CATEGORICAL,
            Launch_Vehicle CATEGORICAL,
            Source_Used_for_Orbital_Data CATEGORICAL,
            longitude_radians_of_geo NUMERICAL,
            Inclination_radians NUMERICAL
        )
    ''')

    execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models,))

    cur_iter_ct = 0

    def snapshot():
        log('vacuuming')
        bdb.sql_execute('vacuum')
        cur_infix = '-%dm-%di' % (num_models, cur_iter_ct)
        save_file_name = out_file_name('satellites', cur_infix + '.bdb')
        meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt')
        log('recording snapshot ' + save_file_name)
        os.system("cp %s %s" % (bdb_file, save_file_name))
        report(save_file_name, meta_file_name)

    def record_metadata(f, saved_file_name, sha_sum, total_time,
                        plot_file_name=None):
        f.write("DB file " + saved_file_name + "\n")
        f.write(sha_sum)
        f.write("built from " + csv_file + "\n")
        f.write("by %s@%s\n" % (user, host))
        f.write("at seed %s\n" % seed)
        f.write("in %3.2f seconds\n" % total_time)
        f.write("with %s models analyzed for %s iterations\n"
                % (num_models, num_iters))
        f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n"
                % (bayeslite.__version__, crosscat.__version__, bdbcontrib.__version__))
        if plot_file_name is not None:
            f.write("diagnostics recorded to %s\n" % plot_file_name)
        f.flush()

    def report(saved_file_name, metadata_file, echo=False, plot_file_name=None):
        sha256 = hashlib.sha256()
        with open(saved_file_name, 'rb') as fd:
            for chunk in iter(lambda: fd.read(65536), ''):
                sha256.update(chunk)
        sha_sum = sha256.hexdigest() + '\n'
        total_time = time.time() - then
        with open(metadata_file, 'w') as fd:
            record_metadata(fd, saved_file_name,
                            sha_sum, total_time, plot_file_name)
            fd.write('using script ')
            fd.write('-' * 57)
            fd.write('\n')
            fd.flush()
            os.system("cat %s >> %s" % (__file__, metadata_file))

        if echo:
            record_metadata(sys.stdout, saved_file_name,
                            sha_sum, total_time, plot_file_name)

    def final_report():
        # create a diagnostics plot
        plot_file_name = out_file_name('satellites', '-logscores.pdf')
        log('writing diagnostic plot to %s' % plot_file_name)
        _fig = bdbcontrib.plot_crosscat_chain_diagnostics(bdb, 'logscore',
                                                          'satellites_cc')
        plt.savefig(plot_file_name)
        final_metadata_file = out_file_name('satellites', '-meta.txt')
        report(bdb_file, final_metadata_file,
               echo=True, plot_file_name=plot_file_name)

    snapshot()
    while cur_iter_ct < num_iters:
        execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' % checkpoint_freq)
        cur_iter_ct += checkpoint_freq
        snapshot()

    final_report()

    log('closing bdb %s' % bdb_file)
    bdb.close()
    os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" % (out_dir, filestamp))
def test_estimate_pairwise_similarity():
    """
    Tests basic estimate pairwise similarity functionality against
    existing BQL estimate queries.
    """
    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            temp.write(test_bql_utils.csv_data)
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(bdb,
                                            't',
                                            temp.name,
                                            header=True,
                                            create=True)

        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # How to properly use the estimate_pairwise_similarity function.
        parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc')

        # Should complain with bad core value
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(bdb_file.name,
                                                  't',
                                                  't_cc',
                                                  cores=0)

        # Should complain if overwrite flag is not set, but t_similarity
        # exists
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(bdb_file.name, 't', 't_cc')
        # Should complain if model and table don't exist
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(bdb_file.name, 'foo',
                                                  'foo_cc')
        # Should complain if bdb_file doesn't exist
        with tempfile.NamedTemporaryFile() as does_not_exist:
            with pytest.raises(SQLError):
                parallel.estimate_pairwise_similarity(does_not_exist.name, 't',
                                                      't_cc')

        # Should run fine if overwrite flag is set
        parallel.estimate_pairwise_similarity(bdb_file.name,
                                              't',
                                              't_cc',
                                              overwrite=True)

        # Should be able to specify another table name
        parallel.estimate_pairwise_similarity(bdb_file.name,
                                              't',
                                              't_cc',
                                              sim_table='t_similarity_2')

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity ORDER BY rowid0, rowid1'))
        parallel_sim_2 = cursor_to_df(
            bdb.execute(
                'SELECT * FROM t_similarity_2 ORDER BY rowid0, rowid1'))

        # Results may be returned out of order. So we sort the values,
        # as above, and we reorder the numeric index
        parallel_sim.index = range(parallel_sim.shape[0])
        parallel_sim_2.index = range(parallel_sim_2.shape[0])

        # The data from two successive parallel pairwise estimates should be
        # identical to each other...
        assert_frame_equal(parallel_sim,
                           parallel_sim_2,
                           check_column_type=True)
        # ...and to a standard estimate pairwise similarity.
        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc'))
        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
Beispiel #17
0
def test_composer_integration__ci_slow():
    # But currently difficult to seperate these tests into smaller tests because
    # of their sequential nature. We will still test all internal functions
    # with different regimes of operation.

    # SETUP
    # -----
    # Dataset.
    bdb = bayeslite.bayesdb_open()
    bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV,
        header=True, create=True)
    bdbcontrib.nullify(bdb, 'satellites', 'NaN')
    # Composer.
    composer = Composer(n_samples=5)
    composer.register_foreign_predictor(
        multiple_regression.MultipleRegression)
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    composer.register_foreign_predictor(random_forest.RandomForest)
    # Use complex generator for interesting test cases.
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    bdb.execute('''
        CREATE GENERATOR t1 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL,
                Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
                Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
                Contractor CATEGORICAL,
                Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
                Launch_Vehicle CATEGORICAL,
                Source_Used_for_Orbital_Data CATEGORICAL,
                longitude_radians_of_geo NUMERICAL,
                Inclination_radians NUMERICAL,
            ),
            random_forest (
                Type_of_Orbit CATEGORICAL
                    GIVEN Apogee_km, Perigee_km,
                        Eccentricity, Period_minutes, Launch_Mass_kg,
                        Power_watts, Anticipated_Lifetime, Class_of_orbit
            ),
            keplers_law (
                Period_minutes NUMERICAL
                    GIVEN Perigee_km, Apogee_km
            ),
            multiple_regression (
                Anticipated_Lifetime NUMERICAL
                    GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg,
                    Contractor
            ),
            DEPENDENT(Apogee_km, Perigee_km, Eccentricity),
            DEPENDENT(Contractor, Country_of_Contractor),
            INDEPENDENT(Country_of_Operator, Date_of_Launch)
        );''')


    # ----------------------
    # TEST INITIALIZE MODELS
    # ----------------------

    bdb.execute('INITIALIZE 2 MODELS FOR t1')
    # Check number of models.
    df = bdbcontrib.describe_generator_models(bdb, 't1')
    assert len(df) == 2
    df = bdbcontrib.describe_generator_models(bdb, 't1_cc')
    assert len(df) == 2

    # -------------------
    # TEST ANALYZE MODELS
    # -------------------

    bdb.execute('ANALYZE t1 FOR 2 ITERATIONS WAIT;')
    # Check number of iterations of composer.
    df = bdbcontrib.describe_generator_models(bdb, 't1')
    for index, modelno, iterations in df.itertuples():
        assert iterations == 2
    # Check number of iterations of composer_cc.
    df = bdbcontrib.describe_generator_models(bdb, 't1_cc')
    for index, modelno, iterations in df.itertuples():
        assert iterations == 2

    # ----------------------------------
    # TEST COLUMN DEPENDENCE PROBABILITY
    # ----------------------------------

    # Special 0/1 regimes.
    # Local with a INDEPENDENT local should be 0.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Date_of_Launch
            WITH Country_of_Operator FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 0
    # Local with a DEPENDENT local should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Perigee_km WITH Eccentricity
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Apogee_km WITH Eccentricity
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1
    # Foreign with a local parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Apogee_km
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Power_watts
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a foreign parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Type_of_Orbit WITH
            Anticipated_Lifetime FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a local non-parent DEPENDENT with local parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            Eccentricity FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with foreign sharing common direct ancestor should be 1.
    # Launch_Mass_kg is the common parent.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Type_of_Orbit FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a foreign sharing a common DEPENDENT ancestor should be 1.
    # Eccentricity is a parent of Type_of_orbit, and is dependent
    # with Period_minutes through DEPENDENT(Apogee_km, Perigee_km, Eccentricity)
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            Type_of_Orbit FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Column with itself should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Anticipated_Lifetime FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.

    # Unknown [0,1] regimes.
    # Foreign with a local of unknown relation with parents.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            longitude_radians_of_geo FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            longitude_radians_of_geo FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.
    # Foreign with a foreign of unknown ancestry relation.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Period_minutes FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.

    # ----------------------------------
    # TEST SIMULATE
    # ----------------------------------

    # Crash tests for various code paths. Quality of simulations ignored.
    # Joint local.
    curs = bdb.execute('''
        SIMULATE Power_watts, Launch_Mass_kg FROM t1 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Forward simulate foreign.
    curs = bdb.execute('''
        SIMULATE Period_minutes FROM t1 GIVEN Apogee_km = 1000, Perigee_km = 980
            LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Forward simulate foreign with missing parents.
    curs = bdb.execute('''
        SIMULATE Anticipated_Lifetime FROM t1 GIVEN Dry_Mass_kg = 2894,
            Launch_Mass_kg = 1730 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Joint simulate foreign with parents, and missing parents.
    curs = bdb.execute('''
        SIMULATE Type_of_Orbit, Eccentricity FROM t1 GIVEN Dry_Mass_kg = 2894,
            Launch_Mass_kg = 1730 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Joint simulate foreign with non-parents.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Eccentricity FROM t1 GIVEN Apogee_km = 38000
            LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate joint local conditioned on two foreigns.
    curs = bdb.execute('''
        SIMULATE Country_of_Operator, Inclination_radians FROM t1
            GIVEN Period_minutes = 1432, Anticipated_Lifetime = 5 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate joint foreign conditioned on third foreign.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Anticipated_Lifetime FROM t1
            GIVEN Type_of_Orbit = 'Deep Highly Eccentric' LIMIT 2
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate foreign conditioned on itself.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Apogee_km FROM t1
            GIVEN Period_minutes = 102 LIMIT 2
    ''')
    assert [s[0] for s in curs] == [102] * 2

    # -----------------------------
    # TEST COLUMN VALUE PROBABILITY
    # -----------------------------

    # Crash tests for various code path. Quality of logpdf ignored.
    # Conditional local.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Power_watts = 800 GIVEN (Perigee_km = 980,
            Launch_Mass_kg = 890) FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Unconditional foreign
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Period_minutes = 1020 FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Conditional foreign on parent and non-parents.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Period_minutes = 1020 GIVEN
            (Apogee_km = 38000, Eccentricity = 0.03) FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[0]
    # Conditional foriegn on foreign.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Anticipated_Lifetime = 4.09 GIVEN
            (Class_of_Orbit = 'LEO', Purpose='Astrophysics',
                Period_minutes = 1436) FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Categorical foreign should be less than 1.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Type_of_Orbit = 'Polar' FROM t1 LIMIT 1;
    ''')
    assert curs.next()[0] <= 1.
    # Query inconsistent with evidence should be 0.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF "Type_of_Orbit" = 'Polar'
            GIVEN ("Type_of_Orbit" = 'Deep Highly Eccentric') FROM t1 LIMIT 1;
    ''')
    assert curs.next()[0] == 0.
    # In theory, query consistent with evidence should be 1, but this is very
    # hard to ensure due to stochastic sampling giving different estimates of
    # P(Y), once in joint and once in marginal Monte Carlo estimation.

    # -----------------------
    # TEST MUTUAL INFORMATION
    # -----------------------

    # Two local columns.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Country_of_Contractor WITH
            longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX Small sample sizes non-deterministically produce negative MI
    assert -1 <= curs.next()[0]
    # One local and one foreign column.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH
            longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX This non-deterministically fails when sample sizes are small
    # assert 0. <= curs.next()[0]
    assert float("-inf") <= curs.next()[0]
    # Two foreign columns.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH
            Anticipated_Lifetime USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX This non-deterministically fails when sample sizes are small
    # assert 0. <= curs.next()[0]
    assert float("-inf") <= curs.next()[0]

    # -----------------------
    # TEST PREDICT CONFIDENCE
    # -----------------------

    # Continuous local column.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Dry_Mass_kg CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert curs.next()[1] >= 0.
    # Discrete local column with no children.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Purpose CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1
    # Discrete local column with children.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Contractor CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1
    # Continuous foreign columns.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Period_minutes CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert curs.next()[1] >= 0.
    # Discrete foreign column.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Type_of_Orbit CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1

    bdb.close()
def test_create_generator_schema():
    bdb = bayeslite.bayesdb_open()
    bayeslite.bayesdb_read_csv_file(bdb,
                                    'satellites',
                                    PATH_SATELLITES_CSV,
                                    header=True,
                                    create=True)
    composer = Composer(n_samples=5)
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    # Using crosscat and default to specify models should work.
    bdb.execute('''
        CREATE GENERATOR t1 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL
            ),
            crosscat (
                Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL
            )
        );''')
    assert bayeslite.core.bayesdb_has_generator(bdb, 't1_cc')
    # IGNORE and GUESS(*) are forbidden and should crash.
    with pytest.raises(AttributeError):
        bdb.execute('''
            CREATE GENERATOR t2 FOR satellites USING composer(
                default (
                    GUESS(*), Country_of_Operator IGNORE,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                )
            );''')
    # Test unregistered foreign predictor.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t3 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # Unregistered foreign predictor should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t4 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # Registered foreign predictor should work.
    composer.register_foreign_predictor(random_forest.RandomForest)
    bdb.execute('''
        CREATE GENERATOR t5 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Eccentricity NUMERICAL
            ),
            random_forest (
                Apogee_km NUMERICAL GIVEN Operator_Owner
            )
        );''')
    # Wrong stattype in predictor should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t6 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km RADIAL GIVEN Operator_Owner
                )
            );''')
    # Missing GIVEN keyword should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t6 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL, Operator_Owner
                )
            );''')
    # Missing conditions in random forest conditions should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t7 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # Test duplicate declarations.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t7 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Class_of_orbit CATEGORICAL GIVEN Operator_Owner
                )
            );''')
    # Arbitrary DAG with foreign predictors.
    composer.register_foreign_predictor(multiple_regression.MultipleRegression)
    bdb.execute('''
        CREATE GENERATOR t8 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
            ),
            random_forest (
                Apogee_km NUMERICAL GIVEN Operator_Owner, Users
            ),
            multiple_regression (
                Eccentricity NUMERICAL GIVEN Apogee_km, Users, Perigee_km
            )
        );''')
    # Duplicate declarations in foreign predictors should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t9 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Perigee_km NUMERICAL GIVEN Purpose
                ),
                multiple_regression (
                    Perigee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # MML for default models should work.
    bdb.execute('''
        CREATE GENERATOR t10 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Apogee_km NUMERICAL
            )
            random_forest (
                Perigee_km NUMERICAL GIVEN Purpose
            )
            multiple_regression (
                Eccentricity NUMERICAL GIVEN Operator_Owner, Class_of_orbit
            )
            DEPENDENT(Apogee_km, Perigee_km, Purpose),
            INDEPENDENT(Country_of_Operator, Purpose)
        );''')
    # MML for foreign predictors should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t11 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL,
                    Apogee_km NUMERICAL
                ),
                random_forest (
                    Perigee_km NUMERICAL GIVEN Purpose
                ),
                multiple_regression (
                    Eccentricity NUMERICAL GIVEN Operator_Owner, Class_of_orbit
                )
                DEPENDENT(Apogee_km, Eccentricity, Country_of_Operator),
                INDEPENDENT(Perigee_km, Purpose)
            );''')
    # Test full generator.
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    bdb.execute('''
        CREATE GENERATOR t12 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL,
                Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
                Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
                Contractor CATEGORICAL,
                Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
                Launch_Vehicle CATEGORICAL,
                Source_Used_for_Orbital_Data CATEGORICAL,
                longitude_radians_of_geo NUMERICAL,
                Inclination_radians NUMERICAL,
            ),
            random_forest (
                Type_of_Orbit CATEGORICAL
                    GIVEN Apogee_km, Perigee_km,
                        Eccentricity, Period_minutes, Launch_Mass_kg,
                        Power_watts, Anticipated_Lifetime, Class_of_orbit
            ),
            keplers_law (
                Period_minutes NUMERICAL
                    GIVEN Perigee_km, Apogee_km
            ),
            multiple_regression (
                Anticipated_Lifetime NUMERICAL
                    GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg, Contractor
            ),
            DEPENDENT(Apogee_km, Perigee_km, Eccentricity),
            INDEPENDENT(Country_of_Operator, longitude_radians_of_geo)
        );''')
    bdb.close()
def test_composer_integration__ci_slow():
    # But currently difficult to seperate these tests into smaller tests because
    # of their sequential nature. We will still test all internal functions
    # with different regimes of operation.

    # SETUP
    # -----
    # Dataset.
    bdb = bayeslite.bayesdb_open()
    bayeslite.bayesdb_read_csv_file(bdb,
                                    'satellites',
                                    PATH_SATELLITES_CSV,
                                    header=True,
                                    create=True)
    bdbcontrib.bql_utils.nullify(bdb, 'satellites', 'NaN')
    # Composer.
    composer = Composer(n_samples=5)
    composer.register_foreign_predictor(multiple_regression.MultipleRegression)
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    composer.register_foreign_predictor(random_forest.RandomForest)
    # Use complex generator for interesting test cases.
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    bdb.execute('''
        CREATE GENERATOR t1 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL,
                Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
                Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
                Contractor CATEGORICAL,
                Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
                Launch_Vehicle CATEGORICAL,
                Source_Used_for_Orbital_Data CATEGORICAL,
                longitude_radians_of_geo NUMERICAL,
                Inclination_radians NUMERICAL,
            ),
            random_forest (
                Type_of_Orbit CATEGORICAL
                    GIVEN Apogee_km, Perigee_km,
                        Eccentricity, Period_minutes, Launch_Mass_kg,
                        Power_watts, Anticipated_Lifetime, Class_of_orbit
            ),
            keplers_law (
                Period_minutes NUMERICAL
                    GIVEN Perigee_km, Apogee_km
            ),
            multiple_regression (
                Anticipated_Lifetime NUMERICAL
                    GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg,
                    Contractor
            ),
            DEPENDENT(Apogee_km, Perigee_km, Eccentricity),
            DEPENDENT(Contractor, Country_of_Contractor),
            INDEPENDENT(Country_of_Operator, Date_of_Launch)
        );''')

    # ----------------------
    # TEST INITIALIZE MODELS
    # ----------------------

    bdb.execute('INITIALIZE 2 MODELS FOR t1')
    # Check number of models.
    df = describe_generator_models(bdb, 't1')
    assert len(df) == 2
    df = describe_generator_models(bdb, 't1_cc')
    assert len(df) == 2

    # -------------------
    # TEST ANALYZE MODELS
    # -------------------

    bdb.execute('ANALYZE t1 FOR 2 ITERATIONS WAIT;')
    # Check number of iterations of composer.
    df = describe_generator_models(bdb, 't1')
    for index, modelno, iterations in df.itertuples():
        assert iterations == 2
    # Check number of iterations of composer_cc.
    df = describe_generator_models(bdb, 't1_cc')
    for index, modelno, iterations in df.itertuples():
        assert iterations == 2

    # ----------------------------------
    # TEST COLUMN DEPENDENCE PROBABILITY
    # ----------------------------------

    # Special 0/1 regimes.
    # Local with a INDEPENDENT local should be 0.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Date_of_Launch
            WITH Country_of_Operator FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 0
    # Local with a DEPENDENT local should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Perigee_km WITH Eccentricity
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Apogee_km WITH Eccentricity
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1
    # Foreign with a local parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Apogee_km
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Power_watts
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a foreign parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Type_of_Orbit WITH
            Anticipated_Lifetime FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a local non-parent DEPENDENT with local parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            Eccentricity FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with foreign sharing common direct ancestor should be 1.
    # Launch_Mass_kg is the common parent.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Type_of_Orbit FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a foreign sharing a common DEPENDENT ancestor should be 1.
    # Eccentricity is a parent of Type_of_orbit, and is dependent
    # with Period_minutes through DEPENDENT(Apogee_km, Perigee_km, Eccentricity)
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            Type_of_Orbit FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Column with itself should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Anticipated_Lifetime FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.

    # Unknown [0,1] regimes.
    # Foreign with a local of unknown relation with parents.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            longitude_radians_of_geo FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            longitude_radians_of_geo FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.
    # Foreign with a foreign of unknown ancestry relation.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Period_minutes FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.

    # ----------------------------------
    # TEST SIMULATE
    # ----------------------------------

    # Crash tests for various code paths. Quality of simulations ignored.
    # Joint local.
    curs = bdb.execute('''
        SIMULATE Power_watts, Launch_Mass_kg FROM t1 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Forward simulate foreign.
    curs = bdb.execute('''
        SIMULATE Period_minutes FROM t1 GIVEN Apogee_km = 1000, Perigee_km = 980
            LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Forward simulate foreign with missing parents.
    curs = bdb.execute('''
        SIMULATE Anticipated_Lifetime FROM t1 GIVEN Dry_Mass_kg = 2894,
            Launch_Mass_kg = 1730 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Joint simulate foreign with parents, and missing parents.
    curs = bdb.execute('''
        SIMULATE Type_of_Orbit, Eccentricity FROM t1 GIVEN Dry_Mass_kg = 2894,
            Launch_Mass_kg = 1730 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Joint simulate foreign with non-parents.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Eccentricity FROM t1 GIVEN Apogee_km = 38000
            LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate joint local conditioned on two foreigns.
    curs = bdb.execute('''
        SIMULATE Country_of_Operator, Inclination_radians FROM t1
            GIVEN Period_minutes = 1432, Anticipated_Lifetime = 5 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate joint foreign conditioned on third foreign.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Anticipated_Lifetime FROM t1
            GIVEN Type_of_Orbit = 'Deep Highly Eccentric' LIMIT 2
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate foreign conditioned on itself.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Apogee_km FROM t1
            GIVEN Period_minutes = 102 LIMIT 2
    ''')
    assert [s[0] for s in curs] == [102] * 2

    # -----------------------------
    # TEST COLUMN VALUE PROBABILITY
    # -----------------------------

    # Crash tests for various code path. Quality of logpdf ignored.
    # Conditional local.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Power_watts = 800 GIVEN (Perigee_km = 980,
            Launch_Mass_kg = 890) FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Unconditional foreign
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Period_minutes = 1020 FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Conditional foreign on parent and non-parents.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Period_minutes = 1020 GIVEN
            (Apogee_km = 38000, Eccentricity = 0.03) FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[0]
    # Conditional foriegn on foreign.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Anticipated_Lifetime = 4.09 GIVEN
            (Class_of_Orbit = 'LEO', Purpose='Astrophysics',
                Period_minutes = 1436) FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Categorical foreign should be less than 1.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Type_of_Orbit = 'Polar' FROM t1 LIMIT 1;
    ''')
    assert curs.next()[0] <= 1.
    # Query inconsistent with evidence should be 0.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF "Type_of_Orbit" = 'Polar'
            GIVEN ("Type_of_Orbit" = 'Deep Highly Eccentric') FROM t1 LIMIT 1;
    ''')
    assert curs.next()[0] == 0.
    # In theory, query consistent with evidence should be 1, but this is very
    # hard to ensure due to stochastic sampling giving different estimates of
    # P(Y), once in joint and once in marginal Monte Carlo estimation.

    # -----------------------
    # TEST MUTUAL INFORMATION
    # -----------------------

    # Two local columns.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Country_of_Contractor WITH
            longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX Small sample sizes non-deterministically produce negative MI
    assert -1 <= curs.next()[0]
    # One local and one foreign column.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH
            longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX This non-deterministically fails when sample sizes are small
    # assert 0. <= curs.next()[0]
    assert float("-inf") <= curs.next()[0]
    # Two foreign columns.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH
            Anticipated_Lifetime USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX This non-deterministically fails when sample sizes are small
    # assert 0. <= curs.next()[0]
    assert float("-inf") <= curs.next()[0]

    # -----------------------
    # TEST PREDICT CONFIDENCE
    # -----------------------

    # Continuous local column.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Dry_Mass_kg CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert curs.next()[1] >= 0.
    # Discrete local column with no children.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Purpose CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1
    # Discrete local column with children.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Contractor CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1
    # Continuous foreign columns.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Period_minutes CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert curs.next()[1] >= 0.
    # Discrete foreign column.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Type_of_Orbit CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1

    bdb.close()
def test_drop_generator():
    bdb = bayeslite.bayesdb_open()
    # Initialize the database
    bayeslite.bayesdb_read_csv_file(bdb,
                                    'satellites',
                                    PATH_SATELLITES_CSV,
                                    header=True,
                                    create=True)
    composer = Composer(n_samples=5)
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    composer.register_foreign_predictor(random_forest.RandomForest)
    composer.register_foreign_predictor(multiple_regression.MultipleRegression)
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    bdb.execute('''
        CREATE GENERATOR t1 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL,
                Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
                Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
                Contractor CATEGORICAL,
                Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
                Launch_Vehicle CATEGORICAL,
                Source_Used_for_Orbital_Data CATEGORICAL,
                longitude_radians_of_geo NUMERICAL,
                Inclination_radians NUMERICAL,
            ),
            random_forest (
                Type_of_Orbit CATEGORICAL
                    GIVEN Apogee_km, Perigee_km,
                        Eccentricity, Period_minutes, Launch_Mass_kg,
                        Power_watts, Anticipated_Lifetime, Class_of_orbit
            ),
            keplers_law (
                Period_minutes NUMERICAL
                    GIVEN Perigee_km, Apogee_km
            ),
            multiple_regression (
                Anticipated_Lifetime NUMERICAL
                    GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg,
                    Contractor
            ),
            DEPENDENT(Apogee_km, Perigee_km, Eccentricity),
            DEPENDENT(Contractor, Country_of_Contractor),
            INDEPENDENT(Country_of_Operator, Date_of_Launch)
        );''')
    generator_id = bayeslite.core.bayesdb_get_generator(bdb, 't1')
    schema = [
        ('table', 'bayesdb_composer_cc_id'),
        ('table', 'bayesdb_composer_column_owner'),
        ('table', 'bayesdb_composer_column_toposort'),
        ('table', 'bayesdb_composer_column_parents'),
        ('table', 'bayesdb_composer_column_foreign_predictor'),
    ]
    # Iterate through tables before dropping.
    for _, name in schema:
        bdb.sql_execute(
            '''
            SELECT * FROM {} WHERE generator_id=?
        '''.format(quote(name)), (generator_id, )).next()
    # Drop generator and ensure table lookups with generator_id throw error.
    bdb.execute('DROP GENERATOR t1')
    for _, name in schema:
        with pytest.raises(StopIteration):
            bdb.sql_execute(
                '''
                SELECT * FROM {} WHERE generator_id=?
            '''.format(quote(name)), (generator_id, )).next()
    assert not bayeslite.core.bayesdb_has_generator(bdb, 't1')
    assert not bayeslite.core.bayesdb_has_generator(bdb, 't1_cc')
    bdb.close()
Beispiel #21
0
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed):
    then = time.time()

    timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d')
    user = subprocess.check_output(["whoami"]).strip()
    host = subprocess.check_output(["hostname"]).strip()
    filestamp = '-' + timestamp + '-' + user

    def out_file_name(base, ext):
        return out_dir + '/' + base + filestamp + ext

    csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv')
    bdb_file = out_file_name('satellites', '.bdb')

    # so we can build bdb models
    os.environ['BAYESDB_WIZARD_MODE'] = '1'

    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    if os.path.exists(bdb_file):
        print 'Error: File', bdb_file, 'already exists. Please remove it.'
        sys.exit(1)

    # create database mapped to filesystem
    log('opening bdb on disk: %s' % bdb_file)
    bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False)

    def execute(bql):
        log("executing %s" % bql)
        bdb.execute(bql)

    # read csv into table
    log('reading data from %s' % csv_file)
    bayeslite.bayesdb_read_csv_file(bdb,
                                    'satellites',
                                    csv_file,
                                    header=True,
                                    create=True,
                                    ifnotexists=True)

    # Add a "not applicable" orbit sub-type
    log('adding "not applicable" orbit sub-type')
    bdb.sql_execute('''UPDATE satellites
        SET type_of_orbit = 'N/A'
        WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO')
          AND type_of_orbit = 'NaN'
    ''')

    # nullify "NaN"
    log('nullifying NaN')
    bdbcontrib.bql_utils.nullify(bdb, 'satellites', 'NaN')

    # register crosscat metamodel
    cc = ccme.MultiprocessingEngine(seed=seed)
    ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, ccmm)

    # create the crosscat generator using
    execute('''
        CREATE GENERATOR satellites_cc FOR satellites USING crosscat (
            GUESS(*),
            name IGNORE,
            Country_of_Operator CATEGORICAL,
            Operator_Owner CATEGORICAL,
            Users CATEGORICAL,
            Purpose CATEGORICAL,
            Class_of_Orbit CATEGORICAL,
            Type_of_Orbit CATEGORICAL,
            Perigee_km NUMERICAL,
            Apogee_km NUMERICAL,
            Eccentricity NUMERICAL,
            Period_minutes NUMERICAL,
            Launch_Mass_kg NUMERICAL,
            Dry_Mass_kg NUMERICAL,
            Power_watts NUMERICAL,
            Date_of_Launch NUMERICAL,
            Anticipated_Lifetime NUMERICAL,
            Contractor CATEGORICAL,
            Country_of_Contractor CATEGORICAL,
            Launch_Site CATEGORICAL,
            Launch_Vehicle CATEGORICAL,
            Source_Used_for_Orbital_Data CATEGORICAL,
            longitude_radians_of_geo NUMERICAL,
            Inclination_radians NUMERICAL
        )
    ''')

    execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models, ))

    cur_iter_ct = 0

    def snapshot():
        log('vacuuming')
        bdb.sql_execute('vacuum')
        cur_infix = '-%dm-%di' % (num_models, cur_iter_ct)
        save_file_name = out_file_name('satellites', cur_infix + '.bdb')
        meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt')
        log('recording snapshot ' + save_file_name)
        os.system("cp %s %s" % (bdb_file, save_file_name))
        report(save_file_name, meta_file_name)

    def record_metadata(f,
                        saved_file_name,
                        sha_sum,
                        total_time,
                        plot_file_name=None):
        f.write("DB file " + saved_file_name + "\n")
        f.write(sha_sum)
        f.write("built from " + csv_file + "\n")
        f.write("by %s@%s\n" % (user, host))
        f.write("at seed %s\n" % seed)
        f.write("in %3.2f seconds\n" % total_time)
        f.write("with %s models analyzed for %s iterations\n" %
                (num_models, num_iters))
        f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n" %
                (bayeslite.__version__, crosscat.__version__,
                 bdbcontrib.__version__))
        if plot_file_name is not None:
            f.write("diagnostics recorded to %s\n" % plot_file_name)
        f.flush()

    def report(saved_file_name,
               metadata_file,
               echo=False,
               plot_file_name=None):
        sha256 = hashlib.sha256()
        with open(saved_file_name, 'rb') as fd:
            for chunk in iter(lambda: fd.read(65536), ''):
                sha256.update(chunk)
        sha_sum = sha256.hexdigest() + '\n'
        total_time = time.time() - then
        with open(metadata_file, 'w') as fd:
            record_metadata(fd, saved_file_name, sha_sum, total_time,
                            plot_file_name)
            fd.write('using script ')
            fd.write('-' * 57)
            fd.write('\n')
            fd.flush()
            os.system("cat %s >> %s" % (__file__, metadata_file))

        if echo:
            record_metadata(sys.stdout, saved_file_name, sha_sum, total_time,
                            plot_file_name)

    def final_report():
        # create a diagnostics plot
        plot_file_name = out_file_name('satellites', '-logscores.pdf')
        log('writing diagnostic plot to %s' % plot_file_name)
        _fig = bdbcontrib.crosscat_utils.plot_crosscat_chain_diagnostics(
            bdb, 'logscore', 'satellites_cc')
        plt.savefig(plot_file_name)
        final_metadata_file = out_file_name('satellites', '-meta.txt')
        report(bdb_file,
               final_metadata_file,
               echo=True,
               plot_file_name=plot_file_name)

    snapshot()
    while cur_iter_ct < num_iters:
        execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' %
                checkpoint_freq)
        cur_iter_ct += checkpoint_freq
        snapshot()

    final_report()

    log('closing bdb %s' % bdb_file)
    bdb.close()
    os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" %
              (out_dir, filestamp))
def test_mml_csv():
    with bayeslite.bayesdb_open() as bdb:
        bayeslite.bayesdb_read_csv_file(
            bdb, 't', 'tests/mml.csv', header=True, create=True)
        guesses = mml_utils.guess_types(bdb, 't')
        # Testing these strings is going to be brittle, but I don't have a
        # great answer.
        assert guesses == ({
            'col1': ('IGNORE',
                     'Column is constant'),
            'col2': ('CATEGORICAL',
                     'Only 5 distinct values'),
            'col3': ('IGNORE',
                     'Column is constant'),
            'col4': ('NUMERICAL',
                     'Contains exclusively numbers (24 of them).'),
            'col5': ('CATEGORICAL',
                     'Only 2 distinct values'),
            'col6': ('NUMERICAL',
                     'Contains exclusively numbers (25 of them).')})

        mml_json = mml_utils.to_json(guesses)
        assert mml_json == {
            'metamodel': 'crosscat',
            'columns': {
                'col1': {'stattype': 'IGNORE',
                         'reason': 'Column is constant'},
                'col2': {'stattype': 'CATEGORICAL',
                         'reason': 'Only 5 distinct values'},
                'col3': {'stattype': 'IGNORE',
                         'reason': 'Column is constant'},
                'col4': {'stattype': 'NUMERICAL',
                         'reason': 'Contains exclusively numbers (24 of them).'},
                'col5': {'stattype': 'CATEGORICAL',
                         'reason': 'Only 2 distinct values'},
                'col6': {'stattype': 'NUMERICAL',
                         'reason': 'Contains exclusively numbers (25 of them).'}
            }}

        mml_statement = mml_utils.to_mml(mml_json, 'table', 'generator')
        assert mml_statement == (
            'CREATE GENERATOR "generator" FOR "table" '
            'USING crosscat( '
            '"col6" NUMERICAL,"col4" NUMERICAL,'
            '"col5" CATEGORICAL,"col2" CATEGORICAL);')

        # col6's values are constructed in such a way as to break crosscat.
        # See https://github.com/probcomp/bayeslite/issues/284
        # On validation the column should be ignored
        mod_schema = mml_utils.validate_schema(bdb, 't', mml_json)
        assert mod_schema == {
            'metamodel': 'crosscat',
            'columns': {
                'col1': {'stattype': 'IGNORE',
                         'reason': 'Column is constant'},
                'col2': {'stattype': 'CATEGORICAL',
                         'reason': 'Only 5 distinct values'},
                'col3': {'stattype': 'IGNORE',
                         'reason': 'Column is constant'},
                'col4': {'stattype': 'NUMERICAL',
                         'reason': 'Contains exclusively numbers (24 of them).'},
                'col5': {'stattype': 'CATEGORICAL',
                         'reason': 'Only 2 distinct values'},
                'col6': {'stattype': 'IGNORE', 'guessed': 'NUMERICAL',
                         'reason': 'Caused ANALYZE to error'}}}
Beispiel #23
0
def test_drop_generator():
    bdb = bayeslite.bayesdb_open()
    # Initialize the database
    bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV,
        header=True, create=True)
    composer = Composer(n_samples=5)
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    composer.register_foreign_predictor(random_forest.RandomForest)
    composer.register_foreign_predictor(multiple_regression.MultipleRegression)
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    bdb.execute('''
        CREATE GENERATOR t1 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL,
                Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
                Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
                Contractor CATEGORICAL,
                Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
                Launch_Vehicle CATEGORICAL,
                Source_Used_for_Orbital_Data CATEGORICAL,
                longitude_radians_of_geo NUMERICAL,
                Inclination_radians NUMERICAL,
            ),
            random_forest (
                Type_of_Orbit CATEGORICAL
                    GIVEN Apogee_km, Perigee_km,
                        Eccentricity, Period_minutes, Launch_Mass_kg,
                        Power_watts, Anticipated_Lifetime, Class_of_orbit
            ),
            keplers_law (
                Period_minutes NUMERICAL
                    GIVEN Perigee_km, Apogee_km
            ),
            multiple_regression (
                Anticipated_Lifetime NUMERICAL
                    GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg,
                    Contractor
            ),
            DEPENDENT(Apogee_km, Perigee_km, Eccentricity),
            DEPENDENT(Contractor, Country_of_Contractor),
            INDEPENDENT(Country_of_Operator, Date_of_Launch)
        );''')
    generator_id = bayeslite.core.bayesdb_get_generator(bdb, 't1')
    schema = [
        ('table', 'bayesdb_composer_cc_id'),
        ('table', 'bayesdb_composer_column_owner'),
        ('table', 'bayesdb_composer_column_toposort'),
        ('table', 'bayesdb_composer_column_parents'),
        ('table', 'bayesdb_composer_column_foreign_predictor'),
    ]
    # Iterate through tables before dropping.
    for _, name in schema:
        bdb.sql_execute('''
            SELECT * FROM {} WHERE generator_id=?
        '''.format(quote(name)), (generator_id,)).next()
    # Drop generator and ensure table lookups with generator_id throw error.
    bdb.execute('DROP GENERATOR t1')
    for _, name in schema:
        with pytest.raises(StopIteration):
            bdb.sql_execute('''
                SELECT * FROM {} WHERE generator_id=?
            '''.format(quote(name)), (generator_id,)).next()
    assert not bayeslite.core.bayesdb_has_generator(bdb, 't1')
    assert not bayeslite.core.bayesdb_has_generator(bdb, 't1_cc')
    bdb.close()
Beispiel #24
0
def test_estimate_pairwise_similarity():
    """
    Tests basic estimate pairwise similarity functionality against
    existing BQL estimate queries.
    """
    os.environ['BAYESDB_WIZARD_MODE'] = '1'

    with tempfile.NamedTemporaryFile(suffix='.bdb') as bdb_file:
        bdb = bayeslite.bayesdb_open(bdb_file.name)
        with tempfile.NamedTemporaryFile() as temp:
            temp.write(test_utils.csv_data)
            temp.seek(0)
            bayeslite.bayesdb_read_csv_file(
                bdb, 't', temp.name, header=True, create=True)

        bdb.execute('''
            CREATE GENERATOR t_cc FOR t USING crosscat (
                GUESS(*),
                id IGNORE
            )
        ''')

        bdb.execute('INITIALIZE 3 MODELS FOR t_cc')
        bdb.execute('ANALYZE t_cc MODELS 0-2 FOR 10 ITERATIONS WAIT')

        # How to properly use the estimate_pairwise_similarity function.
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc'
        )

        # Should complain with bad core value
        with pytest.raises(BLE):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc', cores=0
            )

        # Should complain if overwrite flag is not set, but t_similarity
        # exists
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 't', 't_cc'
            )
        # Should complain if model and table don't exist
        with pytest.raises(SQLError):
            parallel.estimate_pairwise_similarity(
                bdb_file.name, 'foo', 'foo_cc'
            )
        # Should complain if bdb_file doesn't exist
        with tempfile.NamedTemporaryFile() as does_not_exist:
            with pytest.raises(SQLError):
                parallel.estimate_pairwise_similarity(
                    does_not_exist.name, 't', 't_cc'
                )

        # Should run fine if overwrite flag is set
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc', overwrite=True
        )

        # Should be able to specify another table name
        parallel.estimate_pairwise_similarity(
            bdb_file.name, 't', 't_cc', sim_table='t_similarity_2'
        )

        parallel_sim = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity')
        ).sort_values(by=['rowid0', 'rowid1'])
        parallel_sim_2 = cursor_to_df(
            bdb.execute('SELECT * FROM t_similarity_2')
        ).sort_values(by=['rowid0', 'rowid1'])

        # Results may be returned out of order. So we sort the values,
        # as above, and we reorder the numeric index
        parallel_sim.index = range(parallel_sim.shape[0])
        parallel_sim_2.index = range(parallel_sim_2.shape[0])

        # The data from two successive parallel pairwise estimates should be
        # identical to each other...
        assert_frame_equal(
            parallel_sim, parallel_sim_2, check_column_type=True)
        # ...and to a standard estimate pairwise similarity.
        std_sim = cursor_to_df(
            bdb.execute('ESTIMATE SIMILARITY FROM PAIRWISE t_cc')
        )
        assert_frame_equal(std_sim, parallel_sim, check_column_type=True)
Beispiel #25
0
def test_create_generator_schema():
    bdb = bayeslite.bayesdb_open()
    bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV,
        header=True, create=True)
    composer = Composer(n_samples=5)
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    # Using crosscat and default to specify models should work.
    bdb.execute('''
        CREATE GENERATOR t1 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL
            ),
            crosscat (
                Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL
            )
        );''')
    assert bayeslite.core.bayesdb_has_generator(bdb, 't1_cc')
    # IGNORE and GUESS(*) are forbidden and should crash.
    with pytest.raises(AttributeError):
        bdb.execute('''
            CREATE GENERATOR t2 FOR satellites USING composer(
                default (
                    GUESS(*), Country_of_Operator IGNORE,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                )
            );''')
    # Test unregistered foreign predictor.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t3 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # Unregistered foreign predictor should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t4 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # Registered foreign predictor should work.
    composer.register_foreign_predictor(random_forest.RandomForest)
    bdb.execute('''
        CREATE GENERATOR t5 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Eccentricity NUMERICAL
            ),
            random_forest (
                Apogee_km NUMERICAL GIVEN Operator_Owner
            )
        );''')
    # Wrong stattype in predictor should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t6 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km RADIAL GIVEN Operator_Owner
                )
            );''')
    # Missing GIVEN keyword should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t6 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL, Operator_Owner
                )
            );''')
    # Missing conditions in random forest conditions should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t7 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Apogee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # Test duplicate declarations.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t7 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Class_of_orbit CATEGORICAL GIVEN Operator_Owner
                )
            );''')
    # Arbitrary DAG with foreign predictors.
    composer.register_foreign_predictor(multiple_regression.MultipleRegression)
    bdb.execute('''
        CREATE GENERATOR t8 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
            ),
            random_forest (
                Apogee_km NUMERICAL GIVEN Operator_Owner, Users
            ),
            multiple_regression (
                Eccentricity NUMERICAL GIVEN Apogee_km, Users, Perigee_km
            )
        );''')
    # Duplicate declarations in foreign predictors should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t9 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                    Apogee_km NUMERICAL, Eccentricity NUMERICAL
                ),
                random_forest (
                    Perigee_km NUMERICAL GIVEN Purpose
                ),
                multiple_regression (
                    Perigee_km NUMERICAL GIVEN Operator_Owner
                )
            );''')
    # MML for default models should work.
    bdb.execute('''
        CREATE GENERATOR t10 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Apogee_km NUMERICAL
            )
            random_forest (
                Perigee_km NUMERICAL GIVEN Purpose
            )
            multiple_regression (
                Eccentricity NUMERICAL GIVEN Operator_Owner, Class_of_orbit
            )
            DEPENDENT(Apogee_km, Perigee_km, Purpose),
            INDEPENDENT(Country_of_Operator, Purpose)
        );''')
    # MML for foreign predictors should crash.
    with pytest.raises(BLE):
        bdb.execute('''
            CREATE GENERATOR t11 FOR satellites USING composer(
                default (
                    Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                    Users CATEGORICAL, Purpose CATEGORICAL,
                    Class_of_orbit CATEGORICAL,
                    Apogee_km NUMERICAL
                ),
                random_forest (
                    Perigee_km NUMERICAL GIVEN Purpose
                ),
                multiple_regression (
                    Eccentricity NUMERICAL GIVEN Operator_Owner, Class_of_orbit
                )
                DEPENDENT(Apogee_km, Eccentricity, Country_of_Operator),
                INDEPENDENT(Perigee_km, Purpose)
            );''')
    # Test full generator.
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    bdb.execute('''
        CREATE GENERATOR t12 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL,
                Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
                Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
                Contractor CATEGORICAL,
                Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
                Launch_Vehicle CATEGORICAL,
                Source_Used_for_Orbital_Data CATEGORICAL,
                longitude_radians_of_geo NUMERICAL,
                Inclination_radians NUMERICAL,
            ),
            random_forest (
                Type_of_Orbit CATEGORICAL
                    GIVEN Apogee_km, Perigee_km,
                        Eccentricity, Period_minutes, Launch_Mass_kg,
                        Power_watts, Anticipated_Lifetime, Class_of_orbit
            ),
            keplers_law (
                Period_minutes NUMERICAL
                    GIVEN Perigee_km, Apogee_km
            ),
            multiple_regression (
                Anticipated_Lifetime NUMERICAL
                    GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg, Contractor
            ),
            DEPENDENT(Apogee_km, Perigee_km, Eccentricity),
            INDEPENDENT(Country_of_Operator, longitude_radians_of_geo)
        );''')
    bdb.close()
Beispiel #26
0
def test_mml_csv():
    with bayeslite.bayesdb_open() as bdb:
        bayeslite.bayesdb_read_csv_file(bdb,
                                        't',
                                        'tests/mml.csv',
                                        header=True,
                                        create=True)
        guesses = mml_utils.guess_types(bdb, 't')
        # Testing these strings is going to be brittle, but I don't have a
        # great answer.
        assert guesses == ({
            'col1': ('IGNORE', 'Column is constant'),
            'col2': ('CATEGORICAL', 'Only 5 distinct values'),
            'col3': ('IGNORE', 'Column is constant'),
            'col4':
            ('NUMERICAL', 'Contains exclusively numbers (24 of them).'),
            'col5': ('CATEGORICAL', 'Only 2 distinct values'),
            'col6': ('NUMERICAL', 'Contains exclusively numbers (25 of them).')
        })

        mml_json = mml_utils.to_json(guesses)
        assert mml_json == {
            'metamodel': 'crosscat',
            'columns': {
                'col1': {
                    'stattype': 'IGNORE',
                    'reason': 'Column is constant'
                },
                'col2': {
                    'stattype': 'CATEGORICAL',
                    'reason': 'Only 5 distinct values'
                },
                'col3': {
                    'stattype': 'IGNORE',
                    'reason': 'Column is constant'
                },
                'col4': {
                    'stattype': 'NUMERICAL',
                    'reason': 'Contains exclusively numbers (24 of them).'
                },
                'col5': {
                    'stattype': 'CATEGORICAL',
                    'reason': 'Only 2 distinct values'
                },
                'col6': {
                    'stattype': 'NUMERICAL',
                    'reason': 'Contains exclusively numbers (25 of them).'
                }
            }
        }

        mml_statement = mml_utils.to_mml(mml_json, 'table', 'generator')
        assert mml_statement == ('CREATE GENERATOR "generator" FOR "table" '
                                 'USING crosscat( '
                                 '"col6" NUMERICAL,"col4" NUMERICAL,'
                                 '"col5" CATEGORICAL,"col2" CATEGORICAL);')

        # col6's values are constructed in such a way as to break crosscat.
        # See https://github.com/probcomp/bayeslite/issues/284
        # On validation the column should be ignored
        mod_schema = mml_utils.validate_schema(bdb, 't', mml_json)
        assert mod_schema == {
            'metamodel': 'crosscat',
            'columns': {
                'col1': {
                    'stattype': 'IGNORE',
                    'reason': 'Column is constant'
                },
                'col2': {
                    'stattype': 'CATEGORICAL',
                    'reason': 'Only 5 distinct values'
                },
                'col3': {
                    'stattype': 'IGNORE',
                    'reason': 'Column is constant'
                },
                'col4': {
                    'stattype': 'NUMERICAL',
                    'reason': 'Contains exclusively numbers (24 of them).'
                },
                'col5': {
                    'stattype': 'CATEGORICAL',
                    'reason': 'Only 2 distinct values'
                },
                'col6': {
                    'stattype': 'IGNORE',
                    'guessed': 'NUMERICAL',
                    'reason': 'Caused ANALYZE to error'
                }
            }
        }
Beispiel #27
0
# Find the satellites file.
PATH_KEPLER = os.path.dirname(os.path.abspath(__file__))
PATH_EXAMPLES = os.path.dirname(PATH_KEPLER)
PATH_SATELLITES = os.path.join(PATH_EXAMPLES, 'satellites')
PATH_SATELLITES_CSV = os.path.join(PATH_SATELLITES, 'satellites.csv')

composer = Composer()
composer.register_foreign_predictor(keplers_law.KeplersLaw)
composer.register_foreign_predictor(random_forest.RandomForest)

if os.path.exists(os.path.join(outdir, 'kepler.bdb')):
    os.remove(os.path.join(outdir, 'kepler.bdb'))

bdb = bayeslite.bayesdb_open(os.path.join(outdir, 'kepler.bdb'))
bayeslite.bayesdb_register_metamodel(bdb, composer)
bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV,
    header=True, create=True)

bdbcontrib.query(bdb, '''
    CREATE GENERATOR sat_kepler FOR satellites USING composer(
        default (
            Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
            Users CATEGORICAL, Purpose CATEGORICAL,
            Class_of_Orbit CATEGORICAL, Perigee_km NUMERICAL,
            Apogee_km NUMERICAL, Eccentricity NUMERICAL,
            Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
            Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
            Anticipated_Lifetime NUMERICAL, Contractor CATEGORICAL,
            Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
            Launch_Vehicle CATEGORICAL,
            Source_Used_for_Orbital_Data CATEGORICAL,
            longitude_radians_of_geo NUMERICAL, Inclination_radians NUMERICAL
Beispiel #28
0
def test_read_csv():
    with bayeslite.bayesdb_open(builtin_backends=False) as bdb:

        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Table must already exist for create=False.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False,
                ifnotexists=False)

        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Must pass create=True for ifnotexists=True.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False,
                ifnotexists=True)

        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Must pass create=False for header=False.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True,
                ifnotexists=False)

        f = StringIO.StringIO(csv_data)
        with pytest.raises(ValueError):
            # Must pass create=False for header=False.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=True,
                ifnotexists=True)

        f = StringIO.StringIO(csv_hdrdata)
        with pytest.raises(ValueError):
            # Table must already exist for create=False.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False,
                ifnotexists=False)

        f = StringIO.StringIO(csv_hdrdata)
        with pytest.raises(ValueError):
            # Must pass create=True for ifnotexists=True.
            bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False,
                ifnotexists=True)

        f = StringIO.StringIO(csv_hdrdata)
        with pytest.raises(ValueError):
            with bdb.savepoint():
                # Table must not exist if ifnotexists=False.
                bdb.sql_execute('CREATE TABLE t(x)')
                bayeslite.bayesdb_read_csv(bdb, 't', f, header=True,
                    create=True, ifnotexists=False)
        with pytest.raises(IOError):
            # Table must have no empty values in header.
            csv_hdrdata_prime = csv_hdrdata[1:]
            f = StringIO.StringIO(csv_hdrdata_prime)
            with bdb.savepoint():
                bayeslite.bayesdb_read_csv(bdb, 't', f, header=True,
                    create=True, ifnotexists=False)

        f = StringIO.StringIO(csv_hdrdata)
        bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True,
            ifnotexists=False)
        data = bdb.sql_execute('SELECT * FROM t').fetchall()
        assert data == [
            # XXX Would be nice if the NaN could actually be that, or
            # at least None/NULL.
            (1,2,3,'foo','bar',u'nan',u'',u'quagga'),
            (4,5,6,'baz','quux',42.0,u'',u'eland'),
            (7,8,6,'zot','mumble',87.0,u'zoot',u'caribou'),
        ]

        f = StringIO.StringIO(csv_hdr)
        bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=True,
            ifnotexists=True)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == data
        assert cursor_value(bdb.sql_execute('SELECT sql FROM sqlite_master'
                    ' WHERE name = ?', ('t',))) == \
            'CREATE TABLE "t"' \
            '("a" NUMERIC,"b" NUMERIC,"c" NUMERIC,"name" NUMERIC,' \
            '"nick" NUMERIC,"age" NUMERIC,"muppet" NUMERIC,"animal" NUMERIC)'

        f = StringIO.StringIO(csv_data)
        bayeslite.bayesdb_read_csv(bdb, 't', f, header=False, create=False,
            ifnotexists=False)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == data + data

        f = StringIO.StringIO(csv_hdrdata)
        bayeslite.bayesdb_read_csv(bdb, 't', f, header=True, create=False,
            ifnotexists=False)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == \
            data + data + data
        with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp:
            with open(temp.name, 'w') as f:
                f.write(csv_hdrdata)
            bayeslite.bayesdb_read_csv_file(bdb, 't', temp.name, header=True,
                create=False, ifnotexists=False)
        assert bdb.sql_execute('SELECT * FROM t').fetchall() == \
            data + data + data + data

        # Test the BQL CREATE TABLE FROM <csv-file> syntax.
        f = StringIO.StringIO(csv_hdrdata)
        with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp:
            with open(temp.name, 'w') as f:
                f.write(csv_hdrdata)
            bdb.execute('CREATE TABLE t2 FROM \'%s\'' % (temp.name,))
            assert bdb.sql_execute('SELECT * FROM t2').fetchall() == data

        # Trying to read a csv with an empty column name should fail.
        csv_header_corrupt = csv_hdr.replace('a,b',',')
        csv_hdrdata_corrupt = csv_header_corrupt + csv_data
        with tempfile.NamedTemporaryFile(prefix='bayeslite') as temp:
            with open(temp.name, 'w') as f:
                f.write(csv_hdrdata_corrupt)
            with pytest.raises(IOError):
                bayeslite.bayesdb_read_csv_file(
                    bdb, 't3', temp.name, header=True, create=True)