def nullify(self, argin):
    """replace user-specified missing value with NULL
    <table> <value>

    Example:
    bayeslite> .nullify mytable NaN
    bayeslite> .nullify mytable ''
    """
    parser = utils.ArgumentParser(prog='.nullify')
    parser.add_argument('table', type=str, help='Name of the table.')
    parser.add_argument('value', type=str, help='Target string to nullify.')

    try:
        args = parser.parse_args(shlex.split(argin))
    except utils.ArgparseError as e:
        self.stdout.write('%s' % (e.message, ))
        return

    bdbcontrib.nullify(self._bdb, args.table, args.value)
def nullify(self, argin):
    """replace user-specified missing value with NULL
    <table> <value>

    Example:
    bayeslite> .nullify mytable NaN
    bayeslite> .nullify mytable ''
    """
    parser = utils.ArgumentParser(prog='.nullify')
    parser.add_argument('table', type=str,
        help='Name of the table.')
    parser.add_argument('value', type=str,
        help='Target string to nullify.')

    try:
        args = parser.parse_args(shlex.split(argin))
    except utils.ArgparseError as e:
        self.stdout.write('%s' % (e.message,))
        return

    bdbcontrib.nullify(self._bdb, args.table, args.value)
Exemple #3
0
  def nullify(self, value):
    """Wraps bdbcontrib.nullify by passing bdb and name.

    bdbcontrib_nullify_doc"""
    bdbcontrib.nullify(self.bdb, self.name, value)
Exemple #4
0
def test_composer_integration__ci_slow():
    # But currently difficult to seperate these tests into smaller tests because
    # of their sequential nature. We will still test all internal functions
    # with different regimes of operation.

    # SETUP
    # -----
    # Dataset.
    bdb = bayeslite.bayesdb_open()
    bayeslite.bayesdb_read_csv_file(bdb, 'satellites', PATH_SATELLITES_CSV,
        header=True, create=True)
    bdbcontrib.nullify(bdb, 'satellites', 'NaN')
    # Composer.
    composer = Composer(n_samples=5)
    composer.register_foreign_predictor(
        multiple_regression.MultipleRegression)
    composer.register_foreign_predictor(keplers_law.KeplersLaw)
    composer.register_foreign_predictor(random_forest.RandomForest)
    # Use complex generator for interesting test cases.
    bayeslite.bayesdb_register_metamodel(bdb, composer)
    bdb.execute('''
        CREATE GENERATOR t1 FOR satellites USING composer(
            default (
                Country_of_Operator CATEGORICAL, Operator_Owner CATEGORICAL,
                Users CATEGORICAL, Purpose CATEGORICAL,
                Class_of_orbit CATEGORICAL, Perigee_km NUMERICAL,
                Apogee_km NUMERICAL, Eccentricity NUMERICAL,
                Launch_Mass_kg NUMERICAL, Dry_Mass_kg NUMERICAL,
                Power_watts NUMERICAL, Date_of_Launch NUMERICAL,
                Contractor CATEGORICAL,
                Country_of_Contractor CATEGORICAL, Launch_Site CATEGORICAL,
                Launch_Vehicle CATEGORICAL,
                Source_Used_for_Orbital_Data CATEGORICAL,
                longitude_radians_of_geo NUMERICAL,
                Inclination_radians NUMERICAL,
            ),
            random_forest (
                Type_of_Orbit CATEGORICAL
                    GIVEN Apogee_km, Perigee_km,
                        Eccentricity, Period_minutes, Launch_Mass_kg,
                        Power_watts, Anticipated_Lifetime, Class_of_orbit
            ),
            keplers_law (
                Period_minutes NUMERICAL
                    GIVEN Perigee_km, Apogee_km
            ),
            multiple_regression (
                Anticipated_Lifetime NUMERICAL
                    GIVEN Dry_Mass_kg, Power_watts, Launch_Mass_kg,
                    Contractor
            ),
            DEPENDENT(Apogee_km, Perigee_km, Eccentricity),
            DEPENDENT(Contractor, Country_of_Contractor),
            INDEPENDENT(Country_of_Operator, Date_of_Launch)
        );''')


    # ----------------------
    # TEST INITIALIZE MODELS
    # ----------------------

    bdb.execute('INITIALIZE 2 MODELS FOR t1')
    # Check number of models.
    df = bdbcontrib.describe_generator_models(bdb, 't1')
    assert len(df) == 2
    df = bdbcontrib.describe_generator_models(bdb, 't1_cc')
    assert len(df) == 2

    # -------------------
    # TEST ANALYZE MODELS
    # -------------------

    bdb.execute('ANALYZE t1 FOR 2 ITERATIONS WAIT;')
    # Check number of iterations of composer.
    df = bdbcontrib.describe_generator_models(bdb, 't1')
    for index, modelno, iterations in df.itertuples():
        assert iterations == 2
    # Check number of iterations of composer_cc.
    df = bdbcontrib.describe_generator_models(bdb, 't1_cc')
    for index, modelno, iterations in df.itertuples():
        assert iterations == 2

    # ----------------------------------
    # TEST COLUMN DEPENDENCE PROBABILITY
    # ----------------------------------

    # Special 0/1 regimes.
    # Local with a INDEPENDENT local should be 0.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Date_of_Launch
            WITH Country_of_Operator FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 0
    # Local with a DEPENDENT local should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Perigee_km WITH Eccentricity
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Apogee_km WITH Eccentricity
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1
    # Foreign with a local parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH Apogee_km
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH Power_watts
            FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a foreign parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Type_of_Orbit WITH
            Anticipated_Lifetime FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a local non-parent DEPENDENT with local parent should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            Eccentricity FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with foreign sharing common direct ancestor should be 1.
    # Launch_Mass_kg is the common parent.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Type_of_Orbit FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Foreign with a foreign sharing a common DEPENDENT ancestor should be 1.
    # Eccentricity is a parent of Type_of_orbit, and is dependent
    # with Period_minutes through DEPENDENT(Apogee_km, Perigee_km, Eccentricity)
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            Type_of_Orbit FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.
    # Column with itself should be 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Anticipated_Lifetime FROM t1 LIMIT 1
    ''')
    assert curs.next()[0] == 1.

    # Unknown [0,1] regimes.
    # Foreign with a local of unknown relation with parents.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            longitude_radians_of_geo FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Period_minutes WITH
            longitude_radians_of_geo FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.
    # Foreign with a foreign of unknown ancestry relation.
    curs = bdb.execute('''
        ESTIMATE DEPENDENCE PROBABILITY OF Anticipated_Lifetime WITH
            Period_minutes FROM t1 LIMIT 1
    ''')
    assert 0 <= curs.next()[0] <= 1.

    # ----------------------------------
    # TEST SIMULATE
    # ----------------------------------

    # Crash tests for various code paths. Quality of simulations ignored.
    # Joint local.
    curs = bdb.execute('''
        SIMULATE Power_watts, Launch_Mass_kg FROM t1 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Forward simulate foreign.
    curs = bdb.execute('''
        SIMULATE Period_minutes FROM t1 GIVEN Apogee_km = 1000, Perigee_km = 980
            LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Forward simulate foreign with missing parents.
    curs = bdb.execute('''
        SIMULATE Anticipated_Lifetime FROM t1 GIVEN Dry_Mass_kg = 2894,
            Launch_Mass_kg = 1730 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Joint simulate foreign with parents, and missing parents.
    curs = bdb.execute('''
        SIMULATE Type_of_Orbit, Eccentricity FROM t1 GIVEN Dry_Mass_kg = 2894,
            Launch_Mass_kg = 1730 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Joint simulate foreign with non-parents.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Eccentricity FROM t1 GIVEN Apogee_km = 38000
            LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate joint local conditioned on two foreigns.
    curs = bdb.execute('''
        SIMULATE Country_of_Operator, Inclination_radians FROM t1
            GIVEN Period_minutes = 1432, Anticipated_Lifetime = 5 LIMIT 2;
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate joint foreign conditioned on third foreign.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Anticipated_Lifetime FROM t1
            GIVEN Type_of_Orbit = 'Deep Highly Eccentric' LIMIT 2
    ''')
    assert len(curs.fetchall()) == 2
    # Simulate foreign conditioned on itself.
    curs = bdb.execute('''
        SIMULATE Period_minutes, Apogee_km FROM t1
            GIVEN Period_minutes = 102 LIMIT 2
    ''')
    assert [s[0] for s in curs] == [102] * 2

    # -----------------------------
    # TEST COLUMN VALUE PROBABILITY
    # -----------------------------

    # Crash tests for various code path. Quality of logpdf ignored.
    # Conditional local.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Power_watts = 800 GIVEN (Perigee_km = 980,
            Launch_Mass_kg = 890) FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Unconditional foreign
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Period_minutes = 1020 FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Conditional foreign on parent and non-parents.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Period_minutes = 1020 GIVEN
            (Apogee_km = 38000, Eccentricity = 0.03) FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[0]
    # Conditional foriegn on foreign.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Anticipated_Lifetime = 4.09 GIVEN
            (Class_of_Orbit = 'LEO', Purpose='Astrophysics',
                Period_minutes = 1436) FROM t1 LIMIT 1;
    ''')
    assert 0. <= curs.next()[0]
    # Categorical foreign should be less than 1.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF Type_of_Orbit = 'Polar' FROM t1 LIMIT 1;
    ''')
    assert curs.next()[0] <= 1.
    # Query inconsistent with evidence should be 0.
    curs = bdb.execute('''
        ESTIMATE PROBABILITY OF "Type_of_Orbit" = 'Polar'
            GIVEN ("Type_of_Orbit" = 'Deep Highly Eccentric') FROM t1 LIMIT 1;
    ''')
    assert curs.next()[0] == 0.
    # In theory, query consistent with evidence should be 1, but this is very
    # hard to ensure due to stochastic sampling giving different estimates of
    # P(Y), once in joint and once in marginal Monte Carlo estimation.

    # -----------------------
    # TEST MUTUAL INFORMATION
    # -----------------------

    # Two local columns.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Country_of_Contractor WITH
            longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX Small sample sizes non-deterministically produce negative MI
    assert -1 <= curs.next()[0]
    # One local and one foreign column.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH
            longitude_radians_of_geo USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX This non-deterministically fails when sample sizes are small
    # assert 0. <= curs.next()[0]
    assert float("-inf") <= curs.next()[0]
    # Two foreign columns.
    curs = bdb.execute('''
        ESTIMATE MUTUAL INFORMATION OF Period_minutes WITH
            Anticipated_Lifetime USING 5 SAMPLES FROM t1 LIMIT 1;
    ''')
    # XXX This non-deterministically fails when sample sizes are small
    # assert 0. <= curs.next()[0]
    assert float("-inf") <= curs.next()[0]

    # -----------------------
    # TEST PREDICT CONFIDENCE
    # -----------------------

    # Continuous local column.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Dry_Mass_kg CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert curs.next()[1] >= 0.
    # Discrete local column with no children.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Purpose CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1
    # Discrete local column with children.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Contractor CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1
    # Continuous foreign columns.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Period_minutes CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert curs.next()[1] >= 0.
    # Discrete foreign column.
    curs = bdb.execute('''
        INFER EXPLICIT PREDICT Type_of_Orbit CONFIDENCE c FROM t1 LIMIT 1;
    ''')
    assert 0 <= curs.next()[1] <= 1

    bdb.close()
Exemple #5
0
def doit(out_dir, num_models, num_iters, checkpoint_freq, seed):
    then = time.time()

    timestamp = datetime.datetime.fromtimestamp(then).strftime('%Y-%m-%d')
    user = subprocess.check_output(["whoami"]).strip()
    host = subprocess.check_output(["hostname"]).strip()
    filestamp = '-' + timestamp + '-' + user
    def out_file_name(base, ext):
        return out_dir + '/' + base + filestamp + ext

    csv_file = os.path.join(os.path.dirname(__file__), 'satellites.csv')
    bdb_file = out_file_name('satellites', '.bdb')

    # so we can build bdb models
    os.environ['BAYESDB_WIZARD_MODE']='1'

    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    if os.path.exists(bdb_file):
        print 'Error: File', bdb_file, 'already exists. Please remove it.'
        sys.exit(1)

    # create database mapped to filesystem
    log('opening bdb on disk: %s' % bdb_file)
    bdb = bayeslite.bayesdb_open(pathname=bdb_file, builtin_metamodels=False)

    def execute(bql):
        log("executing %s" % bql)
        bdb.execute(bql)

    # read csv into table
    log('reading data from %s' % csv_file)
    bayeslite.bayesdb_read_csv_file(bdb, 'satellites', csv_file,
            header=True, create=True, ifnotexists=True)

    # Add a "not applicable" orbit sub-type
    log('adding "not applicable" orbit sub-type')
    bdb.sql_execute('''UPDATE satellites
        SET type_of_orbit = 'N/A'
        WHERE (class_of_orbit = 'GEO' OR class_of_orbit = 'MEO')
          AND type_of_orbit = 'NaN'
    ''')

    # nullify "NaN"
    log('nullifying NaN')
    bdbcontrib.nullify(bdb, 'satellites', 'NaN')

    # register crosscat metamodel
    cc = ccme.MultiprocessingEngine(seed=seed)
    ccmm = bayeslite.metamodels.crosscat.CrosscatMetamodel(cc)
    bayeslite.bayesdb_register_metamodel(bdb, ccmm)

    # create the crosscat generator using
    execute('''
        CREATE GENERATOR satellites_cc FOR satellites USING crosscat (
            GUESS(*),
            name IGNORE,
            Country_of_Operator CATEGORICAL,
            Operator_Owner CATEGORICAL,
            Users CATEGORICAL,
            Purpose CATEGORICAL,
            Class_of_Orbit CATEGORICAL,
            Type_of_Orbit CATEGORICAL,
            Perigee_km NUMERICAL,
            Apogee_km NUMERICAL,
            Eccentricity NUMERICAL,
            Period_minutes NUMERICAL,
            Launch_Mass_kg NUMERICAL,
            Dry_Mass_kg NUMERICAL,
            Power_watts NUMERICAL,
            Date_of_Launch NUMERICAL,
            Anticipated_Lifetime NUMERICAL,
            Contractor CATEGORICAL,
            Country_of_Contractor CATEGORICAL,
            Launch_Site CATEGORICAL,
            Launch_Vehicle CATEGORICAL,
            Source_Used_for_Orbital_Data CATEGORICAL,
            longitude_radians_of_geo NUMERICAL,
            Inclination_radians NUMERICAL
        )
    ''')

    execute('INITIALIZE %d MODELS FOR satellites_cc' % (num_models,))

    cur_iter_ct = 0

    def snapshot():
        log('vacuuming')
        bdb.sql_execute('vacuum')
        cur_infix = '-%dm-%di' % (num_models, cur_iter_ct)
        save_file_name = out_file_name('satellites', cur_infix + '.bdb')
        meta_file_name = out_file_name('satellites', cur_infix + '-meta.txt')
        log('recording snapshot ' + save_file_name)
        os.system("cp %s %s" % (bdb_file, save_file_name))
        report(save_file_name, meta_file_name)

    def record_metadata(f, saved_file_name, sha_sum, total_time,
                        plot_file_name=None):
        f.write("DB file " + saved_file_name + "\n")
        f.write(sha_sum)
        f.write("built from " + csv_file + "\n")
        f.write("by %s@%s\n" % (user, host))
        f.write("at seed %s\n" % seed)
        f.write("in %3.2f seconds\n" % total_time)
        f.write("with %s models analyzed for %s iterations\n"
                % (num_models, num_iters))
        f.write("by bayeslite %s, with crosscat %s and bdbcontrib %s\n"
                % (bayeslite.__version__, crosscat.__version__, bdbcontrib.__version__))
        if plot_file_name is not None:
            f.write("diagnostics recorded to %s\n" % plot_file_name)
        f.flush()

    def report(saved_file_name, metadata_file, echo=False, plot_file_name=None):
        sha256 = hashlib.sha256()
        with open(saved_file_name, 'rb') as fd:
            for chunk in iter(lambda: fd.read(65536), ''):
                sha256.update(chunk)
        sha_sum = sha256.hexdigest() + '\n'
        total_time = time.time() - then
        with open(metadata_file, 'w') as fd:
            record_metadata(fd, saved_file_name,
                            sha_sum, total_time, plot_file_name)
            fd.write('using script ')
            fd.write('-' * 57)
            fd.write('\n')
            fd.flush()
            os.system("cat %s >> %s" % (__file__, metadata_file))

        if echo:
            record_metadata(sys.stdout, saved_file_name,
                            sha_sum, total_time, plot_file_name)

    def final_report():
        # create a diagnostics plot
        plot_file_name = out_file_name('satellites', '-logscores.pdf')
        log('writing diagnostic plot to %s' % plot_file_name)
        _fig = bdbcontrib.plot_crosscat_chain_diagnostics(bdb, 'logscore',
                                                          'satellites_cc')
        plt.savefig(plot_file_name)
        final_metadata_file = out_file_name('satellites', '-meta.txt')
        report(bdb_file, final_metadata_file,
               echo=True, plot_file_name=plot_file_name)

    snapshot()
    while cur_iter_ct < num_iters:
        execute('ANALYZE satellites_cc FOR %d ITERATIONS WAIT' % checkpoint_freq)
        cur_iter_ct += checkpoint_freq
        snapshot()

    final_report()

    log('closing bdb %s' % bdb_file)
    bdb.close()
    os.system("cd %s && ln -s satellites%s.bdb satellites.bdb" % (out_dir, filestamp))