Esempio n. 1
0
def main(run_name):
    Session, _ = db_connect(path.join(TWOFACE_CACHE_PATH, 'apogee.sqlite'))
    session = Session()

    done_subq = session.query(AllStar.apogee_id)\
                       .join(StarResult, JokerRun, Status)\
                       .filter(Status.id > 0).distinct()

    run = session.query(JokerRun).filter(JokerRun.name == run_name).one()

    n_total = session.query(AllStar).join(StarResult, JokerRun, Status)\
                                    .filter(JokerRun.name == run.name)\
                                    .count()
    print("{0} total".format(n_total))

    n_left = session.query(AllStar).join(StarResult, JokerRun, Status)\
                                   .filter(JokerRun.name == run.name)\
                                   .filter(~AllStar.apogee_id.in_(done_subq))\
                                   .count()
    print("{0} left to process".format(n_left))

    print("\nDone:")
    for status in session.query(Status).order_by(Status.id).all():
        star_query = session.query(AllStar).join(StarResult, JokerRun, Status)\
                                           .filter(JokerRun.name == run.name)\
                                           .filter(Status.id == status.id)
        print("{0} ({1}): {2}".format(status.message, status.id,
                                      star_query.count()))
Esempio n. 2
0
def main(database_file, apogee_id, joker_run):

    db_path = path.join(TWOFACE_CACHE_PATH, database_file)
    if not path.exists(db_path):
        raise IOError("sqlite database not found at '{0}'\n Did you run "
                      "scripts/initdb.py yet for that database?"
                      .format(db_path))

    logger.debug("Connecting to sqlite database at '{0}'".format(db_path))
    Session, engine = db_connect(database_path=db_path,
                                 ensure_db_exists=False)
    session = Session()

    # Get The Joker run information
    run = session.query(JokerRun).filter(JokerRun.name == joker_run).one()

    try:
        star = session.query(AllStar).join(StarResult, JokerRun)\
                      .filter(AllStar.apogee_id == apogee_id)\
                      .filter(JokerRun.name == joker_run)\
                      .limit(1)\
                      .one()

    except NoResultFound:
        raise NoResultFound("Star {0} has no results in Joker run {1}."
                            .format(apogee_id, joker_run))

    # get the RV data for this star
    data = star.apogeervdata()

    # load posterior samples from The Joker
    h5_file = path.join(TWOFACE_CACHE_PATH, '{0}.hdf5'.format(run.name))
    with h5py.File(h5_file) as f:
        samples = JokerSamples.from_hdf5(f[apogee_id])

    # Plot the data with orbits on top
    fig = plot_data_orbits(data, samples, jitter=run.jitter,
                           xlim_choice='wide', title=star.apogee_id)
    fig.set_tight_layout(True)

    fig = plot_data_orbits(data, samples, jitter=run.jitter,
                           xlim_choice='tight', title=star.apogee_id)
    fig.set_tight_layout(True)

    # 2: Make a pseudo-corner plot (no histograms) of projections of the samples
    # _samples_dict = samples_dict.copy()
    # _samples_dict['m2'] = m2
    # _samples_dict['rperi'] = rperi

    session.close()

    plt.show()
Esempio n. 3
0
def main(pool):
    seed = 42

    db_path = path.join(TWOFACE_CACHE_PATH, 'apogee.sqlite')
    logger.debug("Connecting to sqlite database at '{0}'".format(db_path))
    Session, engine = db_connect(database_path=db_path,
                                 ensure_db_exists=False)
    session = Session()

    apogee_id = '2M04171719+4724006' # needs more prior samples
    # apogee_id = '2M01320817+0120301' # needs mcmc
    # apogee_id = '2M02123870+4942289' # completed
    star = session.query(AllStar).filter(AllStar.apogee_id == apogee_id)\
                  .limit(1).one()
    data = star.apogeervdata()

    rnd = np.random.RandomState(seed=seed)
    params = JokerParams(P_min=8*u.day, P_max=32768*u.day)
    joker = TheJoker(params, random_state=rnd, pool=pool)

    prior_cache_file = path.join(TWOFACE_CACHE_PATH,
                                 'P8-32768_prior_samples.hdf5')

    print("Pool size: {0}".format(pool.size))

    n_iter = 4
    for max_prior_samples in 2 ** np.arange(7, 25+1, 2):
        t0 = time.time()
        for k in range(n_iter):
            try:
                samples = joker.rejection_sample(
                    data=data, prior_cache_file=prior_cache_file,
                    n_prior_samples=max_prior_samples)

            except Exception as e:
                logger.warning("\t Failed sampling for star {0} \n Error: {1}"
                               .format(star.apogee_id, str(e)))
                continue

        dt = (time.time() - t0) / n_iter
        print("{0}, {1:.3f}".format(max_prior_samples, dt))

    pool.close()
    session.close()
Esempio n. 4
0
def main(db_file, pool, seed, overwrite=False):

    db_path = join(TWOFACE_CACHE_PATH, db_file)
    if not os.path.exists(db_path):
        raise IOError("sqlite database not found at '{0}'\n Did you run "
                      "scripts/initdb.py yet for that database?"
                      .format(db_path))

    logger.debug("Connecting to sqlite database at '{0}'".format(db_path))
    Session, engine = db_connect(database_path=db_path,
                                 ensure_db_exists=False)
    session = Session()

    # TODO: all hard-set, these should be options
    params = JokerParams(P_min=10 * u.day,
                         P_max=1000 * u.day,
                         jitter=(9.5, 1.64),
                         jitter_unit=u.m/u.s)
    n_prior_samples = 2**28
    run_name = 'apogee-jitter'
    apogee_id = '2M01231070+1801407'

    results_filename = join(TWOFACE_CACHE_PATH, '{0}.hdf5'.format(apogee_id))
    prior_samples_file = join(TWOFACE_CACHE_PATH,
                              '{0}-prior.hdf5'.format(apogee_id))

    # Create TheJoker sampler instance with the specified random seed and pool
    rnd = np.random.RandomState(seed=seed)
    logger.debug("Creating TheJoker instance with {0}, {1}".format(rnd, pool))
    joker = TheJoker(params, random_state=rnd, pool=pool)

    # Create a cache of prior samples (if it doesn't exist) and store the
    # filename in the database.
    if not os.path.exists(prior_samples_file) or overwrite:
        logger.debug("Prior samples file not found - generating {0} samples..."
                     .format(n_prior_samples))
        make_prior_cache(prior_samples_file, joker,
                         nsamples=n_prior_samples)
        logger.debug("...done")

    # Query to get all stars associated with this run that need processing:
    # they should have a status id = 0 (needs processing)
    star_query = session.query(AllStar).join(StarResult, JokerRun, Status)\
                                       .filter(JokerRun.name == run_name)\
                                       .filter(AllStar.apogee_id == apogee_id)
    star = star_query.limit(1).one()

    logger.log(1, "Starting star {0}".format(star.apogee_id))
    t0 = time.time()

    data = star.apogeervdata()
    logger.log(1, "\t visits loaded ({:.2f} seconds)"
               .format(time.time()-t0))
    try:
        samples = joker.rejection_sample(
            data=data, prior_cache_file=prior_samples_file,
            return_logprobs=False)

    except Exception as e:
        logger.warning("\t Failed sampling for star {0} \n Error: {1}"
                       .format(star.apogee_id, str(e)))
        pool.close()
        sys.exit(1)

    logger.debug("\t done sampling ({:.2f} seconds)".format(time.time()-t0))

    # Write the samples that pass to the results file
    with h5py.File(results_filename, 'w') as f:
        samples.to_hdf5(f)

    logger.debug("\t saved samples ({:.2f} seconds)".format(time.time()-t0))
    logger.debug("...done with star {} ({:.2f} seconds)"
                 .format(star.apogee_id, time.time()-t0))

    pool.close()
def main(config_file, pool, seed, overwrite=False):
    config_file = path.abspath(path.expanduser(config_file))

    # parse config file
    with open(config_file, 'r') as f:
        config = yaml.load(f.read())

    # filename of sqlite database
    database_file = config['database_file']

    db_path = path.join(TWOFACE_CACHE_PATH, database_file)
    if not os.path.exists(db_path):
        raise IOError(
            "sqlite database not found at '{0}'\n Did you run "
            "scripts/initdb.py yet for that database?".format(db_path))

    logger.debug("Connecting to sqlite database at '{0}'".format(db_path))
    Session, engine = db_connect(database_path=db_path, ensure_db_exists=False)
    session = Session()

    run = get_run(config, session, overwrite=False)

    # The file with cached posterior samples:
    results_filename = path.join(TWOFACE_CACHE_PATH,
                                 "{0}.hdf5".format(run.name))
    if not path.exists(results_filename):
        raise IOError(
            "Posterior samples result file {0} doesn't exist! Are "
            "you sure you ran `run_apogee.py`?".format(results_filename))

    # Create TheJoker sampler instance with the specified random seed and pool
    rnd = np.random.RandomState(seed=seed)
    logger.debug("Creating TheJoker instance with {0}, {1}".format(rnd, pool))
    params = run.get_joker_params()
    joker = TheJoker(params, random_state=rnd, pool=pool)

    # TODO: we should make sure a 2nd prior cache exists, but because I'm only
    # going to deal with "needs mcmc", ignore this
    # _path, ext = path.splitext(run.prior_samples_file)
    # new_path = '{0}_moar{1}'.format(_path, ext)
    # if not path.exists(new_path):
    #     make_prior_cache(new_path, joker,
    #                      nsamples=8 * config['prior']['num_cache'], # ~100 GB
    #                      batch_size=2**24) # MAGIC NUMBER

    # Get all stars in this JokerRun that need more prior samples
    # TODO HACK: this query only selects "needs mcmc" stars!
    star_query = session.query(AllStar).join(StarResult, JokerRun, Status)\
                                       .filter(JokerRun.name == run.name)\
                                       .filter(Status.id == 2)
    # .filter(Status.id == 1)

    # Base query to get a StarResult for a given Star so we can update the
    # status, etc.
    result_query = session.query(StarResult).join(AllStar, JokerRun)\
                                            .filter(JokerRun.name == run.name)

    n_stars = star_query.count()
    logger.info("{0} stars left to process for run more samples '{1}'".format(
        n_stars, run.name))

    # --------------------------------------------------------------------------
    # Here is where we do the actual processing of the data for each star. We
    # loop through all stars that still need processing and continue with
    # rejection sampling.

    count = 0  # how many stars we've processed in this star batch
    batch_size = 16  # MAGIC NUMBER: how many stars to process before committing
    for star in star_query.all():

        if result_query.filter(
                AllStar.apogee_id == star.apogee_id).count() < 1:
            logger.debug('Star {0} has no result object!'.format(
                star.apogee_id))
            continue

        # Retrieve existing StarResult from database. We limit(1) because the
        # APOGEE_ID isn't unique, but we attach all visits for a given star to
        # all rows, so grabbing one of them is fine.
        result = result_query.filter(AllStar.apogee_id == star.apogee_id)\
                             .limit(1).one()

        logger.log(1, "Starting star {0}".format(star.apogee_id))
        logger.log(1, "Current status: {0}".format(str(result.status)))
        t0 = time.time()

        data = star.apogeervdata()
        logger.log(
            1, "\t visits loaded ({:.2f} seconds)".format(time.time() - t0))

        if result.status.id == 1:  # needs more prior samples

            try:
                samples, ln_prior = joker.iterative_rejection_sample(
                    data=data,
                    n_requested_samples=run.requested_samples_per_star,
                    # HACK: prior_cache_file=run.prior_samples_file,
                    prior_cache_file=new_path,
                    return_logprobs=True)

            except Exception as e:
                logger.warning(
                    "\t Failed sampling for star {0} \n Error: {1}".format(
                        star.apogee_id, str(e)))
                continue

            logger.debug(
                "\t done sampling ({:.2f} seconds)".format(time.time() - t0))

        elif result.status.id == 2:  # needs mcmc
            logger.debug('Firing up MCMC:')

            with h5py.File(results_filename, 'r') as f:
                samples0 = JokerSamples.from_hdf5(f[star.apogee_id])

            n_walkers = 2 * run.requested_samples_per_star
            model, samples, sampler = joker.mcmc_sample(data,
                                                        samples0,
                                                        n_burn=1024,
                                                        n_steps=65536,
                                                        n_walkers=n_walkers,
                                                        return_sampler=True)

            sampler.pool = None
            import pickle
            with open('test-mcmc.pickle', 'wb') as f:
                pickle.dump(sampler, f)

        pool.close()
        import sys
        sys.exit(0)

        # For now, it's sufficient to write the run results to an HDF5 file
        n = run.requested_samples_per_star
        all_ln_probs = ln_prior[:n]
        samples = samples[:n]

        # Write the samples that pass to the results file
        with h5py.File(results_filename, 'r+') as f:
            if star.apogee_id in f:
                del f[star.apogee_id]

            g = f.create_group(star.apogee_id)
            samples.to_hdf5(g)
            g.create_dataset('ln_prior_probs', data=all_ln_probs)

        logger.debug("\t saved samples ({:.2f} seconds)".format(time.time() -
                                                                t0))

        result.status_id = get_status_id(samples, data,
                                         run.n_requested_samples)

        logger.debug("...done with star {} ({:.2f} seconds)".format(
            star.apogee_id,
            time.time() - t0))

        if count % batch_size == 0 and count > 0:
            session.commit()

        count += 1

    pool.close()

    session.commit()
    session.close()
Esempio n. 6
0
from os import path
import astropy.units as u

import numpy as np
import h5py

from thejoker import JokerSamples

from twoface.config import TWOFACE_CACHE_PATH
from twoface.db import (db_connect, AllStar, AllVisit, AllVisitToAllStar, RedClump,
                        StarResult, Status, JokerRun, initialize_db)
from twoface import unimodal_P

samples_file = path.join(TWOFACE_CACHE_PATH, 'apogee-jitter.hdf5')
Session, _ = db_connect(path.join(TWOFACE_CACHE_PATH, 'apogee.sqlite'))
session = Session()

with h5py.File(samples_file) as f:
    for i, key in enumerate(f):
        g = f[key]

        if 'ecc' in g:
            try:
                g['e'] = g['ecc'][:]
            except RuntimeError:
                print(key)
                raise

            g['M0'] = g['phi0'][:]
            for k, v in dict(g['phi0'].attrs).items():
                g['M0'].attrs[k] = v
Esempio n. 7
0
def main(config_file, pool, seed, overwrite=False):
    # Default seed:
    if seed is None:
        seed = 42

    config_file = abspath(expanduser(config_file))

    # parse config file
    with open(config_file, 'r') as f:
        config = yaml.load(f.read())
        config['config_file'] = config_file

    # filename of sqlite database
    if 'database_file' not in config:
        database_file = None

    else:
        database_file = config['database_file']

    db_path = join(TWOFACE_CACHE_PATH, database_file)
    if not os.path.exists(db_path):
        raise IOError(
            "sqlite database not found at '{0}'\n Did you run "
            "scripts/initdb.py yet for that database?".format(db_path))

    logger.debug("Connecting to sqlite database at '{0}'".format(db_path))
    Session, engine = db_connect(database_path=db_path, ensure_db_exists=False)
    session = Session()

    # Retrieve or create a JokerRun instance
    run = get_run(config, session, overwrite=False)  # never overwrite
    params = run.get_joker_params()

    # Create TheJoker sampler instance with the specified random seed and pool
    rnd = np.random.RandomState(seed=seed)
    logger.debug("Creating TheJoker instance with {0}, {1}".format(rnd, pool))
    joker = TheJoker(params, random_state=rnd, pool=pool)

    # Create a file to cache the resulting posterior samples
    results_filename = join(TWOFACE_CACHE_PATH,
                            "{0}-control.hdf5".format(run.name))

    # Ensure that the results file exists - this is where we cache samples that
    # pass the rejection sampling step
    if not os.path.exists(results_filename):
        with h5py.File(results_filename, 'w') as f:
            pass

    with h5py.File(results_filename, 'r') as f:
        done_apogee_ids = list(f.keys())

    # Create a cache of prior samples (if it doesn't exist) and store the
    # filename in the database.
    if not os.path.exists(run.prior_samples_file):
        raise IOError("Prior cache must already exist.")

    # Get random IDs
    star_ids = session.query(AllStar.apogee_id)\
                      .join(StarResult, JokerRun, Status)\
                      .filter(Status.id > 0).distinct().all()
    star_ids = np.array([x[0] for x in star_ids])
    star_ids = rnd.choice(star_ids, size=NCONTROL, replace=False)
    star_ids = star_ids[~np.isin(star_ids, done_apogee_ids)]

    n_stars = len(star_ids)
    logger.info(
        "{0} stars left to process for run '{1}'; {2} already done.".format(
            n_stars, run.name, len(done_apogee_ids)))

    # --------------------------------------------------------------------------
    # Here is where we do the actual processing of the data for each star. We
    # loop through all stars that still need processing and iteratively
    # rejection sample with larger and larger prior sample batch sizes. We do
    # this for efficiency, but the argument for this is somewhat made up...

    for apid in star_ids:
        star = AllStar.get_apogee_id(session, apid)

        logger.log(1, "Starting star {0}".format(star.apogee_id))
        t0 = time.time()

        orig_data = star.apogeervdata()

        # HACK: this assumes we're sampling over the excess variance parameter
        # Generate new data with no RV orbital variations
        y = rnd.normal(params.jitter[0], params.jitter[1])
        s = np.exp(0.5 * y) * params._jitter_unit
        std = np.sqrt(s**2 + orig_data.stddev**2).to(orig_data.rv.unit).value
        new_rv = rnd.normal(np.mean(orig_data.rv).value, std)
        data = APOGEERVData(t=orig_data.t,
                            rv=new_rv * orig_data.rv.unit,
                            stddev=orig_data.stddev)

        logger.log(
            1, "\t visits loaded ({:.2f} seconds)".format(time.time() - t0))
        try:
            samples, ln_prior = joker.iterative_rejection_sample(
                data=data,
                n_requested_samples=run.requested_samples_per_star,
                prior_cache_file=run.prior_samples_file,
                n_prior_samples=run.max_prior_samples,
                return_logprobs=True)

        except Exception as e:
            logger.warning(
                "\t Failed sampling for star {0} \n Error: {1}".format(
                    star.apogee_id, str(e)))
            continue

        logger.debug("\t done sampling ({:.2f} seconds)".format(time.time() -
                                                                t0))

        # For now, it's sufficient to write the run results to an HDF5 file
        n = run.requested_samples_per_star
        samples = samples[:n]

        # Write the samples that pass to the results file
        with h5py.File(results_filename, 'r+') as f:
            if star.apogee_id in f:
                del f[star.apogee_id]

            # HACK: this will overwrite the past samples!
            g = f.create_group(star.apogee_id)
            samples.to_hdf5(g)

        logger.debug("\t saved samples ({:.2f} seconds)".format(time.time() -
                                                                t0))

    pool.close()
    session.close()
Esempio n. 8
0
def main(config_file, pool, seed, overwrite=False, _continue=False):
    config_file = abspath(expanduser(config_file))

    # parse config file
    with open(config_file, 'r') as f:
        config = yaml.load(f.read())
        config['config_file'] = config_file

    # filename of sqlite database
    if 'database_file' not in config:
        database_file = None

    else:
        database_file = config['database_file']

    db_path = join(TWOFACE_CACHE_PATH, database_file)
    if not os.path.exists(db_path):
        raise IOError(
            "sqlite database not found at '{0}'\n Did you run "
            "scripts/initdb.py yet for that database?".format(db_path))

    logger.debug("Connecting to sqlite database at '{0}'".format(db_path))
    Session, engine = db_connect(database_path=db_path, ensure_db_exists=False)
    session = Session()

    # Retrieve or create a JokerRun instance
    run = get_run(config, session, overwrite=overwrite)
    params = run.get_joker_params()

    # Create TheJoker sampler instance with the specified random seed and pool
    rnd = np.random.RandomState(seed=seed)
    logger.debug("Creating TheJoker instance with {0}, {1}".format(rnd, pool))
    joker = TheJoker(params, random_state=rnd, pool=pool)

    # Create a cache of prior samples (if it doesn't exist) and store the
    # filename in the database.
    if not os.path.exists(run.prior_samples_file) or overwrite:
        logger.debug(
            "Prior samples file not found - generating {0} samples...".format(
                config['prior']['num_cache']))
        make_prior_cache(run.prior_samples_file,
                         joker,
                         nsamples=config['prior']['num_cache'])
        logger.debug("...done")

    # Get done APOGEE ID's
    done_subq = session.query(AllStar.apogee_id)\
                       .join(StarResult, JokerRun, Status)\
                       .filter(Status.id > 0).distinct()

    # Query to get all stars associated with this run that need processing:
    # they should have a status id = 0 (needs processing)
    star_query = session.query(AllStar).join(StarResult, JokerRun, Status)\
                                       .filter(JokerRun.name == run.name)\
                                       .filter(Status.id == 0)\
                                       .filter(~AllStar.apogee_id.in_(done_subq))

    # Base query to get a StarResult for a given Star so we can update the
    # status, etc.
    result_query = session.query(StarResult).join(AllStar, JokerRun)\
                                            .filter(JokerRun.name == run.name)\
                                            .filter(Status.id == 0)\
                                            .filter(~AllStar.apogee_id.in_(done_subq))

    # Create a file to cache the resulting posterior samples
    results_filename = join(TWOFACE_CACHE_PATH, "{0}.hdf5".format(run.name))
    n_stars = star_query.count()
    logger.info("{0} stars left to process for run '{1}'".format(
        n_stars, run.name))

    # Ensure that the results file exists - this is where we cache samples that
    # pass the rejection sampling step
    if not os.path.exists(results_filename):
        with h5py.File(results_filename, 'w') as f:
            pass

    # --------------------------------------------------------------------------
    # Here is where we do the actual processing of the data for each star. We
    # loop through all stars that still need processing and iteratively
    # rejection sample with larger and larger prior sample batch sizes. We do
    # this for efficiency, but the argument for this is somewhat made up...

    count = 0  # how many stars we've processed in this star batch
    batch_size = 16  # MAGIC NUMBER: how many stars to process before committing
    for star in star_query.all():

        if result_query.filter(
                AllStar.apogee_id == star.apogee_id).count() < 1:
            logger.debug('Star {0} has no result object!'.format(
                star.apogee_id))
            continue

        # Retrieve existing StarResult from database. We limit(1) because the
        # APOGEE_ID isn't unique, but we attach all visits for a given star to
        # all rows, so grabbing one of them is fine.
        result = result_query.filter(AllStar.apogee_id == star.apogee_id)\
                             .limit(1).one()

        logger.log(1, "Starting star {0}".format(star.apogee_id))
        logger.log(1, "Current status: {0}".format(str(result.status)))
        t0 = time.time()

        data = star.apogeervdata()
        logger.log(
            1, "\t visits loaded ({:.2f} seconds)".format(time.time() - t0))
        try:
            samples, ln_prior = joker.iterative_rejection_sample(
                data=data,
                n_requested_samples=run.requested_samples_per_star,
                prior_cache_file=run.prior_samples_file,
                n_prior_samples=run.max_prior_samples,
                return_logprobs=True)

        except Exception as e:
            logger.warning(
                "\t Failed sampling for star {0} \n Error: {1}".format(
                    star.apogee_id, str(e)))
            continue

        logger.debug("\t done sampling ({:.2f} seconds)".format(time.time() -
                                                                t0))

        # For now, it's sufficient to write the run results to an HDF5 file
        n = run.requested_samples_per_star
        all_ln_probs = ln_prior[:n]
        samples = samples[:n]
        n_actual_samples = len(all_ln_probs)

        # Write the samples that pass to the results file
        with h5py.File(results_filename, 'r+') as f:
            if star.apogee_id in f:
                del f[star.apogee_id]

            # HACK: this will overwrite the past samples!
            g = f.create_group(star.apogee_id)
            samples.to_hdf5(g)

            if 'ln_prior_probs' in g:
                del g['ln_prior_probs']
            g.create_dataset('ln_prior_probs', data=all_ln_probs)

        logger.debug("\t saved samples ({:.2f} seconds)".format(time.time() -
                                                                t0))

        if n_actual_samples >= run.requested_samples_per_star:
            result.status_id = 4  # completed

        elif n_actual_samples == 1:
            # Only one sample was returned - this is probably unimodal, so this
            # star needs MCMC
            result.status_id = 2  # needs mcmc

        else:

            if unimodal_P(samples, data):
                # Multiple samples were returned, but they look unimodal
                result.status_id = 2  # needs mcmc

            else:
                # Multiple samples were returned, but not enough to satisfy the
                # number requested in the config file
                result.status_id = 1  # needs more samples

        logger.debug("...done with star {} ({:.2f} seconds)".format(
            star.apogee_id,
            time.time() - t0))

        if count % batch_size == 0 and count > 0:
            session.commit()

        count += 1

    pool.close()

    session.commit()
    session.close()