def main(run_name): Session, _ = db_connect(path.join(TWOFACE_CACHE_PATH, 'apogee.sqlite')) session = Session() done_subq = session.query(AllStar.apogee_id)\ .join(StarResult, JokerRun, Status)\ .filter(Status.id > 0).distinct() run = session.query(JokerRun).filter(JokerRun.name == run_name).one() n_total = session.query(AllStar).join(StarResult, JokerRun, Status)\ .filter(JokerRun.name == run.name)\ .count() print("{0} total".format(n_total)) n_left = session.query(AllStar).join(StarResult, JokerRun, Status)\ .filter(JokerRun.name == run.name)\ .filter(~AllStar.apogee_id.in_(done_subq))\ .count() print("{0} left to process".format(n_left)) print("\nDone:") for status in session.query(Status).order_by(Status.id).all(): star_query = session.query(AllStar).join(StarResult, JokerRun, Status)\ .filter(JokerRun.name == run.name)\ .filter(Status.id == status.id) print("{0} ({1}): {2}".format(status.message, status.id, star_query.count()))
def main(database_file, apogee_id, joker_run): db_path = path.join(TWOFACE_CACHE_PATH, database_file) if not path.exists(db_path): raise IOError("sqlite database not found at '{0}'\n Did you run " "scripts/initdb.py yet for that database?" .format(db_path)) logger.debug("Connecting to sqlite database at '{0}'".format(db_path)) Session, engine = db_connect(database_path=db_path, ensure_db_exists=False) session = Session() # Get The Joker run information run = session.query(JokerRun).filter(JokerRun.name == joker_run).one() try: star = session.query(AllStar).join(StarResult, JokerRun)\ .filter(AllStar.apogee_id == apogee_id)\ .filter(JokerRun.name == joker_run)\ .limit(1)\ .one() except NoResultFound: raise NoResultFound("Star {0} has no results in Joker run {1}." .format(apogee_id, joker_run)) # get the RV data for this star data = star.apogeervdata() # load posterior samples from The Joker h5_file = path.join(TWOFACE_CACHE_PATH, '{0}.hdf5'.format(run.name)) with h5py.File(h5_file) as f: samples = JokerSamples.from_hdf5(f[apogee_id]) # Plot the data with orbits on top fig = plot_data_orbits(data, samples, jitter=run.jitter, xlim_choice='wide', title=star.apogee_id) fig.set_tight_layout(True) fig = plot_data_orbits(data, samples, jitter=run.jitter, xlim_choice='tight', title=star.apogee_id) fig.set_tight_layout(True) # 2: Make a pseudo-corner plot (no histograms) of projections of the samples # _samples_dict = samples_dict.copy() # _samples_dict['m2'] = m2 # _samples_dict['rperi'] = rperi session.close() plt.show()
def main(pool): seed = 42 db_path = path.join(TWOFACE_CACHE_PATH, 'apogee.sqlite') logger.debug("Connecting to sqlite database at '{0}'".format(db_path)) Session, engine = db_connect(database_path=db_path, ensure_db_exists=False) session = Session() apogee_id = '2M04171719+4724006' # needs more prior samples # apogee_id = '2M01320817+0120301' # needs mcmc # apogee_id = '2M02123870+4942289' # completed star = session.query(AllStar).filter(AllStar.apogee_id == apogee_id)\ .limit(1).one() data = star.apogeervdata() rnd = np.random.RandomState(seed=seed) params = JokerParams(P_min=8*u.day, P_max=32768*u.day) joker = TheJoker(params, random_state=rnd, pool=pool) prior_cache_file = path.join(TWOFACE_CACHE_PATH, 'P8-32768_prior_samples.hdf5') print("Pool size: {0}".format(pool.size)) n_iter = 4 for max_prior_samples in 2 ** np.arange(7, 25+1, 2): t0 = time.time() for k in range(n_iter): try: samples = joker.rejection_sample( data=data, prior_cache_file=prior_cache_file, n_prior_samples=max_prior_samples) except Exception as e: logger.warning("\t Failed sampling for star {0} \n Error: {1}" .format(star.apogee_id, str(e))) continue dt = (time.time() - t0) / n_iter print("{0}, {1:.3f}".format(max_prior_samples, dt)) pool.close() session.close()
def main(db_file, pool, seed, overwrite=False): db_path = join(TWOFACE_CACHE_PATH, db_file) if not os.path.exists(db_path): raise IOError("sqlite database not found at '{0}'\n Did you run " "scripts/initdb.py yet for that database?" .format(db_path)) logger.debug("Connecting to sqlite database at '{0}'".format(db_path)) Session, engine = db_connect(database_path=db_path, ensure_db_exists=False) session = Session() # TODO: all hard-set, these should be options params = JokerParams(P_min=10 * u.day, P_max=1000 * u.day, jitter=(9.5, 1.64), jitter_unit=u.m/u.s) n_prior_samples = 2**28 run_name = 'apogee-jitter' apogee_id = '2M01231070+1801407' results_filename = join(TWOFACE_CACHE_PATH, '{0}.hdf5'.format(apogee_id)) prior_samples_file = join(TWOFACE_CACHE_PATH, '{0}-prior.hdf5'.format(apogee_id)) # Create TheJoker sampler instance with the specified random seed and pool rnd = np.random.RandomState(seed=seed) logger.debug("Creating TheJoker instance with {0}, {1}".format(rnd, pool)) joker = TheJoker(params, random_state=rnd, pool=pool) # Create a cache of prior samples (if it doesn't exist) and store the # filename in the database. if not os.path.exists(prior_samples_file) or overwrite: logger.debug("Prior samples file not found - generating {0} samples..." .format(n_prior_samples)) make_prior_cache(prior_samples_file, joker, nsamples=n_prior_samples) logger.debug("...done") # Query to get all stars associated with this run that need processing: # they should have a status id = 0 (needs processing) star_query = session.query(AllStar).join(StarResult, JokerRun, Status)\ .filter(JokerRun.name == run_name)\ .filter(AllStar.apogee_id == apogee_id) star = star_query.limit(1).one() logger.log(1, "Starting star {0}".format(star.apogee_id)) t0 = time.time() data = star.apogeervdata() logger.log(1, "\t visits loaded ({:.2f} seconds)" .format(time.time()-t0)) try: samples = joker.rejection_sample( data=data, prior_cache_file=prior_samples_file, return_logprobs=False) except Exception as e: logger.warning("\t Failed sampling for star {0} \n Error: {1}" .format(star.apogee_id, str(e))) pool.close() sys.exit(1) logger.debug("\t done sampling ({:.2f} seconds)".format(time.time()-t0)) # Write the samples that pass to the results file with h5py.File(results_filename, 'w') as f: samples.to_hdf5(f) logger.debug("\t saved samples ({:.2f} seconds)".format(time.time()-t0)) logger.debug("...done with star {} ({:.2f} seconds)" .format(star.apogee_id, time.time()-t0)) pool.close()
def main(config_file, pool, seed, overwrite=False): config_file = path.abspath(path.expanduser(config_file)) # parse config file with open(config_file, 'r') as f: config = yaml.load(f.read()) # filename of sqlite database database_file = config['database_file'] db_path = path.join(TWOFACE_CACHE_PATH, database_file) if not os.path.exists(db_path): raise IOError( "sqlite database not found at '{0}'\n Did you run " "scripts/initdb.py yet for that database?".format(db_path)) logger.debug("Connecting to sqlite database at '{0}'".format(db_path)) Session, engine = db_connect(database_path=db_path, ensure_db_exists=False) session = Session() run = get_run(config, session, overwrite=False) # The file with cached posterior samples: results_filename = path.join(TWOFACE_CACHE_PATH, "{0}.hdf5".format(run.name)) if not path.exists(results_filename): raise IOError( "Posterior samples result file {0} doesn't exist! Are " "you sure you ran `run_apogee.py`?".format(results_filename)) # Create TheJoker sampler instance with the specified random seed and pool rnd = np.random.RandomState(seed=seed) logger.debug("Creating TheJoker instance with {0}, {1}".format(rnd, pool)) params = run.get_joker_params() joker = TheJoker(params, random_state=rnd, pool=pool) # TODO: we should make sure a 2nd prior cache exists, but because I'm only # going to deal with "needs mcmc", ignore this # _path, ext = path.splitext(run.prior_samples_file) # new_path = '{0}_moar{1}'.format(_path, ext) # if not path.exists(new_path): # make_prior_cache(new_path, joker, # nsamples=8 * config['prior']['num_cache'], # ~100 GB # batch_size=2**24) # MAGIC NUMBER # Get all stars in this JokerRun that need more prior samples # TODO HACK: this query only selects "needs mcmc" stars! star_query = session.query(AllStar).join(StarResult, JokerRun, Status)\ .filter(JokerRun.name == run.name)\ .filter(Status.id == 2) # .filter(Status.id == 1) # Base query to get a StarResult for a given Star so we can update the # status, etc. result_query = session.query(StarResult).join(AllStar, JokerRun)\ .filter(JokerRun.name == run.name) n_stars = star_query.count() logger.info("{0} stars left to process for run more samples '{1}'".format( n_stars, run.name)) # -------------------------------------------------------------------------- # Here is where we do the actual processing of the data for each star. We # loop through all stars that still need processing and continue with # rejection sampling. count = 0 # how many stars we've processed in this star batch batch_size = 16 # MAGIC NUMBER: how many stars to process before committing for star in star_query.all(): if result_query.filter( AllStar.apogee_id == star.apogee_id).count() < 1: logger.debug('Star {0} has no result object!'.format( star.apogee_id)) continue # Retrieve existing StarResult from database. We limit(1) because the # APOGEE_ID isn't unique, but we attach all visits for a given star to # all rows, so grabbing one of them is fine. result = result_query.filter(AllStar.apogee_id == star.apogee_id)\ .limit(1).one() logger.log(1, "Starting star {0}".format(star.apogee_id)) logger.log(1, "Current status: {0}".format(str(result.status))) t0 = time.time() data = star.apogeervdata() logger.log( 1, "\t visits loaded ({:.2f} seconds)".format(time.time() - t0)) if result.status.id == 1: # needs more prior samples try: samples, ln_prior = joker.iterative_rejection_sample( data=data, n_requested_samples=run.requested_samples_per_star, # HACK: prior_cache_file=run.prior_samples_file, prior_cache_file=new_path, return_logprobs=True) except Exception as e: logger.warning( "\t Failed sampling for star {0} \n Error: {1}".format( star.apogee_id, str(e))) continue logger.debug( "\t done sampling ({:.2f} seconds)".format(time.time() - t0)) elif result.status.id == 2: # needs mcmc logger.debug('Firing up MCMC:') with h5py.File(results_filename, 'r') as f: samples0 = JokerSamples.from_hdf5(f[star.apogee_id]) n_walkers = 2 * run.requested_samples_per_star model, samples, sampler = joker.mcmc_sample(data, samples0, n_burn=1024, n_steps=65536, n_walkers=n_walkers, return_sampler=True) sampler.pool = None import pickle with open('test-mcmc.pickle', 'wb') as f: pickle.dump(sampler, f) pool.close() import sys sys.exit(0) # For now, it's sufficient to write the run results to an HDF5 file n = run.requested_samples_per_star all_ln_probs = ln_prior[:n] samples = samples[:n] # Write the samples that pass to the results file with h5py.File(results_filename, 'r+') as f: if star.apogee_id in f: del f[star.apogee_id] g = f.create_group(star.apogee_id) samples.to_hdf5(g) g.create_dataset('ln_prior_probs', data=all_ln_probs) logger.debug("\t saved samples ({:.2f} seconds)".format(time.time() - t0)) result.status_id = get_status_id(samples, data, run.n_requested_samples) logger.debug("...done with star {} ({:.2f} seconds)".format( star.apogee_id, time.time() - t0)) if count % batch_size == 0 and count > 0: session.commit() count += 1 pool.close() session.commit() session.close()
from os import path import astropy.units as u import numpy as np import h5py from thejoker import JokerSamples from twoface.config import TWOFACE_CACHE_PATH from twoface.db import (db_connect, AllStar, AllVisit, AllVisitToAllStar, RedClump, StarResult, Status, JokerRun, initialize_db) from twoface import unimodal_P samples_file = path.join(TWOFACE_CACHE_PATH, 'apogee-jitter.hdf5') Session, _ = db_connect(path.join(TWOFACE_CACHE_PATH, 'apogee.sqlite')) session = Session() with h5py.File(samples_file) as f: for i, key in enumerate(f): g = f[key] if 'ecc' in g: try: g['e'] = g['ecc'][:] except RuntimeError: print(key) raise g['M0'] = g['phi0'][:] for k, v in dict(g['phi0'].attrs).items(): g['M0'].attrs[k] = v
def main(config_file, pool, seed, overwrite=False): # Default seed: if seed is None: seed = 42 config_file = abspath(expanduser(config_file)) # parse config file with open(config_file, 'r') as f: config = yaml.load(f.read()) config['config_file'] = config_file # filename of sqlite database if 'database_file' not in config: database_file = None else: database_file = config['database_file'] db_path = join(TWOFACE_CACHE_PATH, database_file) if not os.path.exists(db_path): raise IOError( "sqlite database not found at '{0}'\n Did you run " "scripts/initdb.py yet for that database?".format(db_path)) logger.debug("Connecting to sqlite database at '{0}'".format(db_path)) Session, engine = db_connect(database_path=db_path, ensure_db_exists=False) session = Session() # Retrieve or create a JokerRun instance run = get_run(config, session, overwrite=False) # never overwrite params = run.get_joker_params() # Create TheJoker sampler instance with the specified random seed and pool rnd = np.random.RandomState(seed=seed) logger.debug("Creating TheJoker instance with {0}, {1}".format(rnd, pool)) joker = TheJoker(params, random_state=rnd, pool=pool) # Create a file to cache the resulting posterior samples results_filename = join(TWOFACE_CACHE_PATH, "{0}-control.hdf5".format(run.name)) # Ensure that the results file exists - this is where we cache samples that # pass the rejection sampling step if not os.path.exists(results_filename): with h5py.File(results_filename, 'w') as f: pass with h5py.File(results_filename, 'r') as f: done_apogee_ids = list(f.keys()) # Create a cache of prior samples (if it doesn't exist) and store the # filename in the database. if not os.path.exists(run.prior_samples_file): raise IOError("Prior cache must already exist.") # Get random IDs star_ids = session.query(AllStar.apogee_id)\ .join(StarResult, JokerRun, Status)\ .filter(Status.id > 0).distinct().all() star_ids = np.array([x[0] for x in star_ids]) star_ids = rnd.choice(star_ids, size=NCONTROL, replace=False) star_ids = star_ids[~np.isin(star_ids, done_apogee_ids)] n_stars = len(star_ids) logger.info( "{0} stars left to process for run '{1}'; {2} already done.".format( n_stars, run.name, len(done_apogee_ids))) # -------------------------------------------------------------------------- # Here is where we do the actual processing of the data for each star. We # loop through all stars that still need processing and iteratively # rejection sample with larger and larger prior sample batch sizes. We do # this for efficiency, but the argument for this is somewhat made up... for apid in star_ids: star = AllStar.get_apogee_id(session, apid) logger.log(1, "Starting star {0}".format(star.apogee_id)) t0 = time.time() orig_data = star.apogeervdata() # HACK: this assumes we're sampling over the excess variance parameter # Generate new data with no RV orbital variations y = rnd.normal(params.jitter[0], params.jitter[1]) s = np.exp(0.5 * y) * params._jitter_unit std = np.sqrt(s**2 + orig_data.stddev**2).to(orig_data.rv.unit).value new_rv = rnd.normal(np.mean(orig_data.rv).value, std) data = APOGEERVData(t=orig_data.t, rv=new_rv * orig_data.rv.unit, stddev=orig_data.stddev) logger.log( 1, "\t visits loaded ({:.2f} seconds)".format(time.time() - t0)) try: samples, ln_prior = joker.iterative_rejection_sample( data=data, n_requested_samples=run.requested_samples_per_star, prior_cache_file=run.prior_samples_file, n_prior_samples=run.max_prior_samples, return_logprobs=True) except Exception as e: logger.warning( "\t Failed sampling for star {0} \n Error: {1}".format( star.apogee_id, str(e))) continue logger.debug("\t done sampling ({:.2f} seconds)".format(time.time() - t0)) # For now, it's sufficient to write the run results to an HDF5 file n = run.requested_samples_per_star samples = samples[:n] # Write the samples that pass to the results file with h5py.File(results_filename, 'r+') as f: if star.apogee_id in f: del f[star.apogee_id] # HACK: this will overwrite the past samples! g = f.create_group(star.apogee_id) samples.to_hdf5(g) logger.debug("\t saved samples ({:.2f} seconds)".format(time.time() - t0)) pool.close() session.close()
def main(config_file, pool, seed, overwrite=False, _continue=False): config_file = abspath(expanduser(config_file)) # parse config file with open(config_file, 'r') as f: config = yaml.load(f.read()) config['config_file'] = config_file # filename of sqlite database if 'database_file' not in config: database_file = None else: database_file = config['database_file'] db_path = join(TWOFACE_CACHE_PATH, database_file) if not os.path.exists(db_path): raise IOError( "sqlite database not found at '{0}'\n Did you run " "scripts/initdb.py yet for that database?".format(db_path)) logger.debug("Connecting to sqlite database at '{0}'".format(db_path)) Session, engine = db_connect(database_path=db_path, ensure_db_exists=False) session = Session() # Retrieve or create a JokerRun instance run = get_run(config, session, overwrite=overwrite) params = run.get_joker_params() # Create TheJoker sampler instance with the specified random seed and pool rnd = np.random.RandomState(seed=seed) logger.debug("Creating TheJoker instance with {0}, {1}".format(rnd, pool)) joker = TheJoker(params, random_state=rnd, pool=pool) # Create a cache of prior samples (if it doesn't exist) and store the # filename in the database. if not os.path.exists(run.prior_samples_file) or overwrite: logger.debug( "Prior samples file not found - generating {0} samples...".format( config['prior']['num_cache'])) make_prior_cache(run.prior_samples_file, joker, nsamples=config['prior']['num_cache']) logger.debug("...done") # Get done APOGEE ID's done_subq = session.query(AllStar.apogee_id)\ .join(StarResult, JokerRun, Status)\ .filter(Status.id > 0).distinct() # Query to get all stars associated with this run that need processing: # they should have a status id = 0 (needs processing) star_query = session.query(AllStar).join(StarResult, JokerRun, Status)\ .filter(JokerRun.name == run.name)\ .filter(Status.id == 0)\ .filter(~AllStar.apogee_id.in_(done_subq)) # Base query to get a StarResult for a given Star so we can update the # status, etc. result_query = session.query(StarResult).join(AllStar, JokerRun)\ .filter(JokerRun.name == run.name)\ .filter(Status.id == 0)\ .filter(~AllStar.apogee_id.in_(done_subq)) # Create a file to cache the resulting posterior samples results_filename = join(TWOFACE_CACHE_PATH, "{0}.hdf5".format(run.name)) n_stars = star_query.count() logger.info("{0} stars left to process for run '{1}'".format( n_stars, run.name)) # Ensure that the results file exists - this is where we cache samples that # pass the rejection sampling step if not os.path.exists(results_filename): with h5py.File(results_filename, 'w') as f: pass # -------------------------------------------------------------------------- # Here is where we do the actual processing of the data for each star. We # loop through all stars that still need processing and iteratively # rejection sample with larger and larger prior sample batch sizes. We do # this for efficiency, but the argument for this is somewhat made up... count = 0 # how many stars we've processed in this star batch batch_size = 16 # MAGIC NUMBER: how many stars to process before committing for star in star_query.all(): if result_query.filter( AllStar.apogee_id == star.apogee_id).count() < 1: logger.debug('Star {0} has no result object!'.format( star.apogee_id)) continue # Retrieve existing StarResult from database. We limit(1) because the # APOGEE_ID isn't unique, but we attach all visits for a given star to # all rows, so grabbing one of them is fine. result = result_query.filter(AllStar.apogee_id == star.apogee_id)\ .limit(1).one() logger.log(1, "Starting star {0}".format(star.apogee_id)) logger.log(1, "Current status: {0}".format(str(result.status))) t0 = time.time() data = star.apogeervdata() logger.log( 1, "\t visits loaded ({:.2f} seconds)".format(time.time() - t0)) try: samples, ln_prior = joker.iterative_rejection_sample( data=data, n_requested_samples=run.requested_samples_per_star, prior_cache_file=run.prior_samples_file, n_prior_samples=run.max_prior_samples, return_logprobs=True) except Exception as e: logger.warning( "\t Failed sampling for star {0} \n Error: {1}".format( star.apogee_id, str(e))) continue logger.debug("\t done sampling ({:.2f} seconds)".format(time.time() - t0)) # For now, it's sufficient to write the run results to an HDF5 file n = run.requested_samples_per_star all_ln_probs = ln_prior[:n] samples = samples[:n] n_actual_samples = len(all_ln_probs) # Write the samples that pass to the results file with h5py.File(results_filename, 'r+') as f: if star.apogee_id in f: del f[star.apogee_id] # HACK: this will overwrite the past samples! g = f.create_group(star.apogee_id) samples.to_hdf5(g) if 'ln_prior_probs' in g: del g['ln_prior_probs'] g.create_dataset('ln_prior_probs', data=all_ln_probs) logger.debug("\t saved samples ({:.2f} seconds)".format(time.time() - t0)) if n_actual_samples >= run.requested_samples_per_star: result.status_id = 4 # completed elif n_actual_samples == 1: # Only one sample was returned - this is probably unimodal, so this # star needs MCMC result.status_id = 2 # needs mcmc else: if unimodal_P(samples, data): # Multiple samples were returned, but they look unimodal result.status_id = 2 # needs mcmc else: # Multiple samples were returned, but not enough to satisfy the # number requested in the config file result.status_id = 1 # needs more samples logger.debug("...done with star {} ({:.2f} seconds)".format( star.apogee_id, time.time() - t0)) if count % batch_size == 0 and count > 0: session.commit() count += 1 pool.close() session.commit() session.close()