def Regrid_PS(PS1, Corners): dim1, dim2 = Corners.shape[1:] dim1 -= 1 dim2 -= 1 global px, py px, py = np.where(PS1) global squares squares = np.array(Make_squares(Corners)) square_num = np.arange(0, len(squares)) points = np.zeros((len(px), 2)) points[:, 0] = px points[:, 1] = py global pspixels pspixels = Footprint_square(Corners, points) global psimage psimage = PS1.copy() pool = MultiPool() values = list(pool.map(Pix_sum, square_num)) pool.close() PS_scene = np.array(values) PS_scene = np.nansum(PS_scene, axis=0) PS_scene = PS_scene.astype('float') PS_scene = PS_scene.reshape(dim1, dim2) return PS_scene
def test_marginal_ln_likelihood(tmpdir, case): prior, _ = get_prior(case) data, _ = make_data() prior_samples = prior.sample(size=100) joker = TheJoker(prior) # pass JokerSamples instance ll = joker.marginal_ln_likelihood(data, prior_samples) assert len(ll) == len(prior_samples) # save prior samples to a file and pass that instead filename = str(tmpdir / 'samples.hdf5') prior_samples.write(filename, overwrite=True) ll = joker.marginal_ln_likelihood(data, filename) assert len(ll) == len(prior_samples) # make sure batches work: ll = joker.marginal_ln_likelihood(data, filename, n_batches=10) assert len(ll) == len(prior_samples) # NOTE: this makes it so I can't parallelize tests, I think with MultiPool(processes=2) as pool: joker = TheJoker(prior, pool=pool) ll = joker.marginal_ln_likelihood(data, filename) assert len(ll) == len(prior_samples)
def sample(self, n_samples): if self.pool is None or _GPU_ENABLED: pool = SerialPool() else: if isinstance(self.pool, int): pool = MultiPool(self.pool) elif isinstance(self.pool, (SerialPool, MultiPool)): pool = self.pool else: raise TypeError( "Does not understand the given multiprocessing pool.") drawn_samples = list( pool.map(self.draw_one_joint_posterior_sample_map, range(n_samples))) pool.close() drawn_zs = [drawn_samples[i][0] for i in range(n_samples)] drawn_inference_posteriors = [ drawn_samples[i][1] for i in range(n_samples) ] drawn_joint_posterior_samples = pd.DataFrame( drawn_inference_posteriors) drawn_joint_posterior_samples["redshift"] = drawn_zs return drawn_joint_posterior_samples
def pool(request): multimode = 'None' # multimode = 'Serial' # multimode = 'Multi' # multimode = 'MPI' # setup code pool = None if multimode == 'Serial': from schwimmbad import SerialPool pool = SerialPool() elif multimode == 'Multi': from schwimmbad import MultiPool pool = MultiPool() elif multimode == 'MPI': from schwimmbad import MPIPool pool = MPIPool() if not pool.is_master(): pool.wait() import sys sys.exit(0) # inject class variables request.cls.pool = pool yield # tear down if multimode == 'Multi' or multimode == 'MPI': pool.close()
def generate_joint_posterior_samples_from_marginalized_likelihood( joint_result, single_trigger_likelihoods, sep_char="^", ncores=1, ): common_parameters = [ p for p in list(joint_result.posterior.columns) if sep_char not in p ] independent_parameters = [ p for p in list(joint_result.posterior.columns) if sep_char in p ] logger = logging.getLogger(__prog__) logger.info( "Using {} CPU core(s) for generating joint posterior samples from marginalized likelihood" .format(ncores)) import tqdm with MultiPool(ncores) as pool: output_samples = pool.starmap( generate_joint_posterior_sample_from_marginalized_likelihood, tqdm.tqdm([[ row, single_trigger_likelihoods, common_parameters, independent_parameters, sep_char ] for _, row in joint_result.posterior.iterrows()])) # Edit data frame joint_result.posterior = pd.DataFrame(output_samples)
def generate_atlas_in_parallel_chunking(zval, chunksize, nchunks, fname='temp_parallel_atlas', filter_list='filter_list_goodss.dat', filt_dir='internal', priors=[], z_bw=0.05, pg_folder='parallel_atlases/'): N_pregrid = chunksize atlas_vals = [ fname, zval, priors, pg_folder, filter_list, filt_dir, N_pregrid ] time_start = time.time() try: with MultiPool() as pool: values = list(pool.map(partial(gen_pg_parallel, atlas_vals), data)) finally: print('Generated pregrid (%.0f chunks, %.0f sedsperchunk) at zval', zval) print('time taken: %.2f mins.' % ((time.time() - time_start) / 60)) # need to add code here to then concatenate chunks into a single file and delete the individual ones return
def main(data_path: Path, model_path: Path, mask_path: Path, log_level: int): logging.basicConfig( stream=sys.stdout, level=log_level, datefmt='%Y-%m-%d %H:%M', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') with open(data_path) as f: data_cfg = yaml.load(f, Loader=yaml.FullLoader) nside = data_cfg['nside'] npix = hp.nside2npix(nside) nmc = data_cfg['monte_carlo'] amplitude_output_shape = (nmc, 2, npix) parameter_output_shape = (nmc, npix) frequencies = np.array(data_cfg['frequencies']) with h5py.File(data_cfg['hdf5_path'], 'r') as f: data = f['maps/monte_carlo/data'][...] cov = f['maps/monte_carlo/cov'][...] masking = lynx.Masking(mask_path) fitting_masks = list(masking.get_fitting_indices()) tasks = get_tasks(data, cov, nmc, frequencies, fitting_masks, model_path) with MultiPool() as pool: results = pool.map(do_fitting, tasks) with h5py.File(data_cfg['hdf5_path'], 'a') as f: for result in results: save_data(f, amplitude_output_shape, parameter_output_shape, result)
def generate_snrs(result, likelihood, ncores=1): logger = logging.getLogger(__prog__) logger.info("Using {} CPU core(s) for computing SNRs".format(ncores)) import tqdm with MultiPool(ncores) as pool: output_samples = pool.starmap( generate_snrs_per_sample, tqdm.tqdm([[row.to_dict(), likelihood] for _, row in result.posterior.iterrows()])) result.posterior = pd.DataFrame(output_samples) return result
def compute_mean_selection_function(selection_function, N_avg, pool=None): if pool is None: pool = SerialPool() elif isinstance(pool, int): pool = MultiPool(pool) elif isinstance(pool, (SerialPool, MultiPool)): pool = pool else: raise TypeError("Does not understand the given multiprocessing pool.") out = list( pool.starmap(selection_function.evaluate, [() for _ in range(N_avg)])) avg = np.average(out) pool.close() return avg
def test_de_dt_integrate(self): n_values = 10 m_1 = np.random.uniform(0, 10, n_values) * u.Msun m_2 = np.random.uniform(0, 10, n_values) * u.Msun f_orb = 10**(np.random.uniform(-5, -1, n_values)) * u.Hz ecc = np.random.uniform(0.0, 0.9, n_values) beta, a_i = evol.check_mass_freq_input(m_1=m_1, m_2=m_2, f_orb_i=f_orb) n_step = 100 c_0 = utils.c_0(a_i=a_i, ecc_i=ecc) timesteps = evol.create_timesteps_array(a_i=a_i, beta=beta, ecc_i=ecc, t_evol=1 * u.yr, n_step=n_step) t_merge = evol.get_t_merge_ecc(ecc_i=ecc, f_orb_i=f_orb, m_1=m_1, m_2=m_2) # remove any bad timesteps that would evolve past the merger bad_timesteps = timesteps >= t_merge[:, np.newaxis] timesteps[bad_timesteps] = -1 * u.Gyr previous = timesteps.max(axis=1).repeat(timesteps.shape[1]) timesteps[bad_timesteps] = previous.reshape( timesteps.shape)[bad_timesteps] # get rid of the units for faster integration c_0 = c_0.to(u.m).value beta = beta.to(u.m**4 / u.s).value timesteps = timesteps.to(u.s).value # integrate by hand: ecc_evol = np.array([ odeint(evol.de_dt, ecc[i], timesteps[i], args=(beta[i], c_0[i])).flatten() for i in range(len(ecc)) ]) # integrate with function: with MultiPool(processes=1) as pool: ecc_pool = np.array( list( pool.map(evol.integrate_de_dt, zip(ecc, timesteps, beta, c_0)))) self.assertTrue(np.allclose(ecc_evol, ecc_pool, equal_nan=True))
def calm2l_mcmc(infile, alfvar, ncpu, outname): """ - for test purpose """ ALFPY_HOME = os.environ['ALFPY_HOME'] samples = np.array( pd.read_csv(infile, delim_whitespace=True, header=None, comment='#'))[:, 1:47] tstart = time.time() with MultiPool() as pool: pwork = partial(worker_m2l, alfvar, key_list) ml_res = pool.map(pwork, samples) ndur = time.time() - tstart pool.close() print('\ncalculating m2l in .mcmc file: {:.2f}minutes'.format(ndur / 60.)) np.savez('{0}results/{1}_mcmcm2l_b.npz'.format(ALFPY_HOME, outname), m2l=ml_res) return np.array(ml_res)
def run_generation(env, parameters): num_agents = parameters.shape[0] rewards = np.zeros(num_agents) ''' for a in range(num_agents): weights = parameters[a,:] rewards[a] = run_episode(env, weights) ''' # replace loop with map operation for parallel processing with MultiPool() as pool: rewards = np.array(pool.map(run_episode, parameters)) best_reward = np.max(rewards) mean_reward = np.mean(rewards) ranking = rewards.argsort() return parameters[ranking[-5:], :], best_reward, mean_reward, parameters[ ranking[-1], :]
def generate_atlas_in_parallel_zgrid(zgrid, atlas_params, dynamic_decouple=True): """ Make a set of atlases given a redshift grid and a list of parameters (including a priors object). Atlas Params: [N_pregrid, priors, fname, store, path, filter_list, filt_dir, z_bw] """ time_start = time.time() try: with MultiPool() as pool: values = list( pool.map( partial(make_atlas_parallel, atlas_params=atlas_params), zgrid)) finally: time_end = time.time() print('time taken [parallel]: %.2f min.' % ((time_end - time_start) / 60))
def generate_atlas_in_parallel_chunking(chunksize, nchunks, fname='temp_parallel_atlas', filter_list='filter_list_goodss.dat', filt_dir='internal', priors=[], pg_folder='pregrids/'): """ Generate chunks of an atlas in parallel and combine them into one big atlas """ chunk_path = pg_folder + 'atlaschunks/' store_path = pg_folder atlas_vals = [fname, priors, chunk_path, filter_list, filt_dir, chunksize] time_start = time.time() data = np.arange(nchunks) try: with MultiPool() as pool: values = list( pool.map(partial(gen_pg_parallel, atlas_vals=atlas_vals), data)) finally: print('Generated pregrid (%.0f chunks, %.0f sedsperchunk)') print('time taken: %.2f mins.' % ((time.time() - time_start) / 60)) combine_pregrid_chunks(fname_base=fname, N_chunks=nchunks, N_pregrid=chunksize, N_param=priors.Nparam, path=chunk_path, store_path=store_path) return
def simulate(self, bandpasses, norm=None, seed=13, Ncpus=None): self.obs = np.array([]) np.random.seed(seed) tstep = 1 tmin = self.model.mintime() tmax = self.model.maxtime() time = np.arange(tmin, tmax + tstep, tstep) wstep = 10 wmin = self.model.minwave() wmax = self.model.maxwave() wavelen = np.arange(wmin, wmax + wstep, wstep) fluxes = self.model.flux(time, wavelen) norm = np.max(fluxes) if norm is None else norm fluxes = fluxes.T * norm / np.max(fluxes) lc = LightCurve(time, wavelen, fluxes) redshifts = list( sncosmo.zdist(self.zmin, self.zmax, time=self.duration, area=self.area)) tasks = list( zip(redshifts, [self] * len(redshifts), [bandpasses] * len(redshifts), [self.bias] * len(redshifts), [lc] * len(redshifts), np.random.randint(2**32 - 1, size=len(redshifts)))) with MultiPool(processes=Ncpus) as pool: observations = np.array(list(pool.map(survey_worker, tasks))) self.obs = observations
def main(): # how many files should we split the test output into nfiles = 10 wfd_thresh = 1000000 # get the options for the code and set the data_release globally (ugly) to # allow MultiPool to work kwargs = plasticc.get_data.parse_getdata_options() global data_release data_release = kwargs.pop('data_release') getter = plasticc.get_data.GetData(data_release) # setup paths for output base_dir = os.path.join(WORK_DIR, 'csv_dump') dump_dir = os.path.join(base_dir, data_release) if not os.path.exists(dump_dir): os.makedirs(dump_dir) # we can use model as a dummy string to indicate if we are generating # training or test data dummy = kwargs.pop('model') offset = kwargs.pop('offset') limit = kwargs.pop('limit') # setup root filenames for output - these get changed by fixpath if dummy == 'training': outfile = os.path.join(dump_dir, 'plasticc_training_set.csv') offset = None else: if limit is None: outfile = os.path.join(dump_dir, 'plasticc_test_set.csv') else: if offset is None: offset = 0 outfile = os.path.join(dump_dir, 'plasticc_test_n{}_set.csv'.format(offset)) # if we're limiting the output, then just dump one file nfiles = 1 # header file is named something sensible and is public header_file = outfile.replace('.csv', '_metadata.csv') header_file = fixpath(header_file) # make sure we remove any lingering files if os.path.exists(outfile): os.remove(outfile) _ = kwargs.get('field') # set the header keywords for training and testing # same except for sntype will be removed from test and hostgal_photoz isn't # provided kwargs['columns']=['objid','ptrobs_min','ptrobs_max','ra','decl', 'mwebv',\ 'hostgal_specz', 'hostgal_photoz', 'hostgal_photoz_err','sntype'] # set an extrasql query to get just the DDF and WFD objects # sntype for testing = true sntype + 100 if dummy == 'training': extrasql = "AND sntype < 100 AND ((objid LIKE 'WFD%') OR (objid LIKE 'DDF%'))" else: extrasql = "AND sntype > 100 AND ((objid LIKE 'WFD%') OR (objid LIKE 'DDF%'))" # set up options for data retrieval ignoring many of the command-line # options - impose cuts later kwargs['extrasql'] = extrasql kwargs['model'] = '%' kwargs['field'] = '%' kwargs['sort'] = True kwargs['shuffle'] = False kwargs['limit'] = None kwargs['get_num_lightcurves'] = True total = getter.get_lcs_headers(**kwargs) total = list(total)[0] kwargs['limit'] = total kwargs['get_num_lightcurves'] = False kwargs['offset'] = offset out = getter.get_lcs_headers(**kwargs) # current as of 20180827 aggregate_types = { 11: 11, 2: 2, 3: 3, 12: 2, 13: 3, 14: 2, 41: 41, 43: 43, 51: 51, 60: 60, 61: 99, 62: 99, 63: 99, 64: 64, 70: 70, 80: 80, 81: 81, 83: 83, 84: 84, 90: 99, 91: 91, 92: 99, 93: 91 } aggregate_names = { 11: 'SNIa-normal', 2: 'SNCC-II', 3: 'SNCC-Ibc', 12: 'SNCC-II', 13: 'SNCC-Ibc', 14: 'SNCC-II', 41: 'SNIa-91bg', 43: 'SNIa-x', 51: 'KN', 60: 'SLSN-I', 61: 'PISN', 62: 'ILOT', 63: 'CART', 64: 'TDE', 70: 'AGN', 80: 'RRlyrae', 81: 'Mdwarf', 83: 'EBE', 84: 'MIRA', 90: 'uLens-Binary', 91: 'uLens-Point', 92: 'uLens-STRING', 93: 'uLens-Point' } if dummy == 'training': pass else: aggregate_types = {x + 100: y for x, y in aggregate_types.items()} print('Aggregating as ', aggregate_types) # make a big list of the header - NOTE THAT WE ALWAYS RETRIEVE ALL OBJECTS out = list(out) if dummy == 'training': # we don't need to shuffle the training set pass else: # if we're generating test data, if we set a limit, just draw a random # sample else shuffle the full list if limit is not None: out = random.sample(out, limit) else: random.shuffle(out) # convert the selected header entries to a table out = at.Table(rows=out, names=kwargs['columns']) # we're not necessariy keeping all the models we simulated - remove any models that are not in keep_types keep_types = aggregate_types.keys() mask = np.array( [True if x in keep_types else False for x in out['sntype']]) out = out[mask] # aggregate types - map to new class numbers (i.e. MODELNUM_PLASTICC) if dummy == 'training': new_type = np.array( [aggregate_types.get(x, None) for x in out['sntype']]) else: new_type = np.array( [aggregate_types.get(x, None) for x in out['sntype']]) out['sntype'] = new_type # type 99 is not included in training train_types = set(aggregate_types.values()) - set([ 99, ]) train_types = list(train_types) # make sure that there are no "other" classes included in the training data if dummy == 'training': mask = np.array( [True if x in train_types else False for x in out['sntype']]) out = out[mask] # randomize the output type ID - keep rare as 99 target_map_file = outfile.replace('.csv', '_targetmap.txt').replace( '_test_set', '').replace('_training_set', '').replace(dump_dir, base_dir) try: target_map_data = at.Table.read(target_map_file, format='ascii') train_types = target_map_data['train_types'] target_types = target_map_data['target_types'] print(f'Restoring Target Map from {target_map_file}') target_map_file = target_map_file.replace(base_dir, dump_dir) target_map_file = fixpath(target_map_file, public=False) target_map_data.write(target_map_file, format='ascii.fixed_width', delimiter=' ', overwrite=True) print(f'Wrote distribution target mapping to file {target_map_file}') except Exception as e: target_types = np.random.choice(99, len(train_types), replace=False).tolist() target_map_data = at.Table([train_types, target_types], names=['train_types', 'target_types']) target_map_data.write(target_map_file, format='ascii.fixed_width', delimiter=' ', overwrite=True) print(f'Wrote target mapping to file {target_map_file}') target_map_file = target_map_file.replace(base_dir, dump_dir) target_map_file = fixpath(target_map_file, public=False) target_map_data.write(target_map_file, format='ascii.fixed_width', delimiter=' ', overwrite=True) print(f'Wrote distribution target mapping to file {target_map_file}') # map the aggregated IDs to random target IDs target_map = dict(zip(train_types, target_types)) target = np.array([target_map.get(x, 99) for x in out['sntype']]) out['target'] = target print('Mapping as {}'.format(target_map)) # orig map file is like target_map (and also private) but includes the rares orig_map_file = outfile.replace('.csv', '_origmap.txt').replace( '_test_set', '').replace('_training_set', '') orig_map_file = fixpath(orig_map_file, public=False) if not os.path.exists(orig_map_file): orig = [] aggregated = [] mapped = [] names = [] for key, val in aggregate_types.items(): name = aggregate_names.get(key, 'Rare') names.append(name) orig.append(key) aggregated.append(val) mapping = target_map.get(val, 99) mapped.append(mapping) orig_map_data = at.Table([orig, aggregated, mapped, names],\ names=['ORIG_NUM', 'MODEL_NUM', 'TARGET', 'MODEL_NAME']) orig_map_data.write(orig_map_file, format='ascii.fixed_width', delimiter=' ', overwrite=True) print(f'Wrote original mapping to file {orig_map_file}') # galactic objects have -9 as redshift - change to NaN # the numpy.isclose should have worked last time.... check this by hand. ind = out['hostgal_photoz'] == -9. out['hostgal_photoz'][ind] = np.nan out['hostgal_photoz_err'][ind] = np.nan ind = out['hostgal_specz'] == -9. out['hostgal_specz'][ind] = np.nan # add galactic coordinates c = SkyCoord(out['ra'], out['decl'], "icrs", unit='deg') gal = c.galactic out['gall'] = gal.l.value out['galb'] = gal.b.value # add distance modulus cosmo = FlatLambdaCDM(70, 0.3) out['distmod'] = cosmo.distmod(out['hostgal_photoz']).value ind = np.isfinite(out['distmod']) out['distmod'][~ind] = np.nan # figure out what fits files the data are in fits_files = [ "LSST_{0}_MODEL{1}/LSST_{0}_{2}_PHOT.FITS".format(*x.split('_')) for x in out['objid'] ] fits_files = np.array(fits_files) # the object names have the model name in them, so we need to edit them # new name = <SNID> orig_name = out['objid'] new_name = np.array([x.split('_')[-1] for x in orig_name], dtype=np.int32) ddf_field = np.zeros(len(new_name), dtype=np.uint8) ind = new_name < wfd_thresh ddf_field[ind] = 1 # preseve the mapping between old name, new name and file name out['object_id'] = new_name out['filename'] = fits_files out['ddf_bool'] = ddf_field # sort things by object id - Rick has already randomized these, so we preserve his order. out.sort('object_id') del new_name del fits_files del ddf_field del target del new_type # if we are generating test data, save a truth table if dummy == 'training': pass else: out_name = out['object_id'] target = out['target'] # remove the model type from the output header that goes with the test data out.remove_column('target') # make sure the truth table actually matches the job presently executing truth_file = outfile.replace('_set.csv', '_truthtable.csv') truth_file = fixpath(truth_file, public=False) if os.path.exists(truth_file): os.remove(truth_file) # write the truth table truth_table = at.Table([out_name, target], names=['object_id', 'target']) truth_table.write(truth_file) print(f'Wrote {truth_file}') nmc = len(out) out_ind = np.arange(nmc) nthreads = max(multiprocessing.cpu_count() // 2 - 1, 1) print(f'Using {nthreads} threads.') if dummy == 'training': batch_inds = np.array_split(out_ind, nthreads) else: # if this is test data, we want to break files up so that DDF and WFD # are in separate files and the number of files is split so we don't # have a giant CSV file batch_inds = [] ind = np.where(out['object_id'] < wfd_thresh)[0] print('DDF objects {}'.format(len(ind))) batch_inds.append(ind) ind = np.where(out['object_id'] >= wfd_thresh)[0] print('WFD objects {}'.format(len(ind))) batch_inds += np.array_split(ind, nfiles) # make batches to load the data batches = [] for ind in batch_inds: # we need the fits file for each object + the object pointers this_batch_lcs = at.Table( [ out['object_id'][ind], out['ptrobs_min'][ind], out['ptrobs_max'][ind], out['filename'][ind] ], names=['object_id', 'ptrobs_min', 'ptrobs_max', 'filename']) batches.append(this_batch_lcs) gc.collect() # create a map from batch number to first objid in each batch # batch number is helpful to name files by batch # this is sequential, but you might imagine more complicated schemes batch_ids = np.arange(len(batches)) + 1 batch_keys = [x['object_id'][0] for x in batches] batch_map = dict(zip(batch_keys, batch_ids)) # do the output if dummy == 'training': # training is simple - dump each batch into one file in sequence outfile = fixpath(outfile, gzip=True) with MultiPool(processes=nthreads) as pool: with tqdm(total=nmc) as pbar: outlines = 'object_id,mjd,passband,flux,flux_err,detected_bool\n' # change to pool.imap so order is preserved in output file # combine all the batches for result in pool.imap(task, batches): _, nbatch, batchlines = result pbar.update(nbatch) outlines += '\n'.join(batchlines) outlines += '\n' outbytes = outlines.encode() gc.collect() # do the output with gzip.open(outfile, 'wb', compresslevel=9) as f: f.write(outbytes) else: # these variables will set the accessed time and modified time to the same numbers for all batches st_atime = None st_mtime = None with tqdm(total=nmc) as pbar: for batch in batches: # for test, the batches each get a separate file batch_key = batch[0]['object_id'] batch_id = batch_map[batch_key] batchfile = outfile.replace('.csv', f'_batch{batch_id}.csv') batchfile = fixpath(batchfile, gzip=True) # for actual mutliprocessing, split up each file's indices into mini batches ind = np.arange(len(batch)) mini_inds = np.array_split(ind, nthreads - 1) mini_batches = [batch[x] for x in mini_inds] nbatch = 0 # combine all the batches outlines = 'object_id,mjd,passband,flux,flux_err,detected_bool\n' with MultiPool(processes=nthreads - 1) as pool: for result in pool.imap(task, mini_batches): _, mini_nbatch, mini_batchlines = result nbatch += mini_nbatch outlines += '\n'.join(mini_batchlines) outlines += '\n' pbar.update(mini_nbatch) outbytes = outlines.encode() gc.collect() # do the output with gzip.open(batchfile, 'wb', compresslevel=9) as f: f.write(outbytes) # get the timestamps of the first batch file if st_atime is None: st = os.stat(batchfile) st_atime = st.st_atime st_mtime = st.st_mtime # change the timestamp of the output os.utime(batchfile, (st_atime, st_mtime)) # remove and rename some columns from the metadata output # this isn't strictly necessary, since we choose exactly what columns to output # but this makes sure astropy also strips any metadata about the columns itself out.remove_columns( ['objid', 'ptrobs_min', 'ptrobs_max', 'filename', 'sntype']) # setup what columns get output into the headers cols = [ 'object_id', 'ra', 'decl', 'gall', 'galb', 'ddf_bool', 'hostgal_specz', 'hostgal_photoz', 'hostgal_photoz_err', 'distmod', 'mwebv' ] if dummy == 'training': cols.append('target') out = out[cols] # fix column precision precision = {'ra':6, 'decl':6, 'gall':6, 'galb':6,\ 'hostgal_specz':4, 'hostgal_photoz':4, 'hostgal_photoz_err':4, 'distmod':4, 'mwebv':3} for col, val in precision.items(): formatstr = f'%.{val}f' out[col].format = formatstr # write out the header out.write(header_file, format='ascii.csv', overwrite=True)
def main(): # setup paths for output dump_dir = os.path.join(WORK_DIR, 'hdf5_dump') if not os.path.exists(dump_dir): os.makedirs(dump_dir) # get the options for the code and set the data_release globally (ugly) to # allow MultiPool to work kwargs = plasticc.get_data.parse_getdata_options() global data_release data_release = kwargs.pop('data_release') getter = plasticc.get_data.GetData(data_release) # we can use model as a dummy string to indicate if we are generating # training or test data dummy = kwargs.pop('model') offset = kwargs.pop('offset') limit = kwargs.pop('limit') if dummy == 'training': outfile = os.path.join(dump_dir, 'training_set.hdf5') offset = None else: if limit is None: outfile = os.path.join(dump_dir, 'test_set.hdf5') else: if offset is None: offset = 0 outfile = os.path.join(dump_dir, 'test_n{}_set.hdf5'.format(offset)) # make sure we remove any lingering files if os.path.exists(outfile): os.remove(outfile) _ = kwargs.get('field') # set the header keywords for training and testing # same except for sntype will be removed from test and hostgal_photoz isn't # provided if dummy == 'training': kwargs['columns']=['objid','ptrobs_min','ptrobs_max','ra','decl', 'mwebv', 'mwebv_err',\ 'hostgal_photoz', 'hostgal_photoz', 'hostgal_photoz_err', 'sntype'] else: kwargs['columns']=['objid','ptrobs_min','ptrobs_max','ra','decl', 'mwebv', 'mwebv_err',\ 'hostgal_photoz', 'hostgal_photoz_err', 'sntype'] # set an extrasql query to get just the DDF and WFD objects # sntype for testing = true sntype + 100 if dummy == 'training': extrasql = "AND sntype < 100 AND ((objid LIKE 'WFD%') OR (objid LIKE 'DDF%'))" else: extrasql = "AND sntype > 100 AND ((objid LIKE 'WFD%') OR (objid LIKE 'DDF%'))" # set up options for data retrieval ignoring many of the command-line # options kwargs['extrasql'] = extrasql kwargs['model'] = '%' kwargs['field'] = '%' kwargs['sort'] = True kwargs['shuffle'] = False kwargs['limit'] = None kwargs['get_num_lightcurves'] = True total = getter.get_lcs_headers(**kwargs) total = list(total)[0] kwargs['limit'] = total kwargs['get_num_lightcurves'] = False kwargs['offset'] = offset head = getter.get_lcs_headers(**kwargs) # make a big list of the header - NOTE THAT WE ALWAYS RETRIEVE ALL OBJECTS head = list(head) if dummy == 'training': pass else: # if we're generating test data, if we set a limit, just draw a random # sample else shuffle the full list if limit is not None: head = random.sample(head, limit) else: random.shuffle(head) # convert the selected header entries to a table and remove uncessary columns out = at.Table(rows=head, names=kwargs['columns']) out.remove_columns(['ptrobs_min', 'ptrobs_max']) # galactic objects have -9 as redshift - change to 0 dummy_val = np.repeat(-9, len(out)) ind = np.isclose(out['hostgal_photoz'], dummy_val) out['hostgal_photoz'][ind] = 0. out['hostgal_photoz_err'][ind] = 0. # the object names have the model name in them, so we need to edit them # new name = <FIELD><SNID> orig_name = out['objid'] new_name = [ '{}{}'.format(x.split('_')[0], x.split('_')[-1]) for x in orig_name ] new_name = np.array(new_name) new_name = new_name.astype('bytes') out['objid'] = new_name # if we are generating test data, save a truth table if dummy == 'training': pass else: sntype = out['sntype'] # remove the model type from the output header that goes with the test data out.remove_column('sntype') truth_file = outfile.replace('_set.hdf5', '_truthtable.hdf5') if os.path.exists(truth_file): os.remove(truth_file) # ... saving it in the truth table only orig_name = orig_name.astype(bytes) new_name = new_name.astype(bytes) sntype = sntype.astype(bytes) truth_table = at.Table([orig_name, new_name, sntype], names=['objid', 'shortid', 'sntype']) truth_table.write(truth_file, compression=True, path='truth_table', serialize_meta=False, append=True) # write out the header out.write(outfile, compression=True, path='header', serialize_meta=False, append=True) nmc = len(out) # use a multiprocessing pool to load each light curve and dump to HDF5 with MultiPool() as pool: with tqdm(total=nmc) as pbar: for result in pool.imap(task, head): short_obj, thislc = result thislc.write(outfile, path=short_obj, compression=True, serialize_meta=False, append=True) pbar.update()
def fit_catalog(fit_cat, atlas_path, atlas_fname, output_fname, N_pregrid=10000, N_param=3, z_bw=0.05, f160_cut=100, fit_mask=[], zgrid=[], sfr_uncert_cutoff=2.0): cat_id, cat_zbest, cat_seds, cat_errs, cat_f160, cat_class_star = fit_cat #if not zgrid: if isinstance(zgrid, (np.ndarray)) == False: zgrid = np.arange(np.amin(cat_zbest), np.amax(cat_zbest), z_bw) fit_id = cat_id.copy() fit_logM_50 = np.zeros_like(cat_zbest) fit_logM_MAP = np.zeros_like(cat_zbest) fit_logM_16 = np.zeros_like(cat_zbest) fit_logM_84 = np.zeros_like(cat_zbest) fit_logSFRinst_50 = np.zeros_like(cat_zbest) fit_logSFRinst_MAP = np.zeros_like(cat_zbest) fit_logSFRinst_16 = np.zeros_like(cat_zbest) fit_logSFRinst_84 = np.zeros_like(cat_zbest) fit_logZsol_50 = np.zeros_like(cat_zbest) fit_logZsol_16 = np.zeros_like(cat_zbest) fit_logZsol_84 = np.zeros_like(cat_zbest) fit_Av_50 = np.zeros_like(cat_zbest) fit_Av_16 = np.zeros_like(cat_zbest) fit_Av_84 = np.zeros_like(cat_zbest) fit_zfit_50 = np.zeros_like(cat_zbest) fit_zfit_16 = np.zeros_like(cat_zbest) fit_zfit_84 = np.zeros_like(cat_zbest) fit_logMt_50 = np.zeros_like(cat_zbest) fit_logMt_16 = np.zeros_like(cat_zbest) fit_logMt_84 = np.zeros_like(cat_zbest) fit_logSFR100_50 = np.zeros_like(cat_zbest) fit_logSFR100_16 = np.zeros_like(cat_zbest) fit_logSFR100_84 = np.zeros_like(cat_zbest) fit_nparam = np.zeros_like(cat_zbest) fit_t25_50 = np.zeros_like(cat_zbest) fit_t25_16 = np.zeros_like(cat_zbest) fit_t25_84 = np.zeros_like(cat_zbest) fit_t50_50 = np.zeros_like(cat_zbest) fit_t50_16 = np.zeros_like(cat_zbest) fit_t50_84 = np.zeros_like(cat_zbest) fit_t75_50 = np.zeros_like(cat_zbest) fit_t75_16 = np.zeros_like(cat_zbest) fit_t75_84 = np.zeros_like(cat_zbest) fit_nbands = np.zeros_like(cat_zbest) fit_f160w = np.zeros_like(cat_zbest) fit_stellarity = np.zeros_like(cat_zbest) fit_chi2 = np.zeros_like(cat_zbest) fit_flags = np.zeros_like(cat_zbest) for i in (range(len(zgrid))): print('loading atlas at', zgrid[i]) # for a given redshift slice, zval = zgrid[i] # select the galaxies to be fit z_mask = (cat_zbest < (zval + z_bw / 2)) & (cat_zbest > (zval - z_bw / 2)) & (cat_f160 < f160_cut) fit_ids = np.arange(len(cat_zbest))[z_mask] # for gal_id in fit_ids: # gal_sed = cat_seds[gal_id, 0:] # gal_err = cat_errs[gal_id, 0:] # fit_likelihood, fit_norm_fac = evaluate_sed_likelihood(gal_sed,gal_err,atlas,fit_mask=[], # zbest=None,deltaz=None) # quants = get_quants(fit_likelihood, atlas, fit_norm_fac) print('starting parallel fitting for Ngals = ', len(fit_ids), ' at redshift ', str(zval)) try: # load the atlas fname = atlas_fname + '_zval_%.0f_' % (zgrid[i] * 10000) atlas = load_atlas(fname, N_pregrid, N_param=N_param, path=atlas_path) print('loaded atlas') with MultiPool() as pool: # note: Parallel doesn't work in Python2.6 # if not fit_mask: if isinstance(fit_mask, np.ndarray) == False: all_quants = list( pool.map( partial(fit_gals, catvals=(cat_seds, cat_errs, atlas)), fit_ids)) else: all_quants = list( pool.map( partial(fit_gals, catvals=(cat_seds, cat_errs, fit_mask, atlas)), fit_ids)) print('finished fitting parallel zbest chunk at z=%.3f' % zval) print('starting to put values in arrays') for ii, gal_id in enumerate(fit_ids): gal_sed = cat_seds[gal_id, 0:] gal_err = cat_errs[gal_id, 0:] quants = all_quants[ii][0] fit_likelihood = all_quants[ii][1] # fit_logM_MAP[gal_id] = all_quants[ii][2] # fit_logSFRinst_MAP[gal_id] = all_quants[ii][3] fit_logM_50[gal_id] = quants[0][0] fit_logM_16[gal_id] = quants[0][1] fit_logM_84[gal_id] = quants[0][2] fit_logSFRinst_50[gal_id] = quants[1][0] fit_logSFRinst_16[gal_id] = quants[1][1] fit_logSFRinst_84[gal_id] = quants[1][2] fit_Av_50[gal_id] = quants[2][0] fit_Av_16[gal_id] = quants[2][1] fit_Av_84[gal_id] = quants[2][2] fit_logZsol_50[gal_id] = quants[3][0] fit_logZsol_16[gal_id] = quants[3][1] fit_logZsol_84[gal_id] = quants[3][2] fit_zfit_50[gal_id] = quants[4][0] fit_zfit_16[gal_id] = quants[4][1] fit_zfit_84[gal_id] = quants[4][2] fit_logMt_50[gal_id] = quants[5][0][0] fit_logMt_16[gal_id] = quants[5][1][0] fit_logMt_84[gal_id] = quants[5][2][0] fit_logSFR100_50[gal_id] = quants[5][0][1] fit_logSFR100_16[gal_id] = quants[5][1][1] fit_logSFR100_84[gal_id] = quants[5][2][1] fit_nparam[gal_id] = quants[5][0][2] fit_t25_50[gal_id] = quants[5][0][3] fit_t25_16[gal_id] = quants[5][1][3] fit_t25_84[gal_id] = quants[5][2][3] fit_t50_50[gal_id] = quants[5][0][4] fit_t50_16[gal_id] = quants[5][1][4] fit_t50_84[gal_id] = quants[5][2][4] fit_t75_50[gal_id] = quants[5][0][5] fit_t75_16[gal_id] = quants[5][1][5] fit_t75_84[gal_id] = quants[5][2][5] fit_nbands[gal_id] = np.sum(gal_sed > 0) fit_f160w[gal_id] = cat_f160[gal_id] fit_stellarity[gal_id] = cat_class_star[gal_id] fit_chi2[gal_id] = np.amin(fit_likelihood) # flagging galaxies that either # 1. have nan values for mass # 2. have SFR uncertainties > sfr_uncert_cutoff # 3. are flagged as a star # 4. have extremely large chi2 if np.isnan(quants[0][0]): fit_flags[gal_id] = 1.0 elif (np.abs(fit_logSFRinst_84[gal_id] - fit_logSFRinst_16[gal_id]) > sfr_uncert_cutoff): fit_flags[gal_id] = 2.0 elif (cat_class_star[gal_id] > 0.5): fit_flags[gal_id] = 3.0 elif (fit_chi2[gal_id] > 1000): fit_flags[gal_id] = 4.0 else: fit_flags[gal_id] = 0.0 except: print('couldn\'t fit with pool at z=', zval) print('finishing that') pl.clf() pl.figure(figsize=(12, 6)) pl.hist(cat_zbest[cat_zbest > 0], np.arange(0, 6, z_bw), color='black', alpha=0.3) #pl.hist(fit_zfit_50[fit_zfit_50>0],np.arange(0,6,z_bw),color='royalblue') pl.hist(cat_zbest[fit_zfit_50 > 0], np.arange(0, 6, z_bw), color='royalblue') pl.title('fit %.0f/%.0f galaxies' % (np.sum(fit_zfit_50 > 0), len(cat_zbest))) pl.xlabel('redshift') pl.ylabel('# galaxies') display.clear_output(wait=True) display.display(pl.gcf()) pl.show() #'logSFRinst_MAP':fit_logSFRinst_MAP, #'logM_MAP':fit_logM_MAP, fit_mdict = { 'ID': fit_id, 'logM_50': fit_logM_50, 'logM_16': fit_logM_16, 'logM_84': fit_logM_84, 'logSFRinst_50': fit_logSFRinst_50, 'logSFRinst_16': fit_logSFRinst_16, 'logSFRinst_84': fit_logSFRinst_84, 'logZsol_50': fit_logZsol_50, 'logZsol_16': fit_logZsol_16, 'logZsol_84': fit_logZsol_84, 'Av_50': fit_Av_50, 'Av_16': fit_Av_16, 'Av_84': fit_Av_84, 'zfit_50': fit_zfit_50, 'zfit_16': fit_zfit_16, 'zfit_84': fit_zfit_84, 'logMt_50': fit_logMt_50, 'logMt_16': fit_logMt_16, 'logMt_84': fit_logMt_84, 'logSFR100_50': fit_logSFR100_50, 'logSFR100_16': fit_logSFR100_16, 'logSFR100_84': fit_logSFR100_84, 't25_50': fit_t25_50, 't25_16': fit_t25_16, 't25_84': fit_t25_84, 't50_50': fit_t50_50, 't50_16': fit_t50_16, 't50_84': fit_t50_84, 't75_50': fit_t75_50, 't75_16': fit_t75_16, 't75_84': fit_t75_84, 'nparam': fit_nparam, 'nbands': fit_nbands, 'F160w': fit_f160w, 'stellarity': fit_stellarity, 'chi2': fit_chi2, 'fit_flags': fit_flags } fit_cat = Table(fit_mdict) fit_cat.write(output_fname, format='ascii.commented_header') return
def evol_ecc(ecc_i, t_evol=None, n_step=100, timesteps=None, beta=None, m_1=None, m_2=None, a_i=None, f_orb_i=None, output_vars=['ecc', 'f_orb'], n_proc=1, avoid_merger=True, exact_t_merge=False, t_before=1 * u.Myr, t_merge=None): """Evolve an array of eccentric binaries for ``t_evol`` time This function use Peters & Mathews (1964) Eq. 5.11 and 5.13. Note that all of {``beta``, ``m_1``, ``m_2``, ``ecc_i``, ``a_i``, ``f_orb_i``} must have the same dimensions. Parameters ---------- ecc_i : `float/array` Initial eccentricity t_evol : `float/array` Amount of time for which to evolve each binaries. Required if ``timesteps`` is None. Defaults to merger times. n_steps : `int` Number of timesteps to take between t=0 and t=``t_evol``. Required if ``timesteps`` is None. Defaults to 100. timesteps : `float/array` Array of exact timesteps to take when evolving each binary. Must be monotonically increasing and start with t=0. Either supply a 1D array to use for every binary or a 2D array that has a different array of timesteps for each binary. ``timesteps`` is used in place of ``t_evol`` and ``n_steps`` and takes precedence over them. beta : `float/array` Constant defined in Peters and Mathews (1964) Eq. 5.9. See :meth:`legwork.utils.beta` (if supplied ``m_1`` and `m_2` are ignored) m_1 : `float/array` Primary mass (required if ``beta`` is None or if ``output_vars`` contains a frequency) m_2 : `float/array` Secondary mass (required if ``beta`` is None or if ``output_vars`` contains a frequency) a_i : `float/array` Initial semi-major axis (if supplied ``f_orb_i`` is ignored) f_orb_i : `float/array` Initial orbital frequency (required if ``a_i`` is None) output_vars : `array` List of **ordered** output vars, choose from any of ``timesteps``, ``ecc``, ``a``, ``f_orb`` and ``f_GW`` for which of timesteps, eccentricity, semi-major axis and orbital/GW frequency that you want. Default is [``ecc``, ``f_orb``] n_proc : `int` Number of processors to split eccentricity evolution over, where the default is n_proc=1 avoid_merger : `boolean` Whether to avoid integration around the merger of the binary. Warning: setting this to false will result in many LSODA errors to be outputted since the derivatives get so large. exact_t_merge : `boolean` Whether to calculate the merger time exactly or use a fit (only relevant when ``avoid_merger`` is set to True t_before : `float` How much time before the merger to cutoff the integration (default is 1 Myr - this will prevent all LSODA warnings for e < 0.95, you may need to increase this time if your sample is more eccentric than this) t_merge : `float/array` Merger times for each source to be evolved. Only used when `avoid_merger=True`. If `None` then these will be automatically calculated either approximately or exactly based on the values of `exact_t_merge`. Returns ------- evolution : `array` Array possibly containing eccentricity, semi-major axis, timesteps and frequency evolution. Content determined by ``output_vars`` """ # transform input if only a single source arrayed_args, single_source = utils.ensure_array(m_1, m_2, beta, a_i, f_orb_i, ecc_i) m_1, m_2, beta, a_i, f_orb_i, ecc_i = arrayed_args output_vars = np.array([output_vars]) if isinstance(output_vars, str) else output_vars beta, a_i = check_mass_freq_input(beta=beta, m_1=m_1, m_2=m_2, a_i=a_i, f_orb_i=f_orb_i) if np.isin(output_vars, ["f_orb", "f_GW"]).any() and (m_1 is None or m_2 is None): raise ValueError( "`m_1`` and `m_2` required if `output_vars` contains a frequency") c_0 = utils.c_0(a_i=a_i, ecc_i=ecc_i) timesteps = create_timesteps_array(a_i=a_i, beta=beta, ecc_i=ecc_i, t_evol=t_evol, n_step=n_step, timesteps=timesteps) # if avoiding the merger during integration if avoid_merger: if t_merge is None: # calculate the merger time t_merge = get_t_merge_ecc(ecc_i=ecc_i, a_i=a_i, beta=beta, exact=exact_t_merge).to(u.Gyr) # warn the user if they are evolving past the merger if np.any(timesteps > t_merge[:, np.newaxis]): print( "WARNING: Some timesteps are past the merger of the source and this may produce erroneous", "results in combination with `avoid_merger=True`. Only evolve sources until their merger", "or set `avoid_merger=False`.") # make a mask for any timesteps that are too close to the merger too_close = timesteps >= t_merge[:, np.newaxis] - t_before check = too_close check[:, 0] = True if np.all(check): # pragma: no cover print("WARNING: All timesteps are too close to merger so", "evolution is not possible. Either set `t_before` to a", "smaller time or turn off `avoid_merger`") # ensure that the first timestep is always valid too_close[:, 0] = False if np.any(too_close): # set them all equal to the previous timestep before passing limit timesteps[too_close] = -1 * u.Gyr previous = timesteps.max(axis=1).repeat(timesteps.shape[1]) timesteps[too_close] = previous.reshape(timesteps.shape)[too_close] # get rid of the units for faster integration c_0 = c_0.to(u.m).value beta = beta.to(u.m**4 / u.s).value timesteps = timesteps.to(u.s).value # perform the evolution if n_proc > 1: with MultiPool(processes=n_proc) as pool: ecc_evol = np.array( list( pool.map(integrate_de_dt, zip(ecc_i, timesteps.tolist(), beta, c_0)))) else: ecc_evol = np.array([ odeint(de_dt, ecc_i[i], timesteps[i], args=(beta[i], c_0[i])).flatten() for i in range(len(ecc_i)) ]) c_0 = c_0[:, np.newaxis] * u.m ecc_evol = np.nan_to_num(ecc_evol, nan=0.0) # calculate a_evol if any frequency or separation requested if np.isin(output_vars, ["a", "f_orb", "f_GW"]).any(): a_evol = utils.get_a_from_ecc(ecc_evol, c_0) # calculate f_orb_evol if any frequency requested if np.isin(output_vars, ["f_orb", "f_GW"]).any(): # change merged binaries to extremely small separations a_not0 = np.where(a_evol.value == 0.0, 1e-30 * a_evol.unit, a_evol) f_orb_evol = utils.get_f_orb_from_a(a=a_not0, m_1=m_1[:, np.newaxis], m_2=m_2[:, np.newaxis]) # change frequencies back to 1Hz since LISA can't measure above f_orb_evol = np.where(a_not0.value == 1e-30, 1e2 * u.Hz, f_orb_evol) # construct evolution output evolution = [] for var in output_vars: if var == "timesteps": timesteps = timesteps.flatten() if single_source else timesteps evolution.append((timesteps * u.s).to(u.yr)) elif var == "ecc": ecc_evol = ecc_evol.flatten() if single_source else ecc_evol evolution.append(ecc_evol) elif var == "a": a_evol = a_evol.flatten() if single_source else a_evol evolution.append(a_evol.to(u.AU)) elif var == "f_orb": f_orb_evol = f_orb_evol.flatten() if single_source else f_orb_evol evolution.append(f_orb_evol.to(u.Hz)) elif var == "f_GW": f_orb_evol = f_orb_evol.flatten() if single_source else f_orb_evol evolution.append(2 * f_orb_evol.to(u.Hz)) return evolution if len(evolution) > 1 else evolution[0]
def evolve(cls, initialbinarytable, pool=None, **kwargs): """After setting a number of initial conditions we evolve the system. Parameters ---------- initialbinarytable : DataFrame Initial conditions of the binary **kwargs: There are three ways to tell evolve and thus the fortran what you want all the flags and other BSE specific parameters to be. If you pass both a dictionary of flags and/or a inifile and a table with the BSE parameters in the columns, the column values will be overwritten by what is in the dictionary or ini file. NUMBER 1: PASS A DICTIONARY OF FLAGS BSEDict NUMBER 2: PASS A PANDAS DATA FRAME WITH PARAMS DEFINED AS COLUMNS All you need is the initialbinarytable if the all the BSE parameters are defined as columns NUMBER 3: PASS PATH TO A INI FILE WITH THE FLAGS DEFINED params randomseed : `int`, optional, default let numpy choose for you If you would like the random seed that the underlying fortran code uses to be the same for all of the initial conditions you passed then you can send this keyword argument in. It is recommended to just let numpy choose a random number as the Fortran random seed and then this number will be returned as a column in the initial binary table so that you can reproduce the results. nproc : `int`, optional, default: 1 number of CPUs to use to evolve systems in parallel idx : `int`, optional, default: 0 initial index of the bcm/bpp arrays dtp : `float`, optional: default: tphysf timestep size in Myr for bcm output where tphysf is total evolution time in Myr n_per_block : `int`, optional, default: -1 number of systems to evolve in a block with _evolve_multi_system, to allow larger multiprocessing queues and reduced overhead. If less than 1 use _evolve_single_system Returns ------- output_bpp : DataFrame Evolutionary history of each binary output_bcm : DataFrame Final state of each binary initialbinarytable : DataFrame Initial conditions for each binary """ idx = kwargs.pop('idx', 0) nproc = min(kwargs.pop('nproc', 1), len(initialbinarytable)) n_per_block = kwargs.pop('n_per_block', -1) # There are three ways to tell evolve and thus the fortran # what you want all the flags and other BSE specific # parameters to be # NUMBER 1: PASS A DICTIONARY OF FLAGS BSEDict = kwargs.pop('BSEDict', {}) # NUMBER 2: PASS A PANDAS DATA FRAME WITH PARAMS DEFINED AS COLUMNS # All you need is the initialbinarytable with columns, # If you pass both a dictionary of flags and/or a inifile # and a table with the columns, the column values will be # overwritten by what is in the dictionary or ini file # NUMBER 3: PASS PATH TO A INI FILE WITH THE FLAGS DEFINED params = kwargs.pop('params', None) if BSEDict and params is not None: raise ValueError('Please pass either a dictionary ' 'of BSE flags or a path to an inifle not both.') if params is not None: if not os.path.isfile(params): raise ValueError( "File does not exist, probably supplied incorrect " "path to the inifile.") BSEDict, _, _, _, _ = utils.parse_inifile(params) # error check the parameters you are trying to pass to BSE # if we sent in a table with the parameter names # then we will temporarily create a dictionary # in order to verify that the values in the table # are valid utils.error_check(BSEDict) # check the initial conditions of the system and warn user if # anything is weird about them, such as the star starts # in Roche Lobe overflow utils.check_initial_conditions(initialbinarytable) # assign some columns based on keyword arguments but that # can be overwritten by the params or BSEDict if 'dtp' not in initialbinarytable.keys(): initialbinarytable = initialbinarytable.assign( dtp=kwargs.pop('dtp', initialbinarytable['tphysf'])) if 'randomseed' not in initialbinarytable.keys(): seed = np.random.randint(np.iinfo(np.int32).min, np.iinfo(np.int32).max, size=len(initialbinarytable)) initialbinarytable = initialbinarytable.assign( randomseed=kwargs.pop('randomseed', seed)) if 'bin_num' not in initialbinarytable.keys(): initialbinarytable = initialbinarytable.assign( bin_num=np.arange(idx, idx + len(initialbinarytable))) for k, v in BSEDict.items(): if k in initialbinarytable.keys(): warnings.warn( "The value for {0} in initial binary table is being " "overwritten by the value of {0} from either the params " "file or the BSEDict.".format(k)) # special columns that need to be handled differently if k == 'natal_kick_array': assign_natal_kick_array = [BSEDict['natal_kick_array'] ] * len(initialbinarytable) initialbinarytable = initialbinarytable.assign( natal_kick_array=assign_natal_kick_array) for idx, column_name in enumerate(NATAL_KICK_COLUMNS): for sn_idx in range(2): column_name_sn = column_name + '_{0}'.format(sn_idx + 1) column_values = pd.Series( [BSEDict['natal_kick_array'][sn_idx][idx]] * len(initialbinarytable), index=initialbinarytable.index, name=column_name_sn) kwargs1 = {column_name_sn: column_values} initialbinarytable = initialbinarytable.assign( **kwargs1) elif k == 'qcrit_array': initialbinarytable = initialbinarytable.assign( qcrit_array=[BSEDict['qcrit_array']] * len(initialbinarytable)) for kstar in range(0, 16): columns_values = pd.Series( [BSEDict['qcrit_array'][kstar]] * len(initialbinarytable), index=initialbinarytable.index, name='qcrit_{0}'.format(kstar)) initialbinarytable.loc[:, 'qcrit_{0}'. format(kstar)] = columns_values elif k == 'fprimc_array': columns_values = [BSEDict['fprimc_array'] ] * len(initialbinarytable) initialbinarytable = initialbinarytable.assign( fprimc_array=columns_values) for kstar in range(0, 16): columns_values = pd.Series( [BSEDict['fprimc_array'][kstar]] * len(initialbinarytable), index=initialbinarytable.index, name='fprimc_{0}'.format(kstar)) initialbinarytable.loc[:, 'fprimc_{0}'. format(kstar)] = columns_values else: # assigning values this way work for most of the parameters. kwargs1 = {k: v} initialbinarytable = initialbinarytable.assign(**kwargs1) # Here we perform two checks # First, if the BSE parameters are not in the initial binary table # and either a dictionary or an inifile was not provided # then we need to raise an ValueError and tell the user to provide # either a dictionary or an inifile or add more columns if not BSEDict: if ((not set(INITIAL_BINARY_TABLE_SAVE_COLUMNS).issubset( initialbinarytable.columns)) and (not set(INITIAL_CONDITIONS_PASS_COLUMNS).issubset( initialbinarytable.columns))): raise ValueError( "You are passing BSE parameters as columns in the " "initial binary table but not all BSE parameters are defined. " "Please pass a BSEDict or a params file or make sure " "you have all BSE parameters as columns {0} or {1}.". format(INITIAL_BINARY_TABLE_SAVE_COLUMNS, INITIAL_CONDITIONS_PASS_COLUMNS)) # If you did not supply the natal kick or qcrit_array or fprimc_array in the BSEdict then we construct # it from the initial conditions table if ((pd.Series(FLATTENED_NATAL_KICK_COLUMNS).isin( initialbinarytable.keys()).all()) and ('natal_kick_array' not in BSEDict)): column_values = initialbinarytable[ FLATTENED_NATAL_KICK_COLUMNS].values.reshape( -1, 2, len(NATAL_KICK_COLUMNS)).tolist() initialbinarytable = initialbinarytable.assign( natal_kick_array=column_values) if (pd.Series(QCRIT_COLUMNS).isin( initialbinarytable.keys()).all()) and ('qcrit_array' not in BSEDict): initialbinarytable = initialbinarytable.assign( qcrit_array=initialbinarytable[QCRIT_COLUMNS].values.tolist()) if (pd.Series(FPRIMC_COLUMNS).isin( initialbinarytable.keys()).all()) and ('fprimc_array' not in BSEDict): initialbinarytable = initialbinarytable.assign( fprimc_array=initialbinarytable[FPRIMC_COLUMNS].values.tolist( )) # need to ensure that the order of parameters that we pass to BSE # is correct initial_conditions = initialbinarytable[ INITIAL_CONDITIONS_PASS_COLUMNS].to_dict('records') # we use different columns to save the BSE parameters because some # of the parameters are list/arrays which we instead save as # individual values because it makes saving to HDF5 easier/more efficient. initialbinarytable = initialbinarytable[ INITIAL_BINARY_TABLE_SAVE_COLUMNS] # Allow a user to specify a custom time step sampling for certain parts of the evolution timestep_conditions = kwargs.pop('timestep_conditions', []) set_checkstates(timestep_conditions=timestep_conditions) # check if a pool was passed if pool is None: with MultiPool(processes=nproc) as pool: # evolve systems if n_per_block > 0: initial_conditions = np.asarray(initial_conditions) n_tot = initial_conditions.shape[0] initial_conditions_blocked = [] itr_block = 0 while itr_block < n_tot: itr_next = np.min([n_tot, itr_block + n_per_block]) initial_conditions_blocked.append( initial_conditions[itr_block:itr_next]) itr_block = itr_next output = list( pool.map(_evolve_multi_system, initial_conditions_blocked)) else: output = list( pool.map(_evolve_single_system, initial_conditions)) else: # evolve systems if n_per_block > 0: initial_conditions = np.asarray(initial_conditions) n_tot = initial_conditions.shape[0] initial_conditions_blocked = [] itr_block = 0 while itr_block < n_tot: itr_next = np.min([n_tot, itr_block + n_per_block]) initial_conditions_blocked.append( initial_conditions[itr_block:itr_next]) itr_block = itr_next output = list( pool.map(_evolve_multi_system, initial_conditions_blocked)) else: output = list( pool.map(_evolve_single_system, initial_conditions)) output = np.array(output, dtype=object) bpp_arrays = np.vstack(output[:, 1]) bcm_arrays = np.vstack(output[:, 2]) kick_info_arrays = np.vstack(output[:, 3]) natal_kick_arrays = np.vstack(output[:, 4]) natal_kick_arrays = natal_kick_arrays.reshape( -1, 1, len(FLATTENED_NATAL_KICK_COLUMNS)) for idx, column in enumerate(FLATTENED_NATAL_KICK_COLUMNS): # assigning values this way work for most of the parameters. kwargs1 = {column: natal_kick_arrays[:, :, idx]} initialbinarytable = initialbinarytable.assign(**kwargs1) kick_info = pd.DataFrame(kick_info_arrays, columns=KICK_COLUMNS, index=kick_info_arrays[:, -1].astype(int)) bpp = pd.DataFrame(bpp_arrays, columns=BPP_COLUMNS, index=bpp_arrays[:, -1].astype(int)) bcm = pd.DataFrame(bcm_arrays, columns=BCM_COLUMNS, index=bcm_arrays[:, -1].astype(int)) bcm.merger_type = bcm.merger_type.astype(int).astype(str).apply( lambda x: x.zfill(4)) bcm.bin_state = bcm.bin_state.astype(int) bpp.bin_num = bpp.bin_num.astype(int) bcm.bin_num = bcm.bin_num.astype(int) return bpp, bcm, initialbinarytable, kick_info
def main(): # setup paths for output dump_dir = os.path.join(WORK_DIR, 'hdf5_dump') if not os.path.exists(dump_dir): os.makedirs(dump_dir) # get the options for the code and set the data_release globally (ugly) to # allow MultiPool to work kwargs = plasticc.get_data.parse_getdata_options() global data_release data_release = kwargs.pop('data_release') getter = plasticc.get_data.GetData(data_release) # we can use model as a dummy string to indicate if we are generating # training or test data dummy = kwargs.pop('model') offset = kwargs.pop('offset') limit = kwargs.pop('limit') if dummy == 'training': outfile = os.path.join(dump_dir, 'training_set.hdf5') offset = None else: if limit is None: outfile = os.path.join(dump_dir, 'test_set.hdf5') else: if offset is None: offset = 0 outfile = os.path.join(dump_dir, 'test_n{}_set.hdf5'.format(offset)) # make sure we remove any lingering files if os.path.exists(outfile): os.remove(outfile) _ = kwargs.get('field') # set the header keywords for training and testing # same except for sntype will be removed from test and hostgal_photoz isn't # provided if dummy == 'training': kwargs['columns']=['objid','ptrobs_min','ptrobs_max','ra','decl', 'mwebv', 'mwebv_err',\ 'hostgal_photoz', 'hostgal_photoz', 'hostgal_photoz_err', 'sntype'] else: kwargs['columns']=['objid','ptrobs_min','ptrobs_max','ra','decl', 'mwebv', 'mwebv_err',\ 'hostgal_photoz', 'hostgal_photoz_err', 'sntype'] # set an extrasql query to get just the DDF and WFD objects # sntype for testing = true sntype + 100 if dummy == 'training': extrasql = "AND sntype < 100 AND ((objid LIKE 'WFD%') OR (objid LIKE 'DDF%'))" else: extrasql = "AND sntype > 100 AND ((objid LIKE 'WFD%') OR (objid LIKE 'DDF%'))" # set up options for data retrieval ignoring many of the command-line # options kwargs['extrasql'] = extrasql kwargs['model'] = '%' kwargs['field'] = '%' kwargs['sort'] = True kwargs['shuffle'] = False kwargs['limit'] = None kwargs['get_num_lightcurves'] = True total = getter.get_lcs_headers(**kwargs) total = list(total)[0] kwargs['limit'] = total kwargs['get_num_lightcurves'] = False kwargs['offset'] = offset out = getter.get_lcs_headers(**kwargs) aggregate_types = {1:1, 2:2, 3:3, 12:2, 13:3, 14:2, 41:41, 43:43, 45:45, 51:51, 60:60, 61:61, 62:62, 63:63, 64:64, 80:80, 81:81, 82:82, 83:83, 84:84, 90:90, 91:91} if dummy == 'training': pass else: aggregate_types = {x+100:y for x,y in aggregate_types.items()} print('Aggregating as ', aggregate_types) # make a big list of the header - NOTE THAT WE ALWAYS RETRIEVE ALL OBJECTS out = list(out) if dummy == 'training': # we don't need to shuffle the training set pass else: # if we're generating test data, if we set a limit, just draw a random # sample else shuffle the full list if limit is not None: out = random.sample(out, limit) else: random.shuffle(out) # convert the selected header entries to a table out = at.Table(rows=out, names=kwargs['columns']) # we're not necessariy keeping all the models we simulated (42 and 50 are going bye bye) keep_types = aggregate_types.keys() mask = np.array([True if x in keep_types else False for x in out['sntype']]) out = out[mask] # aggregate types if dummy=='training': new_type = np.array([aggregate_types.get(x, None) for x in out['sntype']]) else: new_type = np.array([aggregate_types.get(x, None) for x in out['sntype']]) out['sntype'] = new_type # make sure that there are no "other" classes included in the training data if dummy == 'training': # not train types - 45, 60, 61, 62, 63, 64, 90, 91 train_types = (1, 2, 3, 41, 43, 51, 80, 81, 82, 83, 84) mask = np.array([True if x in train_types else False for x in out['sntype']]) out = out[mask] # galactic objects have -9 as redshift - change to 0 dummy_val = np.repeat(-9, len(out)) ind = np.isclose(out['hostgal_photoz'], dummy_val) out['hostgal_photoz'][ind] = 0. out['hostgal_photoz_err'][ind] = 0. # figure out what fits files the data are in fits_files =[ "LSST_{0}_MODEL{1}/LSST_{0}_{2}_PHOT.FITS".format(*x.split('_')) for x in out['objid']] fits_files = np.array(fits_files) uniq_files = np.unique(fits_files) # the object names have the model name in them, so we need to edit them # new name = <FIELD><SNID> orig_name = out['objid'] new_name = [ '{}{}'.format(x.split('_')[0], x.split('_')[-1]) for x in orig_name] new_name = np.array(new_name) out_name = new_name nmc = len(out) # if we are generating test data, save a truth table if dummy == 'training': pass else: sntype = out['sntype'] # remove the model type from the output header that goes with the test data out.remove_column('sntype') truth_file = outfile.replace('_set.hdf5', '_truthtable.hdf5') if os.path.exists(truth_file): os.remove(truth_file) # ... saving it in the truth table only orig_name = orig_name.astype(bytes) new_name = new_name.astype(bytes) truth_table = at.Table([orig_name, new_name, sntype], names=['objid','shortid','sntype']) truth_table.write(truth_file, compression=True, path='truth_table', serialize_meta=False, append=True) # make batches to load the data batches = {} for filename in uniq_files: ind = (fits_files == filename) this_fits_lcs = at.Table([out['objid'][ind], out['ptrobs_min'][ind], out['ptrobs_max'][ind]], names=['objid','ptrobs_min', 'ptrobs_max']) batches[filename] = this_fits_lcs name_lookup = dict(zip(orig_name, out_name)) gc.collect() # do the output failed = [] with MultiPool() as pool: with tqdm(total=nmc) as pbar: for result in pool.imap_unordered(task, batches.items()): this_file_n = len(result.items()) with h5py.File(outfile, 'a') as outf: for true_obj, thislc in result.items(): short_obj = name_lookup[true_obj] retries = 10 notwritten = True overwrite = False while notwritten and retries > 0: try: outf.create_dataset(short_obj, data=thislc, compression='lzf') #thislc.write(outfile, path=short_obj, compression=True, serialize_meta=False, append=True,\ # overwrite=overwrite) notwritten = False except Exception as e: timer.sleep(0.010) overwrite = True retries -= 1 print('{} {}'.format(true_obj, e)) if notwritten: failed.append((true_obj, short_obj)) print("Failed", true_obj) outf.flush() pbar.update(this_file_n) gc.collect() print(failed) # write out the header out['objid'] = out_name.astype(bytes) out.remove_columns(['ptrobs_min', 'ptrobs_max']) out.write(outfile, compression=True, path='header', serialize_meta=False, append=True)
print("loading data table took {0} s".format(time.time() - start)) print("loading pair indices...") pairs_start = time.time() pairs_file = '../data/matched-pairs-dustin.fits' pairs = read_from_fits(pairs_file) # pairs is a global variable print("loading pairs array took {0} s".format(time.time() - pairs_start)) print("calculating chisquared...") with h5py.File('chisqs.hdf5', 'w') as f: dset = f.create_dataset('chisqs', data=np.zeros_like(pairs) - 1) #tasks = list(zip(range(len(table)), table.iterrows())) tasks = list(zip(range(10000), pairs[:10000, :])) pool = MultiPool() map_start = time.time() results = pool.map(worker, tasks, callback=callback) map_end = time.time() print("mapping took {0} s".format(map_end - map_start)) pool.close() with h5py.File('chisqs.hdf5', 'r+') as f: chisqs = np.copy(f['chisqs']) chisqs2 = calc_chisqs_for_table(table, pairs[:10000, :]) if False: # basic diagnostics print("chisqareds calculated, checking on matches...") plt.hist(chisqs[(chisqs > 0.) & (chisqs < 50.)], bins=500) plt.xlabel('$\chi^2$', fontsize=16)
def initial_sample( self, M1min=0.08, M2min=0.08, M1max=150.0, M2max=150.0, porb_lo=0.15, porb_hi=8.0, rand_seed=0, size=None, nproc=1, pool=None, mp_seeds=None, ): """Sample initial binary distribution according to Moe & Di Stefano (2017) <http://adsabs.harvard.edu/abs/2017ApJS..230...15M>`_ Parameters ---------- M1min : `float` minimum primary mass to sample [Msun] DEFAULT: 0.08 M2min : `float` minimum secondary mass to sample [Msun] DEFAULT: 0.08 M1max : `float` maximum primary mass to sample [Msun] DEFAULT: 150.0 M2max : `float` maximum primary mass to sample [Msun] DEFAULT: 150.0 porb_lo : `float` minimum orbital period to sample [log10(days)] porb_hi : `float` maximum orbital period to sample [log10(days)] rand_seed : int random seed generator DEFAULT: 0 size : int, optional number of evolution times to sample NOTE: this is set in cosmic-pop call as Nstep Returns ------- primary_mass_list : array array of primary masses with size=size secondary_mass_list : array array of secondary masses with size=size porb_list : array array of orbital periods in days with size=size ecc_list : array array of eccentricities with size=size mass_singles : `float` Total mass in single stars needed to generate population mass_binaries : `float` Total mass in binaries needed to generate population n_singles : `int` Number of single stars needed to generate a population n_binaries : `int` Number of binaries needed to generate a population binfrac_list : array array of binary probabilities based on primary mass and period with size=size """ if pool is None: with MultiPool(processes=nproc) as pool: if mp_seeds is not None: if len(list(mp_seeds)) != nproc: raise ValueError( "Must supply a list of random seeds with length equal to number of processors" ) else: mp_seeds = [ nproc * (task._identity[0] - 1) for task in pool._pool ] inputs = [(M1min, M2min, M1max, M2max, porb_hi, porb_lo, size / nproc, rand_seed + mp_seed) for mp_seed in mp_seeds] worker = Worker() results = list(pool.map(worker, inputs)) else: if mp_seeds is not None: if len(list(mp_seeds)) != nproc: raise ValueError( "Must supply a list of random seeds with length equal to number of processors" ) else: if isinstance(pool, MPIPool): mp_seeds = [nproc * (task - 1) for task in pool.workers] elif isinstance(pool, MultiPool): mp_seeds = [ nproc * (task._identity[0] - 1) for task in pool._pool ] else: mp_seeds = [0 for i in range(nproc)] inputs = [(M1min, M2min, M1max, M2max, porb_hi, porb_lo, size / nproc, rand_seed + mp_seed) for mp_seed in mp_seeds] worker = Worker() results = list(pool.map(worker, inputs)) dat_lists = [[], [], [], [], [], [], [], [], []] for output_list in results: ii = 0 for dat_list in output_list: dat_lists[ii].append(dat_list) ii += 1 primary_mass_list = np.hstack(dat_lists[0]) secondary_mass_list = np.hstack(dat_lists[1]) porb_list = np.hstack(dat_lists[2]) ecc_list = np.hstack(dat_lists[3]) mass_singles = np.sum(dat_lists[4]) mass_binaries = np.sum(dat_lists[5]) n_singles = np.sum(dat_lists[6]) n_binaries = np.sum(dat_lists[7]) binfrac_list = np.hstack(dat_lists[8]) return (primary_mass_list, secondary_mass_list, porb_list, ecc_list, mass_singles, mass_binaries, n_singles, n_binaries, binfrac_list)
bounds = scipy.optimize.Bounds([ np.arctan(-10), -2, np.arctan(-10), -2, np.arctan(-10), -np.inf, 0, -np.inf, -10. ], [ np.arctan(10), np.inf, np.arctan(10), np.inf, np.arctan(10), np.inf, np.inf, np.inf, leopy.stats.logit(0.30) ]) try: from schwimmbad import MultiPool pool = MultiPool() print('Parallel execution on ' + str(pool.size) + ' processes') except ImportError as error: print('Serial execution as module `schwimmbad` was not found') pool = None my_print_fun = auxiliary.MyPrintFun() my_take_step = auxiliary.MyTakeStep(stepsize=1) minimizer_options = {'disp': True, 'ftol': 1e-8} minimizer_kwargs = { 'method': method, 'bounds': bounds, 'options': minimizer_options, 'args': (pool, ) }
values = [] for x in data: values.append(do_the_processing(x)) #using the map() function which applies the function (passed as the first argument) to each element in the iterable (second argument) #Note: python3 map() returns a generator object, so we call list on this to get out the values values = list(map(do_the_processing, data)) #class-based wrapper for the built-in (serial) map() function from schwimmbad import SerialPool pool = SearlPool() values = list(pool.map(do_the_processing, data)) #utilize multiple cores on the same processor from schwimmbad import MultiPool with MultiPool() as pool: values = list(pool.map(do_the_processing, data)) #when using MPI pool tell all worker processes to wait for tasks from the master process def main(pool, data): values = pool.map(do_the_processing, data) from schwimmbad import MPIPool with MPIPool() as pool: if not pool.is_master(): pool.wait() sys.exit(0) #selecting a pool with command-line arguments