def test_read_p03_g09(): """ XQ-100 """ p03 = DLASurvey.load_P03() assert p03.nsys == 105 g09 = DLASurvey.load_G09() assert g09.nsys == 38
def test_read_hst16(): # Statistical hst16 = DLASurvey.load_HST16() assert hst16.nsys == 4 # All hst16_all = DLASurvey.load_HST16(sample='all') assert hst16_all.nsys == 48
def test_sdss(): # All sdss = DLASurvey.load_SDSS_DR5(sample='all') # Testing assert sdss.nsys == 1182 # Stat sdss = DLASurvey.load_SDSS_DR5() assert len(sdss.NHI) == 737
def neeleman13(): """ Build a summary file for the Neeleman+13 sample """ prefix = "H100" outpath = os.getenv("DROPBOX_DIR") + "/Public/DLA/" + prefix + "/" dlasurvey = DLASurvey.from_flist("Lists/Neeleman13.lst", tree=os.environ.get("DLA")) dlasurvey.ref = "Neeleman+13" # Reset vlim for dla in dlasurvey._abs_sys: dla.vlim = [-1000.0, 1000.0] * u.km / u.s # Mask dlasurvey.mask = dlasurvey.NHI == dlasurvey.NHI # Json file for ions dlasurvey.fill_ions(use_Nfile=True) mk_json_ions(dlasurvey, prefix, outpath + prefix + "_DLA_ions.json") # Json files for .clm files mk_json_clms(dlasurvey, outpath + "CLMS/", prefix) print("It is likely you wish to tarball the CLMS folder for distribution") # JSON SYS files (preferred) mk_json_sys(dlasurvey, outpath, prefix) # Summary file and spectra mk_summary( dlasurvey, prefix, outpath + prefix + "_DLA.fits", specpath=outpath + "/Spectra/", htmlfil=outpath + prefix + "_DLA.html", )
def main(args=None): import pdb import numpy as np from linetools import utils as ltu from pyigm.surveys.analysis import fit_atan_dla_lz, fit_fN_dblpow from pyigm.surveys.dlasurvey import load_dla_surveys, update_dla_fits from pyigm.surveys.dlasurvey import DLASurvey from pyigm.surveys import dlasurvey pargs = parser() # DLA l(z) analysis if pargs.dla_lz or pargs.all: # arctan from Prochaska & Neeleman 2017 surveys = load_dla_surveys() dfits, _ = fit_atan_dla_lz(surveys, nstep=100, bootstrap=pargs.dla_lz_boot, nboot=50000, nproc=pargs.nproc, boot_out=dlasurvey.lz_boot_file) # Calculate error lz_boot = dlasurvey.load_boot_lz() for key in ['A', 'B', 'C']: boot = lz_boot[key].data # 68% perc = np.percentile(boot, [16., 84.]) dfits['lz']['atan']['sig_{:s}'.format( key)] = perc - dfits['lz']['atan'][key] # Write dfits['lz']['atan']['Ref'] = 'Prochaska & Neeleman 2017' update_dla_fits(dfits) # Fit double power law to f(N) of DLA [PW09 only] if pargs.dla_dpow or pargs.all: sdss_dr5 = DLASurvey.load_SDSS_DR5() dfits, best, Ndgrid, a3grid, a4grid, lik = fit_fN_dblpow(sdss_dr5.NHI, (-3., -1.1), (-6, -2), (21., 22.), nstep=100) # Write dfits['fN']['dpow']['Ref'] = 'PHW05' update_dla_fits(dfits) # DLA ne/nH if pargs.dla_nenH or pargs.all: dfits = {} dfits['nenH'] = {} dfits['nenH']['loglog'] = dict( bp=-2.881, m=-0.352, bp_sig=(+0.253, -0.256), m_sig=(+0.321, -0.317)) # Values with all 50 measurements dfits['nenH']['loglog']['Ref'] = 'Neeleman+15; PN17' # Update update_dla_fits(dfits)
def test_read_h100(): h100 = DLASurvey.load_H100() assert h100.nsys == 100 SiII_clms = h100.ions((14, 2)) gdSiII = np.where(SiII_clms['flag_N'] > 0)[0] assert len(gdSiII) == 98
def neeleman13(): """ Build a summary file for the Neeleman+13 sample """ prefix = 'H100' outpath = os.getenv('DROPBOX_DIR') + '/Public/DLA/' + prefix + '/' dlasurvey = DLASurvey.from_flist('Lists/Neeleman13.lst', tree=os.environ.get('DLA')) dlasurvey.ref = 'Neeleman+13' # Reset vlim for dla in dlasurvey._abs_sys: dla.vlim = [-1000., 1000.] * u.km / u.s # Mask dlasurvey.mask = dlasurvey.NHI == dlasurvey.NHI # Json file for ions dlasurvey.fill_ions(use_Nfile=True) mk_json_ions(dlasurvey, prefix, outpath + prefix + '_DLA_ions.json') # Json files for .clm files mk_json_clms(dlasurvey, outpath + 'CLMS/', prefix) print('It is likely you wish to tarball the CLMS folder for distribution') # JSON SYS files (preferred) mk_json_sys(dlasurvey, outpath, prefix) # Summary file and spectra mk_summary(dlasurvey, prefix, outpath + prefix + '_DLA.fits', specpath=outpath + '/Spectra/', htmlfil=outpath + prefix + '_DLA.html')
def write_sdss_sightlines(): """ Writes the SDSS DR5 sightlines that have no (or very few) DLAs Returns ------- None : Writes to Dropbox """ import os import h5py outfile = os.getenv( 'DROPBOX_DIR') + '/MachineLearning/DR5/SDSS_DR5_noDLAs.hdf5' # Load sdss = DLASurvey.load_SDSS_DR5(sample='all') slines, sdict = grab_sightlines(sdss, flg_bal=0) coords = SkyCoord(ra=slines['RA'], dec=slines['DEC'], unit='deg') # Load spectra -- RA/DEC in igmsp is not identical to RA_GROUP, DEC_GROUP in SDSS_DR7 igmsp = IgmSpec() sdss_meta = igmsp['SDSS_DR7'].meta qso_coord = SkyCoord(ra=sdss_meta['RA_GROUP'], dec=sdss_meta['DEC_GROUP'], unit='deg') idxq, d2dq, d3dq = match_coordinates_sky(coords, qso_coord, nthneighbor=1) in_igmsp = d2dq < 1 * u.arcsec # Check # Cut meta cut_meta = sdss_meta[idxq[in_igmsp]] assert len(slines) == len(cut_meta) # Grab spectra = igmsp['SDSS_DR7'].spec_from_meta(cut_meta) # Write hdf = h5py.File(outfile, 'w') spectra.write_to_hdf5(outfile, hdf5=hdf, clobber=True, fill_val=0.) # Add table (meta is already used) hdf['cut_meta'] = cut_meta hdf.close()
def test_sdss(): # All sdss = DLASurvey.load_SDSS_DR5(sample='all') # Testing assert sdss.nsys == 1182 # Stat sdss_stat = DLASurvey.load_SDSS_DR5() assert len(sdss_stat.NHI) == 737 # Binned lX, lX_lo, lX_hi = sdss_stat.calculate_lox([2., 2.5, 3]) assert np.isclose(lX[0], 0.04625038, atol=1e-5) fN, fN_lo, fN_hi = sdss_stat.calculate_fn([20.3, 20.5, 21., 21.5, 22.], [2, 2.5], log=True) assert fN.size == 4 assert np.isclose(fN_lo[0], 0.0682087, atol=1e-5)
def test_dat_list(): """JXP format :: Likely to be Deprecated """ if os.getenv('DLA') is None: assert True return # Load dlas = DLASurvey.neeleman13_tree() # tests assert dlas.nsys == 100
def main(flg_tst, sdss=None, ml_survey=None): # Load JSON for DR5 if (flg_tst % 2**1) >= 2**0: if sdss is None: sdss = DLASurvey.load_SDSS_DR5() #ml_survey = json_to_sdss_dlasurvey('../results/dr5_v1_predictions.json', sdss) ml_survey = json_to_sdss_dlasurvey('../results/dr5_v2_results.json', sdss) # Vette if (flg_tst % 2**2) >= 2**1: if ml_survey is None: sdss = DLASurvey.load_SDSS_DR5() ml_survey = json_to_sdss_dlasurvey('../results/dr5_v2_results.json', sdss) vette_dlasurvey(ml_survey, sdss) # Vette v5 and generate CSV if (flg_tst % 2**3) >= 2**2: if ml_survey is None: sdss = DLASurvey.load_SDSS_DR5() ml_survey = json_to_sdss_dlasurvey('../results/dr5_v5_predictions.json', sdss) false_neg, midx, _ = vette_dlasurvey(ml_survey, sdss) # CSV of false negatives mk_false_neg_table(false_neg, '../results/false_negative_DR5_v5.csv') # Vette v6 and generate CSV if (flg_tst % 2**4) >= 2**3: if ml_survey is None: sdss = DLASurvey.load_SDSS_DR5() ml_survey = json_to_sdss_dlasurvey('../results/dr5_v6.1_results.json', sdss) false_neg, midx, _ = vette_dlasurvey(ml_survey, sdss) # CSV of false negatives mk_false_neg_table(false_neg, '../results/false_negative_DR5_v6.1.csv') # Vette gensample v2 if (flg_tst % 2**5) >= 2**4: if ml_survey is None: sdss = DLASurvey.load_SDSS_DR5() ml_survey = json_to_sdss_dlasurvey('../results/results_catalog_dr7_model_gensample_v2.json',sdss) false_neg, midx, false_pos = vette_dlasurvey(ml_survey, sdss) # CSV of false negatives mk_false_neg_table(false_neg, '../results/false_negative_DR5_v2_gen.csv') mk_false_neg_table(false_pos, '../results/false_positives_DR5_v2_gen.csv') # Vette gensample v4.3.1 if flg_tst & (2**5): if ml_survey is None: sdss = DLASurvey.load_SDSS_DR5() ml_survey = json_to_sdss_dlasurvey('../results/results_model_4.3.1_data_dr5.json',sdss) false_neg, midx, false_pos = vette_dlasurvey(ml_survey, sdss) # CSV of false negatives mk_false_neg_table(false_neg, '../results/false_negative_DR5_v4.3.1_gen.csv') mk_false_neg_table(false_pos, '../results/false_positives_DR5_v4.3.1_gen.csv') if flg_tst & (2**6): dr5_for_david()
def fig_rhoHI(lw=1.5, csz=15., lsz=14.): """ Generate a DLA in optical depth and flux space Parameters ---------- """ sdss = DLASurvey.load_SDSS_DR5() zbins = [2.2, 2.4, 2.75, 3., 3.5, 4.5] rho_HI, rho_HI_low, rho_HI_hi = sdss.binned_rhoHI(zbins) outfile = 'fig_rhoHI.png' # Figure plt.figure(figsize=(5, 5)) plt.clf() gs = gridspec.GridSpec(1, 1) # Tau plot ax = plt.subplot(gs[0]) # Plot for kk in range(len(zbins)-1): zcen = np.sum(zbins[kk:kk+2])/2. yerr= np.array([rho_HI_low[kk].value/1e8, rho_HI_hi[kk].value/1e8]) ax.errorbar([zcen], [rho_HI[kk].value/1e8], xerr=zcen-zbins[kk], fmt='o', color='blue', capthick=2) ax.errorbar([zcen], [rho_HI[kk].value/1e8], yerr=[yerr], color='blue', capthick=2) # z=0 xmnx = [2., 4.5] ax.fill_between(xmnx, 0.45, 0.6, color='green', alpha=0.5) # Axes ax.set_xlim(xmnx) #ax.set_ylim(1e-2, 5e7) ax.set_ylabel(r'$\rho_{\rm HI} \; (10^8 \, \rm M_\odot \, Mpc^{-3} \, h_{72})$') ax.set_xlabel(r'$z$') ax.text(0.1, 0.9, 'SDSS-DR5 (PW09)', color='blue', size=lsz, transform=ax.transAxes, ha='left') ax.text(0.9, 0.1, 'z~0 [21cm] \n (Zwaan+05)', color='green', size=lsz, transform=ax.transAxes, ha='right') #ax.xaxis.set_major_locator(plt.MultipleLocator(10.)) # set_spines(ax, 2.) set_fontsize(ax,csz) # Write plt.tight_layout(pad=0.2,h_pad=0.,w_pad=0.1) plt.savefig(outfile, dpi=750) plt.close() print("Wrote {:s}".format(outfile))
def grab_meta(): """ Generates the meta data needed for the IGMSpec build Returns ------- meta : Table spec_files : list List of spec_file names """ # Load DLA from pyigm.surveys.dlasurvey import DLASurvey hdla100 = DLASurvey.neeleman13_tree() # Cut down to unique QSOs spec_files = [] names = [] ra = [] dec = [] coords = hdla100.coord cnt = 0 for coord in coords: # Load names.append('J{:s}{:s}'.format( coord.ra.to_string(unit=u.hour, sep='', pad=True, precision=2), coord.dec.to_string(sep='', pad=True, precision=1))) # RA/DEC ra.append(coord.ra.value) dec.append(coord.dec.value) # SPEC_FILE fname = hdla100._abs_sys[cnt]._datdict['hi res file'].split('/')[-1] spec_files.append(fname) cnt += 1 uni, uni_idx = np.unique(names, return_index=True) nqso = len(uni_idx) # meta = Table() meta['RA_GROUP'] = np.array(ra)[uni_idx] meta['DEC_GROUP'] = np.array(dec)[uni_idx] meta['zem_GROUP'] = hdla100.zem[uni_idx] meta['sig_zem'] = [0.] * nqso meta['flag_zem'] = [str('UNKN')] * nqso meta['STYPE'] = [str('QSO')] * nqso meta['SPEC_FILE'] = np.array(spec_files)[uni_idx] # Check assert chk_meta(meta, chk_cat_only=True) return meta
def grab_meta(): """ Generates the meta data needed for the IGMSpec build Returns ------- meta : Table spec_files : list List of spec_file names """ # Load DLA from pyigm.surveys.dlasurvey import DLASurvey hdla100 = DLASurvey.neeleman13_tree() # Cut down to unique QSOs spec_files = [] names = [] ra = [] dec = [] coords = hdla100.coord cnt = 0 for coord in coords: # Load names.append('J{:s}{:s}'.format(coord.ra.to_string(unit=u.hour, sep='', pad=True, precision=2), coord.dec.to_string(sep='', pad=True, precision=1))) # RA/DEC ra.append(coord.ra.value) dec.append(coord.dec.value) # SPEC_FILE fname = hdla100._abs_sys[cnt]._datdict['hi res file'].split('/')[-1] spec_files.append(fname) cnt += 1 uni, uni_idx = np.unique(names, return_index=True) nqso = len(uni_idx) # meta = Table() meta['RA_GROUP'] = np.array(ra)[uni_idx] meta['DEC_GROUP'] = np.array(dec)[uni_idx] meta['zem_GROUP'] = hdla100.zem[uni_idx] meta['sig_zem'] = [0.]*nqso meta['flag_zem'] = [str('UNKN')]*nqso meta['STYPE'] = [str('QSO')]*nqso meta['SPEC_FILE'] = np.array(spec_files)[uni_idx] # Check assert chk_meta(meta, chk_cat_only=True) return meta
def dr5_for_david(): """ Generate a Table for David """ # imports from pyigm.abssys.dla import DLASystem from pyigm.abssys.lls import LLSSystem sdss_survey = DLASurvey.load_SDSS_DR5() # Fiber key for fkey in ['FIBER', 'FIBER_ID', 'FIB']: if fkey in sdss_survey.sightlines.keys(): break # Init #idict = dict(plate=[], fiber=[], classification_confidence=[], # FOR v2 # classification=[], ra=[], dec=[]) # Connect to sightlines s_coord = SkyCoord(ra=sdss_survey.sightlines['RA'], dec=sdss_survey.sightlines['DEC'], unit='deg') # Add plate/fiber to statistical DLAs dla_coord = sdss_survey.coord idx2, d2d, d3d = match_coordinates_sky(dla_coord, s_coord, nthneighbor=1) if np.min(d2d.to('arcsec').value) > 1.: raise ValueError("Bad match to sightlines") plates, fibers = [], [] for jj,igd in enumerate(np.where(sdss_survey.mask)[0]): dla = sdss_survey._abs_sys[igd] try: dla.plate = sdss_survey.sightlines['PLATE'][idx2[jj]] except IndexError: pdb.set_trace() dla.fiber = sdss_survey.sightlines[fkey][idx2[jj]] plates.append(sdss_survey.sightlines['PLATE'][idx2[jj]]) fibers.append(sdss_survey.sightlines[fkey][idx2[jj]]) # Write dtbl = Table() dtbl['plate'] = plates dtbl['fiber'] = fibers dtbl['zabs'] = sdss_survey.zabs dtbl['NHI'] = sdss_survey.NHI dtbl.write('results/dr5_for_david.ascii', format='ascii') # Write sightline info stbl = sdss_survey.sightlines[['PLATE', 'FIB', 'Z_START', 'Z_END', 'RA', 'DEC']] gdsl = stbl['Z_END'] > stbl['Z_START'] stbl[gdsl].write('results/dr5_sightlines_for_david.ascii', format='ascii')
def neeleman13(): """ Build a summary file for the Neeleman+13 sample """ prefix = 'H100' outpath = os.getenv('DROPBOX_DIR')+'/Public/DLA/'+prefix+'/' dlasurvey = DLASurvey.from_flist('Lists/Neeleman13.lst', tree=os.environ.get('DLA')) dlasurvey.ref = 'Neeleman+13' # Json file for ions dlasurvey.fill_ions(use_Nfile=True) mk_json_ions(dlasurvey, prefix, outpath+prefix+'_DLA_ions.json') # Json files for .clm files mk_json_clms(dlasurvey, outpath+'CLMS/', prefix) print('It is likely you wish to tarball the CLMS folder for distribution') # Summary file and spectra mk_summary(dlasurvey, prefix, outpath+prefix+'_DLA.fits', specpath=outpath+'/Spectra/', htmlfil=outpath+prefix+'_DLA.html')
def test_init(): dlas = DLASurvey(ref='null') assert dlas.abs_type == 'DLA' coord = SkyCoord(ra=123.1143, dec=-12.4321, unit='deg') dlasys = DLASystem(coord, 1.244, [-300, 300.] * u.km / u.s, 20.4) dlasys.name = 'Sys1' # coord2 = SkyCoord(ra=223.1143, dec=42.4321, unit='deg') dlasys2 = DLASystem(coord2, 1.744, [-300, 300.] * u.km / u.s, 21.7) dlasys2.name = 'Sys2' # Add systems dlas.add_abs_sys(dlasys) dlas.add_abs_sys(dlasys2) assert dlas.nsys == 2
def test_dla_fitted(): dlas = DLASurvey(ref='null') # f(N) double power law fN = dlas.fitted_fN(21.) assert isinstance(fN, float) assert np.isclose(fN, 12.661299335610309) fN = dlas.fitted_fN(np.arange(20.3, 21.3, 0.1)) assert isinstance(fN, np.ndarray) # l(z) lz = dlas.fitted_lz(1.) assert isinstance(lz, float) assert np.isclose(lz, 0.054821907396422453) # Error lz, sig_lz = dlas.fitted_lz(1., boot_error=True) assert sig_lz.shape == (1, 2) # nenH nenH = dlas.fitted_nenH(21.) assert isinstance(nenH, float) assert np.isclose(nenH, -3.12739999999999999)
def neeleman13(): """ Build a summary file for the Neeleman+13 sample """ prefix = 'H100' outpath = os.getenv('DROPBOX_DIR') + '/Public/DLA/' + prefix + '/' dlasurvey = DLASurvey.from_flist('Lists/Neeleman13.lst', tree=os.environ.get('DLA')) dlasurvey.ref = 'Neeleman+13' # Json file for ions dlasurvey.fill_ions(use_Nfile=True) mk_json_ions(dlasurvey, prefix, outpath + prefix + '_DLA_ions.json') # Json files for .clm files mk_json_clms(dlasurvey, outpath + 'CLMS/', prefix) print('It is likely you wish to tarball the CLMS folder for distribution') # Summary file and spectra mk_summary(dlasurvey, prefix, outpath + prefix + '_DLA.fits', specpath=outpath + '/Spectra/', htmlfil=outpath + prefix + '_DLA.html')
def main(args=None): pargs = parser(options=args) # Setup import sys pfind = __file__.rfind('/scripts') spth = __file__[:pfind] + '/src' sys.path.append(spth) import training_set as tset from pyigm.surveys.dlasurvey import DLASurvey outroot = pargs.outpath + '/training_{:d}_{:d}'.format( pargs.seed, pargs.ntrain) # Sightlines sdss = DLASurvey.load_SDSS_DR5(sample='all') slines, sdict = tset.grab_sightlines(sdss, flg_bal=0) # Run _, _ = tset.make_set(pargs.ntrain, slines, outroot=outroot, seed=pargs.seed, slls=pargs.slls)
def grab_sightlines(dlasurvey=None, flg_bal=None, zmin=2.3, s2n=5., DX=0., igmsp_survey='SDSS_DR7', update_zem=True): """ Grab a set of sightlines without DLAs from a DLA survey Insist that all have spectra occur in igmspec Update sightline zem with igmspec zem Parameters ---------- dlas : DLASurvey Usually SDSS or BOSS flg_bal : int, optional Maximum BAL flag (0=No signature, 1=Weak BAL, 2=BAL) s2n : float, optional Minimum S/N as defined in some manner DX : float, optional Restrict on DX zmin : float, optional Minimum redshift for zem update_zem : bool, optional Update zem in sightlines? Returns ------- final : Table astropy Table of good sightlines sdict : dict dict describing the sightlines """ #1) REMOVE 910, 526 z=2.88; NHI=21.19 import warnings warnings.warn("Someday remove 910, 526 which has a *strong* DLA") igmsp = IgmSpec() # Init if dlasurvey is None: print("Using the DR5 sample for the sightlines") dlasurvey = DLASurvey.load_SDSS_DR5(sample='all') igmsp_survey = 'SDSS_DR7' nsight = len(dlasurvey.sightlines) keep = np.array([True] * nsight) meta = igmsp[igmsp_survey].meta # Avoid DLAs dla_coord = dlasurvey.coord sl_coord = SkyCoord(ra=dlasurvey.sightlines['RA'], dec=dlasurvey.sightlines['DEC']) idx, d2d, d3d = match_coordinates_sky(sl_coord, dla_coord, nthneighbor=1) clear = d2d > 1 * u.arcsec keep = keep & clear # BAL if flg_bal is not None: gd_bal = dlasurvey.sightlines['FLG_BAL'] <= flg_bal keep = keep & gd_bal # S/N if s2n > 0.: gd_s2n = dlasurvey.sightlines['S2N'] > s2n keep = keep & gd_s2n # Cut on DX if DX > 0.: gd_DX = dlasurvey.sightlines['DX'] > DX keep = keep & gd_DX # igmsp qso_coord = SkyCoord(ra=meta['RA_GROUP'], dec=meta['DEC_GROUP'], unit='deg') idxq, d2dq, d3dq = match_coordinates_sky(sl_coord, qso_coord, nthneighbor=1) in_igmsp = d2dq < 1 * u.arcsec keep = keep & in_igmsp # Check zem and dz #igm_id = meta['IGM_ID'][idxq] #cat_rows = match_ids(igm_id, igmsp.cat['IGM_ID']) #zem = igmsp.cat['zem'][cat_rows] zem = meta['zem_GROUP'][idxq] dz = np.abs(zem - dlasurvey.sightlines['ZEM']) gd_dz = dz < 0.1 keep = keep & gd_dz #& gd_zlim if zmin is not None: gd_zmin = zem > zmin keep = keep & gd_zmin #& gd_zlim #gd_zlim = (zem-dlasurvey.sightlines['Z_START']) > 0.1 #pdb.set_trace() # Assess final = dlasurvey.sightlines[keep] #final_coords = SkyCoord(ra=final['RA'], dec=final['DEC'], unit='deg') #matches, meta = igmsp.meta_from_coords(final_coords, groups=['SDSS_DR7'], tol=1*u.arcsec) #idxq2, d2dq2, d3dq2 = match_coordinates_sky(final_coords, qso_coord, nthneighbor=1) #in_igmsp2 = d2dq2 < 1*u.arcsec #pdb.set_trace() sdict = {} sdict['n'] = len(final) print("We have {:d} sightlines for analysis".format(sdict['n'])) def qck_stats(idict, tbl, istr, key): idict[istr + 'min'] = np.min(tbl[key]) idict[istr + 'max'] = np.max(tbl[key]) idict[istr + 'median'] = np.median(tbl[key]) qck_stats(sdict, final, 'z', 'ZEM') qck_stats(sdict, final, 'i', 'MAG') print("Min z = {:g}, Median z = {:g}, Max z = {:g}".format( sdict['zmin'], sdict['zmedian'], sdict['zmax'])) # Return return final, sdict
def test_read_h100_nosys(): h100 = DLASurvey.load_H100(load_sys=False) assert h100.nsys == 100
def main(flg_tst, sdss=None, ml_survey=None): import os # Sightlines flg_tst = int(flg_tst) if (flg_tst % 2**1) >= 2**0: if sdss is None: sdss = DLASurvey.load_SDSS_DR5(sample='all') slines, sdict = grab_sightlines(sdss, flg_bal=0) # Test case of 100 sightlines if (flg_tst % 2**2) >= 2**1: # Make training set _, _ = make_set(100, slines, outroot='results/training_100') # Production runs if (flg_tst % 2**3) >= 2**2: #training_prod(123456, 5, 10, outpath=os.getenv('DROPBOX_DIR')+'/MachineLearning/DLAs/') # TEST #training_prod(123456, 10, 500, outpath=os.getenv('DROPBOX_DIR')+'/MachineLearning/DLAs/') # TEST training_prod(12345, 10, 5000, outpath=os.getenv('DROPBOX_DIR') + '/MachineLearning/DLAs/') # Production runs -- 100k more if (flg_tst % 2**4) >= 2**3: # python src/training_set.py training_prod(22345, 10, 10000, outpath=os.getenv('DROPBOX_DIR') + '/MachineLearning/DLAs/') # Production runs -- 100k more if flg_tst & (2**4): # python src/training_set.py if False: if sdss is None: sdss = DLASurvey.load_SDSS_DR5(sample='all') slines, sdict = grab_sightlines(sdss, flg_bal=0) _, _ = make_set(100, slines, outroot='results/slls_training_100', slls=True) #training_prod(22343, 10, 100, slls=True, outpath=os.getenv('DROPBOX_DIR')+'/MachineLearning/SLLSs/') training_prod(22343, 10, 5000, slls=True, outpath=os.getenv('DROPBOX_DIR') + '/MachineLearning/SLLSs/') # Mixed systems for testing if flg_tst & (2**5): # python src/training_set.py if sdss is None: sdss = DLASurvey.load_SDSS_DR5(sample='all') slines, sdict = grab_sightlines(sdss, flg_bal=0) ntrials = 10000 seed = 23559 _, _ = make_set( ntrials, slines, seed=seed, mix=True, outroot=os.getenv('DROPBOX_DIR') + '/MachineLearning/Mix/mix_test_{:d}_{:d}'.format(seed, ntrials)) # DR5 DLA-free sightlines if flg_tst & (2**6): write_sdss_sightlines() # High NHI systems for testing if flg_tst & (2**7): # python src/training_set.py if sdss is None: sdss = DLASurvey.load_SDSS_DR5(sample='all') slines, sdict = grab_sightlines(sdss, flg_bal=0) ntrials = 20000 seed = 83559 _, _ = make_set(ntrials, slines, seed=seed, high=True, outroot=os.getenv('DROPBOX_DIR') + '/MachineLearning/HighNHI/high_train_{:d}_{:d}'.format( seed, ntrials)) # Low S/N if flg_tst & (2**8): # python src/training_set.py if sdss is None: sdss = DLASurvey.load_SDSS_DR5(sample='all') slines, sdict = grab_sightlines(sdss, flg_bal=0) ntrials = 10000 seed = 83557 _, _ = make_set( ntrials, slines, seed=seed, low_s2n=True, outroot=os.getenv('DROPBOX_DIR') + '/MachineLearning/LowS2N/lows2n_train_{:d}_{:d}'.format( seed, ntrials))
def test_read_HST(): """ Neeleman+16 """ hst16 = DLASurvey.load_HST16() assert hst16.nsys == 4
def dr5_false_positives(ml_dlasurvey=None, ml_llssurvey=None): vette_file = 'vette_dr5.json' from pyigm.surveys.dlasurvey import DLASurvey from matplotlib import pyplot as plt # Load ML if (ml_dlasurvey is None): _, ml_dlasurvey = load_ml_dr7() # Load DR5 dr5 = DLASurvey.load_SDSS_DR5() # This is the statistical sample # Vette vette = ltu.loadjson(vette_file) dr5_ml_idx = np.array(vette['dr5_idx']) # Use coord to efficiently deal with sightlines ml_dla_coord = ml_dlasurvey.coords dr5_coord = SkyCoord(ra=dr5.sightlines['RA'], dec=dr5.sightlines['DEC'], unit='deg') idx, d2d, d3d = match_coordinates_sky(ml_dla_coord, dr5_coord, nthneighbor=1) in_dr5 = d2d < 2*u.arcsec print("{:d} of the ML DLA were in the DR5 sightlines".format(np.sum(in_dr5))) # False positives fpos = np.array([True]*ml_dlasurvey.nsys) fpos[~in_dr5] = False # False positives imatched = np.where(dr5_ml_idx >= 0)[0] match_val = dr5_ml_idx[imatched] fpos[match_val] = False print("There are {:d} total false positives".format(np.sum(fpos))) # This nearly matches David's. Will run with his analysis. fpos_in_stat = fpos.copy() # Restrict on DR5 plates = ml_dlasurvey.plate fibers = ml_dlasurvey.fiber zabs = ml_dlasurvey.zabs zem = ml_dlasurvey.zem for idx in np.where(fpos_in_stat)[0]: # Finally, match to DR5 dr5_sl = np.where((dr5.sightlines['PLATE'] == plates[idx]) & (dr5.sightlines['FIB'] == fibers[idx]))[0][0] if (zabs[idx] >= dr5.sightlines['Z_START'][dr5_sl]) & \ (zabs[idx] <= dr5.sightlines['Z_END'][dr5_sl]): pass else: fpos_in_stat[idx] = False print("Number of FP in DR5 analysis region = {:d}".format(np.sum(fpos_in_stat))) print("Number with NHI<20.45 = {:d}".format(np.sum(ml_dlasurvey.NHI[fpos_in_stat]< 20.45))) # High NHI highNHI = ml_dlasurvey.NHI[fpos_in_stat] > 21. htbl = Table() htbl['PLATE'] = plates[fpos_in_stat][highNHI] htbl['FIBER'] = fibers[fpos_in_stat][highNHI] htbl['zabs'] = zabs[fpos_in_stat][highNHI] htbl['NHI'] = ml_dlasurvey.NHI[fpos_in_stat][highNHI] htbl.write("FP_DR5_highNHI.ascii", format='ascii.fixed_width', overwrite=True) # Medium NHI medNHI = (ml_dlasurvey.NHI[fpos_in_stat] > 20.6) & (ml_dlasurvey.NHI[fpos_in_stat] < 21) mtbl = Table() mtbl['PLATE'] = plates[fpos_in_stat][medNHI] mtbl['FIBER'] = fibers[fpos_in_stat][medNHI] mtbl['zabs'] = zabs[fpos_in_stat][medNHI] mtbl['zem'] = zem[fpos_in_stat][medNHI] mtbl['NHI'] = ml_dlasurvey.NHI[fpos_in_stat][medNHI] mtbl.write("FP_DR5_medNHI.ascii", format='ascii.fixed_width', overwrite=True)
def chk_dr5_dla_to_ml(ml_dlasurvey=None, ml_llssurvey=None, dz_toler=0.015, outfile='vette_dr5.json', write_again=True): # Load ML if (ml_dlasurvey is None) or (ml_llssurvey is None): ml_llssurvey, ml_dlasurvey = load_ml_dr7() # Load DR5 dr5 = DLASurvey.load_SDSS_DR5() # This is the statistical sample # Use coord to efficiently deal with sightlines ml_coord = SkyCoord(ra=ml_dlasurvey.sightlines['RA'], dec=ml_dlasurvey.sightlines['DEC'], unit='deg') dr5_coord = SkyCoord(ra=dr5.sightlines['RA'], dec=dr5.sightlines['DEC'], unit='deg') idx, d2d, d3d = match_coordinates_sky(dr5_coord, ml_coord, nthneighbor=1) in_ml = d2d < 2*u.arcsec print("{:d} of the DR5 sightlines were covered by ML out of {:d}".format(np.sum(in_ml), len(dr5.sightlines))) # 7477 sightlines out of 7482 # Cut down dr5.sightlines = dr5.sightlines[in_ml] new_mask = dla_stat(dr5, dr5.sightlines) # 737 good DLAs dr5.mask = new_mask dr5_dla_coord = dr5.coord dr5_dla_zabs = dr5.zabs ndr5 = len(dr5_dla_coord) ml_dla_coord = ml_dlasurvey.coords ml_lls_coord = ml_llssurvey.coords # Loop on DR5 DLAs and save indices of the matches dr5_ml_idx = np.zeros(ndr5).astype(int) - 1 for ii in range(ndr5): # Match to ML dla_mts = np.where(dr5_dla_coord[ii].separation(ml_dla_coord) < 2*u.arcsec)[0] nmt = len(dla_mts) if nmt == 0: # No match # Check for LLS lls_mts = np.where(dr5_dla_coord[ii].separation(ml_lls_coord) < 2*u.arcsec)[0] nmt2 = len(lls_mts) if nmt2 == 0: # No match pass else: zML = ml_llssurvey.zabs[lls_mts] # Redshifts of all DLAs on the sightline in ML zdiff = np.abs(dr5_dla_zabs[ii]-zML) if np.min(zdiff) < dz_toler: dr5_ml_idx[ii] = -9 # SLLS match else: zML = ml_dlasurvey.zabs[dla_mts] # Redshifts of all DLAs on the sightline in ML zdiff = np.abs(dr5_dla_zabs[ii]-zML) if np.min(zdiff) < dz_toler: #print("Match on {:d}!".format(ii)) # Match imin = np.argmin(zdiff) dr5_ml_idx[ii] = dla_mts[imin] else: # Check for LLS lls_mts = np.where(dr5_dla_coord[ii].separation(ml_lls_coord) < 2*u.arcsec)[0] nmt2 = len(lls_mts) if nmt2 == 0: # No match pass else: zML = ml_llssurvey.zabs[lls_mts] # Redshifts of all DLAs on the sightline in ML zdiff = np.abs(dr5_dla_zabs[ii]-zML) if np.min(zdiff) < dz_toler: dr5_ml_idx[ii] = -9 # SLLS match dr5_coord = SkyCoord(ra=dr5.sightlines['RA'], dec=dr5.sightlines['DEC'], unit='deg') # Write out misses misses = np.where(dr5_ml_idx == -1)[0] plates, fibers = [], [] for miss in misses: imin = np.argmin(dr5_dla_coord[miss].separation(dr5_coord)) plates.append(dr5.sightlines['PLATE'][imin]) fibers.append(dr5.sightlines['FIB'][imin]) mtbl = Table() mtbl['PLATE'] = plates mtbl['FIBER'] = fibers mtbl['NHI'] = dr5.NHI[misses] mtbl['zabs'] = dr5.zabs[misses] if write_again: mtbl.write('DR5_misses.ascii', format='ascii.fixed_width', overwrite=True) # Write out SLLS sllss = np.where(dr5_ml_idx == -9)[0] plates, fibers = [], [] for slls in sllss: imin = np.argmin(dr5_dla_coord[slls].separation(dr5_coord)) plates.append(dr5.sightlines['PLATE'][imin]) fibers.append(dr5.sightlines['FIB'][imin]) mtbl = Table() mtbl['PLATE'] = plates mtbl['FIBER'] = fibers mtbl['NHI'] = dr5.NHI[sllss] mtbl['zabs'] = dr5.zabs[sllss] if write_again: mtbl.write('DR5_SLLS.ascii', format='ascii.fixed_width', overwrite=True) pdb.set_trace() # ML not matched by PW09? ml_dla_coords = ml_dlasurvey.coords idx2, d2d2, d3d = match_coordinates_sky(ml_dla_coords, dr5_dla_coord, nthneighbor=1) not_in_dr5 = d2d2 > 2*u.arcsec # This doesn't match redshifts! might_be_in_dr5 = np.where(~not_in_dr5)[0] others_not_in = [] # this is some painful book-keeping for idx in might_be_in_dr5: # Matching redshifts.. imt = ml_dla_coord[idx].separation(dr5_dla_coord) < 2*u.arcsec # Match on dztoler if np.min(np.abs(ml_dlasurvey.zabs[idx]-dr5.zabs[imt])) > dz_toler: others_not_in.append(idx) # Save out_dict = {} out_dict['in_ml'] = in_ml out_dict['dr5_idx'] = dr5_ml_idx # -1 are misses, -9 are SLLS out_dict['not_in_dr5'] = np.concatenate([np.where(not_in_dr5)[0], np.array(others_not_in)]) ltu.savejson(outfile, ltu.jsonify(out_dict), overwrite=True)
def test_read_xq100(): """ XQ-100 """ xq100 = DLASurvey.load_XQ100(sample='stat') assert xq100.nsys == 36
def test_init(): dlas = DLASurvey(ref='null') assert dlas.abs_type == 'DLA'
def load_ml_file(pred_file): """ Load the search results from the CNN into a DLASurvey object Parameters ---------- pred_file Returns ------- ml_llssurvey: LLSSurvey ml_dlasusrvey: DLASurvey """ print("Loading {:s}. Please be patient..".format(pred_file)) # Read ml_results = ltu.loadjson(pred_file) use_platef = False if 'plate' in ml_results[0].keys(): use_platef = True else: if 'id' in ml_results[0].keys(): use_id = True # Init idict = dict(ra=[], dec=[], plate=[], fiber=[]) if use_platef: for key in ['plate', 'fiber', 'mjd']: idict[key] = [] dlasystems = [] llssystems = [] # Generate coords to speed things up for obj in ml_results: for key in ['ra', 'dec']: idict[key].append(obj[key]) ml_coords = SkyCoord(ra=idict['ra'], dec=idict['dec'], unit='deg') ra_names = ml_coords.icrs.ra.to_string(unit=u.hour, sep='', pad=True) dec_names = ml_coords.icrs.dec.to_string(sep='', pad=True, alwayssign=True) vlim = [-500., 500.] * u.km / u.s dcoord = SkyCoord(ra=0., dec=0., unit='deg') # Loop on list didx, lidx = [], [] print("Looping on sightlines..") for tt, obj in enumerate(ml_results): #if (tt % 100) == 0: # print('tt: {:d}'.format(tt)) # Sightline if use_id: plate, fiber = [int(spl) for spl in obj['id'].split('-')] idict['plate'].append(plate) idict['fiber'].append(fiber) # Systems for ss, syskey in enumerate(['dlas', 'subdlas']): for idla in obj[syskey]: name = 'J{:s}{:s}_z{:.3f}'.format(ra_names[tt], dec_names[tt], idla['z_dla']) if ss == 0: isys = DLASystem(dcoord, idla['z_dla'], vlim, NHI=idla['column_density'], zem=obj['z_qso'], name=name) else: isys = LLSSystem(dcoord, idla['z_dla'], vlim, NHI=idla['column_density'], zem=obj['z_qso'], name=name) isys.confidence = idla['dla_confidence'] isys.s2n = idla['s2n'] if use_platef: isys.plate = obj['plate'] isys.fiber = obj['fiber'] elif use_id: isys.plate = plate isys.fiber = fiber # Save if ss == 0: didx.append(tt) dlasystems.append(isys) else: lidx.append(tt) llssystems.append(isys) # Generate sightline tables sightlines = Table() sightlines['RA'] = idict['ra'] sightlines['DEC'] = idict['dec'] sightlines['PLATE'] = idict['plate'] sightlines['FIBERID'] = idict['fiber'] # Surveys ml_llssurvey = LLSSurvey() ml_llssurvey.sightlines = sightlines.copy() ml_llssurvey._abs_sys = llssystems ml_llssurvey.coords = ml_coords[np.array(lidx)] ml_dlasurvey = DLASurvey() ml_dlasurvey.sightlines = sightlines.copy() ml_dlasurvey._abs_sys = dlasystems ml_dlasurvey.coords = ml_coords[np.array(didx)] # Return return ml_llssurvey, ml_dlasurvey
def examine_false_pos(test_file='data/test_dlas_96629_10000.json.gz', pred_file='data/test_dlas_96629_predictions.json.gz', vette_file='vette_10k.json'): """ Examine false positives in the Test set (held out) """ from pyigm.surveys.dlasurvey import DLASurvey import h5py import json from matplotlib import pyplot as plt # Load Test test_dlas = test_to_tbl(test_file) ntest = len(test_dlas) # Load hdf5 CNN_result_path = '/home/xavier/Projects/ML_DLA_results/CNN/' hdf5_datafile = CNN_result_path + 'gensample_hdf5_files/test_dlas_96629_10000.hdf5' hdf = h5py.File(hdf5_datafile, 'r') headers = json.loads(hdf['meta'].value)['headers'] # Load ML ml_abs = pred_to_tbl(pred_file) # Vette vette = ltu.loadjson(vette_file) test_ml_idx = np.array(vette['test_idx']) # Load DR5 dr5 = DLASurvey.load_SDSS_DR5() all_dr5 = DLASurvey.load_SDSS_DR5(sample='all_sys') # False positives fpos = ml_abs['NHI'] >= 20.3 # Must be a DLA imatched = np.where(test_ml_idx >= 0)[0] match_val = test_ml_idx[imatched] fpos[match_val] = False print("There are {:d} total false positives".format(np.sum(fpos))) # This nearly matches David's. Will run with his analysis. fpos_in_dr5 = fpos.copy() # Restrict on DR5 for idx in np.where(fpos_in_dr5)[0]: # Convoluted indexing.. mlid = ml_abs['ids'][idx] # Plate/Fiber plate = headers[mlid]['PLATE'] fib = headers[mlid]['FIBER'] # Finally, match to DR5 dr5_sl = np.where((dr5.sightlines['PLATE'] == plate) & (dr5.sightlines['FIB'] == fib))[0][0] if (ml_abs['zabs'][idx] >= dr5.sightlines['Z_START'][dr5_sl]) & \ (ml_abs['zabs'][idx] <= dr5.sightlines['Z_END'][dr5_sl]): pass else: fpos_in_dr5[idx] = False print("Number of FP in DR5 analysis region = {:d}".format( np.sum(fpos_in_dr5))) # How many match to DR5 SLLS? slls = all_dr5.NHI < 20.3 slls_coord = all_dr5.coord[slls] slls_zabs = all_dr5.zabs[slls] nslls = 0 for idx in np.where(fpos_in_dr5)[0]: # Convoluted indexing.. mlid = ml_abs['ids'][idx] # RA/DEC ra = headers[mlid]['RA_GROUP'] dec = headers[mlid]['DEC_GROUP'] coord = SkyCoord(ra=ra, dec=dec, unit='deg') # Match coord mt = coord.separation(slls_coord) < 3 * u.arcsec if np.any(mt): # Match redshift if np.min(np.abs(slls_zabs[mt] - ml_abs['zabs'][idx])) < 0.015: nslls += 1 print("Number of FP that are SLLS in DR5 = {:d}".format(nslls)) low_NHI = ml_abs['NHI'][fpos_in_dr5] < 20.5 print("Number of FP that are NHI <= 20.5 = {:d}".format(np.sum(low_NHI))) # Write out fp_tbl = Table() for key in ['ids', 'NHI', 'zabs', 'conf']: fp_tbl[key] = ml_abs[key][fpos_in_dr5] fp_tbl.write('test10k_false_pos.ascii', format='ascii.fixed_width', overwrite=True) # Histogram dr5_idx = np.where(fpos_in_dr5) plt.clf() ax = plt.gca() ax.hist(ml_abs['conf'][dr5_idx]) plt.show()