def list_bricks(ns): t0 = time() if ns.filelist is not None: d = dict([(parse_filename(fn.strip()), fn.strip()) for fn in open(ns.filelist, 'r').readlines()]) else: d = dict(iter_tractor(ns.src)) if ns.verbose: print('enumerated %d bricks in %g seconds' % (len(d), time() - t0)) if ns.bricksdesc is not None: bricksdesc = fitsio.read(ns.bricksdesc, 1, upper=True) # ADM convert from bytes_ to str_ type if fitsio version < 1. if bricksdesc["BRICKNAME"].dtype.type == np.bytes_: bricksdesc = dict([(item['BRICKNAME'].decode(), item) for item in bricksdesc]) else: bricksdesc = dict([(item['BRICKNAME'], item) for item in bricksdesc]) else: bricksdesc = None #- Load list of bricknames to use if ns.bricklist is not None: bricklist = np.loadtxt(ns.bricklist, dtype='S8') # TODO: skip unknown bricks? d = dict([(brickname.decode(), d[brickname]) for brickname in bricklist]) t0 = time() with sharedmem.MapReduce(np=ns.numproc) as pool: chunksize = 1024 keys = list(d.keys()) def work(i): return [(brickname, d[brickname], read_region(brickname, d[brickname], bricksdesc)) for brickname in keys[i:i + chunksize]] bricks = sum(pool.map(work, range(0, len(keys), chunksize)), []) if ns.verbose: print('read regions of %d bricks in %g seconds' % (len(bricks), time() - t0)) return bricks
def make_sweep(sweep, bricks, ns): data = [np.empty(0, dtype=SWEEP_DTYPE)] ra1, dec1, ra2, dec2 = sweep with sharedmem.MapReduce(np=0) as pool: def filter(brickname, filename, region): if not intersect(sweep, region): return None objects = fitsio.read(filename, 1, upper=True) mask = objects['BRICK_PRIMARY'] != 0 objects = objects[mask] mask = objects['RA'] >= ra1 mask &= objects['RA'] < ra2 mask &= objects['DEC'] >= dec1 mask &= objects['DEC'] < dec2 objects = objects[mask] chunk = np.empty(len(objects), dtype=SWEEP_DTYPE) for colname in chunk.dtype.names: if colname not in objects.dtype.names: # skip missing columns continue try: chunk[colname][...] = objects[colname][...] except ValueError: print('failed on column `%s`' % colname) raise return chunk def reduce(chunk): if chunk is not None: data.append(chunk) pool.map(filter, bricks, star=True, reduce=reduce) neff = len(data) - 1 data = np.concatenate(data, axis=0) return data, neff
def make_sweep(sweep, bricks, ns): data = [np.empty(0, dtype=SWEEP_DTYPE)] header = {} ra1, dec1, ra2, dec2 = sweep def merge_header(header, header2): for key, value in header2.items(): if key not in header: header[key] = value else: if header[key] is NA: pass else: if header[key] != value: header[key] = NA with sharedmem.MapReduce(np=0) as pool: def filter(brickname, filename, region): if not intersect(sweep, region): return None, None try: objects = fitsio.read(filename, 1, upper=True) chunkheader = fitsio.read_header(filename, 0, upper=True) except: if ns.ignore_errors: print('IO error on %s' % filename) return None, None else: raise # ADM check all the column dtypes match. if not ns.ignore_errors: sflds = SWEEP_DTYPE.fields tflds = objects.dtype.fields for fld in sflds: sdt, tdt = sflds[fld][0], tflds[fld][0] if sdt != tdt: msg = 'sweeps/Tractor dtypes differ for field ' msg += '{}. Sweeps: {}, Tractor: {}'.format(fld, sdt, tdt) raise ValueError(msg) mask = objects['BRICK_PRIMARY'] != 0 objects = objects[mask] mask = objects['RA'] >= ra1 mask &= objects['RA'] < ra2 mask &= objects['DEC'] >= dec1 mask &= objects['DEC'] < dec2 objects = objects[mask] chunk = np.empty(len(objects), dtype=SWEEP_DTYPE) for colname in chunk.dtype.names: if colname not in objects.dtype.names: # skip missing columns continue try: chunk[colname][...] = objects[colname][...] except ValueError: print('failed on column `%s`' % colname) raise chunkheader = dict([(key, chunkheader[key]) for key in chunkheader.keys()]) return chunk, chunkheader def reduce(chunk, chunkheader): if chunk is not None: data.append(chunk) merge_header(header, chunkheader) pool.map(filter, bricks, star=True, reduce=reduce) neff = len(data) - 1 data = np.concatenate(data, axis=0) header = dict([(key, value) for key, value in header.items() if value is not NA]) return data, header, neff
def main(): ns = parse_args() if ns.ignore_errors: print("Warning: *** Will ignore broken tractor catalogue files ***") print(" *** Disable -I for final data product. ***") # avoid each subprocess importing h5py again and again. if 'hdf5' in ns.format: import h5py # this may take a while on a file system with slow meta-data # access # bricks = [(name, filepath, region), ...] bricks = list_bricks(ns) t0 = time() try: os.makedirs(ns.dest) except OSError: pass # blocks or ra stripes? schemas = { 'ra' : sweep_schema_ra(360), 'blocks' : sweep_schema_blocks(36, 36), 'dec' : sweep_schema_dec(180), } sweeps = schemas[ns.schema] t0 = time() nbricks_tot = np.zeros((), 'i8') nobj_tot = np.zeros((), 'i8') def work(sweep): data, header, nbricks = make_sweep(sweep, bricks, ns) header.update({ 'RAMIN' : sweep[0], 'DECMIN' : sweep[1], 'RAMAX' : sweep[2], 'DECMAX' : sweep[3], }) template = "sweep-%(ramin)s%(decmin)s-%(ramax)s%(decmax)s.%(format)s" def formatdec(dec): return ("%+04g" % dec).replace('-', 'm').replace('+', 'p') def formatra(ra): return ("%03g" % ra) for format in ns.format: filename = template % \ dict(ramin=formatra(sweep[0]), decmin=formatdec(sweep[1]), ramax=formatra(sweep[2]), decmax=formatdec(sweep[3]), format=format) if len(data) > 0: save_sweep_file(os.path.join(ns.dest, filename), data, header, format) return filename, nbricks, len(data) def reduce(filename, nbricks, nobj): nbricks_tot[...] += nbricks nobj_tot[...] += nobj if ns.verbose and nobj > 0: print ( '%s : %d bricks %d primary objects, %g bricks / sec %g objs / sec' % ( filename, nbricks, nobj, nbricks_tot / (time() - t0), nobj_tot / (time() - t0), ) ) with sharedmem.MapReduce(np=ns.numproc) as pool: pool.map(work, sweeps, reduce=reduce)
def main(): ns = parse_args() if ns.ignore_errors: print("Warning: *** Will ignore broken tractor catalog files ***") print(" *** Disable -I for final data product. ***") bricks = list_bricks(ns) # ADM grab a {FIELD: unit} dict from the first Tractor file. unitdict = get_units(bricks[0][1]) tree, nobj, morecols = read_external(ns.external, ns) # get the data type of the match brickname, path = bricks[0] peek = fitsio.read(path, 1, upper=True) matched_catalog = sharedmem.empty(nobj, dtype=peek.dtype) matched_catalog['OBJID'] = -1 matched_distance = sharedmem.empty(nobj, dtype='f4') # convert to radian tol = ns.tolerance / (60. * 60.) * (np.pi / 180) matched_distance[:] = tol nprocessed = np.zeros((), dtype='i8') nmatched = np.zeros((), dtype='i8') ntotal = np.zeros((), dtype='i8') t0 = time() with sharedmem.MapReduce(np=ns.numproc) as pool: def work(brickname, path): try: objects = fitsio.read(path, 1, upper=True) except: if ns.ignore_errors: print ("IO Error on %s" %path) return None, None, None else: raise pos = radec2pos(objects['RA'], objects['DEC']) d, i = tree.query(pos, 1) assert (objects['OBJID'] != -1).all() with pool.critical: mask = d < matched_distance[i] mask &= objects['BRICK_PRIMARY'] i = i[mask] matched_catalog[i] = objects[mask][list(matched_catalog.dtype.names)] matched_distance[i] = d[mask] matched = mask.sum() return brickname, matched, len(objects) def reduce(brickname, matched, total): if brickname is None: return nprocessed[...] += 1 nmatched[...] += matched ntotal[...] += total if ns.verbose: if nprocessed % 1000 == 0: print("Processed %d files, %g / second, matched %d / %d objects." % (nprocessed, nprocessed / (time() - t0), nmatched, ntotal) ) pool.map(work, bricks, star=True, reduce=reduce) nrealmatched = (matched_catalog['OBJID'] != -1).sum() if ns.verbose: print("Processed %d files, %g / second, matched %d / %d objects into %d slots." % (nprocessed, nprocessed / (time() - t0), nmatched, ntotal, nrealmatched) ) try: os.makedirs(os.path.dirname(ns.dest)) except OSError: pass hdr = fitsio.FITSHDR() hdr.add_record(dict(name='NMATCHED', value=nrealmatched, comment='Number of unique matches.')) hdr.add_record(dict(name='NCOLL', value=nmatched - nrealmatched, comment='Total number of matches.')) hdr.add_record(dict(name='NCOLL', value=nrealmatched, comment='Total number of matches.')) hdr.add_record(dict(name='RADIUS', value=ns.tolerance, comment='Search radius (arcsec).')) value = ns.external if len(value) > 68: hdr.add_record(dict(name='EXTERNAL', value=value[:67]+'&')) while len(value): value = value[67:] if len(value) == 0: break hdr.add_record(dict(name='CONTINUE', value=" '%s%s'" % ( value[:67], '&' if len(value) > 67 else ''))) added_long = True else: added_long = False if added_long: hdr.add_record(dict(name='LONGSTRN', value='OGIP 1.0', comment='CONTINUE cards are used')) # Optionally add the new columns if len(morecols) > 0: newdtype = matched_catalog.dtype.descr for coldata, col in zip( morecols, ns.copycols ): newdtype = newdtype + [(col, coldata.dtype)] newdtype = np.dtype(newdtype) _matched_catalog = np.empty(matched_catalog.shape, dtype=newdtype) for field in matched_catalog.dtype.fields: _matched_catalog[field] = matched_catalog[field] for coldata, col in zip( morecols, ns.copycols ): _matched_catalog[col] = coldata matched_catalog = _matched_catalog.copy() del _matched_catalog for format in ns.format: save_file(ns.dest, matched_catalog, hdr, format, unitdict=unitdict)
def main(): ns = parse_args() if ns.ignore_errors: print("Warning: *** Will ignore broken tractor catalog files ***") print(" *** Disable -I for final data product. ***") bricks = list_bricks(ns) tree, nobj, morecols = read_external(ns.external, ns) # get the data type of the match brickname, path = bricks[0] peek = fitsio.read(path, 1, upper=True) matched_catalog = sharedmem.empty(nobj, dtype=peek.dtype) matched_catalog['OBJID'] = -1 matched_distance = sharedmem.empty(nobj, dtype='f4') # convert to radian tol = ns.tolerance / (60. * 60.) * (np.pi / 180) matched_distance[:] = tol nprocessed = np.zeros((), dtype='i8') nmatched = np.zeros((), dtype='i8') ntotal = np.zeros((), dtype='i8') t0 = time() with sharedmem.MapReduce(np=ns.numproc) as pool: def work(brickname, path): try: objects = fitsio.read(path, 1, upper=True) except: if ns.ignore_errors: print ("IO Error on %s" %path) return None, None, None else: raise pos = radec2pos(objects['RA'], objects['DEC']) d, i = tree.query(pos, 1) assert (objects['OBJID'] != -1).all() with pool.critical: mask = d < matched_distance[i] mask &= objects['BRICK_PRIMARY'] i = i[mask] matched_catalog[i] = objects[mask] matched_distance[i] = d[mask] matched = mask.sum() return brickname, matched, len(objects) def reduce(brickname, matched, total): if brickname is None: return nprocessed[...] += 1 nmatched[...] += matched ntotal[...] += total if ns.verbose: if nprocessed % 1000 == 0: print("Processed %d files, %g / second, matched %d / %d objects." % (nprocessed, nprocessed / (time() - t0), nmatched, ntotal) ) pool.map(work, bricks, star=True, reduce=reduce) nrealmatched = (matched_catalog['OBJID'] != -1).sum() if ns.verbose: print("Processed %d files, %g / second, matched %d / %d objects into %d slots." % (nprocessed, nprocessed / (time() - t0), nmatched, ntotal, nrealmatched) ) try: os.makedirs(os.path.dirname(ns.dest)) except OSError: pass header = {} header['NMATCHED'] = nrealmatched header['NCOLLISION'] = nmatched - nrealmatched header['TOL_ARCSEC'] = ns.tolerance # Optionally add the new columns if len(morecols) > 0: newdtype = matched_catalog.dtype.descr for coldata, col in zip( morecols, ns.copycols ): newdtype = newdtype + [(col, coldata.dtype)] newdtype = np.dtype(newdtype) _matched_catalog = np.empty(matched_catalog.shape, dtype=newdtype) for field in matched_catalog.dtype.fields: _matched_catalog[field] = matched_catalog[field] for coldata, col in zip( morecols, ns.copycols ): _matched_catalog[col] = coldata matched_catalog = _matched_catalog.copy() del _matched_catalog for format in ns.format: save_file(ns.dest, matched_catalog, header, format)
def main(): ns = parse_args() if ns.ignore_errors: print("Warning: *** Will ignore broken tractor catalog files ***") print(" *** Disable -I for final data product. ***") bricks = list_bricks(ns) # ADM grab a {FIELD: unit} dict from the first Tractor file. unitdict = get_units(bricks[0][1]) # convert to radian tol = ns.tolerance / (60. * 60.) * (np.pi / 180) tree, nobj, morecols, maxdups = read_external(ns.external, tol, ns) # get the data type of the match brickname, path = bricks[0] peek = fitsio.read(path, 1, upper=True) matched_catalog = sharedmem.empty(nobj, dtype=peek.dtype) matched_catalog['OBJID'] = -1 matched_distance = sharedmem.empty(nobj, dtype='f4') matched_distance[:] = tol nprocessed = np.zeros((), dtype='i8') nmatched = np.zeros((), dtype='i8') ntotal = np.zeros((), dtype='i8') t0 = time() with sharedmem.MapReduce(np=ns.numproc) as pool: def work(brickname, path): try: objects = fitsio.read(path, 1, upper=True) except: if ns.ignore_errors: print ("IO Error on %s" %path) return None, None, None else: raise # ADM limit to just PRIMARY objects from imaging. bp = objects["BRICK_PRIMARY"] objects = objects[bp] pos = radec2pos(objects['RA'], objects['DEC']) # ADM query tree allowing duplicates. dd, ii = tree.query(pos, maxdups, distance_upper_bound=tol) # ADM collect relevant information (retaining duplicates). _s = ii[dd < tol] # ADM the spec object indices. _p = np.where(dd < tol)[0] # ADM the imaging object indices. _d = dd[dd < tol] # ADM the matching distances. # ADM bail if there are no matches. if len(_s) == 0: return brickname, 0, len(objects) # ADM look-up dictionaries of the relevant distances and # ADM imaging object indices for each spec object index. ddict, pdict = {s: [] for s in _s}, {s: [] for s in _s} _ = [ddict[s].append(d) for s, d in zip(_s, _d)] _ = [pdict[s].append(p) for s, p in zip(_s, _p)] # ADM collapse the lookup dict based on minimum distances. sdp = [[s, d[np.argmin(d)], p[np.argmin(d)]] for s, d, p in zip(ddict.keys(), ddict.values(), pdict.values())] # ADM we're left with the spectroscopic and photometric indexes # ADM distances and indexes contingent on the minimum distances. i = np.array(sdp, dtype='i4')[:,0] d = np.array(sdp, dtype='f4')[:,1] iphot = np.array(sdp, dtype='i4')[:,2] assert (objects['OBJID'] != -1).all() with pool.critical: mask = d < matched_distance[i] i = i[mask] iphot = iphot[mask] matched_catalog[i] = objects[iphot][list(matched_catalog.dtype.names)] matched_distance[i] = d[mask] matched = mask.sum() return brickname, matched, len(objects) def reduce(brickname, matched, total): if brickname is None: return nprocessed[...] += 1 nmatched[...] += matched ntotal[...] += total if ns.verbose: if nprocessed % 1000 == 0: print("Processed %d files, %g / second, matched %d / %d brick primary objects." % (nprocessed, nprocessed / (time() - t0), nmatched, ntotal) ) pool.map(work, bricks, star=True, reduce=reduce) nrealmatched = (matched_catalog['OBJID'] != -1).sum() if ns.verbose: print("Processed %d files, %g / second, matched %d / %d objects into %d slots." % (nprocessed, nprocessed / (time() - t0), nmatched, ntotal, nrealmatched) ) try: os.makedirs(os.path.dirname(ns.dest)) except OSError: pass hdr = fitsio.FITSHDR() hdr.add_record(dict(name='NMATCHED', value=nrealmatched, comment='Number of unique matches.')) hdr.add_record(dict(name='NCOLL', value=nmatched - nrealmatched, comment='Total number of matches.')) hdr.add_record(dict(name='NCOLL', value=nrealmatched, comment='Total number of matches.')) hdr.add_record(dict(name='RADIUS', value=ns.tolerance, comment='Search radius (arcsec).')) value = ns.external if len(value) > 68: hdr.add_record(dict(name='EXTERNAL', value=value[:67]+'&')) while len(value): value = value[67:] if len(value) == 0: break hdr.add_record(dict(name='CONTINUE', value=" '%s%s'" % ( value[:67], '&' if len(value) > 67 else ''))) added_long = True else: added_long = False if added_long: hdr.add_record(dict(name='LONGSTRN', value='OGIP 1.0', comment='CONTINUE cards are used')) # Optionally add the new columns if len(morecols) > 0: newdtype = matched_catalog.dtype.descr for coldata, col in zip( morecols, ns.copycols ): newdtype = newdtype + [(col, coldata.dtype)] newdtype = np.dtype(newdtype) _matched_catalog = np.empty(matched_catalog.shape, dtype=newdtype) for field in matched_catalog.dtype.fields: _matched_catalog[field] = matched_catalog[field] for coldata, col in zip( morecols, ns.copycols ): _matched_catalog[col] = coldata matched_catalog = _matched_catalog.copy() del _matched_catalog for format in ns.format: save_file(ns.dest, matched_catalog, hdr, format, unitdict=unitdict)
def main(): ns = parse_args() if ns.ignore_errors: print("Warning: *** Will ignore broken tractor catalogue files ***") print(" *** Disable -I for final data product. ***") # avoid each subprocess importing h5py again and again. if 'hdf5' in ns.format: import h5py # this may take a while on a file system with slow meta-data # access # bricks = [(name, filepath, region), ...] bricks = list_bricks(ns) # ADM get a {FIELD: unit} dictionary from one of the Tractor files. fn = bricks[0][1] unitdict = get_units(fn) # ADM read in a small amount of information from one of the Tractor # ADM files to establish the full dtype. testdata = fitsio.read(fn, rows=[0], upper=True) ALL_DTYPE = testdata.dtype t0 = time() for odn in outdirnames: try: os.makedirs(os.path.join(ns.dest, odn)) except OSError: pass # blocks or ra stripes? schemas = { 'ra': sweep_schema_ra(360), 'blocks': sweep_schema_blocks(36, 36), 'dec': sweep_schema_dec(180), } sweeps = schemas[ns.schema] t0 = time() nbricks_tot = np.zeros((), 'i8') nobj_tot = np.zeros((), 'i8') def work(sweep): data, header, nbricks = make_sweep(sweep, bricks, ns, ALL_DTYPE=ALL_DTYPE) header.update({ 'RAMIN': sweep[0], 'DECMIN': sweep[1], 'RAMAX': sweep[2], 'DECMAX': sweep[3], }) template = "sweep-%(ramin)s%(decmin)s-%(ramax)s%(decmax)s.%(format)s" def formatdec(dec): return ("%+04g" % dec).replace('-', 'm').replace('+', 'p') def formatra(ra): return ("%03g" % ra) for format in ns.format: filename = template % \ dict(ramin=formatra(sweep[0]), decmin=formatdec(sweep[1]), ramax=formatra(sweep[2]), decmax=formatdec(sweep[3]), format=format) if len(data) > 0: # ADM the columns to always include to form a unique ID. uniqid = [ dt for dt in SWEEP_DTYPE.descr if dt[0] == "RELEASE" or dt[0] == "BRICKID" or dt[0] == "OBJID" ] # ADM write out separate sweeps for: # ADM the SWEEP_DTYPE columns (without light-curves). sweepdt = [dt for dt in SWEEP_DTYPE.descr if 'LC' not in dt[0]] # ADM the SWEEP_DTYPE columns (just light-curves). lcdt = uniqid + [ dt for dt in SWEEP_DTYPE.descr if 'LC' in dt[0] ] # ADM the remaining "extra" columns. alldt = uniqid + [ dt for dt in ALL_DTYPE.descr if dt[0] not in SWEEP_DTYPE.names ] ender = [".fits", "-lc.fits", "-ex.fits"] for dt, odn, end in zip([sweepdt, lcdt, alldt], outdirnames, ender): fn = filename.replace(".fits", end) dest = os.path.join(ns.dest, odn, fn) if len(dt) > 0: newdata = np.empty(len(data), dtype=dt) for col in newdata.dtype.names: newdata[col] = data[col] save_sweep_file(dest, newdata, header, format, unitdict=unitdict) return filename, nbricks, len(data) def reduce(filename, nbricks, nobj): nbricks_tot[...] += nbricks nobj_tot[...] += nobj if ns.verbose and nobj > 0: print( '%s : %d bricks %d primary objects, %g bricks / sec %g objs / sec' % ( filename, nbricks, nobj, nbricks_tot / (time() - t0), nobj_tot / (time() - t0), )) with sharedmem.MapReduce(np=ns.numproc) as pool: pool.map(work, sweeps, reduce=reduce)
def main(): ns = parse_args() bricks = list_bricks(ns) tree, boss = read_boss(ns.boss, ns) # get the data type of the match brickname, path = bricks[0] peek = fitsio.read(path, 1, upper=True) matched_catalogue = sharedmem.empty(len(boss), dtype=peek.dtype) matched_catalogue['OBJID'] = -1 matched_distance = sharedmem.empty(len(boss), dtype='f4') # convert to radian tol = ns.tolerance / (60. * 60.) * (np.pi / 180) matched_distance[:] = tol nprocessed = np.zeros((), dtype='i8') nmatched = np.zeros((), dtype='i8') ntotal = np.zeros((), dtype='i8') t0 = time() with sharedmem.MapReduce(np=ns.numproc) as pool: def work(brickname, path): objects = fitsio.read(path, 1, upper=True) pos = radec2pos(objects['RA'], objects['DEC']) d, i = tree.query(pos, 1) assert (objects['OBJID'] != -1).all() with pool.critical: mask = d < matched_distance[i] mask &= objects['BRICK_PRIMARY'] i = i[mask] matched_catalogue[i] = objects[mask] matched_distance[i] = d[mask] matched = mask.sum() return brickname, matched, len(objects) def reduce(brickname, matched, total): nprocessed[...] += 1 nmatched[...] += matched ntotal[...] += total if ns.verbose: if nprocessed % 50 == 0: print("Processed %d files, %g / second, matched %d / %d objects." % (nprocessed, nprocessed / (time() - t0), nmatched, ntotal) ) pool.map(work, bricks, star=True, reduce=reduce) nrealmatched = (matched_catalogue['OBJID'] != -1).sum() if ns.verbose: print("Processed %d files, %g / second, matched %d / %d objects into %d slots." % (nprocessed, nprocessed / (time() - t0), nmatched, ntotal, nrealmatched) ) try: os.makedirs(os.path.dirname(ns.dest)) except OSError: pass header = {} header['NMATCHED'] = nrealmatched header['NCOLLISION'] = nmatched - nrealmatched header['TOL_ARCSEC'] = ns.tolerance for format in ns.format: save_file(ns.dest, matched_catalogue, header, format)