def update_truth(filepath, hdu=2, chunksize=50000, skip=('SLOPES', 'EMLINES')): """Add data from columns in other HDUs of the Truth table. Parameters ---------- filepath : :class:`str` Full path to the data file. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 2). chunksize : :class:`int`, optional If set, update database `chunksize` rows at a time (default 50000). skip : :func:`tuple`, optional Do not load columns with these names (default, ``('SLOPES', 'EMLINES')``) """ tcls = Truth tn = tcls.__tablename__ t = tcls.__table__ if filepath.endswith('.fits'): with fits.open(filepath) as hdulist: data = hdulist[hdu].data elif filepath.endswith('.ecsv'): data = Table.read(filepath, format='ascii.ecsv') else: log.error("Unrecognized data file, %s!", filepath) return log.info("Read data from %s HDU %s", filepath, hdu) try: colnames = data.names except AttributeError: colnames = data.colnames for col in colnames: if data[col].dtype.kind == 'f': bad = np.isnan(data[col]) if np.any(bad): nbad = bad.sum() log.warning("%d rows of bad data detected in column " + "%s of %s.", nbad, col, filepath) log.info("Integrity check complete on %s.", tn) # if rowfilter is None: # good_rows = np.ones((maxrows,), dtype=np.bool) # else: # good_rows = rowfilter(data[0:maxrows]) # data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames] data_list = [data[col].tolist() for col in colnames if col not in skip] data_names = [col.lower() for col in colnames if col not in skip] data_names[0] = 'b_targetid' finalrows = len(data_list[0]) log.info("Initial column conversion complete on %s.", tn) del data data_rows = list(zip(*data_list)) del data_list log.info("Converted columns into rows on %s.", tn) for k in range(finalrows//chunksize + 1): data_chunk = [dict(zip(data_names, row)) for row in data_rows[k*chunksize:(k+1)*chunksize]] q = t.update().where(t.c.targetid == bindparam('b_targetid')) if len(data_chunk) > 0: engine.execute(q, data_chunk) log.info("Updated %d rows in %s.", min((k+1)*chunksize, finalrows), tn)
def load_fiberassign(datapath, maxpass=4, hdu='FIBERASSIGN', q3c=False, latest_epoch=False, last_column='NUMOBS_MORE'): """Load fiber assignment files into the fiberassign table. Tile files can appear in multiple epochs, so for a given tileid, load the tile file with the largest value of epoch. In the "real world", a tile file appears in each epoch until it is observed, therefore the tile file corresponding to the actual observation is the one with the largest epoch. Parameters ---------- datapath : :class:`str` Full path to the directory containing tile files. maxpass : :class:`int`, optional Search for pass numbers up to this value (default 4). hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'FIBERASSIGN'). q3c : :class:`bool`, optional If set, create q3c index on the table. latest_epoch : :class:`bool`, optional If set, search for the latest tile file among several epochs. last_column : :class:`str`, optional Do not load columns past this name (default 'NUMOBS_MORE'). """ fiberpath = os.path.join(datapath, 'fiberassign*.fits') log.info("Using tile file search path: %s.", fiberpath) tile_files = glob.glob(fiberpath) if len(tile_files) == 0: log.error("No tile files found!") return log.info("Found %d tile files.", len(tile_files)) # # Find the latest epoch for every tile file. # latest_tiles = dict() if latest_epoch: tileidre = re.compile(r'/(\d+)/fiberassign/fiberassign\-(\d+)\.fits$') for f in tile_files: m = tileidre.search(f) if m is None: log.error("Could not match %s!", f) continue epoch, tileid = map(int, m.groups()) if tileid in latest_tiles: if latest_tiles[tileid][0] < epoch: latest_tiles[tileid] = (epoch, f) else: latest_tiles[tileid] = (epoch, f) else: for f in tile_files: # fiberassign-TILEID.fits tileid = int( re.match('fiberassign\-(\d+)\.fits', os.path.basename(f))[1]) latest_tiles[tileid] = (0, f) log.info("Identified %d tile files for loading.", len(latest_tiles)) # # Read the identified tile files. # data_index = None for tileid in latest_tiles: epoch, f = latest_tiles[tileid] with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s", f, hdu) for col in data.names[:data_index]: if data[col].dtype.kind == 'f': bad = np.isnan(data[col]) if np.any(bad): nbad = bad.sum() log.warning( "%d rows of bad data detected in column " + "%s of %s.", nbad, col, f) # # This replacement may be deprecated in the future. # if col in ('TARGET_RA', 'TARGET_DEC', 'FIBERASSIGN_X', 'FIBERASSIGN_Y'): data[col][bad] = -9999.0 assert not np.any(np.isnan(data[col])) assert np.all(np.isfinite(data[col])) n_rows = len(data) if data_index is None: data_index = data.names.index(last_column) + 1 data_list = ([[tileid] * n_rows] + [data[col].tolist() for col in data.names[:data_index]]) data_names = ['tileid' ] + [col.lower() for col in data.names[:data_index]] log.info("Initial column conversion complete on tileid = %d.", tileid) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on tileid = %d.", tileid) dbSession.bulk_insert_mappings( FiberAssign, [dict(zip(data_names, row)) for row in data_rows]) log.info("Inserted %d rows in %s for tileid = %d.", n_rows, FiberAssign.__tablename__, tileid) dbSession.commit() if q3c: q3c_index('fiberassign', ra='target_ra') return
def load_zbest(datapath=None, hdu='ZBEST', q3c=False): """Load zbest files into the zcat table. This function is deprecated since there should now be a single redshift catalog file. Parameters ---------- datapath : :class:`str` Full path to the directory containing zbest files. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'ZBEST'). q3c : :class:`bool`, optional If set, create q3c index on the table. """ if datapath is None: datapath = specprod_root() zbestpath = os.path.join(datapath, 'spectra-64', '*', '*', 'zbest-64-*.fits') log.info("Using zbest file search path: %s.", zbestpath) zbest_files = glob.glob(zbestpath) if len(zbest_files) == 0: log.error("No zbest files found!") return log.info("Found %d zbest files.", len(zbest_files)) # # Read the identified zbest files. # for f in zbest_files: brickname = os.path.basename(os.path.dirname(f)) with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s.", f, hdu) good_targetids = ((data['TARGETID'] != 0) & (data['TARGETID'] != -1)) # # If there are too many targetids, the in_ clause will blow up. # Disabling this test, and crossing fingers. # # q = dbSession.query(ZCat).filter(ZCat.targetid.in_(data['TARGETID'].tolist())).all() # if len(q) != 0: # log.warning("Duplicate TARGETID found in %s.", f) # for z in q: # log.warning("Duplicate TARGETID = %d.", z.targetid) # good_targetids = good_targetids & (data['TARGETID'] != z.targetid) data_list = [data[col][good_targetids].tolist() for col in data.names] data_names = [col.lower() for col in data.names] log.info("Initial column conversion complete on brick = %s.", brickname) # # Expand COEFF # col = 'COEFF' expand = ( 'coeff_0', 'coeff_1', 'coeff_2', 'coeff_3', 'coeff_4', 'coeff_5', 'coeff_6', 'coeff_7', 'coeff_8', 'coeff_9', ) i = data_names.index(col.lower()) del data_names[i] del data_list[i] for j, n in enumerate(expand): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) # # zbest files don't contain the same columns as zcatalog. # for col in ZCat.__table__.columns: if col.name not in data_names: data_names.append(col.name) data_list.append([0] * len(data_list[0])) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on brick = %s.", brickname) try: dbSession.bulk_insert_mappings( ZCat, [dict(zip(data_names, row)) for row in data_rows]) except IntegrityError as e: log.error("Integrity Error detected!") log.error(e) dbSession.rollback() else: log.info("Inserted %d rows in %s for brick = %s.", len(data_rows), ZCat.__tablename__, brickname) dbSession.commit() if q3c: q3c_index('zcat') return
def load_file(filepath, tcls, hdu=1, expand=None, convert=None, index=None, rowfilter=None, q3c=False, chunksize=50000, maxrows=0): """Load a data file into the database, assuming that column names map to database column names with no surprises. Parameters ---------- filepath : :class:`str` Full path to the data file. tcls : :class:`sqlalchemy.ext.declarative.api.DeclarativeMeta` The table to load, represented by its class. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 1). expand : :class:`dict`, optional If set, map FITS column names to one or more alternative column names. convert : :class:`dict`, optional If set, convert the data for a named (database) column using the supplied function. index : :class:`str`, optional If set, add a column that just counts the number of rows. rowfilter : callable, optional If set, apply this filter to the rows to be loaded. The function should return :class:`bool`, with ``True`` meaning a good row. q3c : :class:`bool`, optional If set, create q3c index on the table. chunksize : :class:`int`, optional If set, load database `chunksize` rows at a time (default 50000). maxrows : :class:`int`, optional If set, stop loading after `maxrows` are loaded. Alteratively, set `maxrows` to zero (0) to load all rows. """ tn = tcls.__tablename__ if filepath.endswith('.fits'): with fits.open(filepath) as hdulist: data = hdulist[hdu].data elif filepath.endswith('.ecsv'): data = Table.read(filepath, format='ascii.ecsv') else: log.error("Unrecognized data file, %s!", filepath) return if maxrows == 0: maxrows = len(data) log.info("Read data from %s HDU %s", filepath, hdu) try: colnames = data.names except AttributeError: colnames = data.colnames for col in colnames: if data[col].dtype.kind == 'f': bad = np.isnan(data[col][0:maxrows]) if np.any(bad): nbad = bad.sum() log.warning( "%d rows of bad data detected in column " + "%s of %s.", nbad, col, filepath) # # Temporary workaround for bad flux values, see # https://github.com/desihub/desitarget/issues/397 # if col in ('FLUX_R', 'FIBERFLUX_R', 'FIBERTOTFLUX_R'): data[col][0:maxrows][bad] = -9999.0 log.info("Integrity check complete on %s.", tn) if rowfilter is None: good_rows = np.ones((maxrows, ), dtype=np.bool) else: good_rows = rowfilter(data[0:maxrows]) data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames] data_names = [col.lower() for col in colnames] finalrows = len(data_list[0]) log.info("Initial column conversion complete on %s.", tn) if expand is not None: for col in expand: i = data_names.index(col.lower()) if isinstance(expand[col], str): # # Just rename a column. # log.debug("Renaming column %s (at index %d) to %s.", data_names[i], i, expand[col]) data_names[i] = expand[col] else: # # Assume this is an expansion of an array-valued column # into individual columns. # del data_names[i] del data_list[i] for j, n in enumerate(expand[col]): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) log.info("Column expansion complete on %s.", tn) del data if convert is not None: for col in convert: i = data_names.index(col) data_list[i] = [convert[col](x) for x in data_list[i]] log.info("Column conversion complete on %s.", tn) if index is not None: data_list.insert(0, list(range(1, finalrows + 1))) data_names.insert(0, index) log.info("Added index column '%s'.", index) data_rows = list(zip(*data_list)) del data_list log.info("Converted columns into rows on %s.", tn) for k in range(finalrows // chunksize + 1): data_chunk = [ dict(zip(data_names, row)) for row in data_rows[k * chunksize:(k + 1) * chunksize] ] if len(data_chunk) > 0: engine.execute(tcls.__table__.insert(), data_chunk) log.info("Inserted %d rows in %s.", min((k + 1) * chunksize, finalrows), tn) # for k in range(finalrows//chunksize + 1): # data_insert = [dict([(col, data_list[i].pop(0)) # for i, col in enumerate(data_names)]) # for j in range(chunksize)] # session.bulk_insert_mappings(tcls, data_insert) # log.info("Inserted %d rows in %s..", # min((k+1)*chunksize, finalrows), tn) # session.commit() # dbSession.commit() if q3c: q3c_index(tn) return
def load_fiberassign(datapath, maxpass=4, hdu='FIBERASSIGN', q3c=False, latest_epoch=False, last_column='SUBPRIORITY'): """Load fiber assignment files into the fiberassign table. Tile files can appear in multiple epochs, so for a given tileid, load the tile file with the largest value of epoch. In the "real world", a tile file appears in each epoch until it is observed, therefore the tile file corresponding to the actual observation is the one with the largest epoch. Parameters ---------- datapath : :class:`str` Full path to the directory containing tile files. maxpass : :class:`int`, optional Search for pass numbers up to this value (default 4). hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'FIBERASSIGN'). q3c : :class:`bool`, optional If set, create q3c index on the table. latest_epoch : :class:`bool`, optional If set, search for the latest tile file among several epochs. last_column : :class:`str`, optional Do not load columns past this name (default 'BRICKNAME'). """ fiberpath = os.path.join(datapath, 'tile*.fits') log.info("Using tile file search path: %s.", fiberpath) tile_files = glob.glob(fiberpath) if len(tile_files) == 0: log.error("No tile files found!") return log.info("Found %d tile files.", len(tile_files)) # # Find the latest epoch for every tile file. # latest_tiles = dict() if latest_epoch: tileidre = re.compile(r'/(\d+)/fiberassign/tile-(\d+)\.fits$') for f in tile_files: m = tileidre.search(f) if m is None: log.error("Could not match %s!", f) continue epoch, tileid = map(int, m.groups()) if tileid in latest_tiles: if latest_tiles[tileid][0] < epoch: latest_tiles[tileid] = (epoch, f) else: latest_tiles[tileid] = (epoch, f) else: for f in tile_files: # tile_TILEID.fits or tile-TILEID.fits tileid = int(re.match('tile[\-_](\d+)\.fits', os.path.basename(f))[1]) latest_tiles[tileid] = (0, f) log.info("Identified %d tile files for loading.", len(latest_tiles)) # # Read the identified tile files. # data_index = None for tileid in latest_tiles: epoch, f = latest_tiles[tileid] with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s", f, hdu) for col in data.names[:data_index]: if data[col].dtype.kind == 'f': bad = np.isnan(data[col]) if np.any(bad): nbad = bad.sum() log.warning("%d rows of bad data detected in column " + "%s of %s.", nbad, col, f) # # This replacement may be deprecated in the future. # if col in ('TARGET_RA', 'TARGET_DEC', 'DESIGN_X', 'DESIGN_Y'): data[col][bad] = -9999.0 assert not np.any(np.isnan(data[col])) assert np.all(np.isfinite(data[col])) n_rows = len(data) if data_index is None: data_index = data.names.index(last_column) + 1 data_list = ([[tileid]*n_rows] + [data[col].tolist() for col in data.names[:data_index]]) data_names = ['tileid'] + [col.lower() for col in data.names[:data_index]] log.info("Initial column conversion complete on tileid = %d.", tileid) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on tileid = %d.", tileid) dbSession.bulk_insert_mappings(FiberAssign, [dict(zip(data_names, row)) for row in data_rows]) log.info("Inserted %d rows in %s for tileid = %d.", n_rows, FiberAssign.__tablename__, tileid) dbSession.commit() if q3c: q3c_index('fiberassign', ra='target_ra') return
def load_zbest(datapath=None, hdu='ZBEST', q3c=False): """Load zbest files into the zcat table. This function is deprecated since there should now be a single redshift catalog file. Parameters ---------- datapath : :class:`str` Full path to the directory containing zbest files. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'ZBEST'). q3c : :class:`bool`, optional If set, create q3c index on the table. """ if datapath is None: datapath = specprod_root() zbestpath = os.path.join(datapath, 'spectra-64', '*', '*', 'zbest-64-*.fits') log.info("Using zbest file search path: %s.", zbestpath) zbest_files = glob.glob(zbestpath) if len(zbest_files) == 0: log.error("No zbest files found!") return log.info("Found %d zbest files.", len(zbest_files)) # # Read the identified zbest files. # for f in zbest_files: brickname = os.path.basename(os.path.dirname(f)) with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s.", f, hdu) good_targetids = ((data['TARGETID'] != 0) & (data['TARGETID'] != -1)) # # If there are too many targetids, the in_ clause will blow up. # Disabling this test, and crossing fingers. # # q = dbSession.query(ZCat).filter(ZCat.targetid.in_(data['TARGETID'].tolist())).all() # if len(q) != 0: # log.warning("Duplicate TARGETID found in %s.", f) # for z in q: # log.warning("Duplicate TARGETID = %d.", z.targetid) # good_targetids = good_targetids & (data['TARGETID'] != z.targetid) data_list = [data[col][good_targetids].tolist() for col in data.names] data_names = [col.lower() for col in data.names] log.info("Initial column conversion complete on brick = %s.", brickname) # # Expand COEFF # col = 'COEFF' expand = ('coeff_0', 'coeff_1', 'coeff_2', 'coeff_3', 'coeff_4', 'coeff_5', 'coeff_6', 'coeff_7', 'coeff_8', 'coeff_9',) i = data_names.index(col.lower()) del data_names[i] del data_list[i] for j, n in enumerate(expand): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) # # zbest files don't contain the same columns as zcatalog. # for col in ZCat.__table__.columns: if col.name not in data_names: data_names.append(col.name) data_list.append([0]*len(data_list[0])) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on brick = %s.", brickname) try: dbSession.bulk_insert_mappings(ZCat, [dict(zip(data_names, row)) for row in data_rows]) except IntegrityError as e: log.error("Integrity Error detected!") log.error(e) dbSession.rollback() else: log.info("Inserted %d rows in %s for brick = %s.", len(data_rows), ZCat.__tablename__, brickname) dbSession.commit() if q3c: q3c_index('zcat') return
def load_file(filepath, tcls, hdu=1, expand=None, convert=None, index=None, rowfilter=None, q3c=False, chunksize=50000, maxrows=0): """Load a data file into the database, assuming that column names map to database column names with no surprises. Parameters ---------- filepath : :class:`str` Full path to the data file. tcls : :class:`sqlalchemy.ext.declarative.api.DeclarativeMeta` The table to load, represented by its class. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 1). expand : :class:`dict`, optional If set, map FITS column names to one or more alternative column names. convert : :class:`dict`, optional If set, convert the data for a named (database) column using the supplied function. index : :class:`str`, optional If set, add a column that just counts the number of rows. rowfilter : callable, optional If set, apply this filter to the rows to be loaded. The function should return :class:`bool`, with ``True`` meaning a good row. q3c : :class:`bool`, optional If set, create q3c index on the table. chunksize : :class:`int`, optional If set, load database `chunksize` rows at a time (default 50000). maxrows : :class:`int`, optional If set, stop loading after `maxrows` are loaded. Alteratively, set `maxrows` to zero (0) to load all rows. """ tn = tcls.__tablename__ if filepath.endswith('.fits'): with fits.open(filepath) as hdulist: data = hdulist[hdu].data elif filepath.endswith('.ecsv'): data = Table.read(filepath, format='ascii.ecsv') else: log.error("Unrecognized data file, %s!", filepath) return if maxrows == 0: maxrows = len(data) log.info("Read data from %s HDU %s", filepath, hdu) try: colnames = data.names except AttributeError: colnames = data.colnames for col in colnames: if data[col].dtype.kind == 'f': bad = np.isnan(data[col][0:maxrows]) if np.any(bad): nbad = bad.sum() log.warning("%d rows of bad data detected in column " + "%s of %s.", nbad, col, filepath) # # Temporary workaround for bad flux values, see # https://github.com/desihub/desitarget/issues/397 # if col in ('FLUX_R', 'FIBERFLUX_R', 'FIBERTOTFLUX_R'): data[col][0:maxrows][bad] = -9999.0 log.info("Integrity check complete on %s.", tn) if rowfilter is None: good_rows = np.ones((maxrows,), dtype=np.bool) else: good_rows = rowfilter(data[0:maxrows]) data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames] data_names = [col.lower() for col in colnames] finalrows = len(data_list[0]) log.info("Initial column conversion complete on %s.", tn) if expand is not None: for col in expand: i = data_names.index(col.lower()) if isinstance(expand[col], str): # # Just rename a column. # log.debug("Renaming column %s (at index %d) to %s.", data_names[i], i, expand[col]) data_names[i] = expand[col] else: # # Assume this is an expansion of an array-valued column # into individual columns. # del data_names[i] del data_list[i] for j, n in enumerate(expand[col]): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) log.info("Column expansion complete on %s.", tn) del data if convert is not None: for col in convert: i = data_names.index(col) data_list[i] = [convert[col](x) for x in data_list[i]] log.info("Column conversion complete on %s.", tn) if index is not None: data_list.insert(0, list(range(1, finalrows+1))) data_names.insert(0, index) log.info("Added index column '%s'.", index) data_rows = list(zip(*data_list)) del data_list log.info("Converted columns into rows on %s.", tn) for k in range(finalrows//chunksize + 1): data_chunk = [dict(zip(data_names, row)) for row in data_rows[k*chunksize:(k+1)*chunksize]] if len(data_chunk) > 0: engine.execute(tcls.__table__.insert(), data_chunk) log.info("Inserted %d rows in %s.", min((k+1)*chunksize, finalrows), tn) # for k in range(finalrows//chunksize + 1): # data_insert = [dict([(col, data_list[i].pop(0)) # for i, col in enumerate(data_names)]) # for j in range(chunksize)] # session.bulk_insert_mappings(tcls, data_insert) # log.info("Inserted %d rows in %s..", # min((k+1)*chunksize, finalrows), tn) # session.commit() # dbSession.commit() if q3c: q3c_index(tn) return