def update_truth(filepath, hdu=2, chunksize=50000, skip=('SLOPES', 'EMLINES')): """Add data from columns in other HDUs of the Truth table. Parameters ---------- filepath : :class:`str` Full path to the data file. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 2). chunksize : :class:`int`, optional If set, update database `chunksize` rows at a time (default 50000). skip : :func:`tuple`, optional Do not load columns with these names (default, ``('SLOPES', 'EMLINES')``) """ tcls = Truth tn = tcls.__tablename__ t = tcls.__table__ if filepath.endswith('.fits'): with fits.open(filepath) as hdulist: data = hdulist[hdu].data elif filepath.endswith('.ecsv'): data = Table.read(filepath, format='ascii.ecsv') else: log.error("Unrecognized data file, %s!", filepath) return log.info("Read data from %s HDU %s", filepath, hdu) try: colnames = data.names except AttributeError: colnames = data.colnames for col in colnames: if data[col].dtype.kind == 'f': bad = np.isnan(data[col]) if np.any(bad): nbad = bad.sum() log.warning("%d rows of bad data detected in column " + "%s of %s.", nbad, col, filepath) log.info("Integrity check complete on %s.", tn) # if rowfilter is None: # good_rows = np.ones((maxrows,), dtype=np.bool) # else: # good_rows = rowfilter(data[0:maxrows]) # data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames] data_list = [data[col].tolist() for col in colnames if col not in skip] data_names = [col.lower() for col in colnames if col not in skip] data_names[0] = 'b_targetid' finalrows = len(data_list[0]) log.info("Initial column conversion complete on %s.", tn) del data data_rows = list(zip(*data_list)) del data_list log.info("Converted columns into rows on %s.", tn) for k in range(finalrows//chunksize + 1): data_chunk = [dict(zip(data_names, row)) for row in data_rows[k*chunksize:(k+1)*chunksize]] q = t.update().where(t.c.targetid == bindparam('b_targetid')) if len(data_chunk) > 0: engine.execute(q, data_chunk) log.info("Updated %d rows in %s.", min((k+1)*chunksize, finalrows), tn)
def q3c_index(table, ra='ra'): """Create a q3c index on a table. Parameters ---------- table : :class:`str` Name of the table to index. ra : :class:`str`, optional If the RA, Dec columns are called something besides "ra" and "dec", set its name. For example, ``ra='target_ra'``. """ q3c_sql = """CREATE INDEX ix_{table}_q3c_ang2ipix ON {schema}.{table} (q3c_ang2ipix({ra}, {dec})); CLUSTER {schema}.{table} USING ix_{table}_q3c_ang2ipix; ANALYZE {schema}.{table}; """.format(ra=ra, dec=ra.lower().replace('ra', 'dec'), schema=schemaname, table=table) log.info("Creating q3c index on %s.%s.", schemaname, table) dbSession.execute(q3c_sql) log.info("Finished q3c index on %s.%s.", schemaname, table) dbSession.commit() return
def main(): """Entry point for the check_model script. Returns ------- :class:`int` An integer suitable for passing to :func:`sys.exit`. """ from sys import argv from argparse import ArgumentParser desc = """Check actual files against the data model for validity. """ parser = ArgumentParser(description=desc, prog=os.path.basename(argv[0])) parser.add_argument('-d', '--datamodel-dir', dest='desidatamodel', metavar='DIR', help='Override the value of DESIDATAMODEL.') parser.add_argument( '-F', '--compare-files', dest='files', action='store_true', help='Compare an individual data model to an individual file.') parser.add_argument('-W', '--warning-is-error', dest='error', action='store_true', help='Data model warnings raise exceptions.') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Set log level to DEBUG.') parser.add_argument( 'section', metavar='DIR or FILE', help='Section of the data model or individual model file.') parser.add_argument( 'directory', metavar='DIR or FILE', help='Check files in this top-level directory, or one individual file.' ) options = parser.parse_args() if options.verbose: log.setLevel(DEBUG) if 'DESIDATAMODEL' in os.environ: data_model_root = os.environ['DESIDATAMODEL'] else: if options.desidatamodel is not None: data_model_root = options.desidatamodel else: log.critical(("DESIDATAMODEL is not defined. " + "Cannot find data model files!")) return 1 log.debug("DESIDATAMODEL=%s", data_model_root) if options.files: filename = os.path.join(data_model_root, 'doc', options.section) section = os.path.join(data_model_root, 'doc', options.section.split('/')[0]) log.info("Loading individual data model: %s.", filename) files = [DataModel(filename, section)] log.info("Skipping regular expression processing.") # files[0].get_regexp(options.directory, error=options.error) log.info("Setting prototype file for %s to %s.", filename, options.directory) files[0].prototype = options.directory else: section = os.path.join(data_model_root, 'doc', options.section) log.info("Loading data model file in %s.", section) files = scan_model(section) log.info("Searching for data files in %s.", options.directory) files_to_regexp(options.directory, files, error=options.error) log.info("Identifying prototype files in %s.", options.directory) collect_files(options.directory, files) validate_prototypes(files, error=options.error) return 0
def validate_prototype(self, error=False): """Compares a model's prototype data file to the data models. Parameters ---------- error : :class:`bool`, optional If ``True``, failure to extract certain required metadata raises an exception. Notes ----- * Use set theory to compare the data headers to model headers. This should automatically find missing headers, extraneous headers, etc. """ if self.prototype is None: # # A warning should have been issued already, so just skip silently. # return log.info("Comparing %s to %s.", self.prototype, self.filename) if self._stub is None: self._stub = Stub(self.prototype, error=error) stub_meta = self._stub_meta = self._stub.hdumeta modelmeta = self.extract_metadata(error=error) # # Check number of headers. # if self._stub.nhdr != len(modelmeta): log.warning( "Prototype file %s has the wrong number of " + "sections (HDUs) according to %s.", self.prototype, self.filename) return for i in range(self._stub.nhdr): dkw = stub_meta[i]['keywords'] mkw = modelmeta[i]['keywords'] # # Check number of keywords. # if len(dkw) != len(mkw): log.warning( "Prototype file %s has the wrong number of " + "HDU%d keywords according to %s.", self.prototype, i, self.filename) continue # # If number of keywords is correct, check them individually. # for j in range(len(dkw)): if dkw[j][0] != mkw[j][0]: log.warning( "Prototype file %s has a keyword " + "mismatch (%s != %s) in HDU%d according to " + "%s.", self.prototype, dkw[j][0], mkw[j][0], i, self.filename) # # Check the extension type. # dex = stub_meta[i]['extension'] try: mex = modelmeta[i]['extension'] except KeyError: mex = "Extension type not found" if dex != mex: log.warning( "Prototype file %s has an extension type " + "mismatch in HDU%d (%s != %s) " + "according to %s.", self.prototype, i, dex, mex, self.filename) continue # # Check for EXTNAME # dexex = stub_meta[i]['extname'] mexex = modelmeta[i]['extname'] if dexex == '' and i > 0: log.warning("Prototype file %s has no EXTNAME in HDU%d.", self.prototype, i) if (dexex != '' and mexex != '' and dexex != mexex): log.warning( "Prototype file %s has an EXTNAME mismatch " + "in HDU%d (%s != %s) " + "according to %s.", self.prototype, i, dexex, mexex, self.filename) # # If the extension type is correct, check the contents of the # extension. # dexf = stub_meta[i]['format'] try: mexf = modelmeta[i]['format'] except KeyError: mexf = "Extension format not found" if dex == 'IMAGE': try: icomma = dexf.index(',') except ValueError: icomma = len(dexf) if dexf[:icomma] != mexf[:icomma]: log.warning( "Prototype file %s has an extension " + "format mismatch in HDU%d " + "according to %s.", self.prototype, i, self.filename) else: dexf = dexf[1:] # Get rid of header line. if len(dexf) != len(mexf): log.warning( "Prototype file %s has the wrong " + "number of HDU%d columns according to %s.", self.prototype, i, self.filename) else: for j in range(len(dexf)): if dexf[j][0] != mexf[j][0]: log.warning( "Prototype file %s has a " + "column name mismatch (%s != %s) " + "in HDU%d according to %s.", self.prototype, dexf[j][0], mexf[j][0], i, self.filename) return
def main(): """Entry point for the check_model script. Returns ------- :class:`int` An integer suitable for passing to :func:`sys.exit`. """ from sys import argv from argparse import ArgumentParser desc = """Check actual files against the data model for validity. """ parser = ArgumentParser(description=desc, prog=os.path.basename(argv[0])) parser.add_argument('-d', '--datamodel-dir', dest='desidatamodel', metavar='DIR', help='Override the value of DESIDATAMODEL.') parser.add_argument('-F', '--compare-files', dest='files', action='store_true', help='Compare an individual data model to an individual file.') parser.add_argument('-W', '--warning-is-error', dest='error', action='store_true', help='Data model warnings raise exceptions.') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Set log level to DEBUG.') parser.add_argument('section', metavar='DIR or FILE', help='Section of the data model or individual model file.') parser.add_argument('directory', metavar='DIR or FILE', help='Check files in this top-level directory, or one individual file.') options = parser.parse_args() if options.verbose: log.setLevel(DEBUG) if 'DESIDATAMODEL' in os.environ: data_model_root = os.environ['DESIDATAMODEL'] else: if options.desidatamodel is not None: data_model_root = options.desidatamodel else: log.critical(("DESIDATAMODEL is not defined. " + "Cannot find data model files!")) return 1 log.debug("DESIDATAMODEL=%s", data_model_root) if options.files: filename = os.path.join(data_model_root, 'doc', options.section) section = os.path.join(data_model_root, 'doc', options.section.split('/')[0]) log.info("Loading individual data model: %s.", filename) files = [DataModel(filename, section)] log.info("Skipping regular expression processing.") # files[0].get_regexp(options.directory, error=options.error) log.info("Setting prototype file for %s to %s.", filename, options.directory) files[0].prototype = options.directory else: section = os.path.join(data_model_root, 'doc', options.section) log.info("Loading data model file in %s.", section) files = scan_model(section) log.info("Searching for data files in %s.", options.directory) files_to_regexp(options.directory, files, error=options.error) log.info("Identifying prototype files in %s.", options.directory) collect_files(options.directory, files) validate_prototypes(files, error=options.error) return 0
def validate_prototype(self, error=False): """Compares a model's prototype data file to the data models. Parameters ---------- error : :class:`bool`, optional If ``True``, failure to extract certain required metadata raises an exception. Notes ----- * Use set theory to compare the data headers to model headers. This should automatically find missing headers, extraneous headers, etc. """ if self.prototype is None: # # A warning should have been issued already, so just skip silently. # return log.info("Comparing %s to %s.", self.prototype, self.filename) if self._stub is None: self._stub = Stub(self.prototype, error=error) stub_meta = self._stub_meta = self._stub.hdumeta modelmeta = self.extract_metadata(error=error) # # Check number of headers. # if self._stub.nhdr != len(modelmeta): log.warning("Prototype file %s has the wrong number of " + "sections (HDUs) according to %s.", self.prototype, self.filename) return for i in range(self._stub.nhdr): dkw = stub_meta[i]['keywords'] mkw = modelmeta[i]['keywords'] # # Check number of keywords. # if len(dkw) != len(mkw): log.warning("Prototype file %s has the wrong number of " + "HDU%d keywords according to %s.", self.prototype, i, self.filename) continue # # If number of keywords is correct, check them individually. # for j in range(len(dkw)): if dkw[j][0] != mkw[j][0]: log.warning("Prototype file %s has a keyword " + "mismatch (%s != %s) in HDU%d according to " + "%s.", self.prototype, dkw[j][0], mkw[j][0], i, self.filename) # # Check the extension type. # dex = stub_meta[i]['extension'] try: mex = modelmeta[i]['extension'] except KeyError: mex = "Extension type not found" if dex != mex: log.warning("Prototype file %s has an extension type " + "mismatch in HDU%d (%s != %s) " + "according to %s.", self.prototype, i, dex, mex, self.filename) continue # # Check for EXTNAME # dexex = stub_meta[i]['extname'] mexex = modelmeta[i]['extname'] if dexex == '' and i > 0: log.warning("Prototype file %s has no EXTNAME in HDU%d.", self.prototype, i) if (dexex != '' and mexex != '' and dexex != mexex): log.warning("Prototype file %s has an EXTNAME mismatch " + "in HDU%d (%s != %s) " + "according to %s.", self.prototype, i, dexex, mexex, self.filename) # # If the extension type is correct, check the contents of the # extension. # dexf = stub_meta[i]['format'] try: mexf = modelmeta[i]['format'] except KeyError: mexf = "Extension format not found" if dex == 'IMAGE': try: icomma = dexf.index(',') except ValueError: icomma = len(dexf) if dexf[:icomma] != mexf[:icomma]: log.warning("Prototype file %s has an extension " + "format mismatch in HDU%d " + "according to %s.", self.prototype, i, self.filename) else: dexf = dexf[1:] # Get rid of header line. if len(dexf) != len(mexf): log.warning("Prototype file %s has the wrong " + "number of HDU%d columns according to %s.", self.prototype, i, self.filename) else: for j in range(len(dexf)): if dexf[j][0] != mexf[j][0]: log.warning("Prototype file %s has a " + "column name mismatch (%s != %s) " + "in HDU%d according to %s.", self.prototype, dexf[j][0], mexf[j][0], i, self.filename) return
def main(): """Entry point for command-line script. Returns ------- :class:`int` An integer suitable for passing to :func:`sys.exit`. """ # from pkg_resources import resource_filename # # command-line arguments # options = get_options() # # Logging # if options.verbose: log = get_logger(DEBUG, timestamp=True) else: log = get_logger(INFO, timestamp=True) # # Initialize DB # postgresql = setup_db(options) # # Load configuration # loader = [ { 'filepath': os.path.join(options.datapath, 'targets', 'truth-dark.fits'), 'tcls': Truth, 'hdu': 'TRUTH', 'expand': None, 'convert': None, 'index': None, 'q3c': False, 'chunksize': options.chunksize, 'maxrows': options.maxrows }, { 'filepath': os.path.join(options.datapath, 'targets', 'targets-dark.fits'), 'tcls': Target, 'hdu': 'TARGETS', 'expand': { 'DCHISQ': ( 'dchisq_psf', 'dchisq_rex', 'dchisq_dev', 'dchisq_exp', 'dchisq_comp', ) }, 'convert': None, 'index': None, 'q3c': postgresql, 'chunksize': options.chunksize, 'maxrows': options.maxrows }, { 'filepath': os.path.join(options.datapath, 'survey', 'exposures.fits'), 'tcls': ObsList, 'hdu': 'EXPOSURES', 'expand': { 'PASS': '******' }, # 'convert': {'dateobs': lambda x: convert_dateobs(x, tzinfo=utc)}, 'convert': None, 'index': None, 'q3c': postgresql, 'chunksize': options.chunksize, 'maxrows': options.maxrows }, { 'filepath': os.path.join(options.datapath, 'spectro', 'redux', 'mini', 'zcatalog-mini.fits'), 'tcls': ZCat, 'hdu': 'ZCATALOG', 'expand': { 'COEFF': ( 'coeff_0', 'coeff_1', 'coeff_2', 'coeff_3', 'coeff_4', 'coeff_5', 'coeff_6', 'coeff_7', 'coeff_8', 'coeff_9', ) }, 'convert': None, 'rowfilter': lambda x: ((x['TARGETID'] != 0) & (x['TARGETID'] != -1)), 'q3c': postgresql, 'chunksize': options.chunksize, 'maxrows': options.maxrows } ] # # Load the tables that correspond to a single file. # for l in loader: tn = l['tcls'].__tablename__ # # Don't use .one(). It actually fetches *all* rows. # q = dbSession.query(l['tcls']).first() if q is None: if options.zbest and tn == 'zcat': log.info("Loading %s from zbest files in %s.", tn, options.datapath) load_zbest(datapath=options.datapath, q3c=postgresql) else: log.info("Loading %s from %s.", tn, l['filepath']) load_file(**l) log.info("Finished loading %s.", tn) else: log.info("%s table already loaded.", tn.title()) # # Update truth table. # for h in ('BGS', 'ELG', 'LRG', 'QSO', 'STAR', 'WD'): update_truth( os.path.join(options.datapath, 'targets', 'truth-dark.fits'), 'TRUTH_' + h) # # Load fiber assignment files. # q = dbSession.query(FiberAssign).first() if q is None: log.info("Loading FiberAssign from %s.", options.datapath) load_fiberassign(options.datapath, q3c=postgresql) log.info("Finished loading FiberAssign.") else: log.info("FiberAssign table already loaded.") return 0
def setup_db(options=None, **kwargs): """Initialize the database connection. Parameters ---------- options : :class:`argpare.Namespace` Parsed command-line options. kwargs : keywords If present, use these instead of `options`. This is more user-friendly than setting up a :class:`~argpare.Namespace` object in, *e.g.* a Jupyter Notebook. Returns ------- :class:`bool` ``True`` if the configured database is a PostgreSQL database. """ global engine, schemaname # # Schema creation # if options is None: if len(kwargs) > 0: try: schema = kwargs['schema'] except KeyError: schema = None try: overwrite = kwargs['overwrite'] except KeyError: overwrite = False try: hostname = kwargs['hostname'] except KeyError: hostname = None try: username = kwargs['username'] except KeyError: username = '******' try: dbfile = kwargs['dbfile'] except KeyError: dbfile = 'redshift.db' try: datapath = kwargs['datapath'] except KeyError: datapath = None try: verbose = kwargs['verbose'] except KeyError: verbose = False else: raise ValueError("No options specified!") else: schema = options.schema overwrite = options.overwrite hostname = options.hostname username = options.username dbfile = options.dbfile datapath = options.datapath verbose = options.verbose if schema: schemaname = schema # event.listen(Base.metadata, 'before_create', CreateSchema(schemaname)) if overwrite: event.listen( Base.metadata, 'before_create', DDL('DROP SCHEMA IF EXISTS {0} CASCADE'.format(schemaname))) event.listen(Base.metadata, 'before_create', DDL('CREATE SCHEMA IF NOT EXISTS {0}'.format(schemaname))) # # Create the file. # postgresql = False if hostname: postgresql = True db_connection = parse_pgpass(hostname=hostname, username=username) if db_connection is None: log.critical("Could not load database information!") return 1 else: if os.path.basename(dbfile) == dbfile: db_file = os.path.join(datapath, dbfile) else: db_file = dbfile if overwrite and os.path.exists(db_file): log.info("Removing file: %s.", db_file) os.remove(db_file) db_connection = 'sqlite:///' + db_file # # SQLAlchemy stuff. # engine = create_engine(db_connection, echo=verbose) dbSession.remove() dbSession.configure(bind=engine, autoflush=False, expire_on_commit=False) log.info("Begin creating tables.") for tab in Base.metadata.tables.values(): tab.schema = schemaname Base.metadata.create_all(engine) log.info("Finished creating tables.") return postgresql
def load_fiberassign(datapath, maxpass=4, hdu='FIBERASSIGN', q3c=False, latest_epoch=False, last_column='NUMOBS_MORE'): """Load fiber assignment files into the fiberassign table. Tile files can appear in multiple epochs, so for a given tileid, load the tile file with the largest value of epoch. In the "real world", a tile file appears in each epoch until it is observed, therefore the tile file corresponding to the actual observation is the one with the largest epoch. Parameters ---------- datapath : :class:`str` Full path to the directory containing tile files. maxpass : :class:`int`, optional Search for pass numbers up to this value (default 4). hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'FIBERASSIGN'). q3c : :class:`bool`, optional If set, create q3c index on the table. latest_epoch : :class:`bool`, optional If set, search for the latest tile file among several epochs. last_column : :class:`str`, optional Do not load columns past this name (default 'NUMOBS_MORE'). """ fiberpath = os.path.join(datapath, 'fiberassign*.fits') log.info("Using tile file search path: %s.", fiberpath) tile_files = glob.glob(fiberpath) if len(tile_files) == 0: log.error("No tile files found!") return log.info("Found %d tile files.", len(tile_files)) # # Find the latest epoch for every tile file. # latest_tiles = dict() if latest_epoch: tileidre = re.compile(r'/(\d+)/fiberassign/fiberassign\-(\d+)\.fits$') for f in tile_files: m = tileidre.search(f) if m is None: log.error("Could not match %s!", f) continue epoch, tileid = map(int, m.groups()) if tileid in latest_tiles: if latest_tiles[tileid][0] < epoch: latest_tiles[tileid] = (epoch, f) else: latest_tiles[tileid] = (epoch, f) else: for f in tile_files: # fiberassign-TILEID.fits tileid = int( re.match('fiberassign\-(\d+)\.fits', os.path.basename(f))[1]) latest_tiles[tileid] = (0, f) log.info("Identified %d tile files for loading.", len(latest_tiles)) # # Read the identified tile files. # data_index = None for tileid in latest_tiles: epoch, f = latest_tiles[tileid] with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s", f, hdu) for col in data.names[:data_index]: if data[col].dtype.kind == 'f': bad = np.isnan(data[col]) if np.any(bad): nbad = bad.sum() log.warning( "%d rows of bad data detected in column " + "%s of %s.", nbad, col, f) # # This replacement may be deprecated in the future. # if col in ('TARGET_RA', 'TARGET_DEC', 'FIBERASSIGN_X', 'FIBERASSIGN_Y'): data[col][bad] = -9999.0 assert not np.any(np.isnan(data[col])) assert np.all(np.isfinite(data[col])) n_rows = len(data) if data_index is None: data_index = data.names.index(last_column) + 1 data_list = ([[tileid] * n_rows] + [data[col].tolist() for col in data.names[:data_index]]) data_names = ['tileid' ] + [col.lower() for col in data.names[:data_index]] log.info("Initial column conversion complete on tileid = %d.", tileid) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on tileid = %d.", tileid) dbSession.bulk_insert_mappings( FiberAssign, [dict(zip(data_names, row)) for row in data_rows]) log.info("Inserted %d rows in %s for tileid = %d.", n_rows, FiberAssign.__tablename__, tileid) dbSession.commit() if q3c: q3c_index('fiberassign', ra='target_ra') return
def load_zbest(datapath=None, hdu='ZBEST', q3c=False): """Load zbest files into the zcat table. This function is deprecated since there should now be a single redshift catalog file. Parameters ---------- datapath : :class:`str` Full path to the directory containing zbest files. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'ZBEST'). q3c : :class:`bool`, optional If set, create q3c index on the table. """ if datapath is None: datapath = specprod_root() zbestpath = os.path.join(datapath, 'spectra-64', '*', '*', 'zbest-64-*.fits') log.info("Using zbest file search path: %s.", zbestpath) zbest_files = glob.glob(zbestpath) if len(zbest_files) == 0: log.error("No zbest files found!") return log.info("Found %d zbest files.", len(zbest_files)) # # Read the identified zbest files. # for f in zbest_files: brickname = os.path.basename(os.path.dirname(f)) with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s.", f, hdu) good_targetids = ((data['TARGETID'] != 0) & (data['TARGETID'] != -1)) # # If there are too many targetids, the in_ clause will blow up. # Disabling this test, and crossing fingers. # # q = dbSession.query(ZCat).filter(ZCat.targetid.in_(data['TARGETID'].tolist())).all() # if len(q) != 0: # log.warning("Duplicate TARGETID found in %s.", f) # for z in q: # log.warning("Duplicate TARGETID = %d.", z.targetid) # good_targetids = good_targetids & (data['TARGETID'] != z.targetid) data_list = [data[col][good_targetids].tolist() for col in data.names] data_names = [col.lower() for col in data.names] log.info("Initial column conversion complete on brick = %s.", brickname) # # Expand COEFF # col = 'COEFF' expand = ( 'coeff_0', 'coeff_1', 'coeff_2', 'coeff_3', 'coeff_4', 'coeff_5', 'coeff_6', 'coeff_7', 'coeff_8', 'coeff_9', ) i = data_names.index(col.lower()) del data_names[i] del data_list[i] for j, n in enumerate(expand): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) # # zbest files don't contain the same columns as zcatalog. # for col in ZCat.__table__.columns: if col.name not in data_names: data_names.append(col.name) data_list.append([0] * len(data_list[0])) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on brick = %s.", brickname) try: dbSession.bulk_insert_mappings( ZCat, [dict(zip(data_names, row)) for row in data_rows]) except IntegrityError as e: log.error("Integrity Error detected!") log.error(e) dbSession.rollback() else: log.info("Inserted %d rows in %s for brick = %s.", len(data_rows), ZCat.__tablename__, brickname) dbSession.commit() if q3c: q3c_index('zcat') return
def load_file(filepath, tcls, hdu=1, expand=None, convert=None, index=None, rowfilter=None, q3c=False, chunksize=50000, maxrows=0): """Load a data file into the database, assuming that column names map to database column names with no surprises. Parameters ---------- filepath : :class:`str` Full path to the data file. tcls : :class:`sqlalchemy.ext.declarative.api.DeclarativeMeta` The table to load, represented by its class. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 1). expand : :class:`dict`, optional If set, map FITS column names to one or more alternative column names. convert : :class:`dict`, optional If set, convert the data for a named (database) column using the supplied function. index : :class:`str`, optional If set, add a column that just counts the number of rows. rowfilter : callable, optional If set, apply this filter to the rows to be loaded. The function should return :class:`bool`, with ``True`` meaning a good row. q3c : :class:`bool`, optional If set, create q3c index on the table. chunksize : :class:`int`, optional If set, load database `chunksize` rows at a time (default 50000). maxrows : :class:`int`, optional If set, stop loading after `maxrows` are loaded. Alteratively, set `maxrows` to zero (0) to load all rows. """ tn = tcls.__tablename__ if filepath.endswith('.fits'): with fits.open(filepath) as hdulist: data = hdulist[hdu].data elif filepath.endswith('.ecsv'): data = Table.read(filepath, format='ascii.ecsv') else: log.error("Unrecognized data file, %s!", filepath) return if maxrows == 0: maxrows = len(data) log.info("Read data from %s HDU %s", filepath, hdu) try: colnames = data.names except AttributeError: colnames = data.colnames for col in colnames: if data[col].dtype.kind == 'f': bad = np.isnan(data[col][0:maxrows]) if np.any(bad): nbad = bad.sum() log.warning( "%d rows of bad data detected in column " + "%s of %s.", nbad, col, filepath) # # Temporary workaround for bad flux values, see # https://github.com/desihub/desitarget/issues/397 # if col in ('FLUX_R', 'FIBERFLUX_R', 'FIBERTOTFLUX_R'): data[col][0:maxrows][bad] = -9999.0 log.info("Integrity check complete on %s.", tn) if rowfilter is None: good_rows = np.ones((maxrows, ), dtype=np.bool) else: good_rows = rowfilter(data[0:maxrows]) data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames] data_names = [col.lower() for col in colnames] finalrows = len(data_list[0]) log.info("Initial column conversion complete on %s.", tn) if expand is not None: for col in expand: i = data_names.index(col.lower()) if isinstance(expand[col], str): # # Just rename a column. # log.debug("Renaming column %s (at index %d) to %s.", data_names[i], i, expand[col]) data_names[i] = expand[col] else: # # Assume this is an expansion of an array-valued column # into individual columns. # del data_names[i] del data_list[i] for j, n in enumerate(expand[col]): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) log.info("Column expansion complete on %s.", tn) del data if convert is not None: for col in convert: i = data_names.index(col) data_list[i] = [convert[col](x) for x in data_list[i]] log.info("Column conversion complete on %s.", tn) if index is not None: data_list.insert(0, list(range(1, finalrows + 1))) data_names.insert(0, index) log.info("Added index column '%s'.", index) data_rows = list(zip(*data_list)) del data_list log.info("Converted columns into rows on %s.", tn) for k in range(finalrows // chunksize + 1): data_chunk = [ dict(zip(data_names, row)) for row in data_rows[k * chunksize:(k + 1) * chunksize] ] if len(data_chunk) > 0: engine.execute(tcls.__table__.insert(), data_chunk) log.info("Inserted %d rows in %s.", min((k + 1) * chunksize, finalrows), tn) # for k in range(finalrows//chunksize + 1): # data_insert = [dict([(col, data_list[i].pop(0)) # for i, col in enumerate(data_names)]) # for j in range(chunksize)] # session.bulk_insert_mappings(tcls, data_insert) # log.info("Inserted %d rows in %s..", # min((k+1)*chunksize, finalrows), tn) # session.commit() # dbSession.commit() if q3c: q3c_index(tn) return
def main(): """Entry point for command-line script. Returns ------- :class:`int` An integer suitable for passing to :func:`sys.exit`. """ # from pkg_resources import resource_filename # # command-line arguments # options = get_options() # # Logging # if options.verbose: log = get_logger(DEBUG, timestamp=True) else: log = get_logger(INFO, timestamp=True) # # Initialize DB # postgresql = setup_db(options) # # Load configuration # loader = [{'filepath': os.path.join(options.datapath, 'targets', 'truth.fits'), 'tcls': Truth, 'hdu': 'TRUTH', 'expand': None, 'convert': None, 'index': None, 'q3c': False, 'chunksize': options.chunksize, 'maxrows': options.maxrows}, {'filepath': os.path.join(options.datapath, 'targets', 'targets.fits'), 'tcls': Target, 'hdu': 'TARGETS', 'expand': {'DCHISQ': ('dchisq_psf', 'dchisq_rex', 'dchisq_dev', 'dchisq_exp', 'dchisq_comp',)}, 'convert': None, 'index': None, 'q3c': postgresql, 'chunksize': options.chunksize, 'maxrows': options.maxrows}, {'filepath': os.path.join(options.datapath, 'survey', 'exposures.fits'), 'tcls': ObsList, 'hdu': 'EXPOSURES', 'expand': {'PASS': '******'}, # 'convert': {'dateobs': lambda x: convert_dateobs(x, tzinfo=utc)}, 'convert': None, 'index': None, 'q3c': postgresql, 'chunksize': options.chunksize, 'maxrows': options.maxrows}, {'filepath': os.path.join(options.datapath, 'spectro', 'redux', 'mini', 'zcatalog-mini.fits'), 'tcls': ZCat, 'hdu': 'ZCATALOG', 'expand': {'COEFF': ('coeff_0', 'coeff_1', 'coeff_2', 'coeff_3', 'coeff_4', 'coeff_5', 'coeff_6', 'coeff_7', 'coeff_8', 'coeff_9',)}, 'convert': None, 'rowfilter': lambda x: ((x['TARGETID'] != 0) & (x['TARGETID'] != -1)), 'q3c': postgresql, 'chunksize': options.chunksize, 'maxrows': options.maxrows}] # # Load the tables that correspond to a single file. # for l in loader: tn = l['tcls'].__tablename__ # # Don't use .one(). It actually fetches *all* rows. # q = dbSession.query(l['tcls']).first() if q is None: if options.zbest and tn == 'zcat': log.info("Loading %s from zbest files in %s.", tn, options.datapath) load_zbest(datapath=options.datapath, q3c=postgresql) else: log.info("Loading %s from %s.", tn, l['filepath']) load_file(**l) log.info("Finished loading %s.", tn) else: log.info("%s table already loaded.", tn.title()) # # Update truth table. # for h in ('BGS', 'ELG', 'LRG', 'QSO', 'STAR', 'WD'): update_truth(os.path.join(options.datapath, 'targets', 'truth.fits'), 'TRUTH_' + h) # # Load fiber assignment files. # q = dbSession.query(FiberAssign).first() if q is None: log.info("Loading FiberAssign from %s.", options.datapath) load_fiberassign(options.datapath, q3c=postgresql) log.info("Finished loading FiberAssign.") else: log.info("FiberAssign table already loaded.") return 0
def setup_db(options=None, **kwargs): """Initialize the database connection. Parameters ---------- options : :class:`argpare.Namespace` Parsed command-line options. kwargs : keywords If present, use these instead of `options`. This is more user-friendly than setting up a :class:`~argpare.Namespace` object in, *e.g.* a Jupyter Notebook. Returns ------- :class:`bool` ``True`` if the configured database is a PostgreSQL database. """ global engine, schemaname # # Schema creation # if options is None: if len(kwargs) > 0: try: schema = kwargs['schema'] except KeyError: schema = None try: overwrite = kwargs['overwrite'] except KeyError: overwrite = False try: hostname = kwargs['hostname'] except KeyError: hostname = None try: username = kwargs['username'] except KeyError: username = '******' try: dbfile = kwargs['dbfile'] except KeyError: dbfile = 'redshift.db' try: datapath = kwargs['datapath'] except KeyError: datapath = None try: verbose = kwargs['verbose'] except KeyError: verbose = False else: raise ValueError("No options specified!") else: schema = options.schema overwrite = options.overwrite hostname = options.hostname username = options.username dbfile = options.dbfile datapath = options.datapath verbose = options.verbose if schema: schemaname = schema # event.listen(Base.metadata, 'before_create', CreateSchema(schemaname)) if overwrite: event.listen(Base.metadata, 'before_create', DDL('DROP SCHEMA IF EXISTS {0} CASCADE'.format(schemaname))) event.listen(Base.metadata, 'before_create', DDL('CREATE SCHEMA IF NOT EXISTS {0}'.format(schemaname))) # # Create the file. # postgresql = False if hostname: postgresql = True db_connection = parse_pgpass(hostname=hostname, username=username) if db_connection is None: log.critical("Could not load database information!") return 1 else: if os.path.basename(dbfile) == dbfile: db_file = os.path.join(datapath, dbfile) else: db_file = dbfile if overwrite and os.path.exists(db_file): log.info("Removing file: %s.", db_file) os.remove(db_file) db_connection = 'sqlite:///'+db_file # # SQLAlchemy stuff. # engine = create_engine(db_connection, echo=verbose) dbSession.remove() dbSession.configure(bind=engine, autoflush=False, expire_on_commit=False) log.info("Begin creating tables.") for tab in Base.metadata.tables.values(): tab.schema = schemaname Base.metadata.create_all(engine) log.info("Finished creating tables.") return postgresql
def load_fiberassign(datapath, maxpass=4, hdu='FIBERASSIGN', q3c=False, latest_epoch=False, last_column='SUBPRIORITY'): """Load fiber assignment files into the fiberassign table. Tile files can appear in multiple epochs, so for a given tileid, load the tile file with the largest value of epoch. In the "real world", a tile file appears in each epoch until it is observed, therefore the tile file corresponding to the actual observation is the one with the largest epoch. Parameters ---------- datapath : :class:`str` Full path to the directory containing tile files. maxpass : :class:`int`, optional Search for pass numbers up to this value (default 4). hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'FIBERASSIGN'). q3c : :class:`bool`, optional If set, create q3c index on the table. latest_epoch : :class:`bool`, optional If set, search for the latest tile file among several epochs. last_column : :class:`str`, optional Do not load columns past this name (default 'BRICKNAME'). """ fiberpath = os.path.join(datapath, 'tile*.fits') log.info("Using tile file search path: %s.", fiberpath) tile_files = glob.glob(fiberpath) if len(tile_files) == 0: log.error("No tile files found!") return log.info("Found %d tile files.", len(tile_files)) # # Find the latest epoch for every tile file. # latest_tiles = dict() if latest_epoch: tileidre = re.compile(r'/(\d+)/fiberassign/tile-(\d+)\.fits$') for f in tile_files: m = tileidre.search(f) if m is None: log.error("Could not match %s!", f) continue epoch, tileid = map(int, m.groups()) if tileid in latest_tiles: if latest_tiles[tileid][0] < epoch: latest_tiles[tileid] = (epoch, f) else: latest_tiles[tileid] = (epoch, f) else: for f in tile_files: # tile_TILEID.fits or tile-TILEID.fits tileid = int(re.match('tile[\-_](\d+)\.fits', os.path.basename(f))[1]) latest_tiles[tileid] = (0, f) log.info("Identified %d tile files for loading.", len(latest_tiles)) # # Read the identified tile files. # data_index = None for tileid in latest_tiles: epoch, f = latest_tiles[tileid] with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s", f, hdu) for col in data.names[:data_index]: if data[col].dtype.kind == 'f': bad = np.isnan(data[col]) if np.any(bad): nbad = bad.sum() log.warning("%d rows of bad data detected in column " + "%s of %s.", nbad, col, f) # # This replacement may be deprecated in the future. # if col in ('TARGET_RA', 'TARGET_DEC', 'DESIGN_X', 'DESIGN_Y'): data[col][bad] = -9999.0 assert not np.any(np.isnan(data[col])) assert np.all(np.isfinite(data[col])) n_rows = len(data) if data_index is None: data_index = data.names.index(last_column) + 1 data_list = ([[tileid]*n_rows] + [data[col].tolist() for col in data.names[:data_index]]) data_names = ['tileid'] + [col.lower() for col in data.names[:data_index]] log.info("Initial column conversion complete on tileid = %d.", tileid) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on tileid = %d.", tileid) dbSession.bulk_insert_mappings(FiberAssign, [dict(zip(data_names, row)) for row in data_rows]) log.info("Inserted %d rows in %s for tileid = %d.", n_rows, FiberAssign.__tablename__, tileid) dbSession.commit() if q3c: q3c_index('fiberassign', ra='target_ra') return
def load_zbest(datapath=None, hdu='ZBEST', q3c=False): """Load zbest files into the zcat table. This function is deprecated since there should now be a single redshift catalog file. Parameters ---------- datapath : :class:`str` Full path to the directory containing zbest files. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 'ZBEST'). q3c : :class:`bool`, optional If set, create q3c index on the table. """ if datapath is None: datapath = specprod_root() zbestpath = os.path.join(datapath, 'spectra-64', '*', '*', 'zbest-64-*.fits') log.info("Using zbest file search path: %s.", zbestpath) zbest_files = glob.glob(zbestpath) if len(zbest_files) == 0: log.error("No zbest files found!") return log.info("Found %d zbest files.", len(zbest_files)) # # Read the identified zbest files. # for f in zbest_files: brickname = os.path.basename(os.path.dirname(f)) with fits.open(f) as hdulist: data = hdulist[hdu].data log.info("Read data from %s HDU %s.", f, hdu) good_targetids = ((data['TARGETID'] != 0) & (data['TARGETID'] != -1)) # # If there are too many targetids, the in_ clause will blow up. # Disabling this test, and crossing fingers. # # q = dbSession.query(ZCat).filter(ZCat.targetid.in_(data['TARGETID'].tolist())).all() # if len(q) != 0: # log.warning("Duplicate TARGETID found in %s.", f) # for z in q: # log.warning("Duplicate TARGETID = %d.", z.targetid) # good_targetids = good_targetids & (data['TARGETID'] != z.targetid) data_list = [data[col][good_targetids].tolist() for col in data.names] data_names = [col.lower() for col in data.names] log.info("Initial column conversion complete on brick = %s.", brickname) # # Expand COEFF # col = 'COEFF' expand = ('coeff_0', 'coeff_1', 'coeff_2', 'coeff_3', 'coeff_4', 'coeff_5', 'coeff_6', 'coeff_7', 'coeff_8', 'coeff_9',) i = data_names.index(col.lower()) del data_names[i] del data_list[i] for j, n in enumerate(expand): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) # # zbest files don't contain the same columns as zcatalog. # for col in ZCat.__table__.columns: if col.name not in data_names: data_names.append(col.name) data_list.append([0]*len(data_list[0])) data_rows = list(zip(*data_list)) log.info("Converted columns into rows on brick = %s.", brickname) try: dbSession.bulk_insert_mappings(ZCat, [dict(zip(data_names, row)) for row in data_rows]) except IntegrityError as e: log.error("Integrity Error detected!") log.error(e) dbSession.rollback() else: log.info("Inserted %d rows in %s for brick = %s.", len(data_rows), ZCat.__tablename__, brickname) dbSession.commit() if q3c: q3c_index('zcat') return
def load_file(filepath, tcls, hdu=1, expand=None, convert=None, index=None, rowfilter=None, q3c=False, chunksize=50000, maxrows=0): """Load a data file into the database, assuming that column names map to database column names with no surprises. Parameters ---------- filepath : :class:`str` Full path to the data file. tcls : :class:`sqlalchemy.ext.declarative.api.DeclarativeMeta` The table to load, represented by its class. hdu : :class:`int` or :class:`str`, optional Read a data table from this HDU (default 1). expand : :class:`dict`, optional If set, map FITS column names to one or more alternative column names. convert : :class:`dict`, optional If set, convert the data for a named (database) column using the supplied function. index : :class:`str`, optional If set, add a column that just counts the number of rows. rowfilter : callable, optional If set, apply this filter to the rows to be loaded. The function should return :class:`bool`, with ``True`` meaning a good row. q3c : :class:`bool`, optional If set, create q3c index on the table. chunksize : :class:`int`, optional If set, load database `chunksize` rows at a time (default 50000). maxrows : :class:`int`, optional If set, stop loading after `maxrows` are loaded. Alteratively, set `maxrows` to zero (0) to load all rows. """ tn = tcls.__tablename__ if filepath.endswith('.fits'): with fits.open(filepath) as hdulist: data = hdulist[hdu].data elif filepath.endswith('.ecsv'): data = Table.read(filepath, format='ascii.ecsv') else: log.error("Unrecognized data file, %s!", filepath) return if maxrows == 0: maxrows = len(data) log.info("Read data from %s HDU %s", filepath, hdu) try: colnames = data.names except AttributeError: colnames = data.colnames for col in colnames: if data[col].dtype.kind == 'f': bad = np.isnan(data[col][0:maxrows]) if np.any(bad): nbad = bad.sum() log.warning("%d rows of bad data detected in column " + "%s of %s.", nbad, col, filepath) # # Temporary workaround for bad flux values, see # https://github.com/desihub/desitarget/issues/397 # if col in ('FLUX_R', 'FIBERFLUX_R', 'FIBERTOTFLUX_R'): data[col][0:maxrows][bad] = -9999.0 log.info("Integrity check complete on %s.", tn) if rowfilter is None: good_rows = np.ones((maxrows,), dtype=np.bool) else: good_rows = rowfilter(data[0:maxrows]) data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames] data_names = [col.lower() for col in colnames] finalrows = len(data_list[0]) log.info("Initial column conversion complete on %s.", tn) if expand is not None: for col in expand: i = data_names.index(col.lower()) if isinstance(expand[col], str): # # Just rename a column. # log.debug("Renaming column %s (at index %d) to %s.", data_names[i], i, expand[col]) data_names[i] = expand[col] else: # # Assume this is an expansion of an array-valued column # into individual columns. # del data_names[i] del data_list[i] for j, n in enumerate(expand[col]): log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n) data_names.insert(i + j, n) data_list.insert(i + j, data[col][:, j].tolist()) log.debug(data_names) log.info("Column expansion complete on %s.", tn) del data if convert is not None: for col in convert: i = data_names.index(col) data_list[i] = [convert[col](x) for x in data_list[i]] log.info("Column conversion complete on %s.", tn) if index is not None: data_list.insert(0, list(range(1, finalrows+1))) data_names.insert(0, index) log.info("Added index column '%s'.", index) data_rows = list(zip(*data_list)) del data_list log.info("Converted columns into rows on %s.", tn) for k in range(finalrows//chunksize + 1): data_chunk = [dict(zip(data_names, row)) for row in data_rows[k*chunksize:(k+1)*chunksize]] if len(data_chunk) > 0: engine.execute(tcls.__table__.insert(), data_chunk) log.info("Inserted %d rows in %s.", min((k+1)*chunksize, finalrows), tn) # for k in range(finalrows//chunksize + 1): # data_insert = [dict([(col, data_list[i].pop(0)) # for i, col in enumerate(data_names)]) # for j in range(chunksize)] # session.bulk_insert_mappings(tcls, data_insert) # log.info("Inserted %d rows in %s..", # min((k+1)*chunksize, finalrows), tn) # session.commit() # dbSession.commit() if q3c: q3c_index(tn) return