def read_archfile(i, f, filetype, row, colnames, archfiles, db): """Read filename ``f`` with index ``i`` (position within list of filenames). The file has type ``filetype`` and will be added to MSID file at row index ``row``. ``colnames`` is the list of column names for the content type (not used here). """ # Check if filename is already in archfiles. If so then abort further processing. filename = os.path.basename(f) if db.fetchall('SELECT filename FROM archfiles WHERE filename=?', (filename, )): logger.verbose( 'File %s already in archfiles - unlinking and skipping' % f) os.unlink(f) return None, None # Read FITS archive file and accumulate data into dats list and header into headers dict logger.info('Reading (%d / %d) %s' % (i, len(archfiles), filename)) hdus = pyfits.open(f) hdu = hdus[1] try: dat = converters.convert(hdu.data, filetype['content']) except converters.NoValidDataError: # When creating files allow NoValidDataError hdus.close() logger.warning( 'WARNING: no valid data in data file {}'.format(filename)) return None, None except converters.DataShapeError as err: hdus.close() logger.warning( 'WARNING: skipping file {} with bad data shape: ASCDSVER={} {}'. format(filename, hdu.header['ASCDSVER'], err)) return None, None # Accumlate relevant info about archfile that will be ingested into # MSID h5 files. Commit info before h5 ingest so if there is a failure # the needed info will be available to do the repair. archfiles_row = dict( (x, hdu.header.get(x.upper())) for x in archfiles_hdr_cols) archfiles_row['checksum'] = hdu.header.get('checksum') or hdu._checksum archfiles_row['rowstart'] = row archfiles_row['rowstop'] = row + len(dat) archfiles_row['filename'] = filename archfiles_row['filetime'] = int( re.search(r'(\d+)', archfiles_row['filename']).group(1)) filedate = DateTime(archfiles_row['filetime']).date year, doy = (int(x) for x in re.search(r'(\d\d\d\d):(\d\d\d)', filedate).groups()) archfiles_row['year'] = year archfiles_row['doy'] = doy hdus.close() return dat, archfiles_row
def read_archfile(i, f, filetype, row, colnames, archfiles, db): """Read filename ``f`` with index ``i`` (position within list of filenames). The file has type ``filetype`` and will be added to MSID file at row index ``row``. ``colnames`` is the list of column names for the content type (not used here). """ # Check if filename is already in archfiles. If so then abort further processing. filename = os.path.basename(f) if db.fetchall('SELECT filename FROM archfiles WHERE filename=?', (filename,)): logger.verbose('File %s already in archfiles - unlinking and skipping' % f) os.unlink(f) return None, None # Read FITS archive file and accumulate data into dats list and header into headers dict logger.info('Reading (%d / %d) %s' % (i, len(archfiles), filename)) hdus = pyfits.open(f, character_as_bytes=True) hdu = hdus[1] try: dat = converters.convert(hdu.data, filetype['content']) except converters.NoValidDataError: # When creating files allow NoValidDataError hdus.close() logger.warning('WARNING: no valid data in data file {}'.format(filename)) return None, None except converters.DataShapeError as err: hdus.close() logger.warning('WARNING: skipping file {} with bad data shape: ASCDSVER={} {}' .format(filename, hdu.header['ASCDSVER'], err)) return None, None # Accumlate relevant info about archfile that will be ingested into # MSID h5 files. Commit info before h5 ingest so if there is a failure # the needed info will be available to do the repair. archfiles_row = dict((x, hdu.header.get(x.upper())) for x in archfiles_hdr_cols) archfiles_row['checksum'] = hdu.header.get('checksum') or hdu._checksum archfiles_row['rowstart'] = row archfiles_row['rowstop'] = row + len(dat) archfiles_row['filename'] = filename archfiles_row['filetime'] = int(re.search(r'(\d+)', archfiles_row['filename']).group(1)) filedate = DateTime(archfiles_row['filetime']).date year, doy = (int(x) for x in re.search(r'(\d\d\d\d):(\d\d\d)', filedate).groups()) archfiles_row['year'] = year archfiles_row['doy'] = doy hdus.close() return dat, archfiles_row
fitsdir = os.path.abspath(os.path.join(outroot, content)) if os.path.exists(os.path.join('data', content)): print "Skipping", filetype continue print filetype # If files are already in the final cxc archive location: # fitsfiles = sorted(glob.glob('/data/cosmos2/eng_archive/data/acisdeahk/arch/????/???/*.fits.gz')) fitsfiles = sorted(glob.glob(os.path.join(fitsdir, filetype['fileglob']))) if not fitsfiles: print 'No files' continue dat = Ska.Table.read_fits_table(fitsfiles[-1]) dat = converters.convert(dat, filetype['content']) dt = np.median(dat['TIME'][1:] - dat['TIME'][:-1]) print 'dt=',dt n_rows = int(86400 * 365 * 12 / dt) colnames = set(dat.dtype.names) colnames_all = set(dat.dtype.names) for colname in colnames_all: if len(dat[colname].shape) > 1: print 'Removing column', colname colnames.remove(colname) for colname in colnames: make_h5_col_file(dat, content, colname, n_rows) headers = dict() max_size = 1e8 dats_size = 0