def update_sync_repo(opt, logger, content): """ :param opt: argparse options :param logger: logger instance :param content: content type :return: """ # File types context dict ft = fetch.ft ft['content'] = content index_file = Path(sync_files['index'].abs) index_tbl = update_index_file(index_file, opt, logger) if index_tbl is None: # Index table was not created, nothing more to do here logger.warning(f'No index table for {content}') return for row in index_tbl: ft = fetch.ft ft['date_id'] = row['date_id'] update_sync_data_full(content, logger, row) update_sync_data_stat(content, logger, row, '5min') update_sync_data_stat(content, logger, row, 'daily') remove_outdated_sync_files(opt, logger, index_tbl, index_file)
def main(): """ Review kadi dwells for new high background events, update a text file table of those events, make reports, and notify via email as needed. """ global logger opt = get_opt() logger = pyyaks.logger.get_logger(level=opt.log_level) EVENT_ARCHIVE = os.path.join(opt.data_root, "bgd_events.dat") start = None bgd_events = [] if os.path.exists(EVENT_ARCHIVE): bgd_events = Table.read(EVENT_ARCHIVE, format='ascii') if len(bgd_events) > 0: start = DateTime(bgd_events['dwell_datestart'][-1]) # Remove any bogus events from the real list bgd_events = bgd_events[bgd_events['obsid'] != -1] bgd_events['slots'] = bgd_events['slots'].astype(str) bgd_events['slots_for_sum'] = bgd_events['slots_for_sum'].astype(str) # If the user has asked for a start time earlier than the end of the # table, delete any rows after the supplied start time if opt.start is not None: if start is not None: if DateTime(opt.start).secs < start.secs: bgd_events = bgd_events[ bgd_events['dwell_datestart'] < DateTime(opt.start).date] start = DateTime(opt.start) if start is None: start = DateTime(-7) new_events, stop = get_events(start) if len(new_events) > 0: new_events = Table(new_events) for obsid in np.unique(new_events['obsid']): if obsid in [0, -1]: continue url = f"{opt.web_url}/events/obs_{obsid:05d}/index.html" logger.warning(f"HI BGD event at in obsid {obsid} {url}") if len(opt.emails) > 0: send_mail(logger, opt, f'ACA HI BGD event in obsid {obsid}', f'HI BGD in obsid {obsid} report at {url}', __file__) if len(bgd_events) > 0: bgd_events = vstack([bgd_events, new_events]) else: bgd_events = new_events # Add a null event at the end bgd_events.add_row() bgd_events[-1]['obsid'] = -1 bgd_events[-1]['dwell_datestart'] = DateTime(stop).date bgd_events.write(EVENT_ARCHIVE, format='ascii', overwrite=True) make_event_reports(bgd_events, opt.web_out)
def update_sync_repo(opt, logger, content): """ :param opt: argparse options :param logger: logger instance :param content: content type :return: """ # File types context dict ft = fetch.ft ft['content'] = content index_file = Path(sync_files['index'].abs) index_tbl = update_index_file(index_file, opt, logger) if index_tbl is None: # Index table was not created, nothing more to do here logger.warning(f'No index table for {content}') return for row in index_tbl: ft = fetch.ft ft['date_id'] = row['date_id'] update_sync_data_full(content, logger, row) update_sync_data_stat(content, logger, row, '5min') update_sync_data_stat(content, logger, row, 'daily') remove_mask = remove_outdated_sync_files(opt, logger, index_tbl) if np.any(remove_mask): index_tbl = index_tbl[~remove_mask] logger.info( f'Writing {len(index_tbl)} row(s) to index file {index_file}') index_tbl.write(index_file, format='ascii.ecsv')
def read_archfile(i, f, filetype, row, colnames, archfiles, db): """Read filename ``f`` with index ``i`` (position within list of filenames). The file has type ``filetype`` and will be added to MSID file at row index ``row``. ``colnames`` is the list of column names for the content type (not used here). """ # Check if filename is already in archfiles. If so then abort further processing. filename = os.path.basename(f) if db.fetchall('SELECT filename FROM archfiles WHERE filename=?', (filename, )): logger.verbose( 'File %s already in archfiles - unlinking and skipping' % f) os.unlink(f) return None, None # Read FITS archive file and accumulate data into dats list and header into headers dict logger.info('Reading (%d / %d) %s' % (i, len(archfiles), filename)) hdus = pyfits.open(f) hdu = hdus[1] try: dat = converters.convert(hdu.data, filetype['content']) except converters.NoValidDataError: # When creating files allow NoValidDataError hdus.close() logger.warning( 'WARNING: no valid data in data file {}'.format(filename)) return None, None except converters.DataShapeError as err: hdus.close() logger.warning( 'WARNING: skipping file {} with bad data shape: ASCDSVER={} {}'. format(filename, hdu.header['ASCDSVER'], err)) return None, None # Accumlate relevant info about archfile that will be ingested into # MSID h5 files. Commit info before h5 ingest so if there is a failure # the needed info will be available to do the repair. archfiles_row = dict( (x, hdu.header.get(x.upper())) for x in archfiles_hdr_cols) archfiles_row['checksum'] = hdu.header.get('checksum') or hdu._checksum archfiles_row['rowstart'] = row archfiles_row['rowstop'] = row + len(dat) archfiles_row['filename'] = filename archfiles_row['filetime'] = int( re.search(r'(\d+)', archfiles_row['filename']).group(1)) filedate = DateTime(archfiles_row['filetime']).date year, doy = (int(x) for x in re.search(r'(\d\d\d\d):(\d\d\d)', filedate).groups()) archfiles_row['year'] = year archfiles_row['doy'] = doy hdus.close() return dat, archfiles_row
def read_archfile(i, f, filetype, row, colnames, archfiles, db): """Read filename ``f`` with index ``i`` (position within list of filenames). The file has type ``filetype`` and will be added to MSID file at row index ``row``. ``colnames`` is the list of column names for the content type (not used here). """ # Check if filename is already in archfiles. If so then abort further processing. filename = os.path.basename(f) if db.fetchall('SELECT filename FROM archfiles WHERE filename=?', (filename,)): logger.verbose('File %s already in archfiles - unlinking and skipping' % f) os.unlink(f) return None, None # Read FITS archive file and accumulate data into dats list and header into headers dict logger.info('Reading (%d / %d) %s' % (i, len(archfiles), filename)) hdus = pyfits.open(f, character_as_bytes=True) hdu = hdus[1] try: dat = converters.convert(hdu.data, filetype['content']) except converters.NoValidDataError: # When creating files allow NoValidDataError hdus.close() logger.warning('WARNING: no valid data in data file {}'.format(filename)) return None, None except converters.DataShapeError as err: hdus.close() logger.warning('WARNING: skipping file {} with bad data shape: ASCDSVER={} {}' .format(filename, hdu.header['ASCDSVER'], err)) return None, None # Accumlate relevant info about archfile that will be ingested into # MSID h5 files. Commit info before h5 ingest so if there is a failure # the needed info will be available to do the repair. archfiles_row = dict((x, hdu.header.get(x.upper())) for x in archfiles_hdr_cols) archfiles_row['checksum'] = hdu.header.get('checksum') or hdu._checksum archfiles_row['rowstart'] = row archfiles_row['rowstop'] = row + len(dat) archfiles_row['filename'] = filename archfiles_row['filetime'] = int(re.search(r'(\d+)', archfiles_row['filename']).group(1)) filedate = DateTime(archfiles_row['filetime']).date year, doy = (int(x) for x in re.search(r'(\d\d\d\d):(\d\d\d)', filedate).groups()) archfiles_row['year'] = year archfiles_row['doy'] = doy hdus.close() return dat, archfiles_row
def update_msid_files(filetype, archfiles): colnames = pickle.load(open(msid_files['colnames'].abs, 'rb')) colnames_all = pickle.load(open(msid_files['colnames_all'].abs, 'rb')) old_colnames = colnames.copy() old_colnames_all = colnames_all.copy() # Setup db handle with autocommit=False so that error along the way aborts insert transactions db = Ska.DBI.DBI(dbi='sqlite', server=msid_files['archfiles'].abs, autocommit=False) # Get the last row number from the archfiles table out = db.fetchone('SELECT max(rowstop) FROM archfiles') row = out['max(rowstop)'] or 0 last_archfile = db.fetchone('SELECT * FROM archfiles where rowstop=?', (row,)) archfiles_overlaps = [] dats = [] archfiles_processed = [] content_is_derived = (filetype['instrum'] == 'DERIVED') for i, f in enumerate(archfiles): get_data = (read_derived if content_is_derived else read_archfile) dat, archfiles_row = get_data(i, f, filetype, row, colnames, archfiles, db) if dat is None: continue # If creating new content type and there are no existing colnames, then # define the column names now. Filter out any multidimensional # columns, including (typically) QUALITY. if opt.create and not colnames: colnames = set(dat.dtype.names) for colname in dat.dtype.names: if len(dat[colname].shape) > 1: logger.info('Removing column {} from colnames because shape = {}' .format(colname, dat[colname].shape)) colnames.remove(colname) # Ensure that the time gap between the end of the last ingested archive # file and the start of this one is less than opt.max_gap (or # filetype-based defaults). If this fails then break out of the # archfiles processing but continue on to ingest any previously # successful archfiles if last_archfile is None: time_gap = 0 else: time_gap = archfiles_row['tstart'] - last_archfile['tstop'] max_gap = opt.max_gap if max_gap is None: if filetype['instrum'] in ['EPHEM', 'DERIVED']: max_gap = 601 elif filetype['content'] == 'ACISDEAHK': max_gap = 10000 # From P.Plucinsky 2011-09-23 # If ACIS is executing an Event Histogram run while in FMT1, # the telemetry stream will saturate. The amount of time for # an opening in the telemetry to appear such that DEA HKP # packets can get out is a bit indeterminate. The histograms # integrate for 5400s and then they are telemetered. I would # suggest 6000s, but perhaps you would want to double that to # 12000s. elif filetype['content'] in ['CPE1ENG', 'CCDM15ENG']: # 100 years => no max gap for safe mode telemetry or dwell mode telemetry max_gap = 100 * 3.1e7 else: max_gap = 32.9 if time_gap > max_gap: logger.warning('WARNING: found gap of %.2f secs between archfiles %s and %s', time_gap, last_archfile['filename'], archfiles_row['filename']) if opt.create: logger.warning(' Allowing gap because of opt.create=True') elif DateTime() - DateTime(archfiles_row['tstart']) > opt.allow_gap_after_days: # After 4 days (by default) just let it go through because this is # likely a real gap and will not be fixed by subsequent processing. # This can happen after normal sun mode to SIM products. logger.warning(' Allowing gap because arch file ' 'start is more than {} days old' .format(opt.allow_gap_after_days)) else: break elif time_gap < 0: # Overlapping archfiles - deal with this in append_h5_col archfiles_overlaps.append((last_archfile, archfiles_row)) # Update the last_archfile values. last_archfile = archfiles_row # A very small number of archive files (a few) have a problem where the # quality column tform is specified as 3B instead of 17X (for example). # This breaks things, so in this case just skip the file. However # since last_archfile is set above the gap check considers this file to # have been ingested. if not content_is_derived and dat['QUALITY'].shape[1] != len(dat.dtype.names): logger.warning('WARNING: skipping because of quality size mismatch: %d %d' % (dat['QUALITY'].shape[1], len(dat.dtype.names))) continue # Mark the archfile as ingested in the database and add to list for # subsequent relocation into arch_files archive. In the case of a gap # where ingest is stopped before all archfiles are processed, this will # leave files either in a tmp dir (HEAD) or in the stage dir (OCC). # In the latter case this allows for successful processing later when the # gap gets filled. archfiles_processed.append(f) if not opt.dry_run: db.insert(archfiles_row, 'archfiles') # Capture the data for subsequent storage in the hdf5 files dats.append(dat) # Update the running list of column names. Colnames_all is the maximal (union) # set giving all column names seen in any file for this content type. Colnames # was historically the minimal (intersection) set giving the list of column names # seen in every file, but as of 0.39 it is allowed to grow as well to accommodate # adding MSIDs in the TDB. Include only 1-d columns, not things like AEPERR # in PCAD8ENG which is a 40-element binary vector. colnames_all.update(dat.dtype.names) colnames.update(name for name in dat.dtype.names if dat[name].ndim == 1) row += len(dat) if dats: logger.verbose('Writing accumulated column data to h5 file at ' + time.ctime()) data_lens = set() processed_cols = set() for colname in colnames: ft['msid'] = colname if not os.path.exists(msid_files['msid'].abs): make_h5_col_file(dats, colname) if not opt.create: # New MSID was found for this content type. This must be associated with # an update to the TDB. Skip for the moment to ensure that other MSIDs # are fully processed. continue data_len = append_h5_col(dats, colname, archfiles_overlaps) data_lens.add(data_len) processed_cols.add(colname) if len(data_lens) != 1: raise ValueError('h5 data length inconsistency {}, investigate NOW!' .format(data_lens)) # Process any new MSIDs (this is extremely rare) data_len = data_lens.pop() for colname in colnames - processed_cols: ft['msid'] = colname append_filled_h5_col(dats, colname, data_len) # Assuming everything worked now commit the db inserts that signify the # new archive files have been processed if not opt.dry_run: db.commit() # If colnames or colnames_all changed then give warning and update files. if colnames != old_colnames: logger.warning('WARNING: updating %s because colnames changed: %s' % (msid_files['colnames'].abs, old_colnames ^ colnames)) if not opt.dry_run: pickle.dump(colnames, open(msid_files['colnames'].abs, 'wb'), protocol=0) if colnames_all != old_colnames_all: logger.warning('WARNING: updating %s because colnames_all changed: %s' % (msid_files['colnames_all'].abs, colnames_all ^ old_colnames_all)) if not opt.dry_run: pickle.dump(colnames_all, open(msid_files['colnames_all'].abs, 'wb'), protocol=0) return archfiles_processed
def calc_stats_vals(msid, rows, indexes, interval): """ Compute statistics values for ``msid`` over specified intervals. :param msid: Msid object (filter_bad=True) :param rows: Msid row indices corresponding to stat boundaries :param indexes: Universal index values for stat (row times // dt) :param interval: interval name (5min or daily) """ quantiles = (1, 5, 16, 50, 84, 95, 99) n_out = len(rows) - 1 # Check if data type is "numeric". Boolean values count as numeric, # partly for historical reasons, in that they support funcs like # mean (with implicit conversion to float). msid_dtype = msid.vals.dtype msid_is_numeric = issubclass(msid_dtype.type, (np.number, np.bool_)) # If MSID data is unicode, then for stats purposes cast back to bytes # by creating the output array as a like-sized S-type array. if msid_dtype.kind == 'U': msid_dtype = re.sub(r'U', 'S', msid.vals.dtype.str) # Predeclare numpy arrays of correct type and sufficient size for accumulating results. out = OrderedDict() out['index'] = np.ndarray((n_out,), dtype=np.int32) out['n'] = np.ndarray((n_out,), dtype=np.int32) out['val'] = np.ndarray((n_out,), dtype=msid_dtype) if msid_is_numeric: out['min'] = np.ndarray((n_out,), dtype=msid_dtype) out['max'] = np.ndarray((n_out,), dtype=msid_dtype) out['mean'] = np.ndarray((n_out,), dtype=np.float32) if interval == 'daily': out['std'] = np.ndarray((n_out,), dtype=msid_dtype) for quantile in quantiles: out['p{:02d}'.format(quantile)] = np.ndarray((n_out,), dtype=msid_dtype) # MSID may have state codes if msid.state_codes: for raw_count, state_code in msid.state_codes: out['n_' + fix_state_code(state_code)] = np.zeros(n_out, dtype=np.int32) i = 0 for row0, row1, index in zip(rows[:-1], rows[1:], indexes[:-1]): vals = msid.vals[row0:row1] times = msid.times[row0:row1] n_vals = len(vals) if n_vals > 0: out['index'][i] = index out['n'][i] = n_vals out['val'][i] = vals[n_vals // 2] if msid_is_numeric: if n_vals <= 2: dts = np.ones(n_vals, dtype=np.float64) else: dts = np.empty(n_vals, dtype=np.float64) dts[0] = times[1] - times[0] dts[-1] = times[-1] - times[-2] dts[1:-1] = ((times[1:-1] - times[:-2]) + (times[2:] - times[1:-1])) / 2.0 negs = dts < 0.0 if np.any(negs): times_dts = [(DateTime(t).date, dt) for t, dt in zip(times[negs], dts[negs])] logger.warning('WARNING - negative dts in {} at {}' .format(msid.MSID, times_dts)) # Clip to range 0.001 to 300.0. The low bound is just there # for data with identical time stamps. This shouldn't happen # but in practice might. The 300.0 represents 5 minutes and # is the largest normal time interval. Data near large gaps # will get a weight of 5 mins. dts.clip(0.001, 300.0, out=dts) sum_dts = np.sum(dts) out['min'][i] = np.min(vals) out['max'][i] = np.max(vals) out['mean'][i] = np.sum(dts * vals) / sum_dts if interval == 'daily': # biased weighted estimator of variance (N should be big enough) # http://en.wikipedia.org/wiki/Mean_square_weighted_deviation sigma_sq = np.sum(dts * (vals - out['mean'][i]) ** 2) / sum_dts out['std'][i] = np.sqrt(sigma_sq) quant_vals = scipy.stats.mstats.mquantiles(vals, np.array(quantiles) / 100.0) for quant_val, quantile in zip(quant_vals, quantiles): out['p%02d' % quantile][i] = quant_val if msid.state_codes: # If MSID has state codes then count the number of values in each state # and store. The MSID values can have trailing spaces to fill out to a # uniform length, so state_code is right padded accordingly. max_len = max(len(state_code) for raw_count, state_code in msid.state_codes) fmtstr = '{:' + str(max_len) + 's}' for raw_count, state_code in msid.state_codes: state_count = np.count_nonzero(vals == fmtstr.format(state_code)) out['n_' + fix_state_code(state_code)][i] = state_count i += 1 return np.rec.fromarrays([x[:i] for x in out.values()], names=list(out.keys()))
def calc_stats_vals(msid, rows, indexes, interval): quantiles = (1, 5, 16, 50, 84, 95, 99) cols_stats = ('index', 'n', 'val') n_out = len(rows) - 1 msid_dtype = msid.vals.dtype msid_is_numeric = not msid_dtype.name.startswith('string') # Predeclare numpy arrays of correct type and sufficient size for accumulating results. out = dict( index=np.ndarray((n_out, ), dtype=np.int32), n=np.ndarray((n_out, ), dtype=np.int32), val=np.ndarray((n_out, ), dtype=msid_dtype), ) if msid_is_numeric: cols_stats += ('min', 'max', 'mean') out.update( dict( min=np.ndarray((n_out, ), dtype=msid_dtype), max=np.ndarray((n_out, ), dtype=msid_dtype), mean=np.ndarray((n_out, ), dtype=np.float32), )) if interval == 'daily': cols_stats += ('std', ) + tuple('p%02d' % x for x in quantiles) out['std'] = np.ndarray((n_out, ), dtype=msid_dtype) out.update(('p%02d' % x, np.ndarray((n_out, ), dtype=msid_dtype)) for x in quantiles) i = 0 for row0, row1, index in itertools.izip(rows[:-1], rows[1:], indexes[:-1]): vals = msid.vals[row0:row1] times = msid.times[row0:row1] n_vals = len(vals) if n_vals > 0: out['index'][i] = index out['n'][i] = n_vals out['val'][i] = vals[n_vals // 2] if msid_is_numeric: if n_vals <= 2: dts = np.ones(n_vals, dtype=np.float64) else: dts = np.empty(n_vals, dtype=np.float64) dts[0] = times[1] - times[0] dts[-1] = times[-1] - times[-2] dts[1:-1] = ((times[1:-1] - times[:-2]) + (times[2:] - times[1:-1])) / 2.0 negs = dts < 0.0 if np.any(negs): times_dts = [(DateTime(t).date, dt) for t, dt in zip(times[negs], dts[negs])] logger.warning( 'WARNING - negative dts in {} at {}'.format( msid.MSID, times_dts)) # Clip to range 0.001 to 300.0. The low bound is just there # for data with identical time stamps. This shouldn't happen # but in practice might. The 300.0 represents 5 minutes and # is the largest normal time interval. Data near large gaps # will get a weight of 5 mins. dts.clip(0.001, 300.0, out=dts) sum_dts = np.sum(dts) out['min'][i] = np.min(vals) out['max'][i] = np.max(vals) out['mean'][i] = np.sum(dts * vals) / sum_dts if interval == 'daily': # biased weighted estimator of variance (N should be big enough) # http://en.wikipedia.org/wiki/Mean_square_weighted_deviation sigma_sq = np.sum(dts * (vals - out['mean'][i])**2) / sum_dts out['std'][i] = np.sqrt(sigma_sq) quant_vals = scipy.stats.mstats.mquantiles( vals, np.array(quantiles) / 100.0) for quant_val, quantile in zip(quant_vals, quantiles): out['p%02d' % quantile][i] = quant_val i += 1 return np.rec.fromarrays([out[x][:i] for x in cols_stats], names=cols_stats)
def update_observed_metrics(obsid=None, start=None, stop=None, data_root=None, force=False, factor=20, make_plots=False, save=False): """ Update the ``GUIDE_METRICS_OBSID`` and ``GUIDE_METRICS_SLOT`` tables (ascii ECSV format) in place to reflect information about observed guide metrics: dr95, dr50, manvr angle, ending roll error, one shot updates, aberration corrections; and mean yag/zag, mean dyag/dzag centroid errors. :param factor: scaling for the centroid residual visualization in the yag/zag plane (default 20) :param make_plots: if True then generate plots for centroid dashboard (default False) :param save: if True then save the plots (default False) """ if obsid is None: # Default is between NOW and (NOW - NDAYS) days start = DateTime(start) - (NDAYS if start is None else 0) stop = DateTime(stop) # Get obsids, both science and ERs obsids = [evt.obsid for evt in events.obsids.filter(start, stop)] else: obsids = [np.int(obsid)] if data_root is None: data_root = CD_ROOT obsid_metrics_file = os.path.join(data_root, GUIDE_METRICS_OBSID) slot_metrics_file = os.path.join(data_root, GUIDE_METRICS_SLOT) # Read in existing files if they exists and make a set of already-processed obsids dat_obsid_old, processed_obsids = read_metrics_from_file(obsid_metrics_file) dat_slot_old, tmp = read_metrics_from_file(slot_metrics_file) rows_obsid = [] rows_slots = [] for obsid in obsids: logger.info(f'Obsid={obsid}') obs_dir = get_cd_dir(obsid, data_root) if obsid in processed_obsids: if not force: logger.info(f'Skipping obsid {obsid}: already processed') continue else: if obsid in dat_obsid_old['obsid']: idx = list(dat_obsid_old['obsid']).index(obsid) dat_obsid_old.remove_row(idx) if obsid in dat_slot_old['obsid']: ok = dat_slot_old['obsid'] == obsid dat_slot_old.remove_rows(ok) try: metrics_obsid, metrics_slot = get_observed_metrics(obsid, metrics_file=obsid_metrics_file) if not metrics_obsid['dwell']: logger.info(f'Skipping obsid {obsid}: not a dwell?') info = 'Not a dwell' make_special_case_html(metrics_obsid, obs_dir, info=info) continue if metrics_obsid['att_flag'] > 1: logger.info(f'Skipping obsid {obsid}: problem matching obc/ground times') info = 'Problem matching obc/ground att times' make_special_case_html(metrics_obsid, obs_dir, info=info) continue if obsid < 40000 and metrics_obsid['att_flag'] == 1: logger.info(f'Skipping science obsid {obsid}: no ground aspect solution') info = 'No ground aspect solution for science obsid' make_special_case_html(metrics_obsid, obs_dir, info=info) continue if make_plots: kwargs = {'factor': factor, 'save': save} plot_observed_metrics(obsid, plot_dir=obs_dir, coord='dr', att_errors=metrics_obsid['att_errors'], **kwargs) except (NoObsidError, NoDwellError, NoManvrError) as err: logger.info(f'Skipping obsid {obsid} missing data: {err}') continue except Exception as err: logger.warning(f'Skipping obsid {obsid} ERROR: {err}') continue # Process entries for 'per obsid' metrics keys_obsid = ('obsid', 'mean_date', 'att_flag', 'dr50', 'dr95', 'aber_y', 'aber_z', 'aber_flag', 'one_shot_pitch', 'one_shot_yaw', 'one_shot', 'one_shot_aber_corrected', 'manvr_angle', 'preceding_roll_err', 'ending_roll_err', 'obsid_preceding', 'obsid_next') row_obsid = {k: metrics_obsid[k] for k in keys_obsid} rows_obsid.append(row_obsid) # Process entries for 'per slot' metrics row_slots = [] for slot in range(8): out = {} slot_data = metrics_slot['slots'][slot] if bool(slot_data): out['obsid'] = obsid out['slot'] = slot out['mean_date'] = metrics_obsid['mean_date'] keys_slot = ('id', 'type', 'mag', 'yang', 'zang', 'median_mag', 'median_dy', 'median_dz') out.update({k: slot_data[k] for k in keys_slot}) # Needed to build html row_slots.append(out) # Needed to update 'per slot' data file rows_slots.append(out) # Build html page for this obsid make_html(row_obsid, row_slots, obs_dir) # Update the 'per_obsid' table if rows_obsid: sort_cols = ['mean_date'] update_data_table(rows_obsid, dat_obsid_old, obsid_metrics_file, sort_cols) # Update the 'per_slot' table if rows_slots: sort_cols = ['mean_date', 'slot'] update_data_table(rows_slots, dat_slot_old, slot_metrics_file, sort_cols)
def calc_stats_vals(msid, rows, indexes, interval): quantiles = (1, 5, 16, 50, 84, 95, 99) cols_stats = ('index', 'n', 'val') n_out = len(rows) - 1 msid_dtype = msid.vals.dtype msid_is_numeric = not msid_dtype.name.startswith('string') # Predeclare numpy arrays of correct type and sufficient size for accumulating results. out = dict(index=np.ndarray((n_out,), dtype=np.int32), n=np.ndarray((n_out,), dtype=np.int32), val=np.ndarray((n_out,), dtype=msid_dtype), ) if msid_is_numeric: cols_stats += ('min', 'max', 'mean') out.update(dict(min=np.ndarray((n_out,), dtype=msid_dtype), max=np.ndarray((n_out,), dtype=msid_dtype), mean=np.ndarray((n_out,), dtype=np.float32),)) if interval == 'daily': cols_stats += ('std',) + tuple('p%02d' % x for x in quantiles) out['std'] = np.ndarray((n_out,), dtype=msid_dtype) out.update(('p%02d' % x, np.ndarray((n_out,), dtype=msid_dtype)) for x in quantiles) i = 0 for row0, row1, index in itertools.izip(rows[:-1], rows[1:], indexes[:-1]): vals = msid.vals[row0:row1] times = msid.times[row0:row1] n_vals = len(vals) if n_vals > 0: out['index'][i] = index out['n'][i] = n_vals out['val'][i] = vals[n_vals // 2] if msid_is_numeric: if n_vals <= 2: dts = np.ones(n_vals, dtype=np.float64) else: dts = np.empty(n_vals, dtype=np.float64) dts[0] = times[1] - times[0] dts[-1] = times[-1] - times[-2] dts[1:-1] = ((times[1:-1] - times[:-2]) + (times[2:] - times[1:-1])) / 2.0 negs = dts < 0.0 if np.any(negs): times_dts = [(DateTime(t).date, dt) for t, dt in zip(times[negs], dts[negs])] logger.warning('WARNING - negative dts in {} at {}' .format(msid.MSID, times_dts)) # Clip to range 0.001 to 300.0. The low bound is just there # for data with identical time stamps. This shouldn't happen # but in practice might. The 300.0 represents 5 minutes and # is the largest normal time interval. Data near large gaps # will get a weight of 5 mins. dts.clip(0.001, 300.0, out=dts) sum_dts = np.sum(dts) out['min'][i] = np.min(vals) out['max'][i] = np.max(vals) out['mean'][i] = np.sum(dts * vals) / sum_dts if interval == 'daily': # biased weighted estimator of variance (N should be big enough) # http://en.wikipedia.org/wiki/Mean_square_weighted_deviation sigma_sq = np.sum(dts * (vals - out['mean'][i]) ** 2) / sum_dts out['std'][i] = np.sqrt(sigma_sq) quant_vals = scipy.stats.mstats.mquantiles(vals, np.array(quantiles) / 100.0) for quant_val, quantile in zip(quant_vals, quantiles): out['p%02d' % quantile][i] = quant_val i += 1 return np.rec.fromarrays([out[x][:i] for x in cols_stats], names=cols_stats)
def calc_stats_vals(msid, rows, indexes, interval): """ Compute statistics values for ``msid`` over specified intervals. :param msid: Msid object (filter_bad=True) :param rows: Msid row indices corresponding to stat boundaries :param indexes: Universal index values for stat (row times // dt) :param interval: interval name (5min or daily) """ quantiles = (1, 5, 16, 50, 84, 95, 99) n_out = len(rows) - 1 # Check if data type is "numeric". Boolean values count as numeric, # partly for historical reasons, in that they support funcs like # mean (with implicit conversion to float). msid_dtype = msid.vals.dtype msid_is_numeric = issubclass(msid_dtype.type, (np.number, np.bool_)) # Predeclare numpy arrays of correct type and sufficient size for accumulating results. out = OrderedDict() out['index'] = np.ndarray((n_out, ), dtype=np.int32) out['n'] = np.ndarray((n_out, ), dtype=np.int32) out['val'] = np.ndarray((n_out, ), dtype=msid_dtype) if msid_is_numeric: out['min'] = np.ndarray((n_out, ), dtype=msid_dtype) out['max'] = np.ndarray((n_out, ), dtype=msid_dtype) out['mean'] = np.ndarray((n_out, ), dtype=np.float32) if interval == 'daily': out['std'] = np.ndarray((n_out, ), dtype=msid_dtype) for quantile in quantiles: out['p{:02d}'.format(quantile)] = np.ndarray((n_out, ), dtype=msid_dtype) # MSID may have state codes # if msid.state_codes: # for raw_count, state_code in msid.state_codes: # out['n_' + fix_state_code(state_code)] = np.zeros(n_out, dtype=np.int32) i = 0 for row0, row1, index in zip(rows[:-1], rows[1:], indexes[:-1]): vals = msid.vals[row0:row1] times = msid.times[row0:row1] n_vals = len(vals) if n_vals > 0: out['index'][i] = index out['n'][i] = n_vals out['val'][i] = vals[n_vals // 2] if msid_is_numeric: if n_vals <= 2: dts = np.ones(n_vals, dtype=np.float64) else: dts = np.empty(n_vals, dtype=np.float64) dts[0] = times[1] - times[0] dts[-1] = times[-1] - times[-2] dts[1:-1] = ((times[1:-1] - times[:-2]) + (times[2:] - times[1:-1])) / 2.0 negs = dts < 0.0 if np.any(negs): times_dts = [(Time(t, format="unix").yday, dt) for t, dt in zip(times[negs], dts[negs])] logger.warning( 'WARNING - negative dts in {} at {}'.format( msid.MSID, times_dts)) # Clip to range 0.001 to 300.0. The low bound is just there # for data with identical time stamps. This shouldn't happen # but in practice might. The 300.0 represents 5 minutes and # is the largest normal time interval. Data near large gaps # will get a weight of 5 mins. dts.clip(0.001, 300.0, out=dts) sum_dts = np.sum(dts) out['min'][i] = np.min(vals) out['max'][i] = np.max(vals) out['mean'][i] = np.sum(dts * vals) / sum_dts if interval == 'daily': # biased weighted estimator of variance (N should be big enough) # http://en.wikipedia.org/wiki/Mean_square_weighted_deviation sigma_sq = np.sum(dts * (vals - out['mean'][i])**2) / sum_dts out['std'][i] = np.sqrt(sigma_sq) quant_vals = scipy.stats.mstats.mquantiles( vals, np.array(quantiles) / 100.0) for quant_val, quantile in zip(quant_vals, quantiles): out['p%02d' % quantile][i] = quant_val # if msid.state_codes: # If MSID has state codes then count the number of values in each state # and store. The MSID values can have trailing spaces to fill out to a # uniform length, so state_code is right padded accordingly. # max_len = max(len(state_code) for raw_count, state_code in msid.state_codes) # fmtstr = '{:' + str(max_len) + 's}' # for raw_count, state_code in msid.state_codes: # state_count = np.count_nonzero(vals == fmtstr.format(state_code)) # out['n_' + fix_state_code(state_code)][i] = state_count i += 1 return np.rec.fromarrays([x[:i] for x in out.values()], names=list(out.keys()))
def update_msid_files(filetype, archfiles): colnames = pickle.load(open(msid_files['colnames'].abs)) colnames_all = pickle.load(open(msid_files['colnames_all'].abs)) old_colnames = colnames.copy() old_colnames_all = colnames_all.copy() # Setup db handle with autocommit=False so that error along the way aborts insert transactions db = Ska.DBI.DBI(dbi='sqlite', server=msid_files['archfiles'].abs, autocommit=False) # Get the last row number from the archfiles table out = db.fetchone('SELECT max(rowstop) FROM archfiles') row = out['max(rowstop)'] or 0 last_archfile = db.fetchone('SELECT * FROM archfiles where rowstop=?', (row, )) archfiles_overlaps = [] dats = [] archfiles_processed = [] content_is_derived = (filetype['instrum'] == 'DERIVED') for i, f in enumerate(archfiles): get_data = (read_derived if content_is_derived else read_archfile) dat, archfiles_row = get_data(i, f, filetype, row, colnames, archfiles, db) if dat is None: continue # If creating new content type and there are no existing colnames, then # define the column names now. Filter out any multidimensional # columns, including (typically) QUALITY. if opt.create and not colnames: colnames = set(dat.dtype.names) for colname in dat.dtype.names: if len(dat[colname].shape) > 1: logger.info( 'Removing column {} from colnames because shape = {}'. format(colname, dat[colname].shape)) colnames.remove(colname) # Ensure that the time gap between the end of the last ingested archive # file and the start of this one is less than opt.max_gap (or # filetype-based defaults). If this fails then break out of the # archfiles processing but continue on to ingest any previously # successful archfiles if last_archfile is None: time_gap = 0 else: time_gap = archfiles_row['tstart'] - last_archfile['tstop'] max_gap = opt.max_gap if max_gap is None: if filetype['instrum'] in ['EPHEM', 'DERIVED']: max_gap = 601 elif filetype['content'] == 'ACISDEAHK': max_gap = 10000 # From P.Plucinsky 2011-09-23 # If ACIS is executing an Event Histogram run while in FMT1, # the telemetry stream will saturate. The amount of time for # an opening in the telemetry to appear such that DEA HKP # packets can get out is a bit indeterminate. The histograms # integrate for 5400s and then they are telemetered. I would # suggest 6000s, but perhaps you would want to double that to # 12000s. elif filetype['content'] in ['CPE1ENG', 'CCDM15ENG']: # 100 years => no max gap for safe mode telemetry or dwell mode telemetry max_gap = 100 * 3.1e7 else: max_gap = 32.9 if time_gap > max_gap: logger.warning( 'WARNING: found gap of %.2f secs between archfiles %s and %s', time_gap, last_archfile['filename'], archfiles_row['filename']) if opt.create: logger.warning( ' Allowing gap because of opt.create=True') elif DateTime() - DateTime( archfiles_row['tstart']) > opt.allow_gap_after_days: # After 4 days (by default) just let it go through because this is # likely a real gap and will not be fixed by subsequent processing. # This can happen after normal sun mode to SIM products. logger.warning(' Allowing gap because arch file ' 'start is more than {} days old'.format( opt.allow_gap_after_days)) else: break elif time_gap < 0: # Overlapping archfiles - deal with this in append_h5_col archfiles_overlaps.append((last_archfile, archfiles_row)) # Update the last_archfile values. last_archfile = archfiles_row # A very small number of archive files (a few) have a problem where the # quality column tform is specified as 3B instead of 17X (for example). # This breaks things, so in this case just skip the file. However # since last_archfile is set above the gap check considers this file to # have been ingested. if not content_is_derived and dat['QUALITY'].shape[1] != len( dat.dtype.names): logger.warning( 'WARNING: skipping because of quality size mismatch: %d %d' % (dat['QUALITY'].shape[1], len(dat.dtype.names))) continue # Mark the archfile as ingested in the database and add to list for # subsequent relocation into arch_files archive. In the case of a gap # where ingest is stopped before all archfiles are processed, this will # leave files either in a tmp dir (HEAD) or in the stage dir (OCC). # In the latter case this allows for successful processing later when the # gap gets filled. archfiles_processed.append(f) if not opt.dry_run: db.insert(archfiles_row, 'archfiles') # Capture the data for subsequent storage in the hdf5 files dats.append(dat) # Update the running list of column names. Colnames_all is the maximal (union) # set giving all column names seen in any file for this content type. Colnames # was historically the minimal (intersection) set giving the list of column names # seen in every file, but as of 0.39 it is allowed to grow as well to accommodate # adding MSIDs in the TDB. Include only 1-d columns, not things like AEPERR # in PCAD8ENG which is a 40-element binary vector. colnames_all.update(dat.dtype.names) colnames.update(name for name in dat.dtype.names if dat[name].ndim == 1) row += len(dat) if dats: logger.verbose('Writing accumulated column data to h5 file at ' + time.ctime()) data_lens = set() processed_cols = set() for colname in colnames: ft['msid'] = colname if not os.path.exists(msid_files['msid'].abs): make_h5_col_file(dats, colname) if not opt.create: # New MSID was found for this content type. This must be associated with # an update to the TDB. Skip for the moment to ensure that other MSIDs # are fully processed. continue data_len = append_h5_col(dats, colname, archfiles_overlaps) data_lens.add(data_len) processed_cols.add(colname) if len(data_lens) != 1: raise ValueError( 'h5 data length inconsistency {}, investigate NOW!'.format( data_lens)) # Process any new MSIDs (this is extremely rare) data_len = data_lens.pop() for colname in colnames - processed_cols: ft['msid'] = colname append_filled_h5_col(dats, colname, data_len) # Assuming everything worked now commit the db inserts that signify the # new archive files have been processed if not opt.dry_run: db.commit() # If colnames or colnames_all changed then give warning and update files. if colnames != old_colnames: logger.warning('WARNING: updating %s because colnames changed: %s' % (msid_files['colnames'].abs, old_colnames ^ colnames)) if not opt.dry_run: pickle.dump(colnames, open(msid_files['colnames'].abs, 'w')) if colnames_all != old_colnames_all: logger.warning( 'WARNING: updating %s because colnames_all changed: %s' % (msid_files['colnames_all'].abs, colnames_all ^ old_colnames_all)) if not opt.dry_run: pickle.dump(colnames_all, open(msid_files['colnames_all'].abs, 'w')) return archfiles_processed