Exemple #1
0
def update_sync_repo(opt, logger, content):
    """

    :param opt: argparse options
    :param logger: logger instance
    :param content: content type
    :return:
    """
    # File types context dict
    ft = fetch.ft
    ft['content'] = content

    index_file = Path(sync_files['index'].abs)
    index_tbl = update_index_file(index_file, opt, logger)

    if index_tbl is None:
        # Index table was not created, nothing more to do here
        logger.warning(f'No index table for {content}')
        return

    for row in index_tbl:
        ft = fetch.ft
        ft['date_id'] = row['date_id']

        update_sync_data_full(content, logger, row)
        update_sync_data_stat(content, logger, row, '5min')
        update_sync_data_stat(content, logger, row, 'daily')

    remove_outdated_sync_files(opt, logger, index_tbl, index_file)
Exemple #2
0
def main():
    """
    Review kadi dwells for new high background events, update a text file table of
    those events, make reports, and notify via email as needed.
    """
    global logger

    opt = get_opt()
    logger = pyyaks.logger.get_logger(level=opt.log_level)

    EVENT_ARCHIVE = os.path.join(opt.data_root, "bgd_events.dat")
    start = None

    bgd_events = []
    if os.path.exists(EVENT_ARCHIVE):
        bgd_events = Table.read(EVENT_ARCHIVE, format='ascii')
    if len(bgd_events) > 0:
        start = DateTime(bgd_events['dwell_datestart'][-1])
        # Remove any bogus events from the real list
        bgd_events = bgd_events[bgd_events['obsid'] != -1]
        bgd_events['slots'] = bgd_events['slots'].astype(str)
        bgd_events['slots_for_sum'] = bgd_events['slots_for_sum'].astype(str)

    # If the user has asked for a start time earlier than the end of the
    # table, delete any rows after the supplied start time
    if opt.start is not None:
        if start is not None:
            if DateTime(opt.start).secs < start.secs:
                bgd_events = bgd_events[
                    bgd_events['dwell_datestart'] < DateTime(opt.start).date]
        start = DateTime(opt.start)
    if start is None:
        start = DateTime(-7)

    new_events, stop = get_events(start)
    if len(new_events) > 0:

        new_events = Table(new_events)
        for obsid in np.unique(new_events['obsid']):
            if obsid in [0, -1]:
                continue
            url = f"{opt.web_url}/events/obs_{obsid:05d}/index.html"
            logger.warning(f"HI BGD event at in obsid {obsid} {url}")
            if len(opt.emails) > 0:
                send_mail(logger, opt, f'ACA HI BGD event in obsid {obsid}',
                          f'HI BGD in obsid {obsid} report at {url}', __file__)

    if len(bgd_events) > 0:
        bgd_events = vstack([bgd_events, new_events])
    else:
        bgd_events = new_events

    # Add a null event at the end
    bgd_events.add_row()
    bgd_events[-1]['obsid'] = -1
    bgd_events[-1]['dwell_datestart'] = DateTime(stop).date

    bgd_events.write(EVENT_ARCHIVE, format='ascii', overwrite=True)

    make_event_reports(bgd_events, opt.web_out)
Exemple #3
0
def update_sync_repo(opt, logger, content):
    """

    :param opt: argparse options
    :param logger: logger instance
    :param content: content type
    :return:
    """
    # File types context dict
    ft = fetch.ft
    ft['content'] = content

    index_file = Path(sync_files['index'].abs)
    index_tbl = update_index_file(index_file, opt, logger)

    if index_tbl is None:
        # Index table was not created, nothing more to do here
        logger.warning(f'No index table for {content}')
        return

    for row in index_tbl:
        ft = fetch.ft
        ft['date_id'] = row['date_id']

        update_sync_data_full(content, logger, row)
        update_sync_data_stat(content, logger, row, '5min')
        update_sync_data_stat(content, logger, row, 'daily')

    remove_mask = remove_outdated_sync_files(opt, logger, index_tbl)
    if np.any(remove_mask):
        index_tbl = index_tbl[~remove_mask]
        logger.info(
            f'Writing {len(index_tbl)} row(s) to index file {index_file}')
        index_tbl.write(index_file, format='ascii.ecsv')
Exemple #4
0
def read_archfile(i, f, filetype, row, colnames, archfiles, db):
    """Read filename ``f`` with index ``i`` (position within list of filenames).  The
    file has type ``filetype`` and will be added to MSID file at row index ``row``.
    ``colnames`` is the list of column names for the content type (not used here).
    """
    # Check if filename is already in archfiles.  If so then abort further processing.
    filename = os.path.basename(f)
    if db.fetchall('SELECT filename FROM archfiles WHERE filename=?',
                   (filename, )):
        logger.verbose(
            'File %s already in archfiles - unlinking and skipping' % f)
        os.unlink(f)
        return None, None

    # Read FITS archive file and accumulate data into dats list and header into headers dict
    logger.info('Reading (%d / %d) %s' % (i, len(archfiles), filename))
    hdus = pyfits.open(f)
    hdu = hdus[1]

    try:
        dat = converters.convert(hdu.data, filetype['content'])

    except converters.NoValidDataError:
        # When creating files allow NoValidDataError
        hdus.close()
        logger.warning(
            'WARNING: no valid data in data file {}'.format(filename))
        return None, None

    except converters.DataShapeError as err:
        hdus.close()
        logger.warning(
            'WARNING: skipping file {} with bad data shape: ASCDSVER={} {}'.
            format(filename, hdu.header['ASCDSVER'], err))
        return None, None

    # Accumlate relevant info about archfile that will be ingested into
    # MSID h5 files.  Commit info before h5 ingest so if there is a failure
    # the needed info will be available to do the repair.
    archfiles_row = dict(
        (x, hdu.header.get(x.upper())) for x in archfiles_hdr_cols)
    archfiles_row['checksum'] = hdu.header.get('checksum') or hdu._checksum
    archfiles_row['rowstart'] = row
    archfiles_row['rowstop'] = row + len(dat)
    archfiles_row['filename'] = filename
    archfiles_row['filetime'] = int(
        re.search(r'(\d+)', archfiles_row['filename']).group(1))
    filedate = DateTime(archfiles_row['filetime']).date
    year, doy = (int(x)
                 for x in re.search(r'(\d\d\d\d):(\d\d\d)', filedate).groups())
    archfiles_row['year'] = year
    archfiles_row['doy'] = doy
    hdus.close()

    return dat, archfiles_row
Exemple #5
0
def read_archfile(i, f, filetype, row, colnames, archfiles, db):
    """Read filename ``f`` with index ``i`` (position within list of filenames).  The
    file has type ``filetype`` and will be added to MSID file at row index ``row``.
    ``colnames`` is the list of column names for the content type (not used here).
    """
    # Check if filename is already in archfiles.  If so then abort further processing.
    filename = os.path.basename(f)
    if db.fetchall('SELECT filename FROM archfiles WHERE filename=?', (filename,)):
        logger.verbose('File %s already in archfiles - unlinking and skipping' % f)
        os.unlink(f)
        return None, None

    # Read FITS archive file and accumulate data into dats list and header into headers dict
    logger.info('Reading (%d / %d) %s' % (i, len(archfiles), filename))
    hdus = pyfits.open(f, character_as_bytes=True)
    hdu = hdus[1]

    try:
        dat = converters.convert(hdu.data, filetype['content'])

    except converters.NoValidDataError:
        # When creating files allow NoValidDataError
        hdus.close()
        logger.warning('WARNING: no valid data in data file {}'.format(filename))
        return None, None

    except converters.DataShapeError as err:
        hdus.close()
        logger.warning('WARNING: skipping file {} with bad data shape: ASCDSVER={} {}'
                       .format(filename, hdu.header['ASCDSVER'], err))
        return None, None

    # Accumlate relevant info about archfile that will be ingested into
    # MSID h5 files.  Commit info before h5 ingest so if there is a failure
    # the needed info will be available to do the repair.
    archfiles_row = dict((x, hdu.header.get(x.upper())) for x in archfiles_hdr_cols)
    archfiles_row['checksum'] = hdu.header.get('checksum') or hdu._checksum
    archfiles_row['rowstart'] = row
    archfiles_row['rowstop'] = row + len(dat)
    archfiles_row['filename'] = filename
    archfiles_row['filetime'] = int(re.search(r'(\d+)', archfiles_row['filename']).group(1))
    filedate = DateTime(archfiles_row['filetime']).date
    year, doy = (int(x) for x in re.search(r'(\d\d\d\d):(\d\d\d)', filedate).groups())
    archfiles_row['year'] = year
    archfiles_row['doy'] = doy
    hdus.close()

    return dat, archfiles_row
Exemple #6
0
def update_msid_files(filetype, archfiles):
    colnames = pickle.load(open(msid_files['colnames'].abs, 'rb'))
    colnames_all = pickle.load(open(msid_files['colnames_all'].abs, 'rb'))
    old_colnames = colnames.copy()
    old_colnames_all = colnames_all.copy()

    # Setup db handle with autocommit=False so that error along the way aborts insert transactions
    db = Ska.DBI.DBI(dbi='sqlite', server=msid_files['archfiles'].abs, autocommit=False)

    # Get the last row number from the archfiles table
    out = db.fetchone('SELECT max(rowstop) FROM archfiles')
    row = out['max(rowstop)'] or 0
    last_archfile = db.fetchone('SELECT * FROM archfiles where rowstop=?', (row,))

    archfiles_overlaps = []
    dats = []
    archfiles_processed = []

    content_is_derived = (filetype['instrum'] == 'DERIVED')

    for i, f in enumerate(archfiles):
        get_data = (read_derived if content_is_derived else read_archfile)
        dat, archfiles_row = get_data(i, f, filetype, row, colnames, archfiles, db)
        if dat is None:
            continue

        # If creating new content type and there are no existing colnames, then
        # define the column names now.  Filter out any multidimensional
        # columns, including (typically) QUALITY.
        if opt.create and not colnames:
            colnames = set(dat.dtype.names)
            for colname in dat.dtype.names:
                if len(dat[colname].shape) > 1:
                    logger.info('Removing column {} from colnames because shape = {}'
                                .format(colname, dat[colname].shape))
                    colnames.remove(colname)

        # Ensure that the time gap between the end of the last ingested archive
        # file and the start of this one is less than opt.max_gap (or
        # filetype-based defaults).  If this fails then break out of the
        # archfiles processing but continue on to ingest any previously
        # successful archfiles
        if last_archfile is None:
            time_gap = 0
        else:
            time_gap = archfiles_row['tstart'] - last_archfile['tstop']
        max_gap = opt.max_gap
        if max_gap is None:
            if filetype['instrum'] in ['EPHEM', 'DERIVED']:
                max_gap = 601
            elif filetype['content'] == 'ACISDEAHK':
                max_gap = 10000
                # From P.Plucinsky 2011-09-23
                # If ACIS is executing an Event Histogram run while in FMT1,
                # the telemetry stream will saturate.  The amount of time for
                # an opening in the telemetry to appear such that DEA HKP
                # packets can get out is a bit indeterminate.  The histograms
                # integrate for 5400s and then they are telemetered.  I would
                # suggest 6000s, but perhaps you would want to double that to
                # 12000s.
            elif filetype['content'] in ['CPE1ENG', 'CCDM15ENG']:
                # 100 years => no max gap for safe mode telemetry or dwell mode telemetry
                max_gap = 100 * 3.1e7
            else:
                max_gap = 32.9
        if time_gap > max_gap:
            logger.warning('WARNING: found gap of %.2f secs between archfiles %s and %s',
                           time_gap, last_archfile['filename'], archfiles_row['filename'])
            if opt.create:
                logger.warning('       Allowing gap because of opt.create=True')
            elif DateTime() - DateTime(archfiles_row['tstart']) > opt.allow_gap_after_days:
                # After 4 days (by default) just let it go through because this is
                # likely a real gap and will not be fixed by subsequent processing.
                # This can happen after normal sun mode to SIM products.
                logger.warning('       Allowing gap because arch file '
                               'start is more than {} days old'
                               .format(opt.allow_gap_after_days))
            else:
                break
        elif time_gap < 0:
            # Overlapping archfiles - deal with this in append_h5_col
            archfiles_overlaps.append((last_archfile, archfiles_row))

        # Update the last_archfile values.
        last_archfile = archfiles_row

        # A very small number of archive files (a few) have a problem where the
        # quality column tform is specified as 3B instead of 17X (for example).
        # This breaks things, so in this case just skip the file.  However
        # since last_archfile is set above the gap check considers this file to
        # have been ingested.
        if not content_is_derived and dat['QUALITY'].shape[1] != len(dat.dtype.names):
            logger.warning('WARNING: skipping because of quality size mismatch: %d %d' %
                           (dat['QUALITY'].shape[1], len(dat.dtype.names)))
            continue

        # Mark the archfile as ingested in the database and add to list for
        # subsequent relocation into arch_files archive.  In the case of a gap
        # where ingest is stopped before all archfiles are processed, this will
        # leave files either in a tmp dir (HEAD) or in the stage dir (OCC).
        # In the latter case this allows for successful processing later when the
        # gap gets filled.
        archfiles_processed.append(f)
        if not opt.dry_run:
            db.insert(archfiles_row, 'archfiles')

        # Capture the data for subsequent storage in the hdf5 files
        dats.append(dat)

        # Update the running list of column names.  Colnames_all is the maximal (union)
        # set giving all column names seen in any file for this content type.  Colnames
        # was historically the minimal (intersection) set giving the list of column names
        # seen in every file, but as of 0.39 it is allowed to grow as well to accommodate
        # adding MSIDs in the TDB.  Include only 1-d columns, not things like AEPERR
        # in PCAD8ENG which is a 40-element binary vector.
        colnames_all.update(dat.dtype.names)
        colnames.update(name for name in dat.dtype.names if dat[name].ndim == 1)

        row += len(dat)

    if dats:
        logger.verbose('Writing accumulated column data to h5 file at ' + time.ctime())
        data_lens = set()
        processed_cols = set()
        for colname in colnames:
            ft['msid'] = colname
            if not os.path.exists(msid_files['msid'].abs):
                make_h5_col_file(dats, colname)
                if not opt.create:
                    # New MSID was found for this content type.  This must be associated with
                    # an update to the TDB.  Skip for the moment to ensure that other MSIDs
                    # are fully processed.
                    continue
            data_len = append_h5_col(dats, colname, archfiles_overlaps)
            data_lens.add(data_len)
            processed_cols.add(colname)

        if len(data_lens) != 1:
            raise ValueError('h5 data length inconsistency {}, investigate NOW!'
                             .format(data_lens))

        # Process any new MSIDs (this is extremely rare)
        data_len = data_lens.pop()
        for colname in colnames - processed_cols:
            ft['msid'] = colname
            append_filled_h5_col(dats, colname, data_len)

    # Assuming everything worked now commit the db inserts that signify the
    # new archive files have been processed
    if not opt.dry_run:
        db.commit()

    # If colnames or colnames_all changed then give warning and update files.
    if colnames != old_colnames:
        logger.warning('WARNING: updating %s because colnames changed: %s'
                       % (msid_files['colnames'].abs, old_colnames ^ colnames))
        if not opt.dry_run:
            pickle.dump(colnames, open(msid_files['colnames'].abs, 'wb'), protocol=0)
    if colnames_all != old_colnames_all:
        logger.warning('WARNING: updating %s because colnames_all changed: %s'
                       % (msid_files['colnames_all'].abs, colnames_all ^ old_colnames_all))
        if not opt.dry_run:
            pickle.dump(colnames_all, open(msid_files['colnames_all'].abs, 'wb'), protocol=0)

    return archfiles_processed
Exemple #7
0
def calc_stats_vals(msid, rows, indexes, interval):
    """
    Compute statistics values for ``msid`` over specified intervals.

    :param msid: Msid object (filter_bad=True)
    :param rows: Msid row indices corresponding to stat boundaries
    :param indexes: Universal index values for stat (row times // dt)
    :param interval: interval name (5min or daily)
    """
    quantiles = (1, 5, 16, 50, 84, 95, 99)
    n_out = len(rows) - 1

    # Check if data type is "numeric".  Boolean values count as numeric,
    # partly for historical reasons, in that they support funcs like
    # mean (with implicit conversion to float).
    msid_dtype = msid.vals.dtype
    msid_is_numeric = issubclass(msid_dtype.type, (np.number, np.bool_))

    # If MSID data is unicode, then for stats purposes cast back to bytes
    # by creating the output array as a like-sized S-type array.
    if msid_dtype.kind == 'U':
        msid_dtype = re.sub(r'U', 'S', msid.vals.dtype.str)

    # Predeclare numpy arrays of correct type and sufficient size for accumulating results.
    out = OrderedDict()
    out['index'] = np.ndarray((n_out,), dtype=np.int32)
    out['n'] = np.ndarray((n_out,), dtype=np.int32)
    out['val'] = np.ndarray((n_out,), dtype=msid_dtype)

    if msid_is_numeric:
        out['min'] = np.ndarray((n_out,), dtype=msid_dtype)
        out['max'] = np.ndarray((n_out,), dtype=msid_dtype)
        out['mean'] = np.ndarray((n_out,), dtype=np.float32)

        if interval == 'daily':
            out['std'] = np.ndarray((n_out,), dtype=msid_dtype)
            for quantile in quantiles:
                out['p{:02d}'.format(quantile)] = np.ndarray((n_out,), dtype=msid_dtype)

    # MSID may have state codes
    if msid.state_codes:
        for raw_count, state_code in msid.state_codes:
            out['n_' + fix_state_code(state_code)] = np.zeros(n_out, dtype=np.int32)

    i = 0
    for row0, row1, index in zip(rows[:-1], rows[1:], indexes[:-1]):
        vals = msid.vals[row0:row1]
        times = msid.times[row0:row1]

        n_vals = len(vals)
        if n_vals > 0:
            out['index'][i] = index
            out['n'][i] = n_vals
            out['val'][i] = vals[n_vals // 2]
            if msid_is_numeric:
                if n_vals <= 2:
                    dts = np.ones(n_vals, dtype=np.float64)
                else:
                    dts = np.empty(n_vals, dtype=np.float64)
                    dts[0] = times[1] - times[0]
                    dts[-1] = times[-1] - times[-2]
                    dts[1:-1] = ((times[1:-1] - times[:-2]) +
                                 (times[2:] - times[1:-1])) / 2.0
                    negs = dts < 0.0
                    if np.any(negs):
                        times_dts = [(DateTime(t).date, dt)
                                     for t, dt in zip(times[negs], dts[negs])]
                        logger.warning('WARNING - negative dts in {} at {}'
                                       .format(msid.MSID, times_dts))

                    # Clip to range 0.001 to 300.0.  The low bound is just there
                    # for data with identical time stamps.  This shouldn't happen
                    # but in practice might.  The 300.0 represents 5 minutes and
                    # is the largest normal time interval.  Data near large gaps
                    # will get a weight of 5 mins.
                    dts.clip(0.001, 300.0, out=dts)
                sum_dts = np.sum(dts)

                out['min'][i] = np.min(vals)
                out['max'][i] = np.max(vals)
                out['mean'][i] = np.sum(dts * vals) / sum_dts
                if interval == 'daily':
                    # biased weighted estimator of variance (N should be big enough)
                    # http://en.wikipedia.org/wiki/Mean_square_weighted_deviation
                    sigma_sq = np.sum(dts * (vals - out['mean'][i]) ** 2) / sum_dts
                    out['std'][i] = np.sqrt(sigma_sq)
                    quant_vals = scipy.stats.mstats.mquantiles(vals, np.array(quantiles) / 100.0)
                    for quant_val, quantile in zip(quant_vals, quantiles):
                        out['p%02d' % quantile][i] = quant_val

            if msid.state_codes:
                # If MSID has state codes then count the number of values in each state
                # and store.  The MSID values can have trailing spaces to fill out to a
                # uniform length, so state_code is right padded accordingly.
                max_len = max(len(state_code) for raw_count, state_code in msid.state_codes)
                fmtstr = '{:' + str(max_len) + 's}'
                for raw_count, state_code in msid.state_codes:
                    state_count = np.count_nonzero(vals == fmtstr.format(state_code))
                    out['n_' + fix_state_code(state_code)][i] = state_count

            i += 1

    return np.rec.fromarrays([x[:i] for x in out.values()], names=list(out.keys()))
Exemple #8
0
def calc_stats_vals(msid, rows, indexes, interval):
    quantiles = (1, 5, 16, 50, 84, 95, 99)
    cols_stats = ('index', 'n', 'val')
    n_out = len(rows) - 1
    msid_dtype = msid.vals.dtype
    msid_is_numeric = not msid_dtype.name.startswith('string')
    # Predeclare numpy arrays of correct type and sufficient size for accumulating results.
    out = dict(
        index=np.ndarray((n_out, ), dtype=np.int32),
        n=np.ndarray((n_out, ), dtype=np.int32),
        val=np.ndarray((n_out, ), dtype=msid_dtype),
    )
    if msid_is_numeric:
        cols_stats += ('min', 'max', 'mean')
        out.update(
            dict(
                min=np.ndarray((n_out, ), dtype=msid_dtype),
                max=np.ndarray((n_out, ), dtype=msid_dtype),
                mean=np.ndarray((n_out, ), dtype=np.float32),
            ))
        if interval == 'daily':
            cols_stats += ('std', ) + tuple('p%02d' % x for x in quantiles)
            out['std'] = np.ndarray((n_out, ), dtype=msid_dtype)
            out.update(('p%02d' % x, np.ndarray((n_out, ), dtype=msid_dtype))
                       for x in quantiles)
    i = 0
    for row0, row1, index in itertools.izip(rows[:-1], rows[1:], indexes[:-1]):
        vals = msid.vals[row0:row1]
        times = msid.times[row0:row1]
        n_vals = len(vals)
        if n_vals > 0:
            out['index'][i] = index
            out['n'][i] = n_vals
            out['val'][i] = vals[n_vals // 2]
            if msid_is_numeric:
                if n_vals <= 2:
                    dts = np.ones(n_vals, dtype=np.float64)
                else:
                    dts = np.empty(n_vals, dtype=np.float64)
                    dts[0] = times[1] - times[0]
                    dts[-1] = times[-1] - times[-2]
                    dts[1:-1] = ((times[1:-1] - times[:-2]) +
                                 (times[2:] - times[1:-1])) / 2.0
                    negs = dts < 0.0
                    if np.any(negs):
                        times_dts = [(DateTime(t).date, dt)
                                     for t, dt in zip(times[negs], dts[negs])]
                        logger.warning(
                            'WARNING - negative dts in {} at {}'.format(
                                msid.MSID, times_dts))

                    # Clip to range 0.001 to 300.0.  The low bound is just there
                    # for data with identical time stamps.  This shouldn't happen
                    # but in practice might.  The 300.0 represents 5 minutes and
                    # is the largest normal time interval.  Data near large gaps
                    # will get a weight of 5 mins.
                    dts.clip(0.001, 300.0, out=dts)
                sum_dts = np.sum(dts)

                out['min'][i] = np.min(vals)
                out['max'][i] = np.max(vals)
                out['mean'][i] = np.sum(dts * vals) / sum_dts
                if interval == 'daily':
                    # biased weighted estimator of variance (N should be big enough)
                    # http://en.wikipedia.org/wiki/Mean_square_weighted_deviation
                    sigma_sq = np.sum(dts *
                                      (vals - out['mean'][i])**2) / sum_dts
                    out['std'][i] = np.sqrt(sigma_sq)
                    quant_vals = scipy.stats.mstats.mquantiles(
                        vals,
                        np.array(quantiles) / 100.0)
                    for quant_val, quantile in zip(quant_vals, quantiles):
                        out['p%02d' % quantile][i] = quant_val
            i += 1

    return np.rec.fromarrays([out[x][:i] for x in cols_stats],
                             names=cols_stats)
Exemple #9
0
def update_observed_metrics(obsid=None, start=None, stop=None, data_root=None, force=False,
                            factor=20, make_plots=False, save=False):
    """
    Update the ``GUIDE_METRICS_OBSID`` and ``GUIDE_METRICS_SLOT`` tables
    (ascii ECSV format) in place to reflect information about observed
    guide metrics: dr95, dr50, manvr angle, ending roll error, one shot
    updates, aberration corrections; and mean yag/zag, mean dyag/dzag
    centroid errors.

    :param factor: scaling for the centroid residual visualization in
                   the yag/zag plane (default 20)
    :param make_plots: if True then generate plots for centroid dashboard
                       (default False)
    :param save: if True then save the plots (default False)
    """

    if obsid is None:
        # Default is between NOW and (NOW - NDAYS) days
        start = DateTime(start) - (NDAYS if start is None else 0)
        stop = DateTime(stop)
        # Get obsids, both science and ERs
        obsids = [evt.obsid for evt in events.obsids.filter(start, stop)]
    else:
        obsids = [np.int(obsid)]

    if data_root is None:
        data_root = CD_ROOT

    obsid_metrics_file = os.path.join(data_root, GUIDE_METRICS_OBSID)
    slot_metrics_file = os.path.join(data_root, GUIDE_METRICS_SLOT)
    # Read in existing files if they exists and make a set of already-processed obsids
    dat_obsid_old, processed_obsids = read_metrics_from_file(obsid_metrics_file)
    dat_slot_old, tmp = read_metrics_from_file(slot_metrics_file)

    rows_obsid = []
    rows_slots = []
    for obsid in obsids:

        logger.info(f'Obsid={obsid}')
        obs_dir = get_cd_dir(obsid, data_root)

        if obsid in processed_obsids:
            if not force:
                logger.info(f'Skipping obsid {obsid}: already processed')
                continue
            else:
                if obsid in dat_obsid_old['obsid']:
                    idx = list(dat_obsid_old['obsid']).index(obsid)
                    dat_obsid_old.remove_row(idx)

                if obsid in dat_slot_old['obsid']:
                    ok = dat_slot_old['obsid'] == obsid
                    dat_slot_old.remove_rows(ok)

        try:
            metrics_obsid, metrics_slot = get_observed_metrics(obsid,
                                                               metrics_file=obsid_metrics_file)

            if not metrics_obsid['dwell']:
                logger.info(f'Skipping obsid {obsid}: not a dwell?')
                info = 'Not a dwell'
                make_special_case_html(metrics_obsid, obs_dir, info=info)
                continue

            if metrics_obsid['att_flag'] > 1:
                logger.info(f'Skipping obsid {obsid}: problem matching obc/ground times')
                info = 'Problem matching obc/ground att times'
                make_special_case_html(metrics_obsid, obs_dir, info=info)
                continue

            if obsid < 40000 and metrics_obsid['att_flag'] == 1:
                logger.info(f'Skipping science obsid {obsid}: no ground aspect solution')
                info = 'No ground aspect solution for science obsid'
                make_special_case_html(metrics_obsid, obs_dir, info=info)
                continue

            if make_plots:
                kwargs = {'factor': factor, 'save': save}
                plot_observed_metrics(obsid,
                                      plot_dir=obs_dir,
                                      coord='dr',
                                      att_errors=metrics_obsid['att_errors'],
                                      **kwargs)
        except (NoObsidError, NoDwellError, NoManvrError) as err:
            logger.info(f'Skipping obsid {obsid} missing data: {err}')
            continue
        except Exception as err:
            logger.warning(f'Skipping obsid {obsid} ERROR: {err}')
            continue

        # Process entries for 'per obsid' metrics

        keys_obsid = ('obsid', 'mean_date',
                      'att_flag', 'dr50', 'dr95',
                      'aber_y', 'aber_z', 'aber_flag',
                      'one_shot_pitch', 'one_shot_yaw',
                      'one_shot', 'one_shot_aber_corrected',
                      'manvr_angle', 'preceding_roll_err', 'ending_roll_err',
                      'obsid_preceding', 'obsid_next')

        row_obsid = {k: metrics_obsid[k] for k in keys_obsid}
        rows_obsid.append(row_obsid)

        # Process entries for 'per slot' metrics
        row_slots = []
        for slot in range(8):
            out = {}
            slot_data = metrics_slot['slots'][slot]
            if bool(slot_data):
                out['obsid'] = obsid
                out['slot'] = slot
                out['mean_date'] = metrics_obsid['mean_date']
                keys_slot = ('id', 'type', 'mag', 'yang', 'zang',
                             'median_mag', 'median_dy', 'median_dz')
                out.update({k: slot_data[k] for k in keys_slot})
                # Needed to build html
                row_slots.append(out)
                # Needed to update 'per slot' data file
                rows_slots.append(out)

        # Build html page for this obsid
        make_html(row_obsid, row_slots, obs_dir)

        # Update the 'per_obsid' table
        if rows_obsid:
            sort_cols = ['mean_date']
            update_data_table(rows_obsid, dat_obsid_old, obsid_metrics_file, sort_cols)

        # Update the 'per_slot' table
        if rows_slots:
            sort_cols = ['mean_date', 'slot']
            update_data_table(rows_slots, dat_slot_old, slot_metrics_file, sort_cols)
def calc_stats_vals(msid, rows, indexes, interval):
    quantiles = (1, 5, 16, 50, 84, 95, 99)
    cols_stats = ('index', 'n', 'val')
    n_out = len(rows) - 1
    msid_dtype = msid.vals.dtype
    msid_is_numeric = not msid_dtype.name.startswith('string')
    # Predeclare numpy arrays of correct type and sufficient size for accumulating results.
    out = dict(index=np.ndarray((n_out,), dtype=np.int32),
               n=np.ndarray((n_out,), dtype=np.int32),
               val=np.ndarray((n_out,), dtype=msid_dtype),
               )
    if msid_is_numeric:
        cols_stats += ('min', 'max', 'mean')
        out.update(dict(min=np.ndarray((n_out,), dtype=msid_dtype),
                        max=np.ndarray((n_out,), dtype=msid_dtype),
                        mean=np.ndarray((n_out,), dtype=np.float32),))
        if interval == 'daily':
            cols_stats += ('std',) + tuple('p%02d' % x for x in quantiles)
            out['std'] = np.ndarray((n_out,), dtype=msid_dtype)
            out.update(('p%02d' % x, np.ndarray((n_out,), dtype=msid_dtype)) for x in quantiles)
    i = 0
    for row0, row1, index in itertools.izip(rows[:-1], rows[1:], indexes[:-1]):
        vals = msid.vals[row0:row1]
        times = msid.times[row0:row1]
        n_vals = len(vals)
        if n_vals > 0:
            out['index'][i] = index
            out['n'][i] = n_vals
            out['val'][i] = vals[n_vals // 2]
            if msid_is_numeric:
                if n_vals <= 2:
                    dts = np.ones(n_vals, dtype=np.float64)
                else:
                    dts = np.empty(n_vals, dtype=np.float64)
                    dts[0] = times[1] - times[0]
                    dts[-1] = times[-1] - times[-2]
                    dts[1:-1] = ((times[1:-1] - times[:-2])
                                 + (times[2:] - times[1:-1])) / 2.0
                    negs = dts < 0.0
                    if np.any(negs):
                        times_dts = [(DateTime(t).date, dt)
                                     for t, dt in zip(times[negs], dts[negs])]
                        logger.warning('WARNING - negative dts in {} at {}'
                                       .format(msid.MSID, times_dts))

                    # Clip to range 0.001 to 300.0.  The low bound is just there
                    # for data with identical time stamps.  This shouldn't happen
                    # but in practice might.  The 300.0 represents 5 minutes and
                    # is the largest normal time interval.  Data near large gaps
                    # will get a weight of 5 mins.
                    dts.clip(0.001, 300.0, out=dts)
                sum_dts = np.sum(dts)

                out['min'][i] = np.min(vals)
                out['max'][i] = np.max(vals)
                out['mean'][i] = np.sum(dts * vals) / sum_dts
                if interval == 'daily':
                    # biased weighted estimator of variance (N should be big enough)
                    # http://en.wikipedia.org/wiki/Mean_square_weighted_deviation
                    sigma_sq = np.sum(dts * (vals - out['mean'][i]) ** 2) / sum_dts
                    out['std'][i] = np.sqrt(sigma_sq)
                    quant_vals = scipy.stats.mstats.mquantiles(vals, np.array(quantiles) / 100.0)
                    for quant_val, quantile in zip(quant_vals, quantiles):
                        out['p%02d' % quantile][i] = quant_val
            i += 1

    return np.rec.fromarrays([out[x][:i] for x in cols_stats], names=cols_stats)
Exemple #11
0
def calc_stats_vals(msid, rows, indexes, interval):
    """
    Compute statistics values for ``msid`` over specified intervals.
    :param msid: Msid object (filter_bad=True)
    :param rows: Msid row indices corresponding to stat boundaries
    :param indexes: Universal index values for stat (row times // dt)
    :param interval: interval name (5min or daily)
    """
    quantiles = (1, 5, 16, 50, 84, 95, 99)
    n_out = len(rows) - 1

    # Check if data type is "numeric".  Boolean values count as numeric,
    # partly for historical reasons, in that they support funcs like
    # mean (with implicit conversion to float).
    msid_dtype = msid.vals.dtype
    msid_is_numeric = issubclass(msid_dtype.type, (np.number, np.bool_))

    # Predeclare numpy arrays of correct type and sufficient size for accumulating results.
    out = OrderedDict()
    out['index'] = np.ndarray((n_out, ), dtype=np.int32)
    out['n'] = np.ndarray((n_out, ), dtype=np.int32)
    out['val'] = np.ndarray((n_out, ), dtype=msid_dtype)

    if msid_is_numeric:
        out['min'] = np.ndarray((n_out, ), dtype=msid_dtype)
        out['max'] = np.ndarray((n_out, ), dtype=msid_dtype)
        out['mean'] = np.ndarray((n_out, ), dtype=np.float32)

        if interval == 'daily':
            out['std'] = np.ndarray((n_out, ), dtype=msid_dtype)
            for quantile in quantiles:
                out['p{:02d}'.format(quantile)] = np.ndarray((n_out, ),
                                                             dtype=msid_dtype)

    # MSID may have state codes
    # if msid.state_codes:
    #     for raw_count, state_code in msid.state_codes:
    #         out['n_' + fix_state_code(state_code)] = np.zeros(n_out, dtype=np.int32)

    i = 0
    for row0, row1, index in zip(rows[:-1], rows[1:], indexes[:-1]):
        vals = msid.vals[row0:row1]
        times = msid.times[row0:row1]

        n_vals = len(vals)
        if n_vals > 0:
            out['index'][i] = index
            out['n'][i] = n_vals
            out['val'][i] = vals[n_vals // 2]
            if msid_is_numeric:
                if n_vals <= 2:
                    dts = np.ones(n_vals, dtype=np.float64)
                else:
                    dts = np.empty(n_vals, dtype=np.float64)
                    dts[0] = times[1] - times[0]
                    dts[-1] = times[-1] - times[-2]
                    dts[1:-1] = ((times[1:-1] - times[:-2]) +
                                 (times[2:] - times[1:-1])) / 2.0
                    negs = dts < 0.0
                    if np.any(negs):
                        times_dts = [(Time(t, format="unix").yday, dt)
                                     for t, dt in zip(times[negs], dts[negs])]
                        logger.warning(
                            'WARNING - negative dts in {} at {}'.format(
                                msid.MSID, times_dts))

                    # Clip to range 0.001 to 300.0.  The low bound is just there
                    # for data with identical time stamps.  This shouldn't happen
                    # but in practice might.  The 300.0 represents 5 minutes and
                    # is the largest normal time interval.  Data near large gaps
                    # will get a weight of 5 mins.
                    dts.clip(0.001, 300.0, out=dts)
                sum_dts = np.sum(dts)

                out['min'][i] = np.min(vals)
                out['max'][i] = np.max(vals)
                out['mean'][i] = np.sum(dts * vals) / sum_dts
                if interval == 'daily':
                    # biased weighted estimator of variance (N should be big enough)
                    # http://en.wikipedia.org/wiki/Mean_square_weighted_deviation
                    sigma_sq = np.sum(dts *
                                      (vals - out['mean'][i])**2) / sum_dts
                    out['std'][i] = np.sqrt(sigma_sq)
                    quant_vals = scipy.stats.mstats.mquantiles(
                        vals,
                        np.array(quantiles) / 100.0)
                    for quant_val, quantile in zip(quant_vals, quantiles):
                        out['p%02d' % quantile][i] = quant_val

            # if msid.state_codes:
            # If MSID has state codes then count the number of values in each state
            # and store.  The MSID values can have trailing spaces to fill out to a
            # uniform length, so state_code is right padded accordingly.
            # max_len = max(len(state_code) for raw_count, state_code in msid.state_codes)
            # fmtstr = '{:' + str(max_len) + 's}'
            # for raw_count, state_code in msid.state_codes:
            #     state_count = np.count_nonzero(vals == fmtstr.format(state_code))
            #     out['n_' + fix_state_code(state_code)][i] = state_count

            i += 1

    return np.rec.fromarrays([x[:i] for x in out.values()],
                             names=list(out.keys()))
Exemple #12
0
def update_msid_files(filetype, archfiles):
    colnames = pickle.load(open(msid_files['colnames'].abs))
    colnames_all = pickle.load(open(msid_files['colnames_all'].abs))
    old_colnames = colnames.copy()
    old_colnames_all = colnames_all.copy()

    # Setup db handle with autocommit=False so that error along the way aborts insert transactions
    db = Ska.DBI.DBI(dbi='sqlite',
                     server=msid_files['archfiles'].abs,
                     autocommit=False)

    # Get the last row number from the archfiles table
    out = db.fetchone('SELECT max(rowstop) FROM archfiles')
    row = out['max(rowstop)'] or 0
    last_archfile = db.fetchone('SELECT * FROM archfiles where rowstop=?',
                                (row, ))

    archfiles_overlaps = []
    dats = []
    archfiles_processed = []

    content_is_derived = (filetype['instrum'] == 'DERIVED')

    for i, f in enumerate(archfiles):
        get_data = (read_derived if content_is_derived else read_archfile)
        dat, archfiles_row = get_data(i, f, filetype, row, colnames, archfiles,
                                      db)
        if dat is None:
            continue

        # If creating new content type and there are no existing colnames, then
        # define the column names now.  Filter out any multidimensional
        # columns, including (typically) QUALITY.
        if opt.create and not colnames:
            colnames = set(dat.dtype.names)
            for colname in dat.dtype.names:
                if len(dat[colname].shape) > 1:
                    logger.info(
                        'Removing column {} from colnames because shape = {}'.
                        format(colname, dat[colname].shape))
                    colnames.remove(colname)

        # Ensure that the time gap between the end of the last ingested archive
        # file and the start of this one is less than opt.max_gap (or
        # filetype-based defaults).  If this fails then break out of the
        # archfiles processing but continue on to ingest any previously
        # successful archfiles
        if last_archfile is None:
            time_gap = 0
        else:
            time_gap = archfiles_row['tstart'] - last_archfile['tstop']
        max_gap = opt.max_gap
        if max_gap is None:
            if filetype['instrum'] in ['EPHEM', 'DERIVED']:
                max_gap = 601
            elif filetype['content'] == 'ACISDEAHK':
                max_gap = 10000
                # From P.Plucinsky 2011-09-23
                # If ACIS is executing an Event Histogram run while in FMT1,
                # the telemetry stream will saturate.  The amount of time for
                # an opening in the telemetry to appear such that DEA HKP
                # packets can get out is a bit indeterminate.  The histograms
                # integrate for 5400s and then they are telemetered.  I would
                # suggest 6000s, but perhaps you would want to double that to
                # 12000s.
            elif filetype['content'] in ['CPE1ENG', 'CCDM15ENG']:
                # 100 years => no max gap for safe mode telemetry or dwell mode telemetry
                max_gap = 100 * 3.1e7
            else:
                max_gap = 32.9
        if time_gap > max_gap:
            logger.warning(
                'WARNING: found gap of %.2f secs between archfiles %s and %s',
                time_gap, last_archfile['filename'], archfiles_row['filename'])
            if opt.create:
                logger.warning(
                    '       Allowing gap because of opt.create=True')
            elif DateTime() - DateTime(
                    archfiles_row['tstart']) > opt.allow_gap_after_days:
                # After 4 days (by default) just let it go through because this is
                # likely a real gap and will not be fixed by subsequent processing.
                # This can happen after normal sun mode to SIM products.
                logger.warning('       Allowing gap because arch file '
                               'start is more than {} days old'.format(
                                   opt.allow_gap_after_days))
            else:
                break
        elif time_gap < 0:
            # Overlapping archfiles - deal with this in append_h5_col
            archfiles_overlaps.append((last_archfile, archfiles_row))

        # Update the last_archfile values.
        last_archfile = archfiles_row

        # A very small number of archive files (a few) have a problem where the
        # quality column tform is specified as 3B instead of 17X (for example).
        # This breaks things, so in this case just skip the file.  However
        # since last_archfile is set above the gap check considers this file to
        # have been ingested.
        if not content_is_derived and dat['QUALITY'].shape[1] != len(
                dat.dtype.names):
            logger.warning(
                'WARNING: skipping because of quality size mismatch: %d %d' %
                (dat['QUALITY'].shape[1], len(dat.dtype.names)))
            continue

        # Mark the archfile as ingested in the database and add to list for
        # subsequent relocation into arch_files archive.  In the case of a gap
        # where ingest is stopped before all archfiles are processed, this will
        # leave files either in a tmp dir (HEAD) or in the stage dir (OCC).
        # In the latter case this allows for successful processing later when the
        # gap gets filled.
        archfiles_processed.append(f)
        if not opt.dry_run:
            db.insert(archfiles_row, 'archfiles')

        # Capture the data for subsequent storage in the hdf5 files
        dats.append(dat)

        # Update the running list of column names.  Colnames_all is the maximal (union)
        # set giving all column names seen in any file for this content type.  Colnames
        # was historically the minimal (intersection) set giving the list of column names
        # seen in every file, but as of 0.39 it is allowed to grow as well to accommodate
        # adding MSIDs in the TDB.  Include only 1-d columns, not things like AEPERR
        # in PCAD8ENG which is a 40-element binary vector.
        colnames_all.update(dat.dtype.names)
        colnames.update(name for name in dat.dtype.names
                        if dat[name].ndim == 1)

        row += len(dat)

    if dats:
        logger.verbose('Writing accumulated column data to h5 file at ' +
                       time.ctime())
        data_lens = set()
        processed_cols = set()
        for colname in colnames:
            ft['msid'] = colname
            if not os.path.exists(msid_files['msid'].abs):
                make_h5_col_file(dats, colname)
                if not opt.create:
                    # New MSID was found for this content type.  This must be associated with
                    # an update to the TDB.  Skip for the moment to ensure that other MSIDs
                    # are fully processed.
                    continue
            data_len = append_h5_col(dats, colname, archfiles_overlaps)
            data_lens.add(data_len)
            processed_cols.add(colname)

        if len(data_lens) != 1:
            raise ValueError(
                'h5 data length inconsistency {}, investigate NOW!'.format(
                    data_lens))

        # Process any new MSIDs (this is extremely rare)
        data_len = data_lens.pop()
        for colname in colnames - processed_cols:
            ft['msid'] = colname
            append_filled_h5_col(dats, colname, data_len)

    # Assuming everything worked now commit the db inserts that signify the
    # new archive files have been processed
    if not opt.dry_run:
        db.commit()

    # If colnames or colnames_all changed then give warning and update files.
    if colnames != old_colnames:
        logger.warning('WARNING: updating %s because colnames changed: %s' %
                       (msid_files['colnames'].abs, old_colnames ^ colnames))
        if not opt.dry_run:
            pickle.dump(colnames, open(msid_files['colnames'].abs, 'w'))
    if colnames_all != old_colnames_all:
        logger.warning(
            'WARNING: updating %s because colnames_all changed: %s' %
            (msid_files['colnames_all'].abs, colnames_all ^ old_colnames_all))
        if not opt.dry_run:
            pickle.dump(colnames_all, open(msid_files['colnames_all'].abs,
                                           'w'))

    return archfiles_processed