Esempio n. 1
0
def time_corr(date: Union[pd.Series, pd.Index, np.ndarray], cfg_in: Mapping[str, Any],
              sort: Union[str, bool, None] = None, path_save_image='time_corr'):
    """
    :param date: numpy np.ndarray elements may be datetime64 or text in ISO 8601 format
    :param cfg_in: dict with fields:
    - dt_from_utc: correct time by adding this constant
    - fs: sampling frequency
    - sort: same as :param sort:, used only if :param sort: is None
    - keep_input_nans: NaNs in date remains unchanged
    - path: where save images of bad time corrected
    - min_date, min_date: optional limits - to set out time beyond limits to constants slitly beyond limits
    :param sort:
    - 'increase' or 'True' or True: increase duplicated time values, (increase time resolution)
    - 'False', False: do not check time inversions
    - 'delete_inversions'
    :return: (tim, b_ok) where
    - tim: pandas time series, same size as date input
    - b_ok: mask of not decreasing elements
    Note: converts to UTC time if ``date`` in text format, properly formatted for conv.
    todo: use Kalman filter?
    """
    if not date.size:
        return pd.DatetimeIndex([], tz='UTC'), np.bool_([])
    if sort is None:
        sort = cfg_in.get('sort')
    if sort == 'False':
        sort = False
    elif sort == 'True' or sort == 'increase':
        sort = True
    if __debug__:
        lf.debug('time_corr (time correction) started')
    if cfg_in.get('dt_from_utc'):
        if isinstance(date[0], str):
            # add zone that compensate time shift
            hours_from_utc_f = cfg_in['dt_from_utc'].total_seconds() / 3600
            Hours_from_UTC = int(hours_from_utc_f)
            hours_from_utc_f -= Hours_from_UTC
            if abs(hours_from_utc_f) > 0.0001:
                print('For string data can add only fixed number of hours! Adding', Hours_from_UTC / 3600, 'Hours')
            tim = pd.to_datetime((date.astype(np.object) + '{:+03d}'.format(Hours_from_UTC)).astype('datetime64[ns]'),
                                 utc=True)
        elif isinstance(date, pd.Index):
            tim = date
            tim -= cfg_in['dt_from_utc']
            tim = tim.tz_localize('UTC')
            # if Hours_from_UTC != 0:
            # tim.tz= tzoffset(None, -Hours_from_UTC*3600)   #invert localize
            # tim= tim.tz_localize(None).tz_localize('UTC')  #correct

        else:
            try:
                if isinstance(date, pd.Series):
                    tim = pd.to_datetime(date - np.timedelta64(cfg_in['dt_from_utc']), utc=True)

                else:
                    tim = pd.to_datetime(date.astype('datetime64[ns]') - np.timedelta64(
                        pd.Timedelta(cfg_in['dt_from_utc'])), utc=True)  # hours=Hours_from_UTC
            except OverflowError:  # still need??
                tim = pd.to_datetime(datetime_fun(
                    np.subtract, tim.values, np.timedelta64(cfg_in['dt_from_utc']), type_of_operation='<M8[ms]'
                    ), utc=True)
            # tim += np.timedelta64(pd.Timedelta(hours=hours_from_utc_f)) #?
        lf.info('Time constant: {} {:s}', abs(cfg_in['dt_from_utc']),
               'subtracted' if cfg_in['dt_from_utc'] > timedelta(0) else 'added')
    else:
        if (not isinstance(date, pd.Series)) and (not isinstance(date, np.datetime64)):
            date = date.astype('datetime64[ns]')
        tim = pd.to_datetime(date, utc=True)  # .tz_localize('UTC')tz_convert(None)
        #hours_from_utc_f = 0
    cfg_min_date = cfg_in.get('min_date')
    if cfg_min_date:
        cfg_min_date = pd.Timestamp(cfg_in['min_date'], tz='UTC')

        # Skip processing if data is out of filtering range
        global tim_min_save, tim_max_save
        tim_min = tim.min(skipna=True)
        tim_max = tim.max(skipna=True)
        # also collect statistics of min&max for messages:
        tim_min_save = min(tim_min_save, tim_min)
        tim_max_save = max(tim_max_save, tim_max)

        # set time beyond limits to special values keeping it sorted for dask and mark out of range as good values
        if tim_max < cfg_min_date:
            tim[:] = cfg_min_date - np.timedelta64(1, 'ns')  # pd.NaT                      # ns-resolution maximum year
            return tim, np.ones_like(tim, dtype=bool)
        else:
            cfg_max_date = cfg_in.get('max_date')
            if cfg_max_date:
                cfg_max_date = pd.Timestamp(cfg_in['max_date'], tz='UTC')
                if tim_min > cfg_max_date:
                    tim[:] = pd.Timestamp(cfg_in['max_date'], tz='UTC') + np.timedelta64(1, 'ns')  # pd.Timestamp('2262-01-01')  # ns-resolution maximum year
                    return tim, np.ones_like(tim, dtype=bool)

            b_ok_in = tim >= cfg_min_date
            if cfg_max_date:
                b_ok_in &= (tim <= cfg_max_date)

            it_se = np.flatnonzero(b_ok_in)[[0,-1]]
            it_se[1] += 1
            tim = tim[slice(*it_se)]

    b_ok_in = tim.notna()
    n_bad_in = b_ok_in.size - b_ok_in.sum()
    if n_bad_in:
        if cfg_in.get('keep_input_nans'):
            tim = tim[b_ok_in]
    try:
        b_ok_in = b_ok_in.to_numpy()
    except AttributeError:
        pass  # we already have numpy array


    t = tim.to_numpy(np.int64)
    if sort and tim.size > 1:
        # Check time resolution and increase if needed to avoid duplicates
        if n_bad_in and not cfg_in.get('keep_input_nans'):
            t = np.int64(rep2mean(t, bOk=b_ok_in))
            b_ok_in[:] = True
        freq, n_same, n_decrease, i_different = find_sampling_frequency(t, precision=6, b_show=False)
        if freq:
            cfg_in['fs_last'] = freq  # fallback freq to get value for next files on fail
        elif cfg_in['fs_last']:
            lf.warning('Using fallback (last) sampling frequency fs = {:s}', cfg_in['fs_last'])
            freq = cfg_in['fs_last']
        elif cfg_in.get('fs'):
            lf.warning('Ready to use specified sampling frequency fs = {:s}', cfg_in['fs'])
            freq = cfg_in['fs']
        elif cfg_in.get('fs_old_method'):
            lf.warning('Ready to use specified sampling frequency fs_old_method = {:s}', cfg_in['fs_old_method'])
            freq = cfg_in['fs_old_method']
        else:
            lf.warning('Ready to set sampling frequency to default value: fs = 1Hz')
            freq = 1

        # # show linearity of time # plt.plot(date)
        # fig, axes = plt.subplots(1, 1, figsize=(18, 12))
        # t = date.values.view(np.int64)
        # t_lin = (t - np.linspace(t[0], t[-1], len(t)))
        # axes.plot(date, / dt64_1s)
        # fig.savefig(os_path.join(cfg_in['dir'], cfg_in['file_stem'] + 'time-time_linear,s' + '.png'))
        # plt.close(fig)
        b_ok = None
        idel = None
        msg = ''
        if n_decrease > 0:
            # Excude elements

            # if True:
            #     # try fast method
            #     b_bad_new = True
            #     k = 10
            #     while np.any(b_bad_new):
            #         k -= 1
            #         if k > 0:
            #             b_bad_new = b1spike(t[b_ok], max_spike=2 * np.int64(dt64_1s / freq))
            #             b_ok[np.flatnonzero(b_ok)[b_bad_new]] = False
            #             print('step {}: {} spikes found, deleted {}'.format(k, np.sum(b_bad_new),
            #                                                                 np.sum(np.logical_not(b_ok))))
            #             pass
            #         else:
            #             break
            # if k > 0:  # success?
            #     t = rep2mean(t, bOk=b_ok)
            #     freq, n_same, n_decrease, b_same_prev = find_sampling_frequency(t, precision=6, b_show=False)
            #     # print(np.flatnonzero(b_bad))
            # else:
            #     t = tim.values.view(np.int64)
            # if n_decrease > 0:  # fast method is not success
            # take time:i
            # lf.warning(Fast method is not success)

            # Excluding inversions
            # find increased elements (i_different is i_inc only if single spikes):
            i_inc = i_different[longest_increasing_subsequence_i(t[i_different])]
            # try trusting to repeating values, keeping them to not interp near holes (else use np.zeros):
            dt = np.ediff1d(t, to_end=True)
            b_ok = dt == 0
            b_ok[i_inc] = True
            # b_ok= nondecreasing_b(t, )
            # t = t[b_ok]

            t_ok = t[b_ok]
            i_dec = np.flatnonzero(np.ediff1d(t_ok, to_end=True) < 0)
            n_decrease_remains = len(i_dec)
            if n_decrease_remains:
                lf.warning('Decreased time among duplicates ({:d} times). Not trusting repeated values...',
                          n_decrease_remains)
                b_ok = np.zeros_like(t, dtype=np.bool_)
                b_ok[i_inc] = True

                if sort == 'delete_inversions':
                    # selecting one of the two bad time values that lead to the bad diff element and mask these elements
                    for s, e in i_dec + np.int32([0, 1]):
                        b_ok[t == (t_ok[e if b_ok[s] else s])] = False
                    if cfg_in.get('keep_input_nans'):
                        (b_ok_in[b_ok_in])[~b_ok] = False
                    else:
                        b_ok_in[~b_ok] = False
            else:  # Decreased time not in duplicates
                i_dec = np.delete(i_different, np.searchsorted(i_different, i_inc))
                assert np.alltrue(i_dec == i_different[~np.in1d(i_different, i_inc)])  # same results
                # assert np.alltrue(i_dec == np.setdiff1d(i_different, i_inc[:-1]))  # same results
                if sort == 'delete_inversions':
                    b_ok_in[np.flatnonzero(b_ok_in)[i_dec] if cfg_in.get('keep_input_nans') else i_dec] = False

            b_ok[b_ok] = np.ediff1d(t[b_ok], to_end=True) > 0  # adaption for next step

            idel = np.flatnonzero(~b_ok)
            n_del = len(idel)
            msg = f"Filtered time: {n_del}/{t.size} values " \
                  f"{'masked' if sort == 'delete_inversions' else 'interpolated'} (1st and last: " \
                  f"{pd.to_datetime(t[idel[[0, -1]]], utc=True)})"
            if n_decrease:
                lf.warning('decreased time ({}) was detected! {}', n_decrease, msg)
            else:
                lf.warning(msg)


        if n_same > 0 and cfg_in.get('fs') and not cfg_in.get('fs_old_method'):
            # This is most simple operation that should be done usually for CTD
            t = repeated2increased(t, cfg_in['fs'], b_ok if n_decrease else None)  # if n_decrease then b_ok is calculated before
            tim = pd.to_datetime(t, utc=True)
        elif n_same > 0 or n_decrease > 0:
            # message with original t


            # Replace t by linear increasing values using constant frequency excluding big holes
            if cfg_in.get('fs_old_method'):
                lf.warning('Linearize time interval using povided freq = {:f}Hz (determined: {:f})',
                          cfg_in.get('fs_old_method'), freq)
                freq = cfg_in.get('fs_old_method')
            else:  # constant freq = filtered mean
                lf.warning('Linearize time interval using median* freq = {:f}Hz determined', freq)
            t = np.int64(rep2mean(t, bOk=b_ok))  # interp to can use as pandas index even if any bad
            b_show = n_decrease > 0
            if freq <= 1:
                # Skip: typically data resolution is sufficient for this frequency
                lf.warning('Not linearizing for frequency < 1')
            else:
                # Increase time resolution by recalculating all values
                tim_before = pd.to_datetime(t, utc=True)
                make_linear(t, freq)  # changes t (and tim?)
                # Check if we can use them
                bbad = check_time_diff(tim_before, t.view('M8[ns]'), dt_warn=pd.Timedelta(minutes=2),
                                       mesage='Big time diff after corr: difference [min]:')
                if np.any(bbad):
                    b_ok = ~bbad
                    b_show = True

            # Show what is done
            if b_show:
                if b_ok is None:
                    dt = np.ediff1d(t, to_begin=1)
                    b_ok = dt > 0
                plot_bad_time_in_thread(cfg_in, t, b_ok, idel, tim,
                                        (tim_min, tim_max) if cfg_in.get('min_date') else None, path_save_image, msg)

        # Checking all is ok

        dt = np.ediff1d(t, to_begin=1)
        b_ok = dt > 0
        # tim.is_unique , len(np.flatnonzero(tim.duplicated()))
        b_decrease = dt < 0  # with set of first element as increasing
        n_decrease = b_decrease.sum()
        if n_decrease > 0:
            lf.warning(
                'Decreased remaining time ({:d}) are masked!{:s}{:s}',
                n_decrease,
                '\n'.join(' < '.join('{:%y.%m.%d %H:%M:%S.%f%z}'.format(_) for _ in tim[se].to_numpy()) for se in
                         np.flatnonzero(b_decrease)[:3, None] + np.int32([-1, 0])),
                '...' if n_decrease > 3 else ''
                )

            b_ok &= ~b_decrease

        b_same_prev = np.ediff1d(t, to_begin=1) == 0  # with set of first element as changing
        n_same = b_same_prev.sum()

        if cfg_in.get('keep_input_nans'):
            if n_same > 0:
                lf.warning('nonincreased time ({:d} times) is detected! - interp ', n_same)
        else:
            # prepare to interp all nonincreased (including NaNs)
            if n_bad_in:
                b_same_prev &= ~b_ok_in

            msg = ', '.join(
                f'{fault} time ({n} times)' for (n, fault) in ((n_same, 'nonincreased'), (n_bad_in, 'NaN')) if n > 0
                )
            if msg:
                lf.warning('{:s} is detected! - interp ', msg)

        if n_same > 0 or n_decrease > 0:
            # rep2mean(t, bOk=np.logical_not(b_same_prev if n_decrease==0 else (b_same_prev | b_decrease)))
            b_bad = b_same_prev if n_decrease == 0 else (b_same_prev | b_decrease)
            t = rep2mean_with_const_freq_ends(t, ~b_bad, freq)

    else:
        lf.debug('time not need to be sorted')
        b_ok = np.ones(tim.size, np.bool8)
    # make initial shape: paste back NaNs
    if n_bad_in and cfg_in.get('keep_input_nans'):
        # place initially bad elements back
        t, t_in = (np.NaN + np.empty_like(b_ok_in)), t
        t[b_ok_in] = t_in
        b_ok_in[b_ok_in] = b_ok
        b_ok = b_ok_in
    elif sort == 'delete_inversions':
        b_ok &= b_ok_in
    # make initial shape: pad with constants of config. limits where data was removed because input is beyond this limits
    if cfg_in.get('min_date') and np.any(it_se != np.int64([0, date.size])):
        pad_width = (it_se[0], date.size - it_se[1])
        t = np.pad(t, pad_width, constant_values=np.array((cfg_in['min_date'], cfg_in['max_date']), 'M8[ns]'))
        b_ok = np.pad(b_ok, pad_width, constant_values=True)
    assert t.size == b_ok.size

    return pd.to_datetime(t, utc=True), b_ok
Esempio n. 2
0
def CTDrunsExtract(P: np.ndarray, dnT: np.ndarray,
                   cfg_extract_runs: Dict[str, Any]) -> np.ndarray:
    '''
    find profiles ("Mainas"). Uses extractRuns()
    :param P: Pressure/Depth
    :param dnT: Time
    :param cfg_extract_runs: settings dict with fields:
      - dt_between_min
      - min_dp
      - min_samples
      - dt_hole_max - split runs where dt between adjasent samples bigger. If not
      specified it is set equal to 'dt_between_min' automatically
      - b_do - if it is set to False intepret all data as one run
      - b_keep_minmax_of_bad_files, optional - keep 1 min before max and max of separated parts of data where movements insufficient to be runs
    :return: iminmax: 2D numpy array np.int64([[minimums],[maximums]])
    '''

    if ('do' not in cfg_extract_runs
        ) or cfg_extract_runs['b_do']:  # not do only if b_do is set to False
        P = np.abs(rep2mean(P))
        if not 'dt_hole_max' in cfg_extract_runs:
            cfg_extract_runs['dt_hole_max'] = cfg_extract_runs[
                'dt_between_min']
        dt64_hole_max = np.timedelta64(cfg_extract_runs['dt_hole_max'], 'ns')
        # time_holes= np.flatnonzero(np.ediff1d(dnT, dt64_hole_max, dt64_hole_max) >= dt64_hole_max) #bug in numpy
        time_holes = np.hstack(
            (0, np.flatnonzero(np.diff(dnT) >= dt64_hole_max), len(dnT)))
        imin = []
        imax = []
        i_keep_bad_runs = []  #
        for ist, ien in zip(time_holes[:-1], time_holes[1:]):
            islice = slice(ist, ien)
            if (ien - ist) < cfg_extract_runs['min_samples']:
                continue

            if (P[islice].max() -
                    P[islice].min()) < cfg_extract_runs['min_dp']:
                if cfg_extract_runs.get('b_keep_minmax_of_bad_files'):
                    i_keep_bad_runs.append(len(imax))
                    imax.append(P[islice].argmax())
                    imin.append(P[ist:imax[-1]].argmin())
            else:
                if 'path_images' in cfg_extract_runs:
                    cfg_extract_runs['path_image'] = os_path.join(
                        cfg_extract_runs['path_images'],
                        'extract_runs{:%y%m%d_%H%M%S}'.format(
                            np.datetime64(dnT[ist],
                                          's').astype(datetime))) + '.png'
                [it, il] = extractRuns(-P[islice], cfg_extract_runs)
                # Correct extractRuns func (mins and maxs must alternates):
                # make 1st min be less than 1st max
                if it and il:
                    if il[0] < it[0]:
                        del il[0]
                # make length of min and max be equal
                if len(it) > len(il):
                    del it[-1]
                    il.append(ien - ist - 1)
                elif len(it) < len(il):
                    if it and it[0] > il[0]:
                        del il[0]
                    else:
                        it.append(ien - ist - 1)
                imin.extend([i + ist for i in it])
                imax.extend([i + ist for i in il])
        # Filter run down intervals:
        if len(imin):
            iminmax = np.vstack((imin, imax))
            bok = np.logical_and(
                np.diff(iminmax, 1, 0) >= cfg_extract_runs['min_samples'],
                np.diff(P[iminmax], 1, 0) >=
                cfg_extract_runs['min_dp']).flatten()
            bok[i_keep_bad_runs] = True
            if ~np.all(bok):
                iminmax = iminmax[:, bok]
        else:
            l.warning('no runs!')
            return np.int64([[], []])
        # N= min(len(imax), len(imin))
        # iminMax = [imin, imax]
    else:
        # N= 0
        iminmax = np.int64([[0], [len(P)]])

    # # make mask with ends set to -1
    # b_maina = np.zeros(len(P), 'int8')
    # for k in range(N):
    #     b_maina[imin[k]:imax[k]] = 1
    # b_maina[imax] = -1

    # Runs.PMax= P(imax)
    return iminmax  # , b_maina
Esempio n. 3
0
            # end = ' '
            return b_bad
        else:
            return None


    for col in cols_int16:
        b_bad = bad_and_message(data=df[col], fun_bad=np.isnan, msg_bad='nan vaues')
        b_bad2 = bad_and_message(data=df[col], fun_bad=np.isinf, msg_bad='inf vaues')
        if b_bad is None:
            if b_bad2 is None:
                continue
            b_bad = b_bad2
        elif b_bad2 is not None:
            b_bad |= b_bad2
        df[col] = rep2mean(df[col], np.logical_not(b_bad), df.index.astype('i8').astype('f8'))

df = df.astype(cfg['out']['dtype'], copy=False)


# @+node:korzh.20180521171338.1: ** save
def change_db_path(cfg, str_old='Pres.h5', str_new=',P(cal0605).h5'):
    if not cfg['db_path'].endswith(str_new):
        cfg['db_path'] = cfg['db_path'][:-len(str_old)] + str_new


change_db_path(cfg['out'])
log = {}
try:  # set chanks to mean data interval between holes
    cfg['out']['chunksize'] = int(mean_burst_size)  # np.median(np.diff(i_burst[:-1]))
except ValueError:  # some default value if no holes