Exemple #1
0
     9: ['2019-12-20T16:58:30', '2019-12-20T16:59:15'],
     10: ['2019-12-23T17:32:35', '2019-12-23T17:33:27'],
     11: ['2019-07-11T17:41:44', '2019-07-11T18:42:48'],
     12: ['2019-07-11T18:04:46', '2019-07-11T18:05:36'],
     14: ['2019-09-02T14:01:41', '2019-09-02T14:02:15'],  # todo
     16: ['2019-09-03T19:22:20', '2019-09-03T19:22:54'],
 }
 if time_ranges_nord.get(probe):
     # Recalc zeroing_azimuth with zeroed scaling coefs
     cfg_in = {
         'tables': [tbl],
         'db_path': db_path_tank,
         'time_range_nord': time_ranges_nord[probe]
     }
     with pd.HDFStore(db_path_calibr_scalling, mode='r') as store:
         for tbl, coefs in h5_names_gen(cfg_in):
             del coefs[
                 'azimuth_shift_deg']  # to calculate shift of uncorrected data
             # Calculation:
             dict_matrices = {
                 '//coef//H//azimuth_shift_deg':
                 zeroing_azimuth(store, tbl, time_ranges_nord[probe],
                                 coefs, cfg_in)
             }
     h5copy_coef(None, db_path_tank, tbl, dict_matrices=dict_matrices)
     h5copy_coef(db_path_tank,
                 db_path_calibr_scalling,
                 tbl,
                 ok_to_replace_group=True)
 else:
     l.warning('Inlab time_ranges_nord not defined')
Exemple #2
0
def main(config: ConfigType) -> None:
    """
    ----------------------------
    Save data to Pandas HDF5 store*.h5
    ----------------------------
    The store contains tables for each device and each device table contains log with metadata of recording sessions

    :param config: with fields:
    - in - mapping with fields:
      - tables_log: - log table name or pattern str for it: in pattern '{}' will be replaced by data table name
      - cols_good_data: -
      ['dt_from_utc', 'db', 'db_path', 'table_nav']
    - out - mapping with fields:
      - cols: can use i - data row number and i_log_row - log row number that is used to load data range
      - cols_log: can use i - log row number
      - text_date_format
      - file_name_fun, file_name_fun_log - {fun} part of "lambda rec_num, t_st, t_en: {fun}" string to compile function
      for name of data and log text files
      - sep

    """
    global cfg
    cfg = to_vaex_hdf5.cfg_dataclasses.main_init(config, cs_store_name)
    cfg_in = cfg.pop('input')
    cfg_in['cfgFile'] = cs_store_name
    cfg['in'] = cfg_in
    # try:
    #     cfg = to_vaex_hdf5.cfg_dataclasses.main_init_input_file(cfg, cs_store_name, )
    # except Ex_nothing_done:
    #     pass  # existed db is not mandatory

    device_path, cfg['out']['db_path'] = device_in_out_paths(
        db_path=cfg['out'].get('db_path'),
        path_cruise=cfg['in']['path_cruise'],
        device_short_name=cfg['in']['probes_prefix'],
        device_dir_pattern='*inclinometer*')

    out = cfg['out']
    # h5init(cfg['in'], out)

    probes = cfg['in']['probes'] or range(
        1, 41)  # sets default range, specify your values before line ---
    raw_root, probe_is_incl = re.subn('INCL_?', 'INKL_',
                                      cfg['in']['probes_prefix'].upper())

    # some parameters that depends of probe type (indicated by probes_prefix)
    p_type = defaultdict(
        # baranov's format
        constant_factory({
            'correct_fun':
            partial(correct_txt,
                    mod_file_name=mod_incl_name,
                    sub_str_list=[
                        b'^\r?(?P<use>20\d{2}(\t\d{1,2}){5}(\t\d{5}){8}).*',
                        b'^.+'
                    ]),
            'fs':
            10,
            'format':
            'Baranov',
        }),
        {
            'incl': {
                'correct_fun':
                partial(
                    correct_txt,
                    mod_file_name=mod_incl_name,
                    sub_str_list=[
                        b'^(?P<use>20\d{2}(,\d{1,2}){5}(,\-?\d{1,6}){6}(,\d{1,2}\.\d{2})(,\-?\d{1,3}\.\d{2})).*',
                        b'^.+'
                    ]),
                'fs':
                5,
                'format':
                'Kondrashov',
            },
            'voln': {
                'correct_fun':
                partial(
                    correct_txt,
                    mod_file_name=mod_incl_name,
                    sub_str_list=[
                        b'^(?P<use>20\d{2}(,\d{1,2}){5}(,\-?\d{1,8})(,\-?\d{1,2}\.\d{2}){2}).*',
                        b'^.+'
                    ]),
                'fs':
                5,
                #'tbl_prefix': 'w',
                'format':
                'Kondrashov',
            }
        })

    if st(1, 'Save inclinometer or wavegage data from ASCII to HDF5'):
        # Note: Can not find additional not corrected files for same probe if already have any corrected in search path (move them out if need)

        i_proc_probe = 0  # counter of processed probes
        i_proc_file = 0  # counter of processed files
        # patten to identify only _probe_'s raw data files that need to correct '*INKL*{:0>2}*.[tT][xX][tT]':

        raw_parent = dir_incl / '_raw'  # raw_parent /=
        if cfg['in']['raw_subdir'] is None:
            cfg['in']['raw_subdir'] = ''

        dir_out = raw_parent / re.sub(r'[.\\/ *?]', '_',
                                      cfg['in']['raw_subdir'])

        # sub replaces multilevel subdirs to 1 level that correct_fun() can only make

        def dt_from_utc_2000(probe):
            """ Correct time of probes started without time setting. Raw date must start from  2000-01-01T00:00"""
            return (
                datetime(year=2000, month=1, day=1) -
                cfg['in']['time_start_utc'][probe]
            ) if cfg['in']['time_start_utc'].get(probe) else timedelta(0)

        # convert cfg['in']['dt_from_utc'] keys to int

        cfg['in']['dt_from_utc'] = {
            int(p): v
            for p, v in cfg['in']['dt_from_utc'].items()
        }
        # convert cfg['in']['t_start_utc'] to cfg['in']['dt_from_utc'] and keys to int
        cfg['in']['dt_from_utc'].update(    # overwriting the 'time_start_utc' where already exist
            {int(p): dt_from_utc_2000(p) for p, v in cfg['in']['time_start_utc'].items()}
            )
        # make cfg['in']['dt_from_utc'][0] be default value
        cfg['in']['dt_from_utc'] = defaultdict(
            constant_factory(cfg['in']['dt_from_utc'].pop(0, timedelta(0))),
            cfg['in']['dt_from_utc'])

        for probe in probes:
            raw_found = []
            raw_pattern_file = str(
                Path(glob.escape(cfg['in']['raw_subdir'])) /
                cfg['in']['raw_pattern'].format(prefix=raw_root, number=probe))
            correct_fun = p_type[cfg['in']['probes_prefix']]['correct_fun']
            # if not archive:
            if (not re.match(r'.*(\.zip|\.rar)$', cfg['in']['raw_subdir'],
                             re.IGNORECASE)) and raw_parent.is_dir():
                raw_found = list(raw_parent.glob(raw_pattern_file))
            if not raw_found:
                # Check if already have corrected files for probe generated by correct_txt(). If so then just use them
                raw_found = list(
                    dir_out.glob(
                        f"{cfg['in']['probes_prefix']}{probe:0>2}.txt"))
                if raw_found:
                    print('corrected csv file', [r.name for r in raw_found],
                          'found')
                    correct_fun = lambda x, dir_out: x
                elif not cfg['in']['raw_subdir']:
                    continue

            for file_in in (raw_found or open_csv_or_archive_of_them(
                    raw_parent, binary_mode=False, pattern=raw_pattern_file)):
                file_in = correct_fun(file_in, dir_out=dir_out)
                if not file_in:
                    continue
                tbl = file_in.stem  # f"{cfg['in']['probes_prefix']}{probe:0>2}"
                # tbl = re.sub('^((?P<i>inkl)|w)_0', lambda m: 'incl' if m.group('i') else 'w',  # correct name
                #              re.sub('^[\d_]*|\*', '', file_in.stem).lower()),  # remove date-prefix if in name
                csv2h5(
                    [
                        str(
                            Path(__file__).parent / 'ini' /
                            f"csv_{'inclin' if probe_is_incl else 'wavegage'}_{p_type[cfg['in']['probes_prefix']]['format']}.ini"
                        ),
                        '--path',
                        str(file_in),
                        '--blocksize_int',
                        '50_000_000',  # 50Mbt
                        '--table',
                        tbl,
                        '--db_path',
                        str(db_path),
                        # '--log', str(scripts_path / 'log/csv2h5_inclin_Kondrashov.log'),
                        # '--b_raise_on_err', '0',  # ?
                        '--b_interact',
                        '0',
                        '--fs_float',
                        str(p_type[cfg['in']['probes_prefix']]
                            ['fs']),  #f'{fs(probe, file_in.stem)}',
                        '--dt_from_utc_seconds',
                        str(cfg['in']['dt_from_utc'][probe].total_seconds()),
                        '--b_del_temp_db',
                        '1',
                    ] +
                    (['--csv_specific_param_dict', 'invert_magnitometr: True']
                     if probe_is_incl else []),
                    **{
                        'filter': {
                            'min_date':
                            cfg['filter']['min_date'].get(
                                probe, np.datetime64(0, 'ns')),
                            'max_date':
                            cfg['filter']['max_date'].get(
                                probe, np.datetime64('now', 'ns')
                            ),  # simple 'now' works in sinchronious mode
                        }
                    })

                # Get coefs:
                l.info(
                    f"Adding coefficients to {db_path}/{tbl} from {cfg['in']['db_coefs']}"
                )
                try:
                    h5copy_coef(cfg['in']['db_coefs'], db_path, tbl)
                except KeyError as e:  # Unable to open object (component not found)
                    l.warning(
                        'No coefs to copy?'
                    )  # write some dummy coefficients to can load Veusz patterns:
                    h5copy_coef(None,
                                db_path,
                                tbl,
                                dict_matrices=dict_matrices_for_h5(tbl=tbl))
                except OSError as e:
                    l.warning(
                        'Not found DB with coefs?'
                    )  # write some dummy coefficients to can load Veusz patterns:
                    h5copy_coef(None,
                                db_path,
                                tbl,
                                dict_matrices=dict_matrices_for_h5(tbl=tbl))
                i_proc_file += 1
            else:
                print('no', raw_pattern_file, end=', ')
            i_proc_probe += 1
        print('Ok:', i_proc_probe, 'probes,', i_proc_file, 'files processed.')

    cfg_in['tables'] = ['incl30']
    from inclinometer.incl_h5clc import h5_names_gen
    from inclinometer.h5inclinometer_coef import rot_matrix_x, rot_matrix_y  #rotate_x, rotate_y
    # R*[xyz]. As we next will need apply coefs Ag = Rz*Ry*Rx we can incorporate this
    # operation by precalculate it adding known angles on each axes to Rz,Ry,Rx.
    # If rotation is 180 deg, then we can add it only to Rx. Modified coef: Ag_new = Rz*Ry*R(x+180)
    # R(x+180) = Rx*Rx180 equivalent to rotate Ag.T in opposite direction:
    # Ag_new = rotate_x()

    # inclinometer changed so that applying coefs returns rotated data fiels vectors:
    # Out_rotated = Ag * In
    # We rotate it back:
    # Out = rotate(Out_rotated) =
    # after  angle after calibration to some angle P so determine angle relative to vertical
    # by rotate data vector in opposite dir: Out = Ag * R_back * In. This equivalent to have new coef by apply rotation to Ag:
    # Ag_new = Ag * R_back = (R_back.T * Ag.T).T = rotate_forward(Ag.T).T =

    # Applying calibration coef will get data in inverted basis so we need rotate it after:
    #
    # coefs['Ag'] = rotate_x(coefs['Ag'], angle_degrees=180)
    # coefs['Ah'] = rotate_x(coefs['Ah'], angle_degrees=180)

    # dfLogOld, cfg_out['db'], cfg_out['b_skip_if_up_to_date'] = h5temp_open(**cfg_out)
    for i1, (tbl, coefs) in enumerate(h5_names_gen(cfg_in), start=1):
        # using property of rotation around same axis: R(x, θ1)@R(x, θ2) = R(x, θ1 + θ2)
        coefs['Ag'] = coefs['Ag'] @ rot_matrix_x(np.cos(np.pi), np.sin(np.pi))
        coefs['Ah'] = coefs['Ah'] @ rot_matrix_x(np.cos(np.pi), np.sin(np.pi))
        coefs['azimuth_shift_deg'] = 180
        h5copy_coef(None,
                    cfg['out']['db_path'],
                    tbl,
                    dict_matrices=dict_matrices_for_h5(coefs,
                                                       tbl,
                                                       to_nested_keys=True))

    # Calculate velocity and average
    if st(2):
        # if aggregate_period_s is None then not average and write to *_proc_noAvg.h5 else loading from that h5 and writing to _proc.h5
        if not cfg['out']['aggregate_period_s']:
            cfg['out']['aggregate_period_s'] = [
                None, 2, 600,
                3600 if 'w' in cfg['in']['probes_prefix'] else 7200
            ]

        if cfg['in']['azimuth_add']:
            if 'Lat' in cfg['in']['azimuth_add']:
                from datetime import datetime
                # add magnetic declination,° for used coordinates
                # todo: get time
                azimuth_add = mag_dec(cfg['in']['azimuth_add']['Lat'],
                                      cfg['in']['azimuth_add']['Lon'],
                                      datetime(2020, 9, 10),
                                      depth=-1)
            else:
                azimuth_add = 0
            if 'constant' in cfg['in']['azimuth_add']:
                # and add constant. For example, subtruct declination at the calibration place if it was applied
                azimuth_add += cfg['in']['azimuth_add'][
                    'constant']  # add -6.65644183° to account for calibration in Kaliningrad
        for aggregate_period_s in cfg['out']['aggregate_period_s']:
            if aggregate_period_s is None:
                db_path_in = db_path
                db_path_out = db_path.with_name(
                    f'{db_path.stem}_proc_noAvg.h5')
            else:
                db_path_in = db_path.with_name(f'{db_path.stem}_proc_noAvg.h5')
                db_path_out = f'{db_path.stem}_proc.h5'  # or separately: '_proc{aggregate_period_s}.h5'

            args = [
                Path(incl_h5clc.__file__).with_name(
                    f'incl_h5clc_{db_path.stem}.yaml'),
                # if no such file all settings are here
                '--db_path',
                str(db_path_in),
                # !   'incl.*|w\d*'  inclinometers or wavegauges w\d\d # 'incl09':
                '--tables_list',
                'incl.*' if not cfg['in']['probes'] else
                f"incl.*(?:{'|'.join('{:0>2}'.format(p) for p in cfg['in']['probes'])})",
                '--aggregate_period',
                f'{aggregate_period_s}S' if aggregate_period_s else '',
                '--out.db_path',
                str(db_path_out),
                '--table',
                f'V_incl_bin{aggregate_period_s}'
                if aggregate_period_s else 'V_incl',
                '--verbose',
                'INFO',  #'DEBUG' get many numba messages
                '--b_del_temp_db',
                '1',
                # '--calc_version', 'polynom(force)',  # depreshiated
                # '--chunksize', '20000',
                # '--not_joined_h5_path', f'{db_path.stem}_proc.h5',
            ]
            # if aggregate_period_s <= 5:   # [s], do not need split csv for big average interval
            #     args += (['--split_period', '1D'])
            if aggregate_period_s is None:  # proc. parameters (if we have saved proc. data then when aggregating we are not processing)
                args += ([
                    '--max_dict',
                    'M[xyz]:4096',
                    # Note: for Baranov's prog 4096 is not suited
                    # '--time_range_zeroing_dict', "incl19: '2019-11-10T13:00:00', '2019-11-10T14:00:00'\n,"  # not works - use kwarg
                    # '--time_range_zeroing_list', '2019-08-26T04:00:00, 2019-08-26T05:00:00'
                    '--split_period',
                    '1D'
                ] if subs_made else [
                    '--bad_p_at_bursts_starts_peroiod',
                    '1H',
                ])
            # csv splitted by 1day (default for no avg) and monolith csv if aggregate_period_s==600
            if aggregate_period_s not in cfg['out'][
                    'aggregate_period_s_not_to_text']:  # , 300, 600]:
                args += ['--text_path', str(db_path.parent / 'text_output')]
            kwarg = {
                'in': {
                    'min_date': cfg['filter']['min_date'][0],
                    'max_date': cfg['filter']['max_date'][0],
                    'time_range_zeroing': cfg['in']['time_range_zeroing'],
                    'azimuth_add': azimuth_add
                }
            }
            # If need all data to be combined one after one:
            # set_field_if_no(kwarg, 'in', {})
            # kwarg['in'].update({
            #
            #         'tables': [f'incl{i:0>2}' for i in min_date.keys() if i!=0],
            #         'dates_min': min_date.values(),  # in table list order
            #         'dates_max': max_date.values(),  #
            #         })
            # set_field_if_no(kwarg, 'out', {})
            # kwarg['out'].update({'b_all_to_one_col': 'True'})

            incl_h5clc.main(args, **kwarg)

    # Calculate spectrograms.
    if st(3):  # Can be done at any time after step 1

        def raise_ni():
            raise NotImplementedError(
                'Can not proc probes having different fs in one run: you need to do it separately'
            )

        args = [
            Path(incl_h5clc.__file__).with_name(
                f'incl_h5spectrum{db_path.stem}.yaml'),
            # if no such file all settings are here
            '--db_path',
            str(db_path.with_name(f'{db_path.stem}_proc_noAvg.h5')),
            '--tables_list',
            f"{cfg['in']['probes_prefix']}.*",  # inclinometers or wavegauges w\d\d  ## 'w02', 'incl.*',
            # '--aggregate_period', f'{aggregate_period_s}S' if aggregate_period_s else '',
            '--min_date',
            datetime64_str(cfg['filter']['min_date'][0]),
            '--max_date',
            datetime64_str(cfg['filter']['max_date']
                           [0]),  # '2019-09-09T16:31:00',  #17:00:00
            # '--max_dict', 'M[xyz]:4096',  # use if db_path is not ends with _proc_noAvg.h5 i.e. need calc velocity
            '--out.db_path',
            f"{db_path.stem.replace('incl', cfg['in']['probes_prefix'])}_proc_psd.h5",
            # '--table', f'psd{aggregate_period_s}' if aggregate_period_s else 'psd',
            '--fs_float',
            f"{fs(probes[0], cfg['in']['probes_prefix'])}",
            # (lambda x: x == x[0])(np.vectorize(fs)(probes, prefix))).all() else raise_ni()
            #
            # '--time_range_zeroing_list', '2019-08-26T04:00:00, 2019-08-26T05:00:00'
            # '--verbose', 'DEBUG',
            # '--chunksize', '20000',
            '--b_interact',
            '0',
        ]
        if 'w' in cfg['in']['probes_prefix']:
            args += [
                '--split_period',
                '1H',
                '--dt_interval_minutes',
                '10',  # burst mode
                '--fmin',
                '0.0001',
                '--fmax',
                '4'
            ]
        else:
            args += [
                '--split_period',
                '2H',
                '--fmin',
                '0.0004',  #0.0004
                '--fmax',
                '1.05'
            ]

        incl_h5spectrum.main(args)

    # Draw in Veusz
    if st(4):
        b_images_only = True  # False
        pattern_path = db_path.parent / r'vsz_5min\191119_0000_5m_incl19.vsz'  # r'vsz_5min\191126_0000_5m_w02.vsz'
        if not b_images_only:
            pattern_bytes_slice_old = re.escape(b'((5828756, 5830223, None),)')

        # Length of not adjacent intervals, s (set None to not allow)
        period = '1D'
        length = '5m'  # period  # '1D'

        dt_custom_s = pd_period_to_timedelta(
            length) if length != period else None  # None  #  60 * 5

        if True:
            # Load starts and assign ends
            t_intervals_start = pd.read_csv(
                cfg['in']['path_cruise'] /
                r'vsz+h5_proc\intervals_selected.txt',
                converters={
                    'time_start': lambda x: np.datetime64(x, 'ns')
                },
                index_col=0).index
            edges = (pd.DatetimeIndex(t_intervals_start),
                     pd.DatetimeIndex(t_intervals_start + dt_custom_s)
                     )  # np.zeros_like()
        else:
            # Generate periodic intervals
            t_interval_start, t_intervals_end = intervals_from_period(
                datetime_range=np.array(
                    [
                        cfg['filter']['min_date']['0'],
                        cfg['filter']['max_date']['0']
                    ],
                    # ['2018-08-11T18:00:00', '2018-09-06T00:00:00'],
                    # ['2019-02-11T13:05:00', '2019-03-07T11:30:00'],
                    # ['2018-11-16T15:19', '2018-12-14T14:35'],
                    # ['2018-10-22T12:30', '2018-10-27T06:30:00'],
                    'datetime64[s]'),
                period=period)
            edges = (pd.DatetimeIndex([t_interval_start
                                       ]).append(t_intervals_end[:-1]),
                     pd.DatetimeIndex(t_intervals_end))

        for i, probe in enumerate(probes):
            probe_name = f"{cfg['in']['probes_prefix']}{probe:02}"  # table name in db
            l.info('Draw %s in Veusz: %d intervals...', probe_name,
                   edges[0].size)
            # for i_interval, (t_interval_start, t_interval_end) in enumerate(zip(pd.DatetimeIndex([t_interval_start]).append(t_intervals_end[:-1]), t_intervals_end), start=1):

            cfg_vp = {'veusze': None}
            for i_interval, (t_interval_start,
                             t_interval_end) in enumerate(zip(*edges),
                                                          start=1):

                # if i_interval < 23: #<= 0:  # TEMPORARY Skip this number of intervals
                #     continue
                if period != length:
                    t_interval_start = t_interval_end - pd.Timedelta(
                        dt_custom_s, 's')

                try:  # skipping absent probes
                    start_end = h5q_interval2coord(
                        db_path=str(db_path),
                        table=f'/{probe_name}',
                        t_interval=(t_interval_start, t_interval_end))
                    if not len(start_end):
                        break  # no data
                except KeyError:
                    break  # device name not in specified range, go to next name

                pattern_path_new = pattern_path.with_name(
                    f"{t_interval_start:%y%m%d_%H%M}_{length}_{probe_name}.vsz"
                )

                # Modify pattern file
                if not b_images_only:
                    probe_name_old = re.match('.*((?:incl|w)\d*).*',
                                              pattern_path.name).groups()[0]
                    bytes_slice = bytes(
                        '(({:d}, {:d}, None),)'.format(*(start_end +
                                                         np.int32([-1, 1]))),
                        'ascii')

                    def f_replace(line):
                        """
                        Replace in file
                        1. probe name
                        2. slice
                        """
                        # if i_interval == 1:
                        line, ok = re.subn(bytes(probe_name_old, 'ascii'),
                                           bytes(probe_name, 'ascii'), line)
                        if ok:  # can be only in same line
                            line = re.sub(pattern_bytes_slice_old, bytes_slice,
                                          line)
                        return line

                    if not rep_in_file(pattern_path,
                                       pattern_path_new,
                                       f_replace=f_replace):
                        l.warning('Veusz pattern not changed!')
                        # break
                    elif cfg_vp['veusze']:
                        cfg_vp['veusze'].Load(str(pattern_path_new))
                elif cfg_vp['veusze']:
                    cfg_vp['veusze'].Load(str(pattern_path_new))

                txt_time_range = \
                    """
                    "[['{:%Y-%m-%dT%H:%M}', '{:%Y-%m-%dT%H:%M}']]" \
                    """.format(t_interval_start, t_interval_end)
                print(f'{i_interval}. {txt_time_range}', end=' ')

                cfg_vp = veuszPropagate.main(
                    [
                        Path(veuszPropagate.__file__).parent.with_name(
                            'veuszPropagate.ini'),
                        # '--data_yield_prefix', '-',
                        '--path',
                        str(
                            db_path
                        ),  # use for custom loading from db and some source is required
                        '--tables_list',
                        f'/{probe_name}',  # 181022inclinometers/ \d*
                        '--pattern_path',
                        str(pattern_path_new),
                        # fr'd:\workData\BalticSea\190801inclinometer_Schuka\{probe_name}_190807_1D.vsz',
                        # str(db_path.parent / dir_incl / f'{probe_name}_190211.vsz'), #warning: create file with small name
                        # '--before_next', 'restore_config',
                        # '--add_to_filename', f"_{t_interval_start:%y%m%d_%H%M}_{length}",
                        '--filename_fun',
                        f'lambda tbl: "{pattern_path_new.name}"',
                        '--add_custom_list',
                        'USEtime',  # nAveragePrefer',
                        '--add_custom_expressions_list',
                        txt_time_range,
                        # + """
                        # ", 5"
                        # """,
                        '--b_update_existed',
                        'True',
                        '--export_pages_int_list',
                        '1, 2',  # 0 for all '6, 7, 8',  #'1, 2, 3'
                        # '--export_dpi_int', '200',
                        '--export_format',
                        'emf',
                        '--b_interact',
                        '0',
                        '--b_images_only',
                        f'{b_images_only}',
                        '--return',
                        '<embedded_object>',  # reuse to not bloat memory
                    ],
                    veusze=cfg_vp['veusze'])
Exemple #3
0
def h5_velocity_by_intervals_gen(
        cfg: Mapping[str, Any],
        cfg_out: Mapping[str, Any]) -> Iterator[Tuple[str, Tuple[Any, ...]]]:
    """
    Loads data and calculates velocity: many intervals from many of hdf5 tables sequentially.
    :param cfg: dict with fields:
        ['proc']['dt_interval'] - numpy.timedelta64 time interval of loading data
        one group of fields:
            1.  'split_period', pandas interval str, as required by intervals_from_period() to cover all data by it
                'overlap'

            2.  'time_intervals_start' - manually specified starts of intercals

    :param cfg_out: fields must be provided:
        - see h5_names_gen(cfg_in, cfg_out) requirements
    :return:
    """
    # Prepare cycle
    if cfg_out.get('split_period'):

        def gen_loaded(tbl):
            """
            Variant 1. Generate regular intervals (may be with overlap)
            :param tbl:
            :return:
            """
            cfg['in']['table'] = tbl
            # To obtain ``t_intervals_start`` used in query inside gen_data_on_intervals(cfg_out, cfg)
            # we copy its content here:
            t_prev_interval_start, t_intervals_start = intervals_from_period(
                **cfg['in'], period=cfg_out['split_period'])
            if cfg['proc']['overlap']:
                dt_shifts = np.arange(
                    0, 1,
                    (1 - cfg['proc']['overlap'])) * pd_period_to_timedelta(
                        cfg_out['split_period'])
                t_intervals_start = (t_intervals_start.to_numpy(
                    dtype="datetime64[ns]")[np.newaxis].T +
                                     dt_shifts).flatten()
                if cfg['in']['max_date']:
                    idel = t_intervals_start.searchsorted(
                        np.datetime64(
                            cfg['in']['max_date'] -
                            pd_period_to_timedelta(cfg_out['split_period'])))
                    t_intervals_start = t_intervals_start[:idel]
                cfg['in'][
                    'time_intervals_start'] = t_intervals_start  # to save queried time - see main()
            cfg_filter = None
            cfg_in_columns_saved = cfg['in']['columns']
            for start_end in h5q_starts2coord(
                    cfg['in']['db_path'],
                    cfg['in']['table'],
                    t_intervals_start,
                    dt_interval=cfg['proc']['dt_interval']):
                a = h5_load_range_by_coord(**cfg['in'],
                                           range_coordinates=start_end)
                if cfg_filter is None:  # only 1 time
                    # corrects columns if they are not exact mutch to faster h5_load_range_by_coord() next time
                    cfg['in']['columns'] = a.columns  # temporary
                    # and exclude absent fields to not filter warning of no such column in filt_data_dd()
                    detect_filt = f"m(ax|in)_({'|'.join(cfg['in']['columns'])})"
                    cfg_filter = {
                        k: v
                        for k, v in cfg['filter'].items()
                        if re.match(detect_filt, k)
                    }
                d, i_burst = filt_data_dd(a, cfg['in']['dt_between_bursts'],
                                          cfg['in']['dt_hole_warning'],
                                          cfg_filter)

                n_bursts = len(i_burst)
                if n_bursts > 1:  # 1st is always 0
                    l.info('gaps found: (%s)! at %s', n_bursts - 1,
                           i_burst[1:] - 1)
                df0 = d.compute()
                if not len(df0):
                    continue
                start_end = df0.index[[0, -1]].values
                yield df0, start_end
            cfg['in'][
                'columns'] = cfg_in_columns_saved  # recover to not affect next file

    else:
        query_range_pattern = "index>=Timestamp('{}') & index<=Timestamp('{}')"

        def gen_loaded(tbl):
            """
            Variant 2. Generate intervals at specified start values with same width cfg['proc']['dt_interval']
            :param tbl:
            :return:
            """
            for start_end in zip(
                    cfg['in']['time_intervals_start'],
                    cfg['in']['time_intervals_start'] +
                    cfg['proc']['dt_interval']):
                query_range_lims = pd.to_datetime(start_end)
                qstr = query_range_pattern.format(*query_range_lims)
                l.info(f'query:\n%s... ', qstr)
                df0 = store.select(tbl, where=qstr, columns=None)
                yield df0, start_end

    dt_interval_in_its_units = cfg['proc']['dt_interval'].astype(int)
    dt_interval_units = np.datetime_data(cfg['proc']['dt_interval'])[0]
    data_name_suffix = f'{dt_interval_in_its_units}{dt_interval_units}'

    # Cycle
    with pd.HDFStore(cfg['in']['db_path'], mode='r') as store:
        for (tbl, coefs) in h5_names_gen(cfg['in'], cfg_out):
            # Get data in ranges
            for df0, start_end in gen_loaded(tbl):
                if cfg['in']['db_path'].stem.endswith('proc_noAvg'):
                    df = df0
                else:  # loading source data needed to be processed to calc velocity
                    df0 = filter_local(df0, cfg['filter'])
                    df = incl_calc_velocity_nodask(df0,
                                                   **coefs,
                                                   cfg_filter=cfg['in'],
                                                   cfg_proc=cfg['proc'])

                data_name = f'{tbl}/PSD_{start_end[0]}{data_name_suffix}'
                yield (df, tbl, data_name)