def main(new_arg=None): cfg = cfg_from_args(my_argparser(), new_arg) if not cfg: return if new_arg == '<return_cfg>': # to help testing return cfg l = init_logging(logging, None, cfg['program']['log'], cfg['program']['verbose']) if not cfg['out']['path'].is_absolute(): # set path relative to cfg['in']['db_path'] cfg['out']['path'] = cfg['in']['db_path'].with_name(str(cfg['out']['path'])) l.warning('\n {}({}) is gonna save gpx to ..{} dir. '.format( this_prog_basename(__file__), cfg['in']['db_path'], cfg['out']['path'].parent)) if cfg['out']['select_from_tablelog_ranges'] is None: gpx_symbols = None else: gpx_symbols = init_gpx_symbols_fun(cfg['out']) global gpx_names_funs # Shortcat for cfg['out']['gpx_names_funs'] # Load data ################################################################# qstr_trange_pattern = "index>=Timestamp('{}') & index<=Timestamp('{}')" with pd.HDFStore(cfg['in']['db_path'], mode='r') as store: # Find tables by pattern if '*' in cfg['in']['tables'][0]: # if 'table_prefix' in cfg['in'] pattern_tables = cfg['in']['tables'][0] cfg['in']['tables'] = h5find_tables(store, pattern_tables) len_tables = len(cfg['in']['tables']) msg = 'Found {} tables with pattern {}'.format(len_tables, pattern_tables) if len_tables: l.info(msg) else: raise Ex_nothing_done(msg + '!') gpx_names_funs = [] for itbl in range(len(cfg['in']['tables'])): # same fo each table gpx_names_funs.append(cfg['out']['gpx_names_funs'][0]) else: # fixed number of tables # initialise with defaults if need: gpx_names_funs = cfg['out']['gpx_names_funs'] for itbl in range(len(gpx_names_funs), len(cfg['in']['tables'])): gpx_names_funs.append('i+1') dfs_rnav = [] tbl_names_all_shortened = [] for itbl, tblD in enumerate(cfg['in']['tables']): print(itbl, '. ', tblD, end=': ', sep='') if cfg['in']['tables_log'][0]: tblL = tblD + '/' + cfg['in']['tables_log'][0] try: dfL = store[tblL] except KeyError as e: l.warning(' '.join([s for s in e.args if isinstance(s, str)])) continue else: # only for tables without log (usually no such tables) l.warning('configuration specifies to get data without use of "log..." tables') st_en = store[tblD].index[[0, -1]] if cfg['process']['period_files']: t_intervals_start = pd.date_range( start=st_en[0].normalize(), end=max(st_en[-1], st_en[-1].normalize() + pd_period_to_timedelta( cfg['process']['period_files'])), freq=cfg['process']['period_files'])[1:] # makes last t_interval_start >= all_data[-1] dfL = pd.DataFrame.from_records({'DateEnd': t_intervals_start, 'fileName': tblD}, index=st_en[:1].append(t_intervals_start[:-1])) else: dfL = pd.DataFrame.from_records({'DateEnd': st_en[-1], 'fileName': tblD}, index=st_en[:1]) gpx_names_fun_str = "lambda i, row, t=0: '{}'.format({})".format( cfg['out']['gpx_names_fun_format'], gpx_names_funs[itbl]) gpx_names_fun = eval(compile(gpx_names_fun_str, '', 'eval')) if cfg['out']['select_from_tablelog_ranges'] is None: # Use all data for ranges specified in log rows and saves tracks (not points) for irow, r in enumerate(dfL.itertuples()): # iterrows() qstr = qstr_trange_pattern.format(r.Index, r.DateEnd) print(qstr, end='... ') try: dfD = store.select(cfg['in']['table_nav' ] if cfg['in']['table_nav'] else tblD, qstr, columns=['Lat', 'Lon', 'DepEcho']) except Exception as e: l.exception('Error when query: {}. '.format(qstr)) # '\n==> '.join([s for s in e.args if isinstance(s, str)]))) continue # Keep data with period = 1s only dfD = dfD[~dfD.index.round(pd.Timedelta(seconds=1)).duplicated()] # dfD.drop_duplicates(['Lat', 'Lon', 'index'])' bGood = filterGlobal_minmax(dfD, dfD.index, cfg['filter']) dfD = dfD[bGood] # Add UTC time and table name to output file name # Local time and table name to gpx object name str_time_long = '{:%y%m%d_%H%M}'.format(r.Index) r = r._replace(Index=timzone_view(r.Index, cfg['out']['dt_from_utc_in_comments'])) tblD_safe = file_from_tblname(tblD, cfg['in']['tables_log'][0]) try: gpx_names_fun_result = gpx_names_fun(tblD_safe, r) # '{:%y%m%d}'.format(timeLocal) except TypeError as e: raise TypeError('Can not evalute gpx_names_fun "{}"'.format(gpx_names_fun_str)).with_traceback( e.__traceback__) save_to_gpx( dfD, cfg['out']['path'].with_name(f'{str_time_long}{tblD_safe}'), gpx_obj_namef=gpx_names_fun_result, cfg_proc=cfg['process']) if len(cfg['in']['tables']) > 1: nav2add_cur = dfD if irow == 0 else nav2add_cur.append(dfD) if len(cfg['in']['tables']) > 1: nav2add_cur = dfD.assign(itbl=itbl) else: # Use only 1 data point per log row if cfg['out']['select_from_tablelog_ranges'] != 0: print('selecting from {} row index of log table'.format( cfg['out']['select_from_tablelog_ranges'])) try: dfL.index = dfL.index.tz_convert('UTC') except TypeError as e: print((e.msg if hasattr(e, 'msg') else str(e)) + '!\n- continue presume on UTC log index...') print(end='all log data ') time_points = (dfL.index if cfg['out']['select_from_tablelog_ranges'] == 0 else dfL['DateEnd'] if cfg['out']['select_from_tablelog_ranges'] == -1 else None) if time_points is None: raise (ValueError("cfg['out']['select_from_tablelog_ranges'] must be 0 or -1")) cols_nav = ['Lat', 'Lon', 'DepEcho'] nav2add = h5select(store, cfg['in']['table_nav'], cols_nav, time_points=time_points, dt_check_tolerance=cfg['process']['dt_search_nav_tolerance'], query_range_lims=(time_points[0], dfL['DateEnd'][-1]) )[0] cols_nav = nav2add.columns # not all columns may be loaded # Try get non NaN from dfL if it has needed columns (we used to write there edges' data with _st/_en suffixes) isna = nav2add.isna() dfL_col_suffix = 'st' if cfg['out']['select_from_tablelog_ranges'] == 0 else 'en' for col in cols_nav: col_dat = f'{col}_{dfL_col_suffix}' if isna[col].any() and col_dat in dfL.columns: b_use = isna[col].values & dfL[col_dat].notna().values nav2add.loc[b_use, col] = dfL.loc[b_use, col_dat].values nav2add.index = timzone_view(nav2add.index, dt_from_utc=cfg['out']['dt_from_utc_in_comments']) # tz_local= tzoffset(None, cfg['out']['dt_from_utc_in_comments'].total_seconds()) # if nav2add.index.tz is None: # # think if time zone of tz-naive Timestamp is naive then it is UTC # nav2add.index = nav2add.index.tz_localize('UTC') # nav2add.tz_convert(tz_local, copy= False) # Save to gpx waypoints nav2add_cur = nav2add.assign(itbl=itbl) # if 'gpx_names_funs' in cfg['out'] and \ # len(cfg['out']['gpx_names_funs'])>itbl: # # gpx_names = eval(compile('lambda i: str({})'.format( # cfg['out']['gpx_names_funs'][itbl]), [], 'eval')) # save_to_gpx(nav2add_cur, cfg['out']['path'] / f"stations_{file_from_tblname(tblD, cfg['in']['tables_log'][0])}", gpx_obj_namef=gpx_names_fun, waypoint_symbf=gpx_symbols, cfg_proc=cfg['process'] ) # save_to_csv(nav2add, dfL.index, cfg['out']['path'].with_name(f'nav{tblD}.txt')) if False: # Show table info store.get_storer(tblD).table nodes = sorted(store.root.__members__) # , key=number_key print(nodes) # store.get_node('CTD_Idronaut(Redas)').logFiles # next level nodes # prepare saving of combined gpx if tbl_names_all_shortened: i_new = 0 for c_prev, c_new in zip(tbl_names_all_shortened[-1], tblD): if c_new == c_prev: i_new += 1 else: break tbl_names_all_shortened.append(tblD[i_new:]) else: tbl_names_all_shortened.append(tblD) dfs_rnav.append(nav2add_cur) if len(cfg['in']['tables']) > 1 and cfg['out']['gpx_names_funs_cobined']: print('combined: ', end='') # Save combined data to gpx df_rnav_combined = pd.concat(dfs_rnav) df_rnav_combined.sort_index(inplace=True) # Save to gpx waypoints if 'gpx_names_funs' in cfg['out']['gpx_names_funs_cobined']: gpx_names_funs = [ # row not used, it is here only for compability with tracks eval(compile("lambda i: " + f, '', 'eval')) for f in gpx_names_funs] gpx_names_fun = eval(compile( "lambda i,row,t: '{gpx_names_fun_format}'.format({gpx_names_funs_cobined})".format_map( cfg['out']), '', 'eval')) # gpx_symbols = lambda row: cfg['out']['gpx_symbols'][sym_index_fun(row)] # gpx_names = eval(compile("lambda i,row: '{gpx_names_fun_format}'.format({gpx_names_funs_cobined})".format_map(cfg['out']), '', 'eval')) # gpx_names = lambda i: str(i + 1) save_to_gpx( df_rnav_combined, cfg['out']['path'].with_name( 'all_' + file_from_tblname(','.join(tbl_names_all_shortened), cfg['in']['tables_log'][0])), gpx_obj_namef=gpx_names_fun, waypoint_symbf=gpx_symbols, cfg_proc=cfg['process']) print('Ok')
for centers, qstr in gen_queries(ranges, patterns, margins, print_patern=print_patern): file_stem = file_patern.format(**centers) file_stem_no_time = file_patern_no_time.format(**centers) # p, p_st, p_en in np.arange(pst, pen, pstep)[:, np.newaxis] + np.array( # [[0, -pwidth, pwidth]]) / 2: # [:, np.] # print('\n{:g}m.'.format(p), end=' ') # qstr = qstr_pattern.format(p_st, p_en) FCTD = pd.read_hdf(db_path_temp, 'CTD', where=qstr) if FCTD.empty: print('- empty', end='') continue time_st_local, time_en_local = [ timzone_view(x, t_our_zone) for x in FCTD.index[[0, -1]] ] fileN_time =\ f'{time_st_local:%y%m%d_%H%M}-'\ f'{{:{"%d_" if time_st_local.day!=time_en_local.day else ""}%H%M}}'.format(time_en_local) # Get data for each run # It is possible to get it by aggeregation (df_points = FCTD.groupby(['Lat', 'Lon'])) # but here we use runs info which is icapsulated in _shift_. Runs were found in Veusz iruns = np.flatnonzero(np.diff(FCTD['shift']) != 0) ctd = np.empty( (iruns.size + 1, ), { 'names': params + ['Lat', 'Lon'], 'formats': ['f8'] * (len(params) + 2) }) ctd.fill(np.NaN)
def h5add_log(cfg_out: Dict[str, Any], df, log: Union[pd.DataFrame, Mapping, None], tim, log_dt_from_utc): """ Updates (or creates if need) metadata table :param cfg_out: dict with fields: - b_log_ready: if False or '' then updates log['Date0'], log['DateEnd']. - db: handle of opened hdf5 store - some of following fields (next will be tried if previous not defined): - table_log: str, path of log table - tables_log: List[str], path of log table in first element - table: str, path of log table will be consructed by adding '/log' - tables: List[str], path of log table will be consructed by adding '/log' to first element - logfield_fileName_len: optiondal, fixed length of string format of 'fileName' hdf5 column :param df: :param log: Mapping records or dataframe. updates 'Date0' and 'DateEnd' if no 'Date0' or it is {} or None :param tim: :param log_dt_from_utc: :return: """ if cfg_out.get('b_log_ready') and (isinstance(log, Mapping) and not log): return # synchro "tables_log" and more user friendly but not so universal to code "table_log" if cfg_out.get('table_log'): table_log = cfg_out['table_log'] else: table_log = cfg_out.get('tables_log') if table_log: if '{}' in table_log[0]: table_log = table_log[0].format(cfg_out['table']) else: table_log = table_log[0] else: # set default for (1st) data table try: table_log = f"{cfg_out['table']}'/log'" except KeyError: table_log = f"{cfg_out['tables'][0]}'/log'" set_field_if_no(cfg_out, 'logfield_fileName_len', 255) if (log.get('Date0') is None) or not cfg_out.get('b_log_ready'): # or (table_log.split('/')[-1].startswith('logFiles')): log['Date0'], log['DateEnd'] = timzone_view( (tim if tim is not None else df.index.compute() if isinstance( df, dd.DataFrame) else df.index)[[0, -1]], log_dt_from_utc) # dfLog = pd.DataFrame.from_dict(log, np.dtype(np.unicode_, cfg_out['logfield_fileName_len'])) if not isinstance(log, pd.DataFrame): try: log = pd.DataFrame(log).set_index('Date0') except ValueError as e: # , Exception log = pd.DataFrame.from_records( log, exclude=['Date0'], index=log['Date0'] if isinstance(log['Date0'], pd.DatetimeIndex) else [log['Date0']]) # index='Date0' not work for dict try: return df_log_append_fun(log, table_log, cfg_out) except ValueError as e: return h5append_on_inconsistent_index(cfg_out, table_log, log, df_log_append_fun, e, 'append log') except ClosedFileError as e: l.warning('Check code: On reopen store update store variable')
def get_runs_parameters(df_raw, times_min, times_max, cols_good_data: Union[str, Sequence[str], None], dt_from_utc: timedelta = timedelta(0), db=None, db_path=None, table_nav=None, table_nav_cols=('Lat', 'Lon', 'DepEcho', 'Speed', 'Course'), dt_search_nav_tolerance=timedelta(minutes=2)): """ :param df_raw: :param times_min: :param times_max: :param cols_good_data: cols of essential data that must be good (depth) :param dt_from_utc: :param db: :param db_path: :param table_nav: 'navigation' table to find data absent in df_raw. Note: tries to find only positive vals :param table_nav_cols: :param dt_search_nav_tolerance: :return: """ log = {} log_update = { } # {_st: DataFrame, _en: DataFrame} - dataframes of parameters for imin and imax for times_lim, suffix, log_time_col, i_search in ((times_min, '_st', 'Date0', 0), (times_max, '_en', 'DateEnd', -1)): log_update[suffix] = df_raw.asof( times_lim, subset=cols_good_data) # rows of last good data log[log_time_col] = timzone_view(log_update[suffix].index, dt_from_utc) # Search for nearest good values if have bad parameter p for (p, *isnan) in log_update[suffix].isna().T.itertuples(name=None): if i_search == -1: log_update[suffix].loc[isnan, p] = df_raw[p].asof(times_max[isnan]) else: # "asof()"-alternative for 1st notna: take 1st good element in each interval for time_nan, time_min, time_max in zip( times_lim[isnan], times_min[isnan], times_max[isnan]): s_search = df_raw.loc[time_min:time_max, p] try: log_update[suffix].at[time_nan, p] = s_search[ s_search.notna()].iat[ 0] # same as .at[s_search.first_valid_index()] except IndexError: l.warning( 'no good values for parameter "%s" in run started %s', p, time_nan) continue log_update[suffix] = log_update[suffix].add_suffix(suffix) log.update( # pd.DataFrame(, index=log_update['_st'].index).rename_axis('Date0') {**dict( [(k, v.values) for st_en in zip(log_update['_st'].items(), log_update['_en'].items()) for k, v in st_en]), # flatten pares }) if table_nav: time_points = log_update['_st'].index.append(log_update['_en'].index) with FakeContextIfOpen(lambda f: pd.HDFStore(f, mode='r'), db_path, db) as store: df_nav, dt = h5select( # all starts then all ends in row store, table_nav, columns=table_nav_cols, time_points=time_points, dt_check_tolerance=dt_search_nav_tolerance) # {:0.0f}s'.format(cfg['out']['dt_search_nav_tolerance'].total_seconds()) # todo: allow filter for individual columns. solution: use multiple calls for columns that need filtering with appropriate query_range_pattern argument of h5select() isnan = df_nav.isna() for col in df_nav.columns[isnan.any(axis=0)]: # not works: # df_nav_col, dt_col = h5select( # for current parameter's name # cfg['in']['db'], cfg['in']['table_nav'], # columns=[col], # query_range_lims=time_points[[0,-1]], # time_points=time_points[isnan[col]], # query_range_pattern = f"index>=Timestamp('{{}}') & index<=Timestamp('{{}}') & {col} > 0 ", # dt_check_tolerance=cfg['out']['dt_search_nav_tolerance'] # ) # Note: tries to find only positive vals: df_nav_col = store.select( table_nav, where="index>=Timestamp('{}') & index<=Timestamp('{}') & {} > 0" .format( *(time_points[[0, -1]] + np.array( (-dt_search_nav_tolerance, dt_search_nav_tolerance))), col), columns=[col]) try: vals = df_nav_col[col].values vals = vals[inearestsorted(df_nav_col.index, time_points[isnan[col]])] except IndexError: continue # not found if vals.any(): df_nav.loc[isnan[col], col] = vals # df_nav['nearestNav'] = dt.astype('m8[s]').view(np.int64) df_edges_items_list = [ df_edge.add_suffix(suffix).items() for suffix, df_edge in ( ('_st', df_nav.iloc[:len(log_update['_st'])]), ('_en', df_nav.iloc[len(log_update['_st']):len(df_nav)])) ] for st_en in zip(*df_edges_items_list): for name, series in st_en: # If have from data table already => update needed elements only if name in log: b_need = np.isnan(log.get(name)) if b_need.any(): b_have = np.isfinite(series.values) # from loaded nav in points b_use = b_need & b_have if b_use.any(): log[name][b_use] = series.values[b_use] # # from all nav (not loaded) # b_need &= ~b_have # # if b_need.any(): # # load range to search nearest good val. for specified fields and tolerance # df = cfg['in']['db'].select(cfg['in']['table_nav'], where=query_range_pattern.format(st_en.index), columns=name) # df_nav = h5select( # for current parameter's name # cfg['in']['db'], cfg['in']['table_nav'], # columns=name, # query_range_lims=st_en # time_points=log_update['_st'].index.append(log_update['_en'].index), # dt_check_tolerance=cfg['out']['dt_search_nav_tolerance'] # ) continue # else: # b_need = np.isnan(series.values) # for # Else update all elements at once log[name] = series.values return log
except Exception as e: print('{}\n Try set [in].b_raise_on_err= False'.format(e)) raise (e) # Process a and get date date in ISO standard format try: date = fun_proc_loaded(a, cfg['in']) except IndexError: print('no data!') continue # add time shift specified in configuration .ini date = np.atleast_1d(date) tim, b_ok = time_corr(date, cfg['in'], sort=True) # Save last time to can filter next file cfg['in']['time_last'] = date[-1] log_item['rows'] = 1 log_item['Date0'] = timzone_view(tim[0], cfg['in']['dt_from_utc']) log_item['DateEnd'] = datetime.now() # can not paste np.NaN log_item['fileNameNew'] = '{Date0:%y%m%d_%H%M}'.format(**log_item) log.append(log_item.copy()) strLog = '{fileName}:\t{Date0:%d.%m.%Y %H:%M:%S}->\t{fileNameNew}.txt'.format( **log_item) # \t{Lat}\t{Lon}\t{strOldVal}->\t{mag} print(strLog) if 'log' in cfg['program'].keys(): # Log to logfile f.writelines('\n' + strLog) else: if len(log): s = input( '\n{} txt files. Rename _ASC.TXT, .TXT, r.000, r.000.nc? Y/n: ' .format(nFiles)) if 'n' in s or 'N' in s: print('nothing done')