def test_cache(): c = cachey.Cache(10000) cc = Cache(c) with cc: assert get({"x": (inc, 1)}, "x") == 2 assert flag == [1] assert c.data["x"] == 2 assert not cc.starttimes assert not cc.durations while flag: flag.pop() dsk = {"x": (inc, 1), "y": (inc, 2), "z": (add, "x", "y")} with cc: assert get(dsk, "z") == 5 assert flag == [2] # no x present assert not Callback.active
def test_cache(): c = cachey.Cache(10000) cc = Cache(c) with cc: assert get({'x': (inc, 1)}, 'x') == 2 assert flag == [1] assert c.data['x'] == 2 assert not cc.starttimes assert not cc.durations while flag: flag.pop() dsk = {'x': (inc, 1), 'y': (inc, 2), 'z': (add, 'x', 'y')} with cc: assert get(dsk, 'z') == 5 assert flag == [2] # no x present assert not Callback.active
""" Dynamically load irregularly shapes images of ants and bees """ import numpy as np from dask_image.imread import imread from dask.cache import Cache from napari import Viewer, gui_qt cache = Cache(2e9) # Leverage two gigabytes of memory cache.register() base_name = 'data/kaggle-nuclei/fixes/stage1_train/*' images = imread(base_name + '/images/image_gray.tif') labels = imread(base_name + '/labels/label.tif') print(images.shape) with gui_qt(): # create an empty viewer viewer = Viewer() # add the images image_layer = viewer.add_image(images, name='nuceli', colormap='gray') labels_layer = viewer.add_labels(labels, name='labels', opacity=0.5)
# In[ ]: na = distributed_array.persist() na # In[ ]: dir(na) # In[ ]: na = None # In[ ]: #tag::cache[] from dask.cache import Cache c = Cache(1e9) # 1GB cache # A local cache for the part of our code where we need a cache with c: distributed_array.compute() # Or global for any calls we make c.register() #end::cache[] # In[ ]: # In[ ]:
cat_width = 1 # Size of fixed-width string for representing categories columns = None cachesize = 9e9 @property def parq_opts(self): return dict(file_scheme='hive', has_nulls=(False if self.dftype == 'pandas' else 0), write_index=False) p = Parameters() from dask.cache import Cache Cache(p.cachesize).register() filetypes_storing_categories = {'parq', 'castra'} class Kwargs(dict): """Used to distinguish between dictionary argument values, and keyword-arguments. """ pass def benchmark(fn, args): """Benchmark when "fn" function gets called on "args" tuple. "args" may have a Kwargs instance at the end. """
import xarray as xr import os import glob import imp import sys import numpy as np import pandas as pd import datetime import json import time import utm ### # Experimental cache option to speed up dask calls import cachey from dask.cache import Cache cache = Cache(10e9) cache.register() ### start_time = time.time() # Hack to force datetimes to display in GMT/UTC (numpy 1.11.1 has fixed this but other dependent modules (pynio) can't handel numpy 1.11.1) os.environ['TZ'] = 'GMT' time.tzset() # Load in config file ####### load user configurable paramters here ####### # Check user defined configuraiton file if len(sys.argv) == 1: raise ValueError( 'requires one argument [configuration file] (i.e. python GRIB2_to_CHM_forcing.py forcing_config.py' )
"""Dask cache utilities. """ import collections.abc import contextlib from typing import Callable, ContextManager, Optional, Tuple import dask import dask.array as da from dask.cache import Cache #: dask.cache.Cache, optional : A dask cache for opportunistic caching #: use :func:`~.resize_dask_cache` to actually register and resize. #: this is a global cache (all layers will use it), but individual layers #: can opt out using Layer(..., cache=False) _DASK_CACHE = Cache(1) _DEFAULT_MEM_FRACTION = 0.25 def resize_dask_cache(nbytes: Optional[int] = None, mem_fraction: Optional[float] = None) -> Cache: """Create or resize the dask cache used for opportunistic caching. The cache object is an instance of a :class:`Cache`, (which wraps a :class:`cachey.Cache`). See `Dask opportunistic caching <https://docs.dask.org/en/latest/caching.html>`_ Parameters ---------- nbytes : int, optional
from to_pandas_hdf5.csv2h5 import main as csv2h5 from to_pandas_hdf5.csv_specific_proc import correct_kondrashov_txt, rep_in_file, correct_baranov_txt from to_pandas_hdf5.h5_dask_pandas import h5q_interval2coord from inclinometer.h5inclinometer_coef import h5copy_coef import inclinometer.incl_h5clc as incl_h5clc import inclinometer.incl_h5spectrum as incl_h5spectrum import veuszPropagate from utils_time import pd_period_to_timedelta from utils2init import path_on_drive_d, init_logging, open_csv_or_archive_of_them, st # l = logging.getLogger(__name__) l = init_logging(logging, None, None, 'INFO') if True: # False. Experimental speedup but takes memory from dask.cache import Cache cache = Cache(2e9) # Leverage two gigabytes of memory cache.register() # Turn cache on globally if False: # True: # False: # l.warning('using "synchronous" scheduler for debugging') import dask dask.config.set(scheduler='synchronous') # Directory where inclinometer data will be stored path_cruise = path_on_drive_d( r'd:\WorkData\BalticSea\200628_Pregolya,Lagoon-inclinometer') r""" d:\WorkData\BalticSea\200630_AI55\inclinometer d:\WorkData\_experiment\inclinometer\200610_tank_ex[4,5,7,9,10,11][3,12,13,14,15,16,19] d:\WorkData\BalticSea\200514_Pregolya,Lagoon-inclinometer d:\WorkData\BalticSea\200317_Pregolya,Lagoon-inclinometer d:\WorkData\BalticSea\191210_Pregolya,Lagoon-inclinometer
import napari import dask.array as da # https://docs.dask.org/en/latest/caching.html # Using fixed sized cache [Need the Cachey module, so 'pip install cachey' first, 12-03-2019] from dask.cache import Cache cache = Cache(8e9) # Leverage eight gigabytes of memory cache.register() # Turn cache on globally # Read the zarr images memimage = da.from_zarr( 'W:/SV3/RC_15-06-11/Dme_E2_His2AvRFP_spiderGFP_12-03_20150611_155054.corrected/Results/zarr/membrane.zarr' ) nucimage = da.from_zarr( 'W:/SV3/RC_15-06-11/Dme_E2_His2AvRFP_spiderGFP_12-03_20150611_155054.corrected/Results/zarr/nuclei.zarr' ) print(type(memimage), memimage.shape, memimage.dtype) # create Qt GUI context with napari.gui_qt(): # create a Viewer and add the images as layers viewer = napari.Viewer(axis_labels=['view', 't', 'z', 'y', 'x']) viewer.add_image(memimage, scale=[1, 1, 8, 1, 1], colormap='inferno', blending='additive', name='membrane', is_pyramid=False, rgb=False, contrast_limits=[10, 255])
def main(argv): global DEBUG, DD_FORCE_LOAD, DASK_CLIENT parser = argparse.ArgumentParser(epilog=__doc__, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('filepath') parser.add_argument('dftype') parser.add_argument('base') parser.add_argument('x') parser.add_argument('y') parser.add_argument('categories', nargs='+') parser.add_argument('--debug', action='store_true', help='Enable increased verbosity and DEBUG messages') parser.add_argument('--cache', choices=('persist', 'cachey'), default=None, help='Enable caching: "persist" causes Dask dataframes to force loading into memory; "cachey" uses dask.cache.Cache with a cachesize of {}. Caching is disabled by default'.format(int(p.cachesize))) parser.add_argument('--distributed', action='store_true', help='Enable the distributed scheduler instead of the threaded, which is the default.') parser.add_argument('--recalc-ranges', action='store_true', help='Tell datashader to recalculate the ranges on each aggregation, instead of caching them (by default).') args = parser.parse_args(argv[1:]) if args.cache is None: if args.debug: print("DEBUG: Cache disabled", flush=True) else: if args.cache == 'cachey': from dask.cache import Cache cache = Cache(p.cachesize) cache.register() elif args.cache == 'persist': DD_FORCE_LOAD = True if args.debug: print('DEBUG: Cache "{}" mode enabled'.format(args.cache), flush=True) if args.dftype == 'dask' and args.distributed: local_cluster = distributed.LocalCluster(n_workers=p.n_workers, threads_per_worker=1) DASK_CLIENT = distributed.Client(local_cluster) if args.debug: print('DEBUG: "distributed" scheduler is enabled') else: if args.dftype != 'dask' and args.distributed: raise ValueError('--distributed argument is only available with the dask dataframe type (not pandas)') if args.debug: print('DEBUG: "threaded" scheduler is enabled') filepath = args.filepath basename, extension = os.path.splitext(filepath) p.dftype = args.dftype p.base = args.base p.x = args.x p.y = args.y p.categories = args.categories DEBUG = args.debug if DEBUG: print('DEBUG: Memory usage (before read):\t{} MB'.format(get_proc_mem(), flush=True)) df,loadtime = timed_read(filepath, p.dftype) if df is None: if loadtime == -1: print("{:28} {:6} Operation not supported".format(filepath, p.dftype), flush=True) return 1 if DEBUG: print('DEBUG: Memory usage (after read):\t{} MB'.format(get_proc_mem(), flush=True)) img,aggtime1 = timed_agg(df,filepath,5,5,cache_ranges=(not args.recalc_ranges)) if DEBUG: mem_usage = df.memory_usage(deep=True) if p.dftype == 'dask': mem_usage = mem_usage.compute() print('DEBUG:', mem_usage, flush=True) mem_usage_total = mem_usage.sum() print('DEBUG: DataFrame size:\t\t\t{} MB'.format(mem_usage_total / 1e6, flush=True)) for colname in df.columns: print('DEBUG: column "{}" dtype: {}'.format(colname, df[colname].dtype)) print('DEBUG: Memory usage (after agg1):\t{} MB'.format(get_proc_mem(), flush=True)) img,aggtime2 = timed_agg(df,filepath,cache_ranges=(not args.recalc_ranges)) if DEBUG: print('DEBUG: Memory usage (after agg2):\t{} MB'.format(get_proc_mem(), flush=True)) in_size = get_size(filepath) out_size = get_size(filepath+".png") global_end = time.time() print("{:28} {:6} Aggregate1:{:06.2f} ({:06.2f}+{:06.2f}) Aggregate2:{:06.2f} In:{:011d} Out:{:011d} Total:{:06.2f}"\ .format(filepath, p.dftype, loadtime+aggtime1, loadtime, aggtime1, aggtime2, in_size, out_size, global_end-global_start), flush=True) return 0
fig, ax = plt.subplots(1, 1, figsize=(12, 12)) ax.imshow(result, cmap="gray") plt.show(block=False) plt.waitforbuttonpress() if __name__ == "__main__": logging.getLogger("tifffile").setLevel(logging.ERROR) coloredlogs.install( level="DEBUG", fmt="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S" ) client = Client("10.109.20.6:8786") print(client) print() cache = Cache(2e9) cache.register() src_dir = "Y:/charm/20191015_Clarity_brain_HD91_RCCS_1" ds = MicroManagerV2Dataset(src_dir) print("== dataset inventory ==") print(ds.inventory) images, links = load_valid_tiles(ds, save=True) # calculate_link_shifts(ds, links) client.close()
#futures_total = 0 class _COUNTER: COUNT=0 def inc(self, *args, **kwargs): self.COUNT+=1 def __call__(self): return self.COUNT futures_total = _COUNTER() futures_cache = deque()#maxlen=MAX_FUTURE_NUM) # allow for 100,000 sinks for these (so we don't lose them) futures_cache_sinks = deque()#maxlen=100000) # assume all functions are pure globally try: from dask.cache import Cache cache = Cache(1e6) cache.register() except ImportError: print("Error cachey not available. Will not be caching") pass # make everything pure by default set_options(delayed_pure=True) # TAU STUFF Profiler dictionaries profile_dict = dict() last_run_time = None
import io, os, os.path, sys, time, shutil import pandas as pd import dask.dataframe as dd import numpy as np import datashader as ds import bcolz import feather import fastparquet as fp from datashader.utils import export_image from datashader import transfer_functions as tf from castra import Castra from collections import OrderedDict from dask.cache import Cache Cache(9e9).register() base, x, y = 'data', 'x', 'y' dftype = 'pandas' categories = [] filetypes_storing_categories = {'parq', 'castra'} read = OrderedDict(csv={}, h5={}, castra={}, bcolz={}, parq={}, feather={}) read["csv"]["pandas"] = lambda filepath: pd.read_csv(filepath) read["csv"]["dask"] = lambda filepath: dd.read_csv(filepath) read["h5"]["dask"] = lambda filepath: dd.read_hdf(filepath, base) read["h5"]["pandas"] = lambda filepath: pd.read_hdf(filepath, base) read["castra"]["dask"] = lambda filepath: dd.from_castra(filepath) read["bcolz"]["dask"] = lambda filepath: dd.from_bcolz(filepath,
def test_cache_with_number(): c = Cache(10000, limit=1) assert isinstance(c.cache, cachey.Cache) assert c.cache.available_bytes == 10000 assert c.cache.limit == 1
def main(new_arg=None, **kwargs): """ :param new_arg: list of strings, command line arguments :kwargs: dicts of dictcts (for each ini section): specified values overwrites ini values """ # global l cfg = cfg_from_args(my_argparser(), new_arg, **kwargs) if not cfg['program']: return # usually error of unrecognized arguments displayed cfg['in']['db_coefs'] = Path(cfg['in']['db_coefs']) for path_field in ['db_coefs', 'path_cruise']: if not cfg['in'][path_field].is_absolute(): cfg['in'][path_field] = ( cfg['in']['cfgFile'].parent / cfg['in'][path_field] ).resolve().absolute() # cfg['in']['cfgFile'].parent / def constant_factory(val): def default_val(): return val return default_val for lim in ('min_date', 'max_date'): # convert keys to int because they must be comparable to probes_int_list (for command line arguments keys are allways strings, in yaml you can set string or int) _ = {int(k): v for k, v in cfg['filter'][lim].items()} cfg['filter'][lim] = defaultdict(constant_factory(_.get(0)), _) l = init_logging(logging, None, None, 'INFO') #l = init_logging(logging, None, cfg['program']['log'], cfg['program']['verbose']) if True: # False. Experimental speedup but takes memory from dask.cache import Cache cache = Cache(2e9) # Leverage two gigabytes of memory cache.register() # Turn cache on globally #if __debug__: # # because there was errors on debug when default scheduler used # cfg['program']['dask_scheduler'] = 'synchronous' if cfg['program']['dask_scheduler']: if cfg['program']['dask_scheduler'] == 'distributed': from dask.distributed import Client # cluster = dask.distributed.LocalCluster(n_workers=2, threads_per_worker=1, memory_limit="5.5Gb") client = Client(processes=False) # navigate to http://localhost:8787/status to see the diagnostic dashboard if you have Bokeh installed # processes=False: avoide inter-worker communication for computations releases the GIL (numpy, da.array) # without is error else: if cfg['program']['dask_scheduler'] == 'synchronous': l.warning('using "synchronous" scheduler for debugging') import dask dask.config.set(scheduler=cfg['program']['dask_scheduler']) # Run steps : st.start = cfg['program']['step_start'] st.end = cfg['program']['step_end'] st.go = True if not cfg['out'][ 'db_name']: # set name by 'path_cruise' name or parent if it has digits at start. priority for name is "*inclinometer*" for p in (lambda p: [p, p.parent])(cfg['in']['path_cruise']): m = re.match('(^[\d_]*).*', p.name) if m: break cfg['out']['db_name'] = f"{m.group(1).strip('_')}incl.h5" dir_incl = next((d for d in cfg['in']['path_cruise'].glob('*inclinometer*') if d.is_dir()), cfg['in']['path_cruise']) db_path = dir_incl / '_raw' / cfg['out']['db_name'] # --------------------------------------------------------------------------------------------- # def fs(probe, name): # if 'w' in name.lower(): # Baranov's wavegauge electronic # return 10 # 5 # return 5 # if probe < 20 or probe in [23, 29, 30, 32, 33]: # 30 [4, 11, 5, 12] + [1, 7, 13, 30] # return 5 # if probe in [21, 25, 26] + list(range(28, 35)): # return 8.2 # return 4.8 def datetime64_str(time_str: Optional[str] = None) -> np.ndarray: """ Reformat time_str to ISO 8601 or to 'NaT'. Used here for input in funcs that converts str to numpy.datetime64 :param time_str: May be 'NaT' :return: ndarray of strings (tested for 1 element only) formatted by numpy. """ return np.datetime_as_string(np.datetime64(time_str, 's')) probes = cfg['in']['probes'] or range( 1, 41) # sets default range, specify your values before line --- raw_root, probe_is_incl = re.subn('INCL_?', 'INKL_', cfg['in']['probes_prefix'].upper()) # some parameters that depends of probe type (indicated by probes_prefix) p_type = defaultdict( # baranov's format constant_factory({ 'correct_fun': partial(correct_txt, mod_file_name=mod_incl_name, sub_str_list=[ b'^\r?(?P<use>20\d{2}(\t\d{1,2}){5}(\t\d{5}){8}).*', b'^.+' ]), 'fs': 10, 'format': 'Baranov', }), { (lambda x: x if x.startswith('incl') else 'incl')(cfg['in']['probes_prefix']): { 'correct_fun': partial( correct_txt, mod_file_name=mod_incl_name, sub_str_list=[ b'^(?P<use>20\d{2}(,\d{1,2}){5}(,\-?\d{1,6}){6}(,\d{1,2}\.\d{2})(,\-?\d{1,3}\.\d{2})).*', b'^.+' ]), 'fs': 5, 'format': 'Kondrashov', }, 'voln': { 'correct_fun': partial( correct_txt, mod_file_name=mod_incl_name, sub_str_list=[ b'^(?P<use>20\d{2}(,\d{1,2}){5}(,\-?\d{1,8})(,\-?\d{1,2}\.\d{2}){2}).*', b'^.+' ]), 'fs': 5, #'tbl_prefix': 'w', 'format': 'Kondrashov', } }) if st(1, 'Save inclinometer or wavegage data from ASCII to HDF5'): # Note: Can not find additional not corrected files for same probe if already have any corrected in search path (move them out if need) i_proc_probe = 0 # counter of processed probes i_proc_file = 0 # counter of processed files # patten to identify only _probe_'s raw data files that need to correct '*INKL*{:0>2}*.[tT][xX][tT]': raw_parent = dir_incl / '_raw' # raw_parent /= if cfg['in']['raw_subdir'] is None: cfg['in']['raw_subdir'] = '' dir_out = raw_parent / re.sub(r'[.\\/ *?]', '_', cfg['in']['raw_subdir']) # sub replaces multilevel subdirs to 1 level that correct_fun() can only make def dt_from_utc_2000(probe): """ Correct time of probes started without time setting. Raw date must start from 2000-01-01T00:00""" return ( datetime(year=2000, month=1, day=1) - cfg['in']['time_start_utc'][probe] ) if cfg['in']['time_start_utc'].get(probe) else timedelta(0) # convert cfg['in']['dt_from_utc'] keys to int cfg['in']['dt_from_utc'] = { int(p): v for p, v in cfg['in']['dt_from_utc'].items() } # convert cfg['in']['t_start_utc'] to cfg['in']['dt_from_utc'] and keys to int cfg['in']['dt_from_utc'].update( # overwriting the 'time_start_utc' where already exist {int(p): dt_from_utc_2000(p) for p, v in cfg['in']['time_start_utc'].items()} ) # make cfg['in']['dt_from_utc'][0] be default value cfg['in']['dt_from_utc'] = defaultdict( constant_factory(cfg['in']['dt_from_utc'].pop(0, timedelta(0))), cfg['in']['dt_from_utc']) for probe in probes: raw_found = [] raw_pattern_file = str( Path(glob.escape(cfg['in']['raw_subdir'])) / cfg['in']['raw_pattern'].format(prefix=raw_root, number=probe)) correct_fun = p_type[cfg['in']['probes_prefix']]['correct_fun'] # if not archive: if (not re.match(r'.*(\.zip|\.rar)$', cfg['in']['raw_subdir'], re.IGNORECASE)) and raw_parent.is_dir(): raw_found = list(raw_parent.glob(raw_pattern_file)) if not raw_found: # Check if already have corrected files for probe generated by correct_txt(). If so then just use them raw_found = list( dir_out.glob( f"{cfg['in']['probes_prefix']}{probe:0>2}.txt")) if raw_found: print('corrected csv file', [r.name for r in raw_found], 'found') correct_fun = lambda x, dir_out: x elif not cfg['in']['raw_subdir']: continue for file_in in (raw_found or open_csv_or_archive_of_them( raw_parent, binary_mode=False, pattern=raw_pattern_file)): file_in = correct_fun(file_in, dir_out=dir_out) if not file_in: continue tbl = file_in.stem # f"{cfg['in']['probes_prefix']}{probe:0>2}" # tbl = re.sub('^((?P<i>inkl)|w)_0', lambda m: 'incl' if m.group('i') else 'w', # correct name # re.sub('^[\d_]*|\*', '', file_in.stem).lower()), # remove date-prefix if in name csv2h5( [ str( Path(__file__).parent / 'ini' / f"csv_{'inclin' if probe_is_incl else 'wavegage'}_{p_type[cfg['in']['probes_prefix']]['format']}.ini" ), '--path', str(file_in), '--blocksize_int', '50_000_000', # 50Mbt '--table', tbl, '--db_path', str(db_path), # '--log', str(scripts_path / 'log/csv2h5_inclin_Kondrashov.log'), # '--b_raise_on_err', '0', # ? '--b_interact', '0', '--fs_float', str(p_type[cfg['in']['probes_prefix']] ['fs']), #f'{fs(probe, file_in.stem)}', '--dt_from_utc_seconds', str(cfg['in']['dt_from_utc'][probe].total_seconds()), '--b_del_temp_db', '1', ] + (['--csv_specific_param_dict', 'invert_magnitometr: True'] if probe_is_incl else []), **{ 'filter': { 'min_date': cfg['filter']['min_date'].get( probe, np.datetime64(0, 'ns')), 'max_date': cfg['filter']['max_date'].get( probe, np.datetime64('now', 'ns') ), # simple 'now' works in sinchronious mode } }) # Get coefs: l.info( f"Adding coefficients to {db_path}/{tbl} from {cfg['in']['db_coefs']}" ) try: h5copy_coef(cfg['in']['db_coefs'], db_path, tbl) except KeyError as e: # Unable to open object (component not found) l.warning( 'No coefs to copy?' ) # write some dummy coefficients to can load Veusz patterns: h5copy_coef(None, db_path, tbl, dict_matrices=dict_matrices_for_h5(tbl=tbl)) except OSError as e: l.warning( 'Not found DB with coefs?' ) # write some dummy coefficients to can load Veusz patterns: h5copy_coef(None, db_path, tbl, dict_matrices=dict_matrices_for_h5(tbl=tbl)) i_proc_file += 1 else: print('no', raw_pattern_file, end=', ') i_proc_probe += 1 print('Ok:', i_proc_probe, 'probes,', i_proc_file, 'files processed.') if st(2, 'Calculate physical parameters and average'): kwarg = { 'in': { 'min_date': cfg['filter']['min_date'][0], 'max_date': cfg['filter']['max_date'][0], 'time_range_zeroing': cfg['in']['time_range_zeroing'] }, 'proc': {} } # if aggregate_period_s is None then not average and write to *_proc_noAvg.h5 else loading from that h5 and writing to _proc.h5 if not cfg['out']['aggregate_period_s']: cfg['out']['aggregate_period_s'] = [ None, 2, 600, 7200 if probe_is_incl else 3600 ] if cfg['in']['azimuth_add']: if 'Lat' in cfg['in']['azimuth_add']: # add magnetic declination,° for used coordinates # todo: get time kwarg['proc']['azimuth_add'] = mag_dec( cfg['in']['azimuth_add']['Lat'], cfg['in']['azimuth_add']['Lon'], datetime(2020, 9, 10), depth=-1) else: kwarg['proc']['azimuth_add'] = 0 if 'constant' in cfg['in']['azimuth_add']: # and add constant. For example, subtruct declination at the calibration place if it was applied kwarg['proc']['azimuth_add'] += cfg['in']['azimuth_add'][ 'constant'] # add -6.656 to account for calibration in Kaliningrad (mag deg = 6.656°) for aggregate_period_s in cfg['out']['aggregate_period_s']: if aggregate_period_s is None: db_path_in = db_path db_path_out = dir_incl / f'{db_path.stem}_proc_noAvg.h5' else: db_path_in = dir_incl / f'{db_path.stem}_proc_noAvg.h5' db_path_out = dir_incl / f'{db_path.stem}_proc.h5' # or separately: '_proc{aggregate_period_s}.h5' # 'incl.*|w\d*' inclinometers or wavegauges w\d\d # 'incl09': tables_list_regex = f"{cfg['in']['probes_prefix'].replace('voln', 'w')}.*" if cfg['in']['probes']: tables_list_regex += "(?:{})".format('|'.join( '{:0>2}'.format(p) for p in cfg['in']['probes'])) args = [ '../../empty.yml', # all settings are here, so to not print 'using default configuration' we use some existed empty file '--db_path', str(db_path_in), '--tables_list', tables_list_regex, '--aggregate_period', f'{aggregate_period_s}S' if aggregate_period_s else '', '--out.db_path', str(db_path_out), '--table', f'V_incl_bin{aggregate_period_s}' if aggregate_period_s else 'V_incl', '--verbose', 'INFO', #'DEBUG' get many numba messages '--b_del_temp_db', '1', # '--calc_version', 'polynom(force)', # depreshiated # '--chunksize', '20000', # '--not_joined_h5_path', f'{db_path.stem}_proc.h5', ] if aggregate_period_s is None: # proc. parameters (if we have saved proc. data then when aggregating we are not processing) # Note: for Baranov's prog 4096 is not suited: args += ([ '--max_dict', 'M[xyz]:4096', # '--time_range_zeroing_dict', "incl19: '2019-11-10T13:00:00', '2019-11-10T14:00:00'\n," # not works - use kwarg # '--time_range_zeroing_list', '2019-08-26T04:00:00, 2019-08-26T05:00:00' '--split_period', '1D' ] if probe_is_incl else [ '--bad_p_at_bursts_starts_peroiod', '1H', ]) # csv splitted by 1day (default for no avg) else csv is monolith if aggregate_period_s not in cfg['out'][ 'aggregate_period_s_not_to_text']: # , 300, 600]: args += ['--text_path', str(dir_incl / 'text_output')] # If need all data to be combined one after one: # set_field_if_no(kwarg, 'in', {}) # kwarg['in'].update({ # # 'tables': [f'incl{i:0>2}' for i in min_date.keys() if i!=0], # 'dates_min': min_date.values(), # in table list order # 'dates_max': max_date.values(), # # }) # set_field_if_no(kwarg, 'out', {}) # kwarg['out'].update({'b_all_to_one_col': 'True'}) incl_h5clc.main(args, **kwarg) if st(3, 'Calculate spectrograms'): # Can be done at any time after step 1 min_Pressure = 7 # add dict dates_min like {probe: parameter} of incl_clc to can specify param to each probe def raise_ni(): raise NotImplementedError( 'Can not proc probes having different fs in one run: you need to do it separately' ) args = [ Path(incl_h5clc.__file__).with_name( f'incl_h5spectrum{db_path.stem}.yaml'), # if no such file all settings are here '--db_path', str(dir_incl / f'{db_path.stem}_proc_noAvg.h5'), '--tables_list', f"{cfg['in']['probes_prefix']}.*", # inclinometers or wavegauges w\d\d ## 'w02', 'incl.*', # '--aggregate_period', f'{aggregate_period_s}S' if aggregate_period_s else '', '--min_date', datetime64_str(cfg['filter']['min_date'][0]), '--max_date', datetime64_str(cfg['filter']['max_date'] [0]), # '2019-09-09T16:31:00', #17:00:00 '--min_Pressure', f'{min_Pressure}', # '--max_dict', 'M[xyz]:4096', # use if db_path is not ends with _proc_noAvg.h5 i.e. need calc velocity '--out.db_path', f"{db_path.stem.replace('incl', cfg['in']['probes_prefix'])}_proc_psd.h5", # '--table', f'psd{aggregate_period_s}' if aggregate_period_s else 'psd', '--fs_float', str(p_type[cfg['in']['probes_prefix']] ['fs']), # f"{fs(probes[0], cfg['in']['probes_prefix'])}", # (lambda x: x == x[0])(np.vectorize(fs)(probes, prefix))).all() else raise_ni() # # '--time_range_zeroing_list', '2019-08-26T04:00:00, 2019-08-26T05:00:00' # '--verbose', 'DEBUG', # '--chunksize', '20000', '--b_interact', '0', ] if probe_is_incl: args += [ '--split_period', '2H', '--fmin', '0.0004', #0.0004 '--fmax', '1.05' ] else: args += [ '--split_period', '1H', '--dt_interval_minutes', '15', # set this if burst mode to the burst interval '--fmin', '0.0001', '--fmax', '4', #'--min_Pressure', '-1e15', # to not load NaNs ] incl_h5spectrum.main(args) if st(4, 'Draw in Veusz'): pattern_path = dir_incl / r'processed_h5,vsz/201202-210326incl_proc#28.vsz' # r'\201202_1445incl_proc#03_pattern.vsz' #' # db_path.parent / r'vsz_5min\191119_0000_5m_incl19.vsz' # r'vsz_5min\191126_0000_5m_w02.vsz' b_images_only = False # importing in vsz index slices replacing: pattern_str_slice_old = None # Length of not adjacent intervals, s (set None to not allow) # pandas interval in string or tuple representation '1D' of period between intervals and interval to draw period_str = '0s' # '1D' # dt dt_str = '0s' # '5m' file_intervals = None period = to_offset(period_str).delta dt = to_offset(dt_str).delta # timedelta(0) # 60 * 5 if file_intervals and period and dt: # Load starts and assign ends t_intervals_start = pd.read_csv( cfg['in']['path_cruise'] / r'vsz+h5_proc\intervals_selected.txt', converters={ 'time_start': lambda x: np.datetime64(x, 'ns') }, index_col=0).index edges = (pd.DatetimeIndex(t_intervals_start), pd.DatetimeIndex(t_intervals_start + dt_custom_s) ) # np.zeros_like() elif period and dt: # Generate periodic intervals t_interval_start, t_intervals_end = intervals_from_period( datetime_range=np.array( [ cfg['filter']['min_date']['0'], cfg['filter']['max_date']['0'] ], # ['2018-08-11T18:00:00', '2018-09-06T00:00:00'], # ['2019-02-11T13:05:00', '2019-03-07T11:30:00'], # ['2018-11-16T15:19', '2018-12-14T14:35'], # ['2018-10-22T12:30', '2018-10-27T06:30:00'], 'datetime64[s]'), period=period) edges = (pd.DatetimeIndex([t_interval_start ]).append(t_intervals_end[:-1]), pd.DatetimeIndex(t_intervals_end)) else: # [min, max] edges for each probe edges_dict = { pr: [cfg['filter']['min_date'][pr], cfg['filter']['max_date'][pr]] for pr in probes } cfg_vp = {'veusze': None} for i, probe in enumerate(probes): # cfg_vp = {'veusze': None} if edges_dict: # custom edges for each probe edges = [pd.DatetimeIndex([t]) for t in edges_dict[probe]] # substr in file to rerplace probe_name_in_pattern (see below). probe_name = f"_{cfg['in']['probes_prefix'].replace('incl', 'i')}{probe:02}" tbl = None # f"/{cfg['in']['probes_prefix']}{probe:02}" # to check probe data exist in db else will not check l.info('Draw %s in Veusz: %d intervals...', probe_name, edges[0].size) # for i_interval, (t_interval_start, t_interval_end) in enumerate(zip(pd.DatetimeIndex([t_interval_start]).append(t_intervals_end[:-1]), t_intervals_end), start=1): for i_interval, (t_interval_start, t_interval_end) in enumerate(zip(*edges), start=1): # if i_interval < 23: #<= 0: # TEMPORARY Skip this number of intervals # continue if period and period != dt: t_interval_start = t_interval_end - pd.Timedelta( dt_custom_s, 's') if tbl: try: # skipping absent probes start_end = h5q_interval2coord( db_path=str(db_path), table=tbl, t_interval=(t_interval_start, t_interval_end)) if not len(start_end): break # no data except KeyError: break # device name not in specified range, go to next name pattern_path_new = pattern_path.with_name(''.join([ f'{t_interval_start:%y%m%d_%H%M}', f'_{dt_str}' if dt else '', f'{probe_name}.vsz' ])) # Modify pattern file if not b_images_only: pattern_type, pattern_number = re.match( r'.*(incl|w)_proc?#?(\d*).*', pattern_path.name).groups() probe_name_in_pattern = f"_{pattern_type.replace('incl', 'i')}{pattern_number}" def f_replace(line): """ Replace in file 1. probe name 2. slice """ # if i_interval == 1: line, ok = re.subn(probe_name_in_pattern, probe_name, line) if ok and pattern_str_slice_old: # can be only in same line str_slice = '(({:d}, {:d}, None),)'.format( *(start_end + np.int32([-1, 1]))) # bytes(, 'ascii') line = re.sub(pattern_str_slice_old, str_slice, line) return line if not rep_in_file(pattern_path, pattern_path_new, f_replace=f_replace, binary_mode=False): l.warning('Veusz pattern not changed!' ) # may be ok if we need draw pattern # break elif cfg_vp['veusze']: cfg_vp['veusze'].Load(str(pattern_path_new)) elif cfg_vp['veusze']: cfg_vp['veusze'].Load(str(pattern_path_new)) txt_time_range = \ """ "[['{:%Y-%m-%dT%H:%M}', '{:%Y-%m-%dT%H:%M}']]" \ """.format(t_interval_start, t_interval_end) print(f'{i_interval}. {txt_time_range}', end=' ') cfg_vp = veuszPropagate.main( [ Path(veuszPropagate.__file__).parent.with_name( 'veuszPropagate.ini'), # '--data_yield_prefix', '-', # '--path', str(db_path), # if custom loading from db and some source is required '--tables_list', '', # switches to search vsz-files only # f'/{probe_name}', # 181022inclinometers/ \d* '--pattern_path', str(pattern_path_new), # fr'd:\workData\BalticSea\190801inclinometer_Schuka\{probe_name}_190807_1D.vsz', # str(dir_incl / f'{probe_name}_190211.vsz'), #warning: create file with small name # '--before_next', 'restore_config', # '--add_to_filename', f"_{t_interval_start:%y%m%d_%H%M}_{dt}", '--filename_fun', f'lambda tbl: "{pattern_path_new.name}"', '--add_custom_list', f'USEtime__', # f'USEtime{probe_name}', nAveragePrefer', '--add_custom_expressions_list', txt_time_range, # + """ # ", 5" # """, '--b_update_existed', 'True', '--export_pages_int_list', '0', # 0 for all '6, 7, 8', #'1, 2, 3' # '--export_dpi_int', '200', '--export_format', 'jpg', #'emf', '--b_interact', '0', '--b_images_only', f'{b_images_only}', '--return', '<embedded_object>', # reuse to not bloat memory '--b_execute_vsz', 'True', '--before_next', 'Close()' # Close() need if b_execute_vsz many files ], veusze=cfg_vp['veusze']) if st(40, f'Draw in Veusz by loader-drawer.vsz method'): # save all vsz files that uses separate code from os import chdir as os_chdir dt_s = 300 cfg['in'][ 'pattern_path'] = db_path.parent / f'vsz_{dt_s:d}s' / '~pattern~.vsz' time_starts = pd.read_csv( db_path.parent / r'processed_h5,vsz' / 'intervals_selected.txt', index_col=0, parse_dates=True, date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%dT%H:%M:%S' )).index pattern_code = cfg['in']['pattern_path'].read_bytes( ) # encoding='utf-8' path_vsz_all = [] for i, probe in enumerate(probes): probe_name = f"{cfg['in']['probes_prefix']}{probe:02}" # table name in db l.info('Draw %s in Veusz: %d intervals...', probe_name, time_starts.size) for i_interval, time_start in enumerate(time_starts, start=1): path_vsz = cfg['in']['pattern_path'].with_name( f"{time_start:%y%m%d_%H%M}_{probe_name.replace('incl','i')}.vsz" ) # copy file to path_vsz path_vsz.write_bytes(pattern_code) # replaces 1st row path_vsz_all.append(path_vsz) os_chdir(cfg['in']['pattern_path'].parent) veuszPropagate.main( [ 'ini/veuszPropagate.ini', '--path', str(cfg['in']['pattern_path'].with_name( '??????_????_*.vsz')), # db_path), '--pattern_path', f"{cfg['in']['pattern_path']}_", # here used to auto get export dir only. may not be _not existed file path_ if ['out']['paths'] is provided # '--table_log', f'/{device}/logRuns', # '--add_custom_list', f'{device_veusz_prefix}USE_time_search_runs', # 'i3_USE_timeRange', # '--add_custom_expressions', # """'[["{log_row[Index]:%Y-%m-%dT%H:%M:%S}", "{log_row[DateEnd]:%Y-%m-%dT%H:%M:%S}"]]'""", # '--export_pages_int_list', '1', #'--b_images_only', 'True' '--b_interact', '0', '--b_update_existed', 'True', # todo: delete_overlapped '--b_images_only', 'True', '--load_timeout_s_float', str(cfg['program']['load_timeout_s']) # '--min_time', '2020-07-08T03:35:00', ], **{'out': { 'paths': path_vsz_all }}) if st(50, 'Export from existed Veusz files in dir'): pattern_parent = db_path.parent # r'vsz_5min\191126_0000_5m_w02.vsz'' pattern_path = str(pattern_parent / r'processed_h5,vsz' / '??????incl_proc#[1-9][0-9].vsz') # [0-2,6-9] veuszPropagate.main([ 'ini/veuszPropagate.ini', '--path', pattern_path, '--pattern_path', pattern_path, # '--export_pages_int_list', '1', #'--b_images_only', 'True' '--b_interact', '0', '--b_update_existed', 'True', # todo: delete_overlapped '--b_images_only', 'True', '--load_timeout_s_float', str(cfg['program']['load_timeout_s']), '--b_execute_vsz', 'True', '--before_next', 'Close()' # Close() need if b_execute_vsz many files ])
def main(new_arg=None, **kwargs): """ :param new_arg: list of strings, command line arguments :kwargs: dicts of dictcts (for each ini section): specified values overwrites ini values """ # global l cfg = cfg_from_args(my_argparser(), new_arg, **kwargs) cfg['in']['db_coefs'] = Path(cfg['in']['db_coefs']) for path_field in ['db_coefs', 'path_cruise']: if not cfg['in'][path_field].is_absolute(): cfg['in'][path_field] = ( cfg['in']['cfgFile'].parent / cfg['in'][path_field] ).resolve().absolute() # cfg['in']['cfgFile'].parent / def constant_factory(val): def default_val(): return val return default_val for lim in ('min_date', 'max_date'): cfg['filter'][lim] = defaultdict( constant_factory(cfg['filter'][lim].get( '0', cfg['filter'][lim].get(0))), cfg['filter'][lim]) l = init_logging(logging, None, None, 'INFO') #l = init_logging(logging, None, cfg['program']['log'], cfg['program']['verbose']) if True: # False. Experimental speedup but takes memory from dask.cache import Cache cache = Cache(2e9) # Leverage two gigabytes of memory cache.register() # Turn cache on globally if cfg['program']['dask_scheduler']: if cfg['program']['dask_scheduler'] == 'distributed': from dask.distributed import Client client = Client( processes=False ) # navigate to http://localhost:8787/status to see the diagnostic dashboard if you have Bokeh installed # processes=False: avoide inter-worker communication for computations releases the GIL (numpy, da.array) # without is error else: if cfg['program']['dask_scheduler'] == 'synchronous': l.warning('using "synchronous" scheduler for debugging') import dask dask.config.set(scheduler=cfg['program']['dask_scheduler']) # Run steps : st.start = cfg['program']['step_start'] st.end = cfg['program']['step_end'] st.go = True if not cfg['out'][ 'db_name']: # set name by 'path_cruise' name or parent if it has digits at start. priority for name is "*inclinometer*" for p in (lambda p: [p, p.parent])(cfg['in']['path_cruise']): m = re.match('(^[\d_]*).*', p.name) if m: break cfg['out']['db_name'] = f"{m.group(1).strip('_')}incl.h5" cfg['in']['path_cruise'].glob('*inclinometer*') dir_incl = next((d for d in cfg['in']['path_cruise'].glob('*inclinometer*') if d.is_dir()), cfg['in']['path_cruise']) db_path = dir_incl / cfg['out']['db_name'] # --------------------------------------------------------------------------------------------- def fs(probe, name): return 5 # if 'w' in name.lower(): # Baranov's wavegauge electronic # return 5 # 10 # if probe < 20 or probe in [23, 29, 30, 32, 33]: # 30 [4, 11, 5, 12] + [1, 7, 13, 30] # return 5 # if probe in [21, 25, 26] + list(range(28, 35)): # return 8.2 # return 4.8 def datetime64_str(time_str: Optional[str] = None) -> np.ndarray: """ Reformat time_str to ISO 8601 or to 'NaT'. Used here for input in funcs that converts str to numpy.datetime64 :param time_str: May be 'NaT' :return: ndarray of strings (tested for 1 element only) formatted by numpy. """ return np.datetime_as_string(np.datetime64(time_str, 's')) probes = cfg['in']['probes'] or range( 1, 41) # sets default range, specify your values before line --- raw_root, subs_made = re.subn('INCL_?', 'INKL_', cfg['in']['probes_prefix'].upper()) if st( 1 ): # Can not find additional not corrected files for same probe if already have any corrected in search path (move them out if need) i_proc_probe = 0 # counter of processed probes i_proc_file = 0 # counter of processed files # patten to identify only _probe_'s raw data files that need to correct '*INKL*{:0>2}*.[tT][xX][tT]': raw_parent = dir_incl / '_raw' dir_out = raw_parent / re.sub( r'[.\\/ ]', '_', cfg['in']['raw_subdir'] ) # sub replaces multilevel subdirs to 1 level that correct_fun() can only make raw_parent /= cfg['in']['raw_subdir'] for probe in probes: raw_found = [] raw_pattern_file = cfg['in']['raw_pattern'].format(prefix=raw_root, number=probe) correct_fun = partial( correct_kondrashov_txt if subs_made else correct_baranov_txt, dir_out=dir_out) # if not archive: if (not '.zip' in cfg['in']['raw_subdir'].lower() and not '.rar' in cfg['in']['raw_subdir'].lower()) or raw_parent.is_dir(): raw_found = list(raw_parent.glob(raw_pattern_file)) if not raw_found: # Check if already have corrected files for probe generated by correct_kondrashov_txt(). If so then just use them raw_found = list( raw_parent.glob( f"{cfg['in']['probes_prefix']}{probe:0>2}.txt")) if raw_found: print('corrected csv file', [r.name for r in raw_found], 'found') correct_fun = lambda x: x elif not cfg['in']['raw_subdir']: continue for file_in in (raw_found or open_csv_or_archive_of_them( raw_parent, binary_mode=False, pattern=raw_pattern_file)): file_in = correct_fun(file_in) if not file_in: continue tbl = f"{cfg['in']['probes_prefix']}{probe:0>2}" # tbl = re.sub('^((?P<i>inkl)|w)_0', lambda m: 'incl' if m.group('i') else 'w', # correct name # re.sub('^[\d_]*|\*', '', file_in.stem).lower()), # remove date-prefix if in name csv2h5( [ str( Path(__file__).parent / 'ini' / f"csv_inclin_{'Kondrashov' if subs_made else 'Baranov'}.ini" ), '--path', str(file_in), '--blocksize_int', '50_000_000', # 50Mbt '--table', tbl, '--db_path', str(db_path), # '--log', str(scripts_path / 'log/csv2h5_inclin_Kondrashov.log'), # '--b_raise_on_err', '0', # ? '--b_interact', '0', '--fs_float', f'{fs(probe, file_in.stem)}', '--dt_from_utc_seconds', str(cfg['in']['dt_from_utc'].total_seconds()), '--b_del_temp_db', '1', ] + (['--csv_specific_param_dict', 'invert_magnitometr: True'] if subs_made else ['--cols_load_list', "yyyy,mm,dd,HH,MM,SS,P,U"]), **{ 'filter': { 'min_date': cfg['filter']['min_date'][probe], 'max_date': cfg['filter']['max_date'][probe], } }) # Get coefs: l.info( f"Adding coefficients to {db_path}/{tbl} from {cfg['in']['db_coefs']}" ) try: h5copy_coef(cfg['in']['db_coefs'], db_path, tbl) except KeyError as e: # Unable to open object (component not found) l.warning( 'No coefs to copy?' ) # write some dummy coefficients to can load Veusz patterns: h5copy_coef(None, db_path, tbl, dict_matrices=dict_matrices_for_h5(tbl=tbl)) except OSError as e: l.warning( 'Not found DB with coefs?' ) # write some dummy coefficients to can load Veusz patterns: h5copy_coef(None, db_path, tbl, dict_matrices=dict_matrices_for_h5(tbl=tbl)) i_proc_file += 1 else: print('no', raw_pattern_file, end=', ') i_proc_probe += 1 print('Ok:', i_proc_probe, 'probes,', i_proc_file, 'files processed.') # Calculate velocity and average if st(2): # if aggregate_period_s is None then not average and write to *_proc_noAvg.h5 else loading from that h5 and writing to _proc.h5 if not cfg['out']['aggregate_period_s']: cfg['out']['aggregate_period_s'] = [ None, 2, 600, 3600 if 'w' in cfg['in']['probes_prefix'] else 7200 ] if cfg['in']['azimuth_add']: if 'Lat' in cfg['in']['azimuth_add']: from datetime import datetime # add magnetic declination,° for used coordinates # todo: get time azimuth_add = mag_dec(cfg['in']['azimuth_add']['Lat'], cfg['in']['azimuth_add']['Lon'], datetime(2020, 9, 10), depth=-1) else: azimuth_add = 0 if 'constant' in cfg['in']['azimuth_add']: # and add constant. For example, subtruct declination at the calibration place if it was applied azimuth_add += cfg['in']['azimuth_add'][ 'constant'] # add -6.65644183° to account for calibration in Kaliningrad for aggregate_period_s in cfg['out']['aggregate_period_s']: if aggregate_period_s is None: db_path_in = db_path db_path_out = db_path.with_name( f'{db_path.stem}_proc_noAvg.h5') else: db_path_in = db_path.with_name(f'{db_path.stem}_proc_noAvg.h5') db_path_out = f'{db_path.stem}_proc.h5' # or separately: '_proc{aggregate_period_s}.h5' args = [ Path(incl_h5clc.__file__).with_name( f'incl_h5clc_{db_path.stem}.yaml'), # if no such file all settings are here '--db_path', str(db_path_in), # ! 'incl.*|w\d*' inclinometers or wavegauges w\d\d # 'incl09': '--tables_list', 'incl.*' if not cfg['in']['probes'] else f"incl.*(?:{'|'.join('{:0>2}'.format(p) for p in cfg['in']['probes'])})", '--aggregate_period', f'{aggregate_period_s}S' if aggregate_period_s else '', '--out.db_path', str(db_path_out), '--table', f'V_incl_bin{aggregate_period_s}' if aggregate_period_s else 'V_incl', '--verbose', 'INFO', #'DEBUG' get many numba messages '--b_del_temp_db', '1', # '--calc_version', 'polynom(force)', # depreshiated # '--chunksize', '20000', # '--not_joined_h5_path', f'{db_path.stem}_proc.h5', ] # if aggregate_period_s <= 5: # [s], do not need split csv for big average interval # args += (['--split_period', '1D']) if aggregate_period_s is None: # proc. parameters (if we have saved proc. data then when aggregating we are not processing) args += ([ '--max_dict', 'M[xyz]:4096', # Note: for Baranov's prog 4096 is not suited # '--timerange_zeroing_dict', "incl19: '2019-11-10T13:00:00', '2019-11-10T14:00:00'\n," # not works - use kwarg # '--timerange_zeroing_list', '2019-08-26T04:00:00, 2019-08-26T05:00:00' '--split_period', '1D' ] if subs_made else [ '--bad_p_at_bursts_starts_peroiod', '1H', ]) # csv splitted by 1day (default for no avg) and monolith csv if aggregate_period_s==600 if aggregate_period_s not in cfg['out'][ 'aggregate_period_s_not_to_text']: # , 300, 600]: args += ['--text_path', str(db_path.parent / 'text_output')] kwarg = { 'in': { 'min_date': cfg['filter']['min_date'][0], 'max_date': cfg['filter']['max_date'][0], 'timerange_zeroing': cfg['in']['timerange_zeroing'], 'azimuth_add': azimuth_add } } # If need all data to be combined one after one: # set_field_if_no(kwarg, 'in', {}) # kwarg['in'].update({ # # 'tables': [f'incl{i:0>2}' for i in min_date.keys() if i!=0], # 'dates_min': min_date.values(), # in table list order # 'dates_max': max_date.values(), # # }) # set_field_if_no(kwarg, 'out', {}) # kwarg['out'].update({'b_all_to_one_col': 'True'}) incl_h5clc.main(args, **kwarg) # Calculate spectrograms. if st(3): # Can be done at any time after step 1 def raise_ni(): raise NotImplementedError( 'Can not proc probes having different fs in one run: you need to do it separately' ) args = [ Path(incl_h5clc.__file__).with_name( f'incl_h5spectrum{db_path.stem}.yaml'), # if no such file all settings are here '--db_path', str(db_path.with_name(f'{db_path.stem}_proc_noAvg.h5')), '--tables_list', f"{cfg['in']['probes_prefix']}.*", # inclinometers or wavegauges w\d\d ## 'w02', 'incl.*', # '--aggregate_period', f'{aggregate_period_s}S' if aggregate_period_s else '', '--min_date', datetime64_str(cfg['filter']['min_date'][0]), '--max_date', datetime64_str(cfg['filter']['max_date'] [0]), # '2019-09-09T16:31:00', #17:00:00 # '--max_dict', 'M[xyz]:4096', # use if db_path is not ends with _proc_noAvg.h5 i.e. need calc velocity '--out.db_path', f"{db_path.stem.replace('incl', cfg['in']['probes_prefix'])}_proc_psd.h5", # '--table', f'psd{aggregate_period_s}' if aggregate_period_s else 'psd', '--fs_float', f"{fs(probes[0], cfg['in']['probes_prefix'])}", # (lambda x: x == x[0])(np.vectorize(fs)(probes, prefix))).all() else raise_ni() # # '--timerange_zeroing_list', '2019-08-26T04:00:00, 2019-08-26T05:00:00' # '--verbose', 'DEBUG', # '--chunksize', '20000', '--b_interact', '0', ] if 'w' in cfg['in']['probes_prefix']: args += [ '--split_period', '1H', '--dt_interval_minutes', '10', # burst mode '--fmin', '0.0001', '--fmax', '4' ] else: args += [ '--split_period', '2H', '--fmin', '0.0004', #0.0004 '--fmax', '1.05' ] incl_h5spectrum.main(args) # Draw in Veusz if st(4): b_images_only = True # False pattern_path = db_path.parent / r'vsz_5min\191119_0000_5m_incl19.vsz' # r'vsz_5min\191126_0000_5m_w02.vsz' if not b_images_only: pattern_bytes_slice_old = re.escape(b'((5828756, 5830223, None),)') # Length of not adjacent intervals, s (set None to not allow) period = '1D' length = '5m' # period # '1D' dt_custom_s = pd_period_to_timedelta( length) if length != period else None # None # 60 * 5 if True: # Load starts and assign ends t_intervals_start = pd.read_csv( cfg['in']['path_cruise'] / r'vsz+h5_proc\intervals_selected.txt', converters={ 'time_start': lambda x: np.datetime64(x, 'ns') }, index_col=0).index edges = (pd.DatetimeIndex(t_intervals_start), pd.DatetimeIndex(t_intervals_start + dt_custom_s) ) # np.zeros_like() else: # Generate periodic intervals t_interval_start, t_intervals_end = intervals_from_period( datetime_range=np.array( [ cfg['filter']['min_date']['0'], cfg['filter']['max_date']['0'] ], # ['2018-08-11T18:00:00', '2018-09-06T00:00:00'], # ['2019-02-11T13:05:00', '2019-03-07T11:30:00'], # ['2018-11-16T15:19', '2018-12-14T14:35'], # ['2018-10-22T12:30', '2018-10-27T06:30:00'], 'datetime64[s]'), period=period) edges = (pd.DatetimeIndex([t_interval_start ]).append(t_intervals_end[:-1]), pd.DatetimeIndex(t_intervals_end)) for i, probe in enumerate(probes): probe_name = f"{cfg['in']['probes_prefix']}{probe:02}" # table name in db l.info('Draw %s in Veusz: %d intervals...', probe_name, edges[0].size) # for i_interval, (t_interval_start, t_interval_end) in enumerate(zip(pd.DatetimeIndex([t_interval_start]).append(t_intervals_end[:-1]), t_intervals_end), start=1): cfg_vp = {'veusze': None} for i_interval, (t_interval_start, t_interval_end) in enumerate(zip(*edges), start=1): # if i_interval < 23: #<= 0: # TEMPORARY Skip this number of intervals # continue if period != length: t_interval_start = t_interval_end - pd.Timedelta( dt_custom_s, 's') try: # skipping absent probes start_end = h5q_interval2coord( db_path=str(db_path), table=f'/{probe_name}', t_interval=(t_interval_start, t_interval_end)) if not len(start_end): break # no data except KeyError: break # device name not in specified range, go to next name pattern_path_new = pattern_path.with_name( f"{t_interval_start:%y%m%d_%H%M}_{length}_{probe_name}.vsz" ) # Modify pattern file if not b_images_only: probe_name_old = re.match('.*((?:incl|w)\d*).*', pattern_path.name).groups()[0] bytes_slice = bytes( '(({:d}, {:d}, None),)'.format(*(start_end + np.int32([-1, 1]))), 'ascii') def f_replace(line): """ Replace in file 1. probe name 2. slice """ # if i_interval == 1: line, ok = re.subn(bytes(probe_name_old, 'ascii'), bytes(probe_name, 'ascii'), line) if ok: # can be only in same line line = re.sub(pattern_bytes_slice_old, bytes_slice, line) return line if not rep_in_file(pattern_path, pattern_path_new, f_replace=f_replace): l.warning('Veusz pattern not changed!') # break elif cfg_vp['veusze']: cfg_vp['veusze'].Load(str(pattern_path_new)) elif cfg_vp['veusze']: cfg_vp['veusze'].Load(str(pattern_path_new)) txt_time_range = \ """ "[['{:%Y-%m-%dT%H:%M}', '{:%Y-%m-%dT%H:%M}']]" \ """.format(t_interval_start, t_interval_end) print(f'{i_interval}. {txt_time_range}', end=' ') cfg_vp = veuszPropagate.main( [ Path(veuszPropagate.__file__).parent.with_name( 'veuszPropagate.ini'), # '--data_yield_prefix', '-', '--path', str( db_path ), # use for custom loading from db and some source is required '--tables_list', f'/{probe_name}', # 181022inclinometers/ \d* '--pattern_path', str(pattern_path_new), # fr'd:\workData\BalticSea\190801inclinometer_Schuka\{probe_name}_190807_1D.vsz', # str(db_path.parent / dir_incl / f'{probe_name}_190211.vsz'), #warning: create file with small name # '--before_next', 'restore_config', # '--add_to_filename', f"_{t_interval_start:%y%m%d_%H%M}_{length}", '--filename_fun', f'lambda tbl: "{pattern_path_new.name}"', '--add_custom_list', 'USEtime', # nAveragePrefer', '--add_custom_expressions_list', txt_time_range, # + """ # ", 5" # """, '--b_update_existed', 'True', '--export_pages_int_list', '1, 2', # 0 for all '6, 7, 8', #'1, 2, 3' # '--export_dpi_int', '200', '--export_format', 'emf', '--b_interact', '0', '--b_images_only', f'{b_images_only}', '--return', '<embedded_object>', # reuse to not bloat memory ], veusze=cfg_vp['veusze'])
import xarray as xr import os import glob import imp import sys import numpy as np import pandas as pd import datetime import json import time import utm ### # Experimental cache option to speed up dask calls import cachey from dask.cache import Cache cache = Cache(4e9) cache.register() ### start_time = time.time() # Hack to force datetimes to display in GMT/UTC (numpy 1.11.1 has fixed this but other dependent modules (pynio) can't handel numpy 1.11.1) os.environ['TZ'] = 'GMT' time.tzset() # Load in config file ####### load user configurable paramters here ####### # Check user defined configuraiton file if len(sys.argv) == 1: raise ValueError( 'Netcdf_to_CHM_forcing.py requires one argument [configuration file] (i.e. python GRIB2_to_CHM_forcing.py forcing_config.py' )
def main(argv): global DEBUG, DD_FORCE_LOAD, DASK_CLIENT parser = argparse.ArgumentParser( epilog=__doc__, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('filepath') parser.add_argument('dftype') parser.add_argument('base') parser.add_argument('x') parser.add_argument('y') parser.add_argument('categories', nargs='+') parser.add_argument('--debug', action='store_true', help='Enable increased verbosity and DEBUG messages') parser.add_argument( '--cache', choices=('persist', 'cachey'), default=None, help= 'Enable caching: "persist" causes Dask dataframes to force loading into memory; "cachey" uses dask.cache.Cache with a cachesize of {}. Caching is disabled by default' .format(int(p.cachesize))) parser.add_argument( '--distributed', action='store_true', help= 'Enable the distributed scheduler instead of the threaded, which is the default.' ) parser.add_argument( '--recalc-ranges', action='store_true', help= 'Tell datashader to recalculate the ranges on each aggregation, instead of caching them (by default).' ) args = parser.parse_args(argv[1:]) if args.cache is None: if args.debug: print("DEBUG: Cache disabled", flush=True) else: if args.cache == 'cachey': from dask.cache import Cache cache = Cache(p.cachesize) cache.register() elif args.cache == 'persist': DD_FORCE_LOAD = True if args.debug: print('DEBUG: Cache "{}" mode enabled'.format(args.cache), flush=True) if args.dftype == 'dask' and args.distributed: local_cluster = distributed.LocalCluster(n_workers=p.n_workers, threads_per_worker=1) DASK_CLIENT = distributed.Client(local_cluster) if args.debug: print('DEBUG: "distributed" scheduler is enabled') else: if args.dftype != 'dask' and args.distributed: raise ValueError( '--distributed argument is only available with the dask dataframe type (not pandas)' ) if args.debug: print('DEBUG: "threaded" scheduler is enabled') filepath = args.filepath basename, extension = os.path.splitext(filepath) p.dftype = args.dftype p.base = args.base p.x = args.x p.y = args.y p.categories = args.categories DEBUG = args.debug if DEBUG: print('DEBUG: Memory usage (before read):\t{} MB'.format( get_proc_mem(), flush=True)) df, loadtime = timed_read(filepath, p.dftype) if df is None: if loadtime == -1: print("{:28} {:6} Operation not supported".format( filepath, p.dftype), flush=True) return 1 if DEBUG: print('DEBUG: Memory usage (after read):\t{} MB'.format(get_proc_mem(), flush=True)) img, aggtime1 = timed_agg(df, filepath, 5, 5, cache_ranges=(not args.recalc_ranges)) if DEBUG: mem_usage = df.memory_usage(deep=True) if p.dftype == 'dask': mem_usage = mem_usage.compute() print('DEBUG:', mem_usage, flush=True) mem_usage_total = mem_usage.sum() print('DEBUG: DataFrame size:\t\t\t{} MB'.format(mem_usage_total / 1e6, flush=True)) for colname in df.columns: print('DEBUG: column "{}" dtype: {}'.format( colname, df[colname].dtype)) print('DEBUG: Memory usage (after agg1):\t{} MB'.format(get_proc_mem(), flush=True)) img, aggtime2 = timed_agg(df, filepath, cache_ranges=(not args.recalc_ranges)) if DEBUG: print('DEBUG: Memory usage (after agg2):\t{} MB'.format(get_proc_mem(), flush=True)) in_size = get_size(filepath) out_size = get_size(filepath + ".png") global_end = time.time() print("{:28} {:6} Aggregate1:{:06.2f} ({:06.2f}+{:06.2f}) Aggregate2:{:06.2f} In:{:011d} Out:{:011d} Total:{:06.2f}"\ .format(filepath, p.dftype, loadtime+aggtime1, loadtime, aggtime1, aggtime2, in_size, out_size, global_end-global_start), flush=True) return 0
def main(argv): global DEBUG, DD_FORCE_LOAD parser = argparse.ArgumentParser( epilog=__doc__, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('filepath') parser.add_argument('dftype') parser.add_argument('base') parser.add_argument('x') parser.add_argument('y') parser.add_argument('categories', nargs='+') parser.add_argument('--debug', action='store_true', help='Enable increased verbosity and DEBUG messages') parser.add_argument( '--cache', choices=('persist', 'cachey'), default=None, help= 'Enable caching: "persist" causes Dask dataframes to force loading into memory; "cachey" uses dask.cache.Cache with a cachesize of {}. Caching is disabled by default' .format(int(p.cachesize))) args = parser.parse_args(argv[1:]) if args.cache is None: if args.debug: print("DEBUG: Cache disabled") else: if args.cache == 'cachey': from dask.cache import Cache Cache(p.cachesize).register() elif args.cache == 'persist': DD_FORCE_LOAD = True if args.debug: print('DEBUG: Cache "{}" mode enabled'.format(args.cache)) filepath = args.filepath basename, extension = os.path.splitext(filepath) p.dftype = args.dftype p.base = args.base p.x = args.x p.y = args.y p.categories = args.categories DEBUG = args.debug df, loadtime = timed_read(filepath, p.dftype) if df is None: if loadtime == -1: print("{:28} {:6} Operation not supported".format( filepath, p.dftype)) elif loadtime == -2: print("{:28} {:6} File does not exist".format(filepath, p.dftype)) return 1 img, aggtime1 = timed_agg(df, filepath, 5, 5) img, aggtime2 = timed_agg(df, filepath) in_size = get_size(filepath) out_size = get_size("{}.png".format(filepath)) global_end = time.time() print("{:28} {:6} Aggregate1:{:06.2f} ({:06.2f}+{:06.2f}) Aggregate2:{:06.2f} In:{:011d} Out:{:011d} Total:{:06.2f}"\ .format(filepath, p.dftype, loadtime+aggtime1, loadtime, aggtime1, aggtime2, in_size, out_size, global_end-global_start)) return 0