def test_progress_function(loop, capsys): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: f = e.submit(lambda: 1) g = e.submit(lambda: 2) progress([[f], [[g]]], notebook=False) check_bar_completed(capsys)
def test_progress_function(loop, capsys): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: f = e.submit(lambda: 1) g = e.submit(lambda: 2) progress([[f], [[g]]], notebook=False) check_bar_completed(capsys)
def test_progress_function(client, capsys): f = client.submit(lambda: 1) g = client.submit(lambda: 2) progress([[f], [[g]]], notebook=False) check_bar_completed(capsys) progress(f) check_bar_completed(capsys)
def test_progress_function(client, capsys): f = client.submit(lambda: 1) g = client.submit(lambda: 2) progress([[f], [[g]]], notebook=False) check_bar_completed(capsys) progress(f) check_bar_completed(capsys)
def test_progress_function(loop, capsys): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: f = c.submit(lambda: 1) g = c.submit(lambda: 2) progress([[f], [[g]]], notebook=False) check_bar_completed(capsys) progress(f) check_bar_completed(capsys)
def test_fast(client): L = client.map(inc, range(100)) L2 = client.map(dec, L) L3 = client.map(add, L, L2) p = progress(L3, multi=True, complete=True, notebook=True) client.sync(p.listen) assert set(p._last_response["all"]) == {"inc", "dec", "add"}
def test_fast(client): L = client.map(inc, range(100)) L2 = client.map(dec, L) L3 = client.map(add, L, L2) p = progress(L3, multi=True, complete=True, notebook=True) client.sync(p.listen) assert set(p._last_response['all']) == {'inc', 'dec', 'add'}
def test_fast(loop): with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: L = c.map(inc, range(100)) L2 = c.map(dec, L) L3 = c.map(add, L, L2) p = progress(L3, multi=True, complete=True, notebook=True) sync(loop, p.listen) assert set(p._last_response['all']) == {'inc', 'dec', 'add'}
def test_fast(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: L = e.map(inc, range(100)) L2 = e.map(dec, L) L3 = e.map(add, L, L2) p = progress(L3, multi=True, complete=True, notebook=True) sync(loop, p.listen) assert set(p._last_response['all']) == {'inc', 'dec', 'add'}
def test_fast(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port']), loop=loop) as e: L = e.map(inc, range(100)) L2 = e.map(dec, L) L3 = e.map(add, L, L2) p = progress(L3, multi=True, complete=True, notebook=True) sync(loop, p.listen) assert set(p._last_response['all']) == {'inc', 'dec', 'add'}
def main(update=False, outdir=os.getcwd(), ARGO_DIR="/data/datos/ARGO/data/"): argo_files = [ os.path.join(ARGO_DIR, x) for x in os.listdir(ARGO_DIR) if x.endswith("_prof.nc") ] argo_files.sort() client = setup_cluster() print(client) if update: updated_data = update_data(argo_files, outdir=outdir) updated_data = updated_data.persist() progress(updated_data) updated_data = check_bio(updated_data.compute().reset_index(drop=True)) updated_data.to_csv(os.path.join(outdir, "argo_latlon.txt"), index=False) else: data = merge_data([get_data(argof) for argof in argo_files]) data = data.persist() progress(data) data = check_bio(data.compute().reset_index(drop=True)) data.to_csv(os.path.join(outdir, "argo_latlon.txt"), index=False)
def test_progress_function_w_kwargs(client, capsys): f = client.submit(lambda: 1) g = client.submit(lambda: 2) progress(f, interval="20ms") check_bar_completed(capsys)
def build_index(use_bag=False): """ An experiment is a collection of outputNNN directories. Each directory represents the output of a single job submission script. These directories are created by the *payu* tool. This function creates and/or updates an index cache of variables names found in all NetCDF4 files. We can also examine the .nc files directly to infer their contents. for each .nc file, get variables -> dimensions .ncfile, varname, dimensions, chunksize """ # Build index of all NetCDF files found in directories to search. ncfiles = [] runs_available = [] print('Finding runs on disk...', end='') for directoryToSearch in directoriesToSearch: #print('Searching {}'.format(directoryToSearch)) # find all subdirectories results = subprocess.check_output(['find', directoryToSearch, '-maxdepth', '3', '-type', 'd', '-name', 'output???']) results = [s for s in results.decode('utf-8').split()] runs_available.extend(results) print('found {} run directories'.format( len(runs_available))) #ncfiles.extend(results) # # results = subprocess.check_output(['find', directoryToSearch, '-name', '*.nc']) # # print('Found {} .nc files'.format(len(ncfiles))) # We can persist this index by storing it in a sqlite database placed in a centrally available location. # The use of the `dataset` module hides the details of working with SQL directly. # In this database is a single table listing all variables in NetCDF4 seen previously. print('Using database {}'.format(database_url)) print('Querying database...', end='') db = dataset.connect(database_url) # find list of all run directories r = db.query('SELECT DISTINCT rootdir, configuration, experiment, run FROM ncfiles') runs_already_seen = [os.path.join(*row.values()) for row in r] print('runs already indexed: {}'.format(len(runs_already_seen))) runs_to_index = list(set(runs_available) - set(runs_already_seen)) if len(runs_to_index) == 0: print("No new runs found.") return print('{} new run directories found including...'.format(len(runs_to_index))) for i in range(min(3, len(runs_to_index))): print(runs_to_index[i]) if len(runs_to_index) > 3: print('...') print('Finding files on disk...') ncfiles = [] for run in tqdm.tqdm_notebook(runs_to_index, leave=True): results = subprocess.check_output(['find', run, '-name', '*.nc']) results = [s for s in results.decode('utf-8').split()] ncfiles.extend(results) IPython.display.clear_output(wait=True) # NetCDF files found on disk not seen before: #files_to_add = set(ncfiles) - set(files_already_seen) files_to_add = ncfiles print('Files found but not yet indexed: {}'.format(len(files_to_add))) # For these new files, we can determine their configuration, experiment, and run. # Using NetCDF4 to get list of all variables in each file. # output* directories # match the parent and grandparent directory to configuration/experiment find_output = re.compile('(.*)/([^/]*)/([^/]*)/(output\d+)/.*\.nc') # determine general pattern for ncfile names find_basename_pattern = re.compile('(?P<root>[^\d]+)(?P<index>__\d+_\d+)?(?P<indexice>\.\d+\-\d+)?(?P<ext>\.nc)') def index_variables(ncfile): matched = find_output.match(ncfile) if matched is None: return [] if not os.path.exists(ncfile): return [] basename = os.path.basename(ncfile) m = find_basename_pattern.match(basename) if m is None: basename_pattern = basename else: basename_pattern = m.group('root') + ('__\d+_\d+' if m.group('index') else '') + ('.\d+-\d+' if m.group('indexice') else '')+ m.group('ext') try: with netCDF4.Dataset(ncfile) as ds: ncvars = [ {'ncfile': ncfile, 'rootdir': matched.group(1), 'configuration': matched.group(2), 'experiment' : matched.group(3), 'run' : matched.group(4), 'basename' : basename, 'basename_pattern' : basename_pattern, 'variable' : v.name, 'dimensions' : str(v.dimensions), 'chunking' : str(v.chunking()), } for v in ds.variables.values()] except: print ('Exception occurred while trying to read {}'.format(ncfile)) ncvars = [] return ncvars if len(files_to_add) == 0: print("No new .nc files found.") return True print('Indexing new .nc files...') if use_bag: with distributed.Client() as client: bag = dask.bag.from_sequence(files_to_add) bag = bag.map(index_variables).flatten() futures = client.compute(bag) progress(futures, notebook=False) ncvars = futures.result() else: ncvars = [] for file_to_add in tqdm.tqdm_notebook(files_to_add, leave=False): ncvars.extend(index_variables(file_to_add)) IPython.display.clear_output() print('') print('Found {} new variables'.format(len(ncvars))) print('Saving results in database...') db['ncfiles'].insert_many(ncvars) print('Indexing complete.') return True
def main(lat, lon, time): client = setup_cluster() print(client) GODAS_GLOB = "/data/users/grivera/GODAS/clim/daily/*.godas_dayclim.nc" argodb = pd.read_csv(paths["ARGO_DB"], parse_dates=[0]) grid = np.arange(0, 2001, 2.0) newdf = filter_data(argodb, lat, lon, time) newdf.loc[:, "fname"] = newdf["date"].apply( get_fn, args=(paths["ARGO_DATA"], "{:%Y%m%d}_prof.nc") ) newdf = newdf.sort_values("date") print( "\nLoading GODAS climatology from {:%Y-%m-%d} to {:%Y-%m-%d}".format( newdf.date.iloc[0], newdf.date.iloc[-1] ) ) godas_clim = xr.open_mfdataset(GODAS_GLOB, parallel=True).pottmp - 273 godas_clim = ( godas_clim.sel( time=slice(newdf.date.iloc[0], newdf.date.iloc[-1]), lat=slice(lat[0], lat[1]), lon=slice(lon[0], lon[1]), ) .mean(dim=["lat", "lon"]) .persist() ) progress(godas_clim) godas_clim = godas_clim.compute() print("\nDone \n") print("Computing region average") new_data = dastack( [ get_temp_anom(r.fname, r.nprof, godas_clim.sel(time=r.date)) for r in newdf.itertuples() ] ) new_data = new_data.persist() progress(new_data) new_data = new_data.compute() print("\nDone\n") save_nc( newdf, new_data[:, 0, :], "argo_tanom", "tanom", lon, lat, grid, paths["ARGO_PATCH_OUT"], ) save_nc( newdf, new_data[:, 1, :], "argo_temp", "temp", lon, lat, grid, paths["ARGO_PATCH_OUT"], )
def build_index(use_bag=False, careful=False, expt_dir_list=None): """ An experiment is a collection of outputNNN directories. Each directory represents the output of a single job submission script. These directories are created by the *payu* tool. This function creates and/or updates an index cache of variables names found in all NetCDF4 files. Optional arguments: careful: if True, use slower but more thorough method. Use this if some files are missing from index. Default is False. use_bag: Default is False. expt_dir_list: list of experiment directories in which cosima-cookbook.db will be created/updated. At present this is used only in get_nc_variable. You must have write access to all these directories. If expt_dir_list=None (the default), a central database is used. We can also examine the .nc files directly to infer their contents. for each .nc file, get variables -> dimensions .ncfile, varname, dimensions, chunksize """ # Build index of all NetCDF files found in directories to search. # dir_dbs dict keys are directories to search, values are corresponding db url if expt_dir_list is None: # one db for all expts dir_dbs = {d: database_url for d in directoriesToSearch} maxdepth = 3 else: # a separate db at root of each expt dir if not isinstance(expt_dir_list, list): expt_dir_list = [expt_dir_list] dir_dbs = {d: database_url_from_path(d) for d in expt_dir_list} maxdepth = 1 prev_db_url = '' for directoryToSearch, db_url in dir_dbs.items(): print('Finding runs in {}... '.format(directoryToSearch), end='') ncfiles = [] runs_available = [] # find all output subdirectories try: results = subprocess.check_output([ 'find', directoryToSearch, '-maxdepth', str(maxdepth), '-type', 'd', '-name', 'output???' ]) results = [s for s in results.decode('utf-8').split()] runs_available.extend(results) except: print( '{0} exception occurred while finding output directories in {1}' .format(sys.exc_info()[0], directoryToSearch)) runs_available = set(runs_available) print('found {} run directories'.format(len(runs_available))) #ncfiles.extend(results) # # results = subprocess.check_output(['find', directoryToSearch, '-name', '*.nc']) # # print('Found {} .nc files'.format(len(ncfiles))) # We can persist this index by storing it in a sqlite database placed in a centrally available location. # The use of the `dataset` module hides the details of working with SQL directly. # In this database is a single table listing all variables in NetCDF4 seen previously. print('Using database {}'.format(db_url)) db = dataset.connect(db_url) db['ncfiles'] # db.create_table('ncfiles') # has no effect if 'ncfiles' table already exists if not (db_url == prev_db_url ): # avoid repeating db query when expt_dir_list is None runs_already_seen = set([]) files_already_seen = set([]) if db['ncfiles'].count( ) > 0: # this also creates 'ncfiles' table if db is new if careful: # filter by filename rather than dir print('Querying database for files... ', end='') rf = db.query('SELECT DISTINCT ncfile FROM ncfiles') files_already_seen = set([row['ncfile'] for row in rf]) # files_already_seen = dict.fromkeys(rf) # use a dict for fast lookup # files_already_seen = {n['ncfile']: None for n in rf} # use a dict for fast lookup print('files already indexed: {}'.format( len(files_already_seen))) else: # filter by dir rather than filename # BUG: this can skip dirs even if they contain un-indexed .nc files - see https://github.com/OceansAus/cosima-cookbook/issues/95 print('Querying database for directories... ', end='') # find list of all run directories r = db.query( 'SELECT DISTINCT rootdir, configuration, experiment, run FROM ncfiles' ) runs_already_seen = set( [os.path.join(*row.values()) for row in r]) print('run directories already indexed: {}'.format( len(runs_already_seen))) prev_db_url = db_url runs_to_index = list(runs_available - runs_already_seen) if len(runs_to_index) == 0: print('No new runs found in {}'.format(directoryToSearch)) continue # # print('{} new run directories found including... '.format(len(runs_to_index))) # # for i in range(min(3, len(runs_to_index))): # print(runs_to_index[i]) # if len(runs_to_index) > 3: # print('...') print('Finding files in {} run directories... '.format( len(runs_to_index))) ncfiles = [] for run in tqdm.tqdm_notebook(runs_to_index, leave=False): try: results = subprocess.check_output( ['find', run, '-name', '*.nc']) results = [s for s in results.decode('utf-8').split()] ncfiles.extend(results) except: print( '{0} exception occurred while finding *.nc in {1}'.format( sys.exc_info()[0], run)) # IPython.display.clear_output(wait=True) # NetCDF files found on disk not seen before: files_to_add = set(ncfiles) - files_already_seen print('Files found but not yet indexed: {}'.format(len(files_to_add))) # For these new files, we can determine their configuration, experiment, and run. # Using NetCDF4 to get list of all variables in each file. # output* directories # match the parent and grandparent directory to configuration/experiment find_output = re.compile('(.*)/([^/]*)/([^/]*)/(output\d+)/.*\.nc') # determine general pattern for ncfile names find_basename_pattern = re.compile( '(?P<root>[^\d]+)(?P<index>__\d+_\d+)?(?P<indexice>\.\d+\-\d+)?(?P<ext>\.nc)' ) def index_variables(ncfile): matched = find_output.match(ncfile) if matched is None: return [] if not os.path.exists(ncfile): return [] # TODO: also exit here if ncfile is already in database - use [NOT] EXISTS ?? # but this is super slow # module load sqlite # sqlite3 /g/data3/hh5/tmp/cosima/cosima-cookbook/cosima-cookbook.db # select * from ncfiles where ncfile == '/g/data3/hh5/tmp/cosima/access-om2-025/025deg_jra55v13_ryf8485_KDS50/output024/ocean/ocean_scalar.nc'; basename = os.path.basename(ncfile) m = find_basename_pattern.match(basename) if m is None: basename_pattern = basename else: basename_pattern = m.group('root') + ('__\d+_\d+' if m.group( 'index') else '') + ('.\d+-\d+' if m.group('indexice') else '') + m.group('ext') try: with netCDF4.Dataset(ncfile) as ds: ncvars = [{ 'ncfile': ncfile, 'rootdir': matched.group(1), 'configuration': matched.group(2), 'experiment': matched.group(3), 'run': matched.group(4), 'basename': basename, 'basename_pattern': basename_pattern, 'variable': v.name, 'dimensions': str(v.dimensions), 'chunking': str(v.chunking()), } for v in ds.variables.values()] except: print('{0} exception occurred while trying to read {1}'.format( sys.exc_info()[0], ncfile)) ncvars = [] return ncvars if len(files_to_add) == 0: print('No new .nc files found in {}'.format(directoryToSearch)) continue print('Indexing {} new .nc files... '.format(len(files_to_add))) if use_bag: with distributed.Client() as client: bag = dask.bag.from_sequence(files_to_add) bag = bag.map(index_variables).flatten() futures = client.compute(bag) progress(futures, notebook=False) ncvars = futures.result() else: ncvars = [] for file_to_add in tqdm.tqdm_notebook(files_to_add, leave=False): ncvars.extend(index_variables(file_to_add)) IPython.display.clear_output() print('') print('Indexed {} variables found in new files'.format(len(ncvars))) print('Saving results in database {}... '.format(db_url)) db['ncfiles'].insert_many(ncvars) print('Indexing complete.') return True