def test_progress_function(loop, capsys):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            f = e.submit(lambda: 1)
            g = e.submit(lambda: 2)

            progress([[f], [[g]]], notebook=False)
            check_bar_completed(capsys)
Esempio n. 2
0
def test_progress_function(loop, capsys):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            f = e.submit(lambda: 1)
            g = e.submit(lambda: 2)

            progress([[f], [[g]]], notebook=False)
            check_bar_completed(capsys)
def test_progress_function(client, capsys):
    f = client.submit(lambda: 1)
    g = client.submit(lambda: 2)

    progress([[f], [[g]]], notebook=False)
    check_bar_completed(capsys)

    progress(f)
    check_bar_completed(capsys)
Esempio n. 4
0
def test_progress_function(client, capsys):
    f = client.submit(lambda: 1)
    g = client.submit(lambda: 2)

    progress([[f], [[g]]], notebook=False)
    check_bar_completed(capsys)

    progress(f)
    check_bar_completed(capsys)
def test_progress_function(loop, capsys):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            f = c.submit(lambda: 1)
            g = c.submit(lambda: 2)

            progress([[f], [[g]]], notebook=False)
            check_bar_completed(capsys)

            progress(f)
            check_bar_completed(capsys)
Esempio n. 6
0
def test_fast(client):
    L = client.map(inc, range(100))
    L2 = client.map(dec, L)
    L3 = client.map(add, L, L2)
    p = progress(L3, multi=True, complete=True, notebook=True)
    client.sync(p.listen)
    assert set(p._last_response["all"]) == {"inc", "dec", "add"}
Esempio n. 7
0
def test_fast(client):
    L = client.map(inc, range(100))
    L2 = client.map(dec, L)
    L3 = client.map(add, L, L2)
    p = progress(L3, multi=True, complete=True, notebook=True)
    client.sync(p.listen)
    assert set(p._last_response['all']) == {'inc', 'dec', 'add'}
Esempio n. 8
0
def test_fast(loop):
    with cluster() as (s, [a, b]):
        with Client(s['address'], loop=loop) as c:
            L = c.map(inc, range(100))
            L2 = c.map(dec, L)
            L3 = c.map(add, L, L2)
            p = progress(L3, multi=True, complete=True, notebook=True)
            sync(loop, p.listen)
            assert set(p._last_response['all']) == {'inc', 'dec', 'add'}
Esempio n. 9
0
def test_fast(loop):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            L = e.map(inc, range(100))
            L2 = e.map(dec, L)
            L3 = e.map(add, L, L2)
            p = progress(L3, multi=True, complete=True, notebook=True)
            sync(loop, p.listen)
            assert set(p._last_response['all']) == {'inc', 'dec', 'add'}
Esempio n. 10
0
def test_fast(loop):
    with cluster() as (s, [a, b]):
        with Executor(('127.0.0.1', s['port']), loop=loop) as e:
            L = e.map(inc, range(100))
            L2 = e.map(dec, L)
            L3 = e.map(add, L, L2)
            p = progress(L3, multi=True, complete=True, notebook=True)
            sync(loop, p.listen)
            assert set(p._last_response['all']) == {'inc', 'dec', 'add'}
Esempio n. 11
0
def main(update=False, outdir=os.getcwd(), ARGO_DIR="/data/datos/ARGO/data/"):
    argo_files = [
        os.path.join(ARGO_DIR, x) for x in os.listdir(ARGO_DIR)
        if x.endswith("_prof.nc")
    ]
    argo_files.sort()
    client = setup_cluster()
    print(client)
    if update:
        updated_data = update_data(argo_files, outdir=outdir)
        updated_data = updated_data.persist()
        progress(updated_data)
        updated_data = check_bio(updated_data.compute().reset_index(drop=True))
        updated_data.to_csv(os.path.join(outdir, "argo_latlon.txt"),
                            index=False)
    else:
        data = merge_data([get_data(argof) for argof in argo_files])
        data = data.persist()
        progress(data)
        data = check_bio(data.compute().reset_index(drop=True))
        data.to_csv(os.path.join(outdir, "argo_latlon.txt"), index=False)
def test_progress_function_w_kwargs(client, capsys):
    f = client.submit(lambda: 1)
    g = client.submit(lambda: 2)

    progress(f, interval="20ms")
    check_bar_completed(capsys)
Esempio n. 13
0
def build_index(use_bag=False):
    """
    An experiment is a collection of outputNNN directories.  Each directory
    represents the output of a single job submission script. These directories
    are created by the *payu* tool.

    This function creates and/or updates an index cache of variables names
    found in all NetCDF4 files.

    We can also examine the .nc files directly to infer their contents.
    for each .nc file, get variables -> dimensions

    .ncfile, varname, dimensions, chunksize

    """

    # Build index of all NetCDF files found in directories to search.

    ncfiles = []
    runs_available = []

    print('Finding runs on disk...', end='')
    for directoryToSearch in directoriesToSearch:
        #print('Searching {}'.format(directoryToSearch))

        # find all subdirectories
        results = subprocess.check_output(['find', directoryToSearch, '-maxdepth', '3', '-type', 'd',
            '-name', 'output???'])

        results = [s for s in results.decode('utf-8').split()]
        runs_available.extend(results)
    print('found {} run directories'.format( len(runs_available)))

    #ncfiles.extend(results)
#
#    results = subprocess.check_output(['find', directoryToSearch, '-name', '*.nc'])
#
#    print('Found {} .nc files'.format(len(ncfiles)))

    # We can persist this index by storing it in a sqlite database placed in a centrally available location.

    # The use of the `dataset` module hides the details of working with SQL directly.

    # In this database is a single table listing all variables in NetCDF4 seen previously.
    print('Using database {}'.format(database_url))
    print('Querying database...', end='')

    db = dataset.connect(database_url)

    # find list of all run directories
    r = db.query('SELECT DISTINCT rootdir, configuration, experiment, run FROM ncfiles')

    runs_already_seen = [os.path.join(*row.values())
                         for row in r]

    print('runs already indexed: {}'.format(len(runs_already_seen)))

    runs_to_index = list(set(runs_available) - set(runs_already_seen))

    if len(runs_to_index) == 0:
        print("No new runs found.")
        return

    print('{} new run directories found including...'.format(len(runs_to_index)))

    for i in range(min(3, len(runs_to_index))):
        print(runs_to_index[i])
    if len(runs_to_index) > 3:
        print('...')

    print('Finding files on disk...')
    ncfiles = []
    for run in tqdm.tqdm_notebook(runs_to_index, leave=True):
        results = subprocess.check_output(['find', run, '-name', '*.nc'])
        results = [s for s in results.decode('utf-8').split()]

        ncfiles.extend(results)

    IPython.display.clear_output(wait=True)
    
    # NetCDF files found on disk not seen before:
    #files_to_add = set(ncfiles) - set(files_already_seen)

    files_to_add = ncfiles

    print('Files found but not yet indexed: {}'.format(len(files_to_add)))

    # For these new files, we can determine their configuration, experiment, and run.
    # Using NetCDF4 to get list of all variables in each file.

    # output* directories
    # match the parent and grandparent directory to configuration/experiment
    find_output = re.compile('(.*)/([^/]*)/([^/]*)/(output\d+)/.*\.nc')

    # determine general pattern for ncfile names
    find_basename_pattern = re.compile('(?P<root>[^\d]+)(?P<index>__\d+_\d+)?(?P<indexice>\.\d+\-\d+)?(?P<ext>\.nc)')

    def index_variables(ncfile):

        matched = find_output.match(ncfile)
        if matched is None:
            return []

        if not os.path.exists(ncfile):
            return []

        basename = os.path.basename(ncfile)
        m = find_basename_pattern.match(basename)
        if m is None:
            basename_pattern = basename
        else:
            basename_pattern = m.group('root') + ('__\d+_\d+' if m.group('index') else '') + ('.\d+-\d+' if m.group('indexice') else '')+ m.group('ext')

        try:
            with netCDF4.Dataset(ncfile) as ds:
                ncvars = [ {'ncfile': ncfile,
                   'rootdir': matched.group(1),
                   'configuration': matched.group(2),
                   'experiment' : matched.group(3),
                   'run' : matched.group(4),
                   'basename' : basename,
                   'basename_pattern' : basename_pattern,
                   'variable' : v.name,
                   'dimensions' : str(v.dimensions),
                   'chunking' : str(v.chunking()),
                   } for v in ds.variables.values()]
        except:
            print ('Exception occurred while trying to read {}'.format(ncfile))
            ncvars = []

        return ncvars

    if len(files_to_add) == 0:
        print("No new .nc files found.")
        return True

    print('Indexing new .nc files...')

    if use_bag:
        with distributed.Client() as client:
            bag = dask.bag.from_sequence(files_to_add)
            bag = bag.map(index_variables).flatten()

            futures = client.compute(bag)
            progress(futures, notebook=False)

            ncvars = futures.result()
    else:
        ncvars = []
        for file_to_add in tqdm.tqdm_notebook(files_to_add, leave=False):
            ncvars.extend(index_variables(file_to_add))
        IPython.display.clear_output()
        
    print('')
    print('Found {} new variables'.format(len(ncvars)))

    print('Saving results in database...')
    db['ncfiles'].insert_many(ncvars)

    print('Indexing complete.')

    return True
Esempio n. 14
0
def main(lat, lon, time):
    client = setup_cluster()
    print(client)
    GODAS_GLOB = "/data/users/grivera/GODAS/clim/daily/*.godas_dayclim.nc"

    argodb = pd.read_csv(paths["ARGO_DB"], parse_dates=[0])
    grid = np.arange(0, 2001, 2.0)
    newdf = filter_data(argodb, lat, lon, time)
    newdf.loc[:, "fname"] = newdf["date"].apply(
        get_fn, args=(paths["ARGO_DATA"], "{:%Y%m%d}_prof.nc")
    )
    newdf = newdf.sort_values("date")
    print(
        "\nLoading GODAS climatology from {:%Y-%m-%d} to {:%Y-%m-%d}".format(
            newdf.date.iloc[0], newdf.date.iloc[-1]
        )
    )
    godas_clim = xr.open_mfdataset(GODAS_GLOB, parallel=True).pottmp - 273
    godas_clim = (
        godas_clim.sel(
            time=slice(newdf.date.iloc[0], newdf.date.iloc[-1]),
            lat=slice(lat[0], lat[1]),
            lon=slice(lon[0], lon[1]),
        )
        .mean(dim=["lat", "lon"])
        .persist()
    )
    progress(godas_clim)
    godas_clim = godas_clim.compute()
    print("\nDone \n")
    print("Computing region average")
    new_data = dastack(
        [
            get_temp_anom(r.fname, r.nprof, godas_clim.sel(time=r.date))
            for r in newdf.itertuples()
        ]
    )
    new_data = new_data.persist()
    progress(new_data)
    new_data = new_data.compute()
    print("\nDone\n")
    save_nc(
        newdf,
        new_data[:, 0, :],
        "argo_tanom",
        "tanom",
        lon,
        lat,
        grid,
        paths["ARGO_PATCH_OUT"],
    )
    save_nc(
        newdf,
        new_data[:, 1, :],
        "argo_temp",
        "temp",
        lon,
        lat,
        grid,
        paths["ARGO_PATCH_OUT"],
    )
def build_index(use_bag=False, careful=False, expt_dir_list=None):
    """
    An experiment is a collection of outputNNN directories.  Each directory
    represents the output of a single job submission script. These directories
    are created by the *payu* tool.

    This function creates and/or updates an index cache of variables names
    found in all NetCDF4 files.

    Optional arguments:

        careful: if True, use slower but more thorough method. Use this if some
             files are missing from index. Default is False.

        use_bag: Default is False.

        expt_dir_list: list of experiment directories in which cosima-cookbook.db 
             will be created/updated. At present this is used only in get_nc_variable.
             You must have write access to all these directories.
             If expt_dir_list=None (the default), a central database is used.

    We can also examine the .nc files directly to infer their contents.
    for each .nc file, get variables -> dimensions

    .ncfile, varname, dimensions, chunksize

    """

    # Build index of all NetCDF files found in directories to search.

    # dir_dbs dict keys are directories to search, values are corresponding db url
    if expt_dir_list is None:  # one db for all expts
        dir_dbs = {d: database_url for d in directoriesToSearch}
        maxdepth = 3
    else:  # a separate db at root of each expt dir
        if not isinstance(expt_dir_list, list):
            expt_dir_list = [expt_dir_list]
        dir_dbs = {d: database_url_from_path(d) for d in expt_dir_list}
        maxdepth = 1

    prev_db_url = ''
    for directoryToSearch, db_url in dir_dbs.items():
        print('Finding runs in {}... '.format(directoryToSearch), end='')
        ncfiles = []
        runs_available = []
        # find all output subdirectories
        try:
            results = subprocess.check_output([
                'find', directoryToSearch, '-maxdepth',
                str(maxdepth), '-type', 'd', '-name', 'output???'
            ])
            results = [s for s in results.decode('utf-8').split()]
            runs_available.extend(results)
        except:
            print(
                '{0} exception occurred while finding output directories in {1}'
                .format(sys.exc_info()[0], directoryToSearch))

        runs_available = set(runs_available)
        print('found {} run directories'.format(len(runs_available)))

        #ncfiles.extend(results)
        #
        #    results = subprocess.check_output(['find', directoryToSearch, '-name', '*.nc'])
        #
        #    print('Found {} .nc files'.format(len(ncfiles)))

        # We can persist this index by storing it in a sqlite database placed in a centrally available location.

        # The use of the `dataset` module hides the details of working with SQL directly.

        # In this database is a single table listing all variables in NetCDF4 seen previously.
        print('Using database {}'.format(db_url))
        db = dataset.connect(db_url)
        db['ncfiles']
        # db.create_table('ncfiles')  # has no effect if 'ncfiles' table already exists

        if not (db_url == prev_db_url
                ):  # avoid repeating db query when expt_dir_list is None
            runs_already_seen = set([])
            files_already_seen = set([])
            if db['ncfiles'].count(
            ) > 0:  # this also creates 'ncfiles' table if db is new
                if careful:  # filter by filename rather than dir
                    print('Querying database for files... ', end='')
                    rf = db.query('SELECT DISTINCT ncfile FROM ncfiles')
                    files_already_seen = set([row['ncfile'] for row in rf])
                    # files_already_seen = dict.fromkeys(rf) # use a dict for fast lookup
                    # files_already_seen = {n['ncfile']: None for n in rf}  # use a dict for fast lookup
                    print('files already indexed: {}'.format(
                        len(files_already_seen)))
                else:  # filter by dir rather than filename
                    # BUG: this can skip dirs even if they contain un-indexed .nc files - see https://github.com/OceansAus/cosima-cookbook/issues/95
                    print('Querying database for directories... ', end='')
                    # find list of all run directories
                    r = db.query(
                        'SELECT DISTINCT rootdir, configuration, experiment, run FROM ncfiles'
                    )
                    runs_already_seen = set(
                        [os.path.join(*row.values()) for row in r])
                    print('run directories already indexed: {}'.format(
                        len(runs_already_seen)))
        prev_db_url = db_url

        runs_to_index = list(runs_available - runs_already_seen)
        if len(runs_to_index) == 0:
            print('No new runs found in {}'.format(directoryToSearch))
            continue
        #
        # print('{} new run directories found including... '.format(len(runs_to_index)))
        #
        # for i in range(min(3, len(runs_to_index))):
        #     print(runs_to_index[i])
        # if len(runs_to_index) > 3:
        #     print('...')

        print('Finding files in {} run directories... '.format(
            len(runs_to_index)))
        ncfiles = []
        for run in tqdm.tqdm_notebook(runs_to_index, leave=False):
            try:
                results = subprocess.check_output(
                    ['find', run, '-name', '*.nc'])
                results = [s for s in results.decode('utf-8').split()]
                ncfiles.extend(results)
            except:
                print(
                    '{0} exception occurred while finding *.nc in {1}'.format(
                        sys.exc_info()[0], run))

        # IPython.display.clear_output(wait=True)
        # NetCDF files found on disk not seen before:
        files_to_add = set(ncfiles) - files_already_seen

        print('Files found but not yet indexed: {}'.format(len(files_to_add)))

        # For these new files, we can determine their configuration, experiment, and run.
        # Using NetCDF4 to get list of all variables in each file.

        # output* directories
        # match the parent and grandparent directory to configuration/experiment
        find_output = re.compile('(.*)/([^/]*)/([^/]*)/(output\d+)/.*\.nc')

        # determine general pattern for ncfile names
        find_basename_pattern = re.compile(
            '(?P<root>[^\d]+)(?P<index>__\d+_\d+)?(?P<indexice>\.\d+\-\d+)?(?P<ext>\.nc)'
        )

        def index_variables(ncfile):

            matched = find_output.match(ncfile)
            if matched is None:
                return []

            if not os.path.exists(ncfile):
                return []

    # TODO: also exit here if ncfile is already in database - use [NOT] EXISTS ??
    # but this is super slow
    # module load sqlite
    # sqlite3 /g/data3/hh5/tmp/cosima/cosima-cookbook/cosima-cookbook.db
    # select * from ncfiles where ncfile == '/g/data3/hh5/tmp/cosima/access-om2-025/025deg_jra55v13_ryf8485_KDS50/output024/ocean/ocean_scalar.nc';

            basename = os.path.basename(ncfile)
            m = find_basename_pattern.match(basename)
            if m is None:
                basename_pattern = basename
            else:
                basename_pattern = m.group('root') + ('__\d+_\d+' if m.group(
                    'index') else '') + ('.\d+-\d+' if m.group('indexice') else
                                         '') + m.group('ext')

            try:
                with netCDF4.Dataset(ncfile) as ds:
                    ncvars = [{
                        'ncfile': ncfile,
                        'rootdir': matched.group(1),
                        'configuration': matched.group(2),
                        'experiment': matched.group(3),
                        'run': matched.group(4),
                        'basename': basename,
                        'basename_pattern': basename_pattern,
                        'variable': v.name,
                        'dimensions': str(v.dimensions),
                        'chunking': str(v.chunking()),
                    } for v in ds.variables.values()]
            except:
                print('{0} exception occurred while trying to read {1}'.format(
                    sys.exc_info()[0], ncfile))
                ncvars = []

            return ncvars

        if len(files_to_add) == 0:
            print('No new .nc files found in {}'.format(directoryToSearch))
            continue

        print('Indexing {} new .nc files... '.format(len(files_to_add)))

        if use_bag:
            with distributed.Client() as client:
                bag = dask.bag.from_sequence(files_to_add)
                bag = bag.map(index_variables).flatten()

                futures = client.compute(bag)
                progress(futures, notebook=False)

                ncvars = futures.result()
        else:
            ncvars = []
            for file_to_add in tqdm.tqdm_notebook(files_to_add, leave=False):
                ncvars.extend(index_variables(file_to_add))
            IPython.display.clear_output()

        print('')
        print('Indexed {} variables found in new files'.format(len(ncvars)))

        print('Saving results in database {}... '.format(db_url))
        db['ncfiles'].insert_many(ncvars)

    print('Indexing complete.')

    return True