Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--from_dir_prefix')
    parser.add_argument('-t', '--to_dir_prefix')
    parser.add_argument('-u',
                        '--path2udp_model',
                        default='russian-syntagrus-ud-2.0-170801.udpipe')
    parser.add_argument('-n', '--cpu_n', default=5, type=int)
    args = parser.parse_args()
    # dfunc.MODELFILE4UDPIPE = args.path2udp_model
    # dfunc.set_model(args.path2udp_model)
    with dask.config.set(pool=ThreadPool(args.cpu_n)):
        bag = db.read_text(args.from_dir_prefix)
        pbar = ProgressBar()
        pbar.register()
        ddf = bag.to_dataframe(columns=['text'])
        ddf['text'] = ddf['text'].apply(dfunc.skip_empty, meta=('x', 'f8'))
        ddf = ddf.dropna()
        ddf['rec'] = ddf['text'].apply(dfunc.get_rec_info, meta=('x', 'f8'))
        ddf['text'] = ddf['text'].apply(dfunc.spec_tok_add, meta=('x', 'f8'))
        ddf['norm_text'] = ddf['text'].apply(dfunc.normalization1,
                                             meta=('x', 'f8'))
        # udpipe_sent_and_tok = dfunc.get_udpipe_sent_and_tok(args.path2udp_model)
        # udpipe_sent_and_tok = dfunc.get_udpipe_sent_and_tok('/home/den/Documents/elmo/data_preparing/rutwitter/russian-syntagrus-ud-2.0-170801.udpipe')
        ddf['norm_text'] = ddf['text'].apply(dfunc.udpipe_sent_and_tok,
                                             meta=('x', 'f8'))
        # ddf['norm_text'] = ddf['text'].apply(dfunc.nltk_sent_and_tok, meta=('x', 'f8'))
        ddf['norm_text'] = ddf['norm_text'].apply(dfunc.normalization2,
                                                  meta=('x', 'f8'))
        ddf['rec_text'] = ddf.apply(dfunc.recovery, meta=('x', 'f8'), axis=1)
        ddf['cleaned_text'] = ddf['norm_text'].apply(dfunc.lower_case,
                                                     meta=('x', 'f8'))
        ddf[['rec_text', 'cleaned_text']].to_csv(args.to_dir_prefix)
 def createPyramidLevel(self, resolution=0, subdiv=(1, 2, 2), quiet=False):
     """ Add a level in a multi-level pyramid.
         Provided this function because TeraStitcher does
         not have enough control over the sampling strategy for imaris files
     """
     # find all of the imaris datasets under the specified resolution group
     self._subdiv = subdiv
     datasetnames = list()
     resin = 'ResolutionLevel ' + str(resolution)
     resout = 'ResolutionLevel ' + str(resolution + 1)
     prf = '/DataSet/' + resin
     self._file_object[prf].visit(datasetnames.append)
     tt = [
         type(self._file_object[prf + '/' + x]) == h5py._hl.dataset.Dataset
         for x in datasetnames
     ]
     res = list(compress(datasetnames, tt))
     # Now we need to find the ones ending in '/Data'
     tt = [x.endswith('/Data') for x in res]
     res = list(compress(res, tt))
     outpaths = ['/DataSet/' + resout + '/' + x for x in res]
     inpaths = [prf + '/' + x for x in res]
     pbar = ProgressBar()
     for idx in range(len(inpaths)):
         if not quiet:
             print(inpaths[idx])
             pbar.register()
         self._subdivide(self._file_object, inpaths[idx], outpaths[idx])
         if not quiet:
             pbar.unregister()
Ejemplo n.º 3
0
    def init_dask_client():
        pbar = ProgressBar()
        pbar.register()

        cluster = LocalCluster()
        client = Client(cluster)
        return cluster, client
Ejemplo n.º 4
0
def main(argv):
    jobNum = int(argv.jobNum)
    outputDir = argv.outputDir
    inputDir = argv.inputDir
    try:
        os.mkdir(outputDir)
        for shape in ['64_32', '128_64', '192_96']:
            os.mkdir(os.path.join(outputDir, shape))
    except OSError:
        pass  # path already exists
    client = LocalCluster(n_workers=jobNum,
                          threads_per_worker=5)  # IO intensive, more threads
    print('* number of workers:{}, \n* input dir:{}, \n* output dir:{}\n\n'.
          format(jobNum, inputDir, outputDir))
    #print('* Link to local cluster dashboard: ', client.dashboard_link)
    for subFolder in [
            'ccpd_base', 'ccpd_db', 'ccpd_fn', 'ccpd_rotate', 'ccpd_tilt',
            'ccpd_weather'
    ]:
        fileList = os.listdir(os.path.join(inputDir, subFolder))
        print('* {} images found in {}. Start processing ...'.format(
            len(fileList), subFolder))
        toDo = dbag.from_sequence(fileList, npartitions=jobNum *
                                  30).persist()  # persist the bag in memory
        toDo = toDo.map(processImage, inputDir, outputDir, subFolder)
        pbar = ProgressBar(minimum=2.0)
        pbar.register()  # register all computations for better tracking
        result = toDo.compute()
        print('* image cropped: {}. Done ...'.format(sum(result)))
    client.close()  # shut down the cluster
Ejemplo n.º 5
0
def show_progress(arr, msg: str = None, nthreads: int = 1):
    from dask.diagnostics import ProgressBar

    if msg is not None:
        logger.info(msg)
    pbar = ProgressBar()
    pbar.register()
    res = controlled_compute(arr, nthreads)
    pbar.unregister()
    return res
Ejemplo n.º 6
0
def main():
    paths = list(Path(args.dir).rglob("*.txt"))
    pbar = ProgressBar()
    pbar.register()
    a_bag = db.from_sequence(paths, npartitions=mp.cpu_count())
    a_bag = a_bag.map(lambda a_path: parse_path(a_path))
    frame_data = a_bag.compute()
    pbar.unregister()

    frame = pd.DataFrame(frame_data)
    frame.to_pickle(args.out)
def merge_with_small(dbSnp153, small_df):
    # TODO:after merge, make sure to drop duplicates, because dbSnp153 might contain duplicate keys
    # or process the dbSnp153: dedup/sort beforehand
    small_df["start"] = small_df["BP"] - 1
    dbSnp153['chr'] = dbSnp153.chrom.str[3:]
    pbar = ProgressBar()
    pbar.register()
    result = dbSnp153.merge(small_df,
                            how="inner",
                            left_on=["chr", "chromStart", "chromEnd"],
                            right_on=["Chr", "start", "BP"]).compute()
    return result
Ejemplo n.º 8
0
def test_register(capsys):
    try:
        p = ProgressBar()
        p.register()

        assert _globals['callbacks']

        get_threaded(dsk, 'e')
        check_bar_completed(capsys)

        p.unregister()

        assert not _globals['callbacks']
    finally:
        _globals['callbacks'].clear()
Ejemplo n.º 9
0
def test_register(capsys):
    try:
        p = ProgressBar()
        p.register()

        assert Callback.active

        get_threaded(dsk, 'e')
        check_bar_completed(capsys)

        p.unregister()

        assert not Callback.active
    finally:
        Callback.active.clear()
Ejemplo n.º 10
0
def test_register(capsys):
    try:
        p = ProgressBar()
        p.register()

        assert _globals['callbacks']

        get(dsk, 'e')
        check_bar_completed(capsys)

        p.unregister()

        assert not _globals['callbacks']
    finally:
        _globals['callbacks'].clear()
Ejemplo n.º 11
0
def test_register(capsys):
    try:
        p = ProgressBar()
        p.register()

        assert Callback.active

        get_threaded(dsk, "e")
        check_bar_completed(capsys)

        p.unregister()

        assert not Callback.active
    finally:
        Callback.active.clear()
Ejemplo n.º 12
0
def test_register(capsys):
    try:
        p = ProgressBar()
        p.register()

        assert _globals['callbacks']

        get(dsk, 'e')
        out, err = capsys.readouterr()
        bar, percent, time = [i.strip() for i in out.split('\r')[-1].split('|')]
        assert bar == "[########################################]"
        assert percent == "100% Completed"

        p.unregister()

        assert not _globals['callbacks']
    finally:
        _globals['callbacks'].clear()
Ejemplo n.º 13
0
def cli(ctx, store, cube, skv, n_threads, color):
    """
    Execute certain operations on the given Kartothek cube.

    If possible, the operations will be performed in parallel on the current machine.
    """
    ctx.ensure_object(dict)

    store_obj = get_store(skv, store)
    cube, datasets = get_cube(store_obj, cube)

    dask.config.set(scheduler="threads")
    if n_threads > 0:
        dask.config.set(pool=ThreadPool(n_threads))

    if color == "always":
        ctx.color = True
    elif color == "off":
        ctx.color = False

    pbar = ProgressBar()
    pbar.register()
    ctx.call_on_close(pbar.unregister)

    # silence extremely verbose azure logging
    azure_logger = logging.getLogger("azure.storage.common.storageclient")
    azure_logger.setLevel(logging.FATAL)

    # pandas perf tuning
    chained_assignment_old = pd.options.mode.chained_assignment

    def reset_pd():
        pd.options.mode.chained_assignment = chained_assignment_old

    ctx.call_on_close(reset_pd)
    pd.options.mode.chained_assignment = None

    ctx.obj["skv"] = skv
    ctx.obj["store"] = store_obj
    ctx.obj["store_name"] = store
    ctx.obj["cube"] = cube
    ctx.obj["datasets"] = datasets
    ctx.obj["pbar"] = pbar
Ejemplo n.º 14
0
def show_progress(arr, msg: str = None, nthreads: int = 1):
    """
    Performs computation with Dask and shows progress bar.

    Args:
        arr:
        msg: message to log, default None
        nthreads: number of threads to use for computation, default 1

    Returns:
        Result of computation.
    """
    from dask.diagnostics import ProgressBar

    if msg is not None:
        logger.info(msg)
    pbar = ProgressBar()
    pbar.register()
    res = controlled_compute(arr, nthreads)
    pbar.unregister()
    return res
def main():
    logger = logging.getLogger(__name__)
    logger.info('creating a bunch of features')

    pbar = ProgressBar()
    pbar.register()

    target_entities = ['ip', 'app', 'device', 'os', 'channel']
    filenames_train = sorted(glob('../data/interim/train_2017-11-*00.csv'))
    training_windows = ['1 hours', '3 hours', '1 day']

    for target_entity in target_entities:
        filenames = glob(
            f"../data/interim/partitioned/{target_entity}/train_*.csv")
        b = bag.from_sequence(filenames)
        entity_sets = b.map(create_entityset, target_entity).compute()
        gc.collect()

        for filename in filenames_train:
            logger.info(f"Processing: {filename}")
            df = pd.read_csv(filename,
                             usecols=['click_time'],
                             parse_dates=to_parse)
            cutoff_time = df['click_time'].min()
            del df
            for training_window in training_windows:
                create_features(filename,
                                entity_sets,
                                target_entity=target_entity,
                                cutoff_time=cutoff_time,
                                training_window=ft.Timedelta(training_window))

        del entity_sets, b
        gc.collect()

    logger.info('finished')
Ejemplo n.º 16
0
def Movie(
        da,
        odir,
        varname=None,
        framedim="time",
        moviename="movie",
        clim=None,
        cmap=None,
        bgcolor=np.array([1, 1, 1]) * 0.3,
        framewidth=1280,
        frameheight=720,
        dpi=100,
        lon=None,
        lat=None,
        dask=True,
        delete=True,
        ffmpeg=True,
        plot_style="simple",
        norm=mpl.colors.Normalize(),
        progbar=False,
):
    # Set defaults:
    if not ffmpeg and delete:
        raise RuntimeError("raw picture deletion makes only \
            sense if ffmpeg conversion is enabled")

    if not isinstance(da, xr.DataArray):
        raise RuntimeError("input has to be an xarray DataStructure, instead\
        is " + str(type(da)))

    if not os.path.exists(odir):
        os.makedirs(odir)

    # Infer defaults from data
    if clim is None:
        print("clim will be inferred from data, this can take very long...")
        clim = [da.min(), da.max()]
    if cmap is None:
        cmap = plt.cm.viridis

    if plot_style in ["map"]:
        if None in [lon, lat]:
            raise RuntimeError("map plotting requires lon and lat")
        else:
            lons = np.array(da[lon].data)
            lats = np.array(da[lat].data)

            if len(lons.shape) != 2:
                lons, lats = np.meshgrid(lons, lats)

            time = np.array(da["time"].data)

    else:
        lons = None
        lats = None
        time = None

    # Annnd here we go
    print("+++ Execute plot function +++")
    if dask:
        data = da.data
        frame_axis = da.get_axis_num(framedim)
        drop_axis = [da.get_axis_num(a) for a in da.dims if not a == framedim]
        chunks = list(data.shape)
        chunks[frame_axis] = 1
        data = data.rechunk(chunks)
        if progbar:
            pbar = ProgressBar()
            pbar.register()
        data.map_blocks(
            FramePrint,
            chunks=[1],
            drop_axis=drop_axis,
            dtype=np.float64,
            dask=dask,
            frame_axis=frame_axis,
            odir=odir,
            cmap=cmap,
            clim=clim,
            framewidth=framewidth,
            frameheight=frameheight,
            bgcolor=bgcolor,
            plot_style=plot_style,
            lons=lons,
            lats=lats,
            time=time,
            norm=norm,
            dpi=dpi,
        ).compute(get=get)
        if progbar:
            pbar.unregister()
    # The .compute(get=get) line is some dask 'magic': it parallelizes the
    # print function with processes and not threads,which is a lot faster
    # for custom functions apparently!
    else:
        # do it with a simple for loop...can this really be quicker?
        print("This is slow! Do it in dask!")
        for ii in range(0, len(da.time)):
            start_time = time.time()
            da_slice = da[{framedim: ii}]
            # fig,ax,h = FramePrint(da_slice,
            FramePrint(
                da_slice,
                frame=ii,
                odir=odir,
                cmap=cmap,
                clim=clim,
                framewidth=framewidth,
                frameheight=dpi,
                bgcolor=bgcolor,
                plot_style=plot_style,
                lons=lons,
                lats=lats,
                norm=norm,
                dpi=dpi,
            )
            if ii % 100 == 0:
                remaining_time = (len(da.time) - ii) * (time.time() -
                                                        start_time) / 60
                print("FRAME---%04d---" % ii)
                print("Estimated time left : %d minutes" % remaining_time)

    query = ('ffmpeg -y -i "frame_%05d.png" -c:v libx264 -preset veryslow \
        -crf 6 -pix_fmt yuv420p \
        -framerate 10 \
        "' + moviename + '.mp4"')

    with cd(odir):
        if ffmpeg:
            print("+++ Convert frames to video +++")
            excode = os.system(query)
            if excode == 0 and delete:
                os.system("rm *.png")
Ejemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser(
        description='Add multiscale levels to an existing n5')

    parser.add_argument('-i', '--input', dest='input_path', type=str, required=True, \
        help='Path to the directory containing the n5 volume')

    parser.add_argument('-d', '--data_set', dest='data_set', type=str, default="", \
        help='Path to data set (default empty, so /s0 is assumed to exist at the root)')

    parser.add_argument('-f', '--downsampling_factors', dest='downsampling_factors', type=str, default="2,2,2", \
        help='Downsampling factors for each dimension (default "2,2,2")')

    parser.add_argument('-p', '--pixel_res', dest='pixel_res', type=str, \
        help='Pixel resolution for each dimension "2.0,2.0,2.0" (default None) - required for Neuroglancer')

    parser.add_argument('-u', '--pixel_res_units', dest='pixel_res_units', type=str, default="nm", \
        help='Measurement unit for --pixel_res (default "nm") - required for Neuroglancer')

    parser.add_argument('--distributed', dest='distributed', action='store_true', \
        help='Run with distributed scheduler (default)')
    parser.set_defaults(distributed=False)

    parser.add_argument('--workers', dest='workers', type=int, default=20, \
        help='If --distributed is set, this specifies the number of workers (default 20)')

    parser.add_argument('--dashboard', dest='dashboard', action='store_true', \
        help='If --distributed is set, this runs a web-based dashboard on port 8787')
    parser.set_defaults(dashboard=False)

    parser.add_argument('--metadata-only', dest='metadata_only', action='store_true', \
        help='Only fix metadata on an existing multiscale pyramid')
    parser.set_defaults(metadata_only=False)

    args = parser.parse_args()

    if args.distributed:
        dashboard_address = None
        if args.dashboard:
            dashboard_address = ":8787"
            print(f"Starting dashboard on {dashboard_address}")

        from dask.distributed import Client
        client = Client(processes=True, n_workers=args.workers, \
            threads_per_worker=1, dashboard_address=dashboard_address)

    else:
        from dask.diagnostics import ProgressBar
        pbar = ProgressBar()
        pbar.register()

    downsampling_factors = [
        int(c) for c in args.downsampling_factors.split(',')
    ]

    pixel_res = None
    if args.pixel_res:
        pixel_res = [float(c) for c in args.pixel_res.split(',')]

    if not args.metadata_only:
        add_multiscale(args.input_path,
                       args.data_set,
                       downsampling_factors=downsampling_factors)

    add_metadata(args.input_path,
                 downsampling_factors=downsampling_factors,
                 pixel_res=pixel_res,
                 pixel_res_units=args.pixel_res_units)
Ejemplo n.º 18
0
from astropy.io import fits
from astropy import units as u
from astropy.stats import mad_std
import pylab as pl
import radio_beam
import glob
from spectral_cube import SpectralCube, DaskSpectralCube
from spectral_cube.lower_dimensional_structures import Projection

from casatools import image
ia = image()

if os.getenv('NO_PROGRESSBAR') is None:
    from dask.diagnostics import ProgressBar
    pbar = ProgressBar()
    pbar.register()

nthreads = 1
scheduler = 'synchronous'

os.environ['TEMPDIR'] = '/blue/adamginsburg/adamginsburg/tmp/'

if os.getenv('DASK_THREADS') is not None:
    try:
        nthreads = int(os.getenv('DASK_THREADS'))
        if nthreads > 1:
            scheduler = 'threads'
        else:
            scheduler = 'synchronous'
    except (TypeError, ValueError):
        nthreads = 1
import re
import json
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from multiprocessing import Pool
import dask.bag as db
import dask.dataframe as dd
import time
from dask.diagnostics import ProgressBar
from dask.distributed import Client, LocalCluster, progress

p = ProgressBar()
p.register()

#cluster = LocalCluster(n_workers=2 , threads_per_worker=2 , ip= '145.107.189.23')
#c = Client()

# word stemmer
stemmer = LancasterStemmer()
random.seed(1)


def ImportData():
    """Function for importing the Json file dataset 
    as a pandas object and numpy array.
    Printing Datashape"""

    data = pd.read_json("sample_data.json", lines=True)
Ejemplo n.º 20
0
def main(argv=sys.argv[1:]):
    global LOG

    from satpy import Scene
    from satpy.writers import compute_writer_results
    from dask.diagnostics import ProgressBar
    from polar2grid.core.script_utils import (
        setup_logging,
        rename_log_file,
        create_exc_handler,
    )
    import argparse

    add_polar2grid_config_paths()
    USE_POLAR2GRID_DEFAULTS = bool(
        int(os.environ.setdefault("USE_POLAR2GRID_DEFAULTS", "1")))
    BINARY_NAME = "polar2grid" if USE_POLAR2GRID_DEFAULTS else "geo2grid"

    prog = os.getenv("PROG_NAME", sys.argv[0])
    # "usage: " will be printed at the top of this:
    usage = """
    %(prog)s -h
see available products:
    %(prog)s -r <reader> -w <writer> --list-products -f file1 [file2 ...]
basic processing:
    %(prog)s -r <reader> -w <writer> [options] -f file1 [file2 ...]
basic processing with limited products:
    %(prog)s -r <reader> -w <writer> [options] -p prod1 prod2 -f file1 [file2 ...]
"""
    parser = argparse.ArgumentParser(
        prog=prog,
        usage=usage,
        fromfile_prefix_chars="@",
        description="Load, composite, resample, and save datasets.",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        dest="verbosity",
        action="count",
        default=0,
        help="each occurrence increases verbosity 1 level through "
        "ERROR-WARNING-INFO-DEBUG (default INFO)",
    )
    parser.add_argument("-l",
                        "--log",
                        dest="log_fn",
                        default=None,
                        help="specify the log filename")
    parser.add_argument(
        "--progress",
        action="store_true",
        help="show processing progress bar (not recommended for logged output)",
    )
    parser.add_argument(
        "--num-workers",
        type=int,
        default=os.getenv("DASK_NUM_WORKERS", 4),
        help="specify number of worker threads to use (default: 4)",
    )
    parser.add_argument(
        "--match-resolution",
        dest="preserve_resolution",
        action="store_false",
        help="When using the 'native' resampler for composites, don't save data "
        "at its native resolution, use the resolution used to create the "
        "composite.",
    )
    parser.add_argument(
        "--list-products",
        dest="list_products",
        action="store_true",
        help="List available {} products and exit".format(BINARY_NAME),
    )
    parser.add_argument(
        "--list-products-all",
        dest="list_products_all",
        action="store_true",
        help="List available {} products and custom/Satpy products and exit".
        format(BINARY_NAME),
    )
    reader_group = add_scene_argument_groups(
        parser, is_polar2grid=USE_POLAR2GRID_DEFAULTS)[0]
    resampling_group = add_resample_argument_groups(
        parser, is_polar2grid=USE_POLAR2GRID_DEFAULTS)[0]
    writer_group = add_writer_argument_groups(parser)[0]
    argv_without_help = [x for x in argv if x not in ["-h", "--help"]]

    _retitle_optional_arguments(parser)
    args, remaining_args = parser.parse_known_args(argv_without_help)
    os.environ["DASK_NUM_WORKERS"] = str(args.num_workers)

    # get the logger if we know the readers and writers that will be used
    if args.readers is not None and args.writers is not None:
        glue_name = args.readers[0] + "_" + "-".join(args.writers or [])
        LOG = logging.getLogger(glue_name)
    reader_subgroups = _add_component_parser_args(parser, "readers",
                                                  args.readers or [])
    writer_subgroups = _add_component_parser_args(parser, "writers",
                                                  args.writers or [])
    args = parser.parse_args(argv)

    if args.readers is None:
        parser.print_usage()
        parser.exit(
            1,
            "\nERROR: Reader must be provided (-r flag).\n"
            "Supported readers:\n\t{}\n".format("\n\t".join(
                ["abi_l1b", "ahi_hsd", "hrit_ahi"])),
        )
    elif len(args.readers) > 1:
        parser.print_usage()
        parser.exit(
            1,
            "\nMultiple readers is not currently supported. Got:\n\t"
            "{}\n".format("\n\t".join(args.readers)),
        )
        return -1
    if args.writers is None:
        parser.print_usage()
        parser.exit(
            1,
            "\nERROR: Writer must be provided (-w flag) with one or more writer.\n"
            "Supported writers:\n\t{}\n".format("\n\t".join(["geotiff"])),
        )

    reader_args = _args_to_dict(args, reader_group._group_actions)
    reader_names = reader_args.pop("readers")
    scene_creation, load_args = _get_scene_init_load_args(
        args, reader_args, reader_names, reader_subgroups)
    resample_args = _args_to_dict(args, resampling_group._group_actions)
    writer_args = _args_to_dict(args, writer_group._group_actions)
    writer_specific_args = _parse_writer_args(writer_args["writers"],
                                              writer_subgroups, reader_names,
                                              args)
    writer_args.update(writer_specific_args)

    if not args.filenames:
        parser.print_usage()
        parser.exit(1, "\nERROR: No data files provided (-f flag)\n")

    # Prepare logging
    rename_log = False
    if args.log_fn is None:
        rename_log = True
        args.log_fn = glue_name + "_fail.log"
    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    setup_logging(console_level=levels[min(3, args.verbosity)],
                  log_filename=args.log_fn)
    logging.getLogger("rasterio").setLevel(levels[min(2, args.verbosity)])
    sys.excepthook = create_exc_handler(LOG.name)
    if levels[min(3, args.verbosity)] > logging.DEBUG:
        import warnings

        warnings.filterwarnings("ignore")
    LOG.debug("Starting script with arguments: %s", " ".join(sys.argv))

    # Set up dask and the number of workers
    if args.num_workers:
        dask.config.set(num_workers=args.num_workers)

    # Create a Scene, analyze the provided files
    LOG.info("Sorting and reading input files...")
    try:
        scn = Scene(**scene_creation)
    except ValueError as e:
        LOG.error(
            "{} | Enable debug message (-vvv) or see log file for details.".
            format(str(e)))
        LOG.debug("Further error information: ", exc_info=True)
        return -1
    except OSError:
        LOG.error(
            "Could not open files. Enable debug message (-vvv) or see log file for details."
        )
        LOG.debug("Further error information: ", exc_info=True)
        return -1

    # Rename the log file
    if rename_log:
        rename_log_file(glue_name +
                        scn.attrs["start_time"].strftime("_%Y%m%d_%H%M%S.log"))

    # Load the actual data arrays and metadata (lazy loaded as dask arrays)
    LOG.info("Loading product metadata from files...")
    reader_info = ReaderProxyBase.from_reader_name(scene_creation["reader"],
                                                   scn, load_args["products"])
    if args.list_products or args.list_products_all:
        _print_list_products(reader_info, p2g_only=not args.list_products_all)
        return 0

    load_args["products"] = reader_info.get_satpy_products_to_load()
    if not load_args["products"]:
        return -1
    scn.load(load_args["products"])

    ll_bbox = resample_args.pop("ll_bbox")
    if ll_bbox:
        scn = scn.crop(ll_bbox=ll_bbox)

    scn = filter_scene(
        scn,
        reader_names,
        sza_threshold=reader_args["sza_threshold"],
        day_fraction=reader_args["filter_day_products"],
        night_fraction=reader_args["filter_night_products"],
    )
    if scn is None:
        LOG.info("No remaining products after filtering.")
        return 0

    to_save = []
    areas_to_resample = resample_args.pop("grids")
    if "ewa_persist" in resample_args:
        resample_args["persist"] = resample_args.pop("ewa_persist")
    scenes_to_save = resample_scene(
        scn,
        areas_to_resample,
        preserve_resolution=args.preserve_resolution,
        is_polar2grid=USE_POLAR2GRID_DEFAULTS,
        **resample_args)
    for scene_to_save, products_to_save in scenes_to_save:
        overwrite_platform_name_with_aliases(scene_to_save)
        reader_info.apply_p2g_name_to_scene(scene_to_save)
        to_save = write_scene(
            scene_to_save,
            writer_args["writers"],
            writer_args,
            products_to_save,
            to_save=to_save,
        )

    if args.progress:
        pbar = ProgressBar()
        pbar.register()

    LOG.info("Computing products and saving data to writers...")
    compute_writer_results(to_save)
    LOG.info("SUCCESS")
    return 0
Ejemplo n.º 21
0
def run(stat, bands, nodata, output, output_type, num_process, chunksize, start_date, end_date, inputs):
    # ignore warnings
    warnings.filterwarnings("ignore")
    print(header)

    # check statistical option
    if stat not in ('median', 'mean', 'gmean', 'max', 'min', 'std', 'valid_pixels', 'last_pixel',
                    'jday_last_pixel', 'jday_median', 'linear_trend') and not stat.startswith(('percentile_', 'trim_mean_')):
        print("\nError: argument '-stat' invalid choice: {}".format(stat))
        print("choose from: median, mean, gmean, max, min, std, valid_pixels, last_pixel, "
              "jday_last_pixel, jday_median, linear_trend, percentile_NN, trim_mean_LL_UL")
        return
    if stat.startswith('percentile_'):
        try:
            int(stat.split('_')[1])
        except:
            print("\nError: argument '-stat' invalid choice: {}".format(stat))
            print("the percentile must ends with a valid number, e.g. percentile_25")
            return
    if stat.startswith('trim_mean_'):
        try:
            int(stat.split('_')[2])
            int(stat.split('_')[3])
        except:
            print("\nError: argument '-stat' invalid choice: {}".format(stat))
            print("the trim_mean_LL_UL must ends with a valid limits, e.g. trim_mean_10_80")
            return

    print("\nLoading and prepare images in path(s):", flush=True)
    # search all Image files in inputs recursively if the files are in directories
    images_files = []
    for _input in inputs:
        if os.path.isfile(_input):
            if _input.endswith(IMAGES_TYPES):
                images_files.append(os.path.abspath(_input))
        elif os.path.isdir(_input):
            for root, dirs, files in os.walk(_input):
                if len(files) != 0:
                    files = [os.path.join(root, x) for x in files if x.endswith(IMAGES_TYPES)]
                    [images_files.append(os.path.abspath(file)) for file in files]

    # load bands
    if isinstance(bands, int):
        bands = [bands]
    if not isinstance(bands, list):
        bands = [int(b) for b in bands.split(',')]

    # load images
    images = [Image(landsat_file) for landsat_file in images_files]

    # filter images based on the start date and/or end date, required filename as metadata
    if start_date is not None or end_date is not None:
        [image.set_metadata_from_filename() for image in images]
        if start_date is not None:
            images = [image for image in images if image.date >= start_date]
        if end_date is not None:
            images = [image for image in images if image.date <= end_date]

    if len(images) <= 1:
        print("\n\nAfter load (and filter images in range date if applicable) there are {} images to process.\n"
              "StackComposed required at least 2 or more images to process.\n".format(len(images)))
        exit(1)

    # save nodata set from arguments
    Image.nodata_from_arg = nodata

    # get wrapper extent
    min_x = min([image.extent[0] for image in images])
    max_y = max([image.extent[1] for image in images])
    max_x = max([image.extent[2] for image in images])
    min_y = min([image.extent[3] for image in images])
    Image.wrapper_extent = [min_x, max_y, max_x, min_y]

    # define the properties for the raster wrapper
    Image.wrapper_x_res = images[0].x_res
    Image.wrapper_y_res = images[0].y_res
    Image.wrapper_shape = (int((max_y-min_y)/Image.wrapper_y_res), int((max_x-min_x)/Image.wrapper_x_res))  # (y,x)

    # reset the chunksize with the min of width/high if apply
    if chunksize > min(Image.wrapper_shape):
        chunksize = min(Image.wrapper_shape)

    # some information about process
    if len(images_files) != len(images):
        print("  images loaded: {0}".format(len(images_files)))
        print("  images to process: {0} (filtered in the range dates)".format(len(images)))
    else:
        print("  images to process: {0}".format(len(images)))
    print("  band(s) to process: {0}".format(','.join([str(b) for b in bands])))
    print("  pixels size: {0} x {1}".format(round(Image.wrapper_x_res, 1), round(Image.wrapper_y_res, 1)))
    print("  wrapper size: {0} x {1} pixels".format(Image.wrapper_shape[1], Image.wrapper_shape[0]))
    print("  running in {0} cores with chunks size {1}".format(num_process, chunksize))

    # check
    print("  checking bands and pixel size: ", flush=True, end="")
    for image in images:
        for band in bands:
            if band > image.n_bands:
                print("\n\nError: the image '{0}' don't have the band {1} needed to process\n"
                      .format(image.file_path, band))
                exit(1)
        if round(image.x_res, 1) != round(Image.wrapper_x_res, 1) or \
           round(image.y_res, 1) != round(Image.wrapper_y_res, 1):
            print("\n\nError: the image '{}' don't have the same pixel size to the base image: {}x{} vs {}x{}."
                  " The stack-composed is not enabled for process yet images with different pixel size.\n"
                  .format(image.file_path, round(image.x_res, 1), round(image.y_res, 1),
                          round(Image.wrapper_x_res, 1), round(Image.wrapper_x_res, 1)))
            exit(1)
    print("ok")

    # set bounds for all images
    [image.set_bounds() for image in images]

    # for some statistics that required filename as metadata
    if stat in ["last_pixel", "jday_last_pixel", "jday_median", "linear_trend"]:
        [image.set_metadata_from_filename() for image in images]

    # registered Dask progress bar
    pbar = ProgressBar()
    pbar.register()

    for band in bands:
        # check and set the output file before process
        if os.path.isdir(output):
            output_filename = os.path.join(output, "stack_composed_{}_band{}.tif".format(stat, band))
        elif output.endswith((".tif", ".TIF")) and os.path.isdir(os.path.dirname(output)):
            output_filename = output
        elif output.endswith((".tif", ".TIF")) and os.path.dirname(output) == '':
            output_filename = os.path.join(os.getcwd(), output)
        else:
            print("\nError: Setting the output filename, wrong directory and/or\n"
                  "       filename: {}\n".format(output))
            exit(1)

        # choose the default data type based on the statistic
        if output_type is None:
            if stat in ['median', 'mean', 'gmean', 'max', 'min', 'last_pixel', 'jday_last_pixel',
                        'jday_median'] or stat.startswith(('percentile_', 'trim_mean_')):
                gdal_output_type = gdal.GDT_UInt16
            if stat in ['std', 'snr']:
                gdal_output_type = gdal.GDT_Float32
            if stat in ['valid_pixels']:
                if len(images) < 256:
                    gdal_output_type = gdal.GDT_Byte
                else:
                    gdal_output_type = gdal.GDT_UInt16
            if stat in ['linear_trend']:
                gdal_output_type = gdal.GDT_Int32
        else:
            if output_type == 'byte': gdal_output_type = gdal.GDT_Byte
            if output_type == 'uint16': gdal_output_type = gdal.GDT_UInt16
            if output_type == 'uint32': gdal_output_type = gdal.GDT_UInt32
            if output_type == 'int16': gdal_output_type = gdal.GDT_Int16
            if output_type == 'int32': gdal_output_type = gdal.GDT_Int32
            if output_type == 'float32': gdal_output_type = gdal.GDT_Float32
            if output_type == 'float64': gdal_output_type = gdal.GDT_Float64
        for image in images:
            image.output_type = gdal_output_type

        ### process ###
        # Calculate the statistics
        print("\nProcessing the {} for band {}:".format(stat, band))
        output_array = statistic(stat, images, band, num_process, chunksize)

        ### save result ###
        # create output raster
        driver = gdal.GetDriverByName('GTiff')
        nbands = 1
        outRaster = driver.Create(output_filename, Image.wrapper_shape[1], Image.wrapper_shape[0],
                                  nbands, gdal_output_type)
        outband = outRaster.GetRasterBand(nbands)

        # convert nan value and set nodata value special by statistic
        if stat in ['linear_trend']:
            output_array[np.isnan(output_array)] = -2147483648
            outband.SetNoDataValue(-2147483648)
            output_filename = output_filename.replace("stack_composed_linear_trend_band",
                                                      "stack_composed_linear_trend_x1e6_band")
        else:  # set nodata value depend of the output type
            if gdal_output_type in [gdal.GDT_Byte, gdal.GDT_UInt16, gdal.GDT_UInt32, gdal.GDT_Int16, gdal.GDT_Int32]:
                outband.SetNoDataValue(0)
            if gdal_output_type in [gdal.GDT_Float32, gdal.GDT_Float64]:
                outband.SetNoDataValue(np.nan)

        # write band
        outband.WriteArray(output_array)

        # set projection and geotransform
        outRasterSRS = osr.SpatialReference()
        outRasterSRS.ImportFromWkt(Image.projection)
        outRaster.SetProjection(outRasterSRS.ExportToWkt())
        outRaster.SetGeoTransform((Image.wrapper_extent[0], Image.wrapper_x_res, 0,
                                   Image.wrapper_extent[1], 0, -Image.wrapper_y_res))

        # clean
        del driver, outRaster, outband, outRasterSRS, output_array
        # force run garbage collector to release unreferenced memory
        gc.collect()

    print("\nProcess completed!")
Ejemplo n.º 22
0
class MetSim(object):
    """
    MetSim handles the distribution of jobs that write to a common file
    by launching muliple processes and queueing up their writeback so that
    work can be done while IO is happening.
    """

    # Class variables
    methods = {'mtclim': mtclim}
    params = {
        "period_ending": False,
        "is_worker": False,
        "method": 'mtclim',
        "domain": '',
        "state": '',
        "out_dir": '',
        "out_prefix": 'forcing',
        "start": 'forcing',
        "stop": 'forcing',
        "forcing_fmt": 'netcdf',
        "time_step": -1,
        "calendar": 'standard',
        "prec_type": 'uniform',
        "out_precision": 'f4',
        "verbose": 0,
        "sw_prec_thresh": 0.0,
        "utc_offset": False,
        "lw_cloud": 'cloud_deardorff',
        "lw_type": 'prata',
        "prec_type": 'uniform',
        "tdew_tol": 1e-6,
        "tmax_daylength_fraction": 0.67,
        "rain_scalar": 0.75,
        "tday_coef": 0.45,
        "lapse_rate": 0.0065,
        "out_vars": {n: available_outputs[n]
                     for n in default_outputs},
        "out_freq": None,
        "chunks": NO_SLICE,
        "scheduler": 'distributed',
        "num_workers": 1,
    }

    def __init__(self, params: dict, domain_slice=NO_SLICE):
        """
        Constructor
        """
        self._domain = None
        self._met_data = None
        self._state = None
        self._client = None
        self._domain_slice = domain_slice
        self.progress_bar = ProgressBar()
        self.params.update(params)
        logging.captureWarnings(True)
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(self.params['verbose'])

        formatter = logging.Formatter(' - '.join(
            ['%asctime)s', '%(name)s', '%(levelname)s', '%(message)s']))
        ch = logging.StreamHandler(sys.stdout)
        ch.setFormatter(formatter)
        ch.setLevel(self.params['verbose'])
        # set global dask scheduler
        if domain_slice is NO_SLICE:
            if self.params['scheduler'] in DASK_CORE_SCHEDULERS:
                dask.config.set(scheduler=self.params['scheduler'])
            else:
                from distributed import Client, progress
                if 'distributed' == self.params['scheduler']:
                    self._client = Client(n_workers=self.params['num_workers'],
                                          threads_per_worker=1)
                    if self.params['verbose'] == logging.DEBUG:
                        self.progress_bar = progress
                elif os.path.isfile(self.params['scheduler']):
                    self._client = Client(
                        scheduler_file=self.params['scheduler'])
                else:
                    self._client = Client(self.params['scheduler'])
        else:
            dask.config.set(scheduler=self.params['scheduler'])

        # Set up logging
        # If in verbose mode set up the progress bar
        if self.params['verbose'] == logging.DEBUG:
            if 'distributed' != self.params['scheduler']:
                self.progress_bar.register()
                self.progress_bar = lambda x: x
        else:
            # If not in verbose mode, create a dummy function
            self.progress_bar = lambda x: x
        # Create time vector(s)
        self._times = self._get_output_times(
            freq=self.params['out_freq'],
            period_ending=self.params['period_ending'])

        self._update_unit_attrs(self.params['out_vars'])

    def _update_unit_attrs(self, out_vars):
        for k, v in out_vars.items():
            if 'units' in v.keys():
                if v['units'] in converters[k].keys():
                    attrs[k]['units'] = v['units']
                else:
                    self.logger.warn(
                        f'Could not find unit conversion for {k} to {v["units"]}!'
                        f' We will use the default units of'
                        f' {available_outputs[k]["units"]} instead.')
                    v['units'] = available_outputs[k]['units']
            else:
                v['units'] = available_outputs[k]['units']

    def _validate_force_times(self, force_times):
        for p, i in [('start', 0), ('stop', -1)]:
            # infer times from force_times
            if isinstance(self.params[p], str):
                if self.params[p] == 'forcing':
                    self.params[p] = pd.Timestamp(
                        force_times.values[i]).to_pydatetime()
                elif '/' in self.params[p]:
                    year, month, day = map(int, self.params[p].split('/'))
                    self.params[p] = pd.datetime(year, month, day)
                else:
                    self.params[p] = pd.to_datetime(self.params[p])

        # update calendar from input data (fall back to params version)
        self.params['calendar'] = self.met_data['time'].encoding.get(
            'calendar', self.params['calendar'])

        assert self.params['start'] >= pd.Timestamp(
            force_times.values[0]).to_pydatetime()
        assert self.params['stop'] <= pd.Timestamp(
            force_times.values[-1]).to_pydatetime()

        self.params['state_start'] = (self.params['start'] -
                                      pd.Timedelta("90 days"))
        self.params['state_stop'] = (self.params['start'] -
                                     pd.Timedelta("1 days"))
        if self.params['utc_offset']:
            attrs['time'] = {
                'units': DEFAULT_TIME_UNITS,
                'long_name': 'UTC time',
                'standard_name': 'utc_time'
            }
        else:
            attrs['time'] = {
                'units': DEFAULT_TIME_UNITS,
                'long_name': 'local time at grid location',
                'standard_name': 'local_time'
            }

    def convert_monthly_param(self, name):
        self.met_data[name] = self.met_data['prec'].copy()
        months = self.met_data['time'].dt.month
        for m in range(12):
            param = self.domain[name].sel(month=m)
            locations = {'time': self.met_data['time'].isel(time=months == m)}
            self.met_data[name].loc[locations] = param

    @property
    def domain(self):
        if self._domain is None:
            self._domain = io.read_domain(
                self.params).isel(**self._domain_slice)
        return self._domain

    @property
    def met_data(self):
        if self._met_data is None:
            self._met_data = io.read_met_data(self.params, self.domain)
            self._met_data['elev'] = self.domain['elev']
            self._met_data['lat'] = self.domain['lat']
            self._met_data['lon'] = self.domain['lon']

            # process constant_vars
            constant_vars = self.params.get('constant_vars', None)
            if constant_vars:
                da_template = self._met_data[list(self._met_data)[0]]
                for var in constant_vars.keys():
                    self._met_data[var] = xr.full_like(
                        da_template, float(constant_vars[var]))

            self._validate_force_times(force_times=self._met_data['time'])
        return self._met_data

    @property
    def state(self):
        if self._state is None:
            self._state = io.read_state(self.params, self.domain)
            self._aggregate_state()
        return self._state

    @property
    def slices(self):
        if not self.params['chunks']:
            return [{d: slice(None) for d in self.domain[['mask']].dims}]

        return chunk_domain(self.params['chunks'], self.domain[['mask']].dims)

    def open_output(self):
        filenames = [self._get_output_filename(times) for times in self._times]
        return xr.open_mfdataset(filenames)

    def run(self):
        self._validate_setup()
        write_locks = {}
        for times in self._times:
            filename = self._get_output_filename(times)
            self.setup_netcdf_output(filename, times)
            write_locks[filename] = combine_locks(
                [NETCDFC_LOCK, get_write_lock(filename)])
        self.logger.info('Starting {} chunks...'.format(len(self.slices)))

        delayed_objs = [
            wrap_run_slice(self.params, write_locks, dslice)
            for dslice in self.slices
        ]
        persisted = dask.persist(delayed_objs,
                                 num_workers=self.params['num_workers'])
        self.progress_bar(persisted)
        dask.compute(persisted)
        self.logger.info('Cleaning up...')
        try:
            self._client.cluster.close()
            self._client.close()
            if self.params['verbose'] == logging.DEBUG:
                print()
                print('closed dask cluster/client')
        except Exception:
            pass

    def load_inputs(self, close=True):
        self._domain = self.domain.load()
        self._met_data = self.met_data.load()
        self._state = self.state.load()
        if close:
            self._domain.close()
            self._met_data.close()
            self._state.close()

    def setup_netcdf_output(self, filename, times):
        '''setup a single netcdf file'''
        with Dataset(filename, mode="w") as ncout:
            # dims
            dim_sizes = (None, ) + self.domain['mask'].shape
            var_dims = ('time', ) + self.domain['mask'].dims
            chunksizes = [len(times)]
            for d, s in zip(var_dims[1:], dim_sizes[1:]):
                c = int(self.params['chunks'].get(d, s))
                if c <= s:
                    chunksizes.append(c)
                else:
                    chunksizes.append(s)
            create_kwargs = {'chunksizes': chunksizes}
            for d, size in zip(var_dims, dim_sizes):
                ncout.createDimension(d, size)
            # vars
            for varname, varconf in self.params['out_vars'].items():
                ncout.createVariable(varconf['out_name'],
                                     self.params['out_precision'], var_dims,
                                     **create_kwargs)

            # add metadata and coordinate variables (time/lat/lon)
            time_var = ncout.createVariable('time', 'i4', ('time', ))
            time_var.calendar = self.params['calendar']
            time_var[:] = date2num(times.to_pydatetime(),
                                   units=attrs['time'].get(
                                       'units', DEFAULT_TIME_UNITS),
                                   calendar=time_var.calendar)

            dtype_map = {
                'float64': 'f8',
                'float32': 'f4',
                'int64': 'i8',
                'int32': 'i4'
            }
            for dim in self.domain['mask'].dims:
                dim_vals = self.domain[dim].values
                dim_dtype = dtype_map.get(str(dim_vals.dtype),
                                          self.params['out_precision'])
                dim_var = ncout.createVariable(dim, dim_dtype, (dim, ))
                dim_var[:] = dim_vals

            # parameters to not record in the metadata
            skip_params = [
                'elev',
                'lat',
                'lon',
                'is_worker',
                'out_vars',
                'forcing_vars',
                'domain_vars',
                'state_vars',
                'constant_vars',
                'references',
                'verbose',
                'num_workers',
            ]
            for k, v in self.params.items():
                if k in skip_params:
                    continue
                # Need to convert some parameters to strings
                if k in ['start', 'stop', 'utc_offset', 'period_ending']:
                    v = str(v)
                elif k in ['state_start', 'state_stop', 'out_freq']:
                    # skip
                    continue
                # Don't include complex types
                if isinstance(v, dict):
                    v = json.dumps(v)
                elif not isinstance(v, str) and isinstance(v, Iterable):
                    v = ', '.join(v)

                if isinstance(v, str):
                    v = v.replace("'", "").replace('"', "")
                attrs['_global'][k] = v

            # set global attrs
            for key, val in attrs['_global'].items():
                setattr(ncout, key, val)

            # set variable attrs
            for key, value in attrs.get('time', {}).items():
                setattr(ncout.variables['time'], key, value)
            for varname, varconf in self.params['out_vars'].items():
                outname = varconf['out_name']
                for key, val in attrs.get(varname, {}).items():
                    setattr(ncout.variables[outname], key, val)

    def write_chunk(self, locks=None):
        '''write data from a single chunk'''
        if not len(self.params['out_vars']):
            return
        for times in self._times:
            filename = self._get_output_filename(times)
            lock = locks.get(filename, DummyLock())
            time_slice = slice(times[0], times[-1])
            with lock:
                with Dataset(filename, mode="r+") as ncout:
                    for varname, varconf in self.params['out_vars'].items():
                        outname = varconf['out_name']
                        dims = ncout.variables[outname].dimensions[1:]
                        write_slice = ((slice(None), ) +
                                       tuple(self._domain_slice[d]
                                             for d in dims))
                        ncout.variables[outname][write_slice] = (
                            self.output[varname].sel(time=time_slice).values)

    def run_slice(self):
        """
        Run a single slice of
        """
        self._validate_setup()
        self.disagg = int(self.params['time_step']) < cnst.MIN_PER_DAY
        self.method = MetSim.methods[self.params['method']]
        self.setup_output()
        times = self.met_data['time']
        params = self.params.copy()
        # transform input parameters to floating point values
        params['sw_prec_thresh'] = float(params['sw_prec_thresh'])
        params['rain_scalar'] = float(params['rain_scalar'])
        params['tdew_tol'] = float(params['tdew_tol'])
        params['tmax_daylength_fraction'] = float(
            params['tmax_daylength_fraction'])
        params['tday_coef'] = float(params['tday_coef'])
        params['tmax_daylength_fraction'] = float(
            params['tmax_daylength_fraction'])
        params['lapse_rate'] = float(params['lapse_rate'])
        if self.params['prec_type'].upper() in ['TRIANGLE', 'MIX']:
            self.convert_monthly_param('dur')
            self.convert_monthly_param('t_pk')
        for index, mask_val in np.ndenumerate(self.domain['mask'].values):
            if mask_val > 0:
                locs = {d: i for d, i in zip(self.domain['mask'].dims, index)}
            else:
                continue

            df, state = wrap_run_cell(self.method.run, params,
                                      self.met_data.isel(**locs),
                                      self.state.isel(**locs), self.disagg,
                                      times)

            # Cut the returned data down to the correct time index
            # and do any required unit conversions
            for varname in self.params['out_vars']:
                desired_units = self.params['out_vars'][varname]['units']
                out_vals = converters[varname][desired_units](
                    df[varname].values, int(self.params['time_step']))
                self.output[varname][locs] = out_vals

    def _unpack_state(self, result: pd.DataFrame, locs: dict):
        """Put restart values in the state dataset"""
        # We concatenate with the old state values in case we don't
        # have 90 new days to use
        tmin = np.concatenate((self.state['t_min'].isel(**locs).values[:],
                               result['t_min'].values))
        tmax = np.concatenate((self.state['t_max'].isel(**locs).values[:],
                               result['t_max'].values))
        prec = np.concatenate(
            (self.state['prec'].isel(**locs).values[:], result['prec'].values))
        self.state['t_min'].isel(**locs).values[:] = tmin[-90:]
        self.state['t_max'].isel(**locs).values[:] = tmax[-90:]
        self.state['prec'].isel(**locs).values[:] = prec[-90:]
        state_start = result.index[-1] - pd.Timedelta('89 days')
        self.state['time'].values = date_range(
            state_start, result.index[-1], calendar=self.params['calendar'])

    def _get_output_times(self, freq=None, period_ending=False):
        """
        Generate chunked time vectors

        Parameters
        ----------
        freq:
            Output frequency. Given as a Pandas timegrouper string.
            If not given, the entire timeseries will be used.
        period_ending:
            Flag to specify if output timesteps should be period-
            ending. Default is period-beginning

        Returns
        -------
        times:
            A list of timeseries which represent each of times that
            output files will be created for.
        """
        prototype = self.met_data
        self.disagg = int(self.params['time_step']) < cnst.MIN_PER_DAY

        if self.disagg:
            delta = pd.Timedelta('1 days') - pd.Timedelta('{} minutes'.format(
                self.params['time_step']))
        else:
            delta = pd.Timedelta('0 days')
        if period_ending:
            offset = pd.Timedelta('{} minutes'.format(
                self.params['time_step']))
        else:
            offset = pd.Timedelta('0 minutes')

        start = pd.Timestamp(prototype['time'].values[0]).to_pydatetime()
        stop = pd.Timestamp(prototype['time'].values[-1]).to_pydatetime()
        times = date_range(start + offset,
                           stop + offset + delta,
                           freq="{}T".format(self.params['time_step']),
                           calendar=self.params['calendar'])

        if freq is None or freq == '':
            times = [times]
        else:
            dummy = pd.Series(np.arange(len(times)), index=times)
            grouper = pd.Grouper(freq=freq)
            times = [t.index for k, t in dummy.groupby(grouper)]
        return times

    def _get_output_filename(self, times):
        suffix = self.get_nc_output_suffix(times)
        fname = '{}_{}.nc'.format(self.params['out_prefix'], suffix)
        output_filename = os.path.join(os.path.abspath(self.params['out_dir']),
                                       fname)
        return output_filename

    def setup_output(self):

        # output times
        times = self._get_output_times(
            freq=None, period_ending=self.params['period_ending'])[0]

        # Number of timesteps
        n_ts = len(times)

        shape = (n_ts, ) + self.domain['mask'].shape
        dims = ('time', ) + self.domain['mask'].dims
        coords = {'time': times, **self.domain['mask'].coords}
        self.output = xr.Dataset(coords=coords)
        self.output['time'].encoding['calendar'] = self.params['calendar']

        dtype = self.params['out_precision']
        for varname in self.params['out_vars']:
            self.output[varname] = xr.DataArray(data=np.full(shape,
                                                             np.nan,
                                                             dtype=dtype),
                                                coords=coords,
                                                dims=dims,
                                                name=varname,
                                                attrs=attrs.get(varname, {}))
        self.output['time'].attrs.update(attrs['time'])

    def _aggregate_state(self):
        """Aggregate data out of the state file and load it into `met_data`"""
        # Precipitation record

        assert self.state.dims['time'] == 90, self.state['time']
        record_dates = date_range(self.params['state_start'],
                                  self.params['state_stop'],
                                  calendar=self.params['calendar'])
        trailing = self.state['prec']
        trailing['time'] = record_dates
        total_precip = xr.concat([trailing, self.met_data['prec']],
                                 dim='time').load()
        total_precip = (
            cnst.DAYS_PER_YEAR * total_precip.rolling(time=90).mean().sel(
                time=slice(self.params['start'], self.params['stop'])))

        self.met_data['seasonal_prec'] = total_precip

        # Smoothed daily temperature range
        trailing = self.state['t_max'] - self.state['t_min']

        trailing['time'] = record_dates
        dtr = self.met_data['t_max'] - self.met_data['t_min']
        if (dtr < 0).any():
            raise ValueError("Daily maximum temperature lower"
                             " than daily minimum temperature!")
        sm_dtr = xr.concat([trailing, dtr], dim='time').load()
        sm_dtr = sm_dtr.rolling(time=30).mean().drop(record_dates, dim='time')
        self.met_data['dtr'] = dtr
        self.met_data['smoothed_dtr'] = sm_dtr

    def _validate_setup(self):
        """Updates the global parameters dictionary"""
        errs = [""]

        # Make sure there's some input
        if not len(self.params.get('forcing', [])):
            errs.append("Requires input forcings to be specified")

        # Make sure there is at least one forcing_var
        # They cannot all be constant since we use one as a template
        # for the others
        if not len(self.params.get('forcing_vars', [])):
            errs.append("Requires at least one non-constant forcing")

        # Parameters that can't be empty strings or None
        non_empty = ['out_dir', 'time_step', 'forcing_fmt']
        for each in non_empty:
            if self.params.get(each, None) is None or self.params[each] == '':
                errs.append("Cannot have empty value for {}".format(each))

        # Make sure time step divides evenly into a day
        if (cnst.MIN_PER_DAY % int(self.params.get('time_step', -1))
                or (int(self.params['time_step']) > (6 * cnst.MIN_PER_HOUR)
                    and int(self.params['time_step']) != cnst.MIN_PER_DAY)):
            errs.append("Time step must be evenly divisible into 1440 "
                        "minutes (24 hours) and less than 360 minutes "
                        "(6 hours). Got {}.".format(self.params['time_step']))

        # Check for required input variable specification
        if self.met_data is not None:
            required_in = ['t_min', 't_max', 'prec']
            for each in required_in:
                if each not in self.met_data.variables:
                    errs.append("Input requires {}".format(each))

        # Make sure that we are going to write out some data
        if not len(self.params.get('out_vars', [])):
            errs.append("Output variable list must not be empty")

        # Check output variables are valid
        daily_out_vars = [
            't_min', 't_max', 't_day', 'prec', 'vapor_pressure', 'shortwave',
            'tskc', 'pet', 'wind'
        ]
        out_var_check = [
            'temp', 'prec', 'shortwave', 'vapor_pressure', 'air_pressure',
            'rel_humid', 'spec_humid', 'longwave', 'tskc', 'wind'
        ]
        if int(self.params.get('time_step', -1)) == 1440:
            out_var_check = daily_out_vars
        for var in self.params.get('out_vars', []):
            if var not in out_var_check:
                errs.append('Cannot output variable {} at timestep {}'.format(
                    var, self.params['time_step']))

        # Check that the parameters specified are available
        opts = {
            'out_precision': ['f4', 'f8'],
            'lw_cloud': ['default', 'cloud_deardorff'],
            'lw_type': [
                'default', 'tva', 'anderson', 'brutsaert', 'satterlund',
                'idso', 'prata'
            ]
        }
        for k, v in opts.items():
            if not self.params.get(k, None) in v:
                errs.append("Invalid option given for {}".format(k))

        # If any errors, raise and give a summary
        if len(errs) > 1:
            raise Exception("\n  ".join(errs))

    def get_nc_output_suffix(self, times):
        s, e = times[[0, -1]]
        template = '{:04d}{:02d}{:02d}-{:04d}{:02d}{:02d}'
        return template.format(
            s.year,
            s.month,
            s.day,
            e.year,
            e.month,
            e.day,
        )
import output, logging
import load_subreddit_castra, make_subreddit_castra
from dask.diagnostics import ProgressBar
from pprint import pprint
import pandas as pd
import dask.dataframe as dd

logging.basicConfig(level = logging.DEBUG, format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.getLogger('requests').setLevel(logging.CRITICAL)
logger = logging.getLogger(__name__)

# Start a progress bar for all computations
pbar = ProgressBar()
pbar.register()

def test(file_name):
    """
    # Subsetting the dataframe
    a = df[df.link_id == 't3_36k7u4'].compute()

    # Get multiple columns from the dataframe
    b = df[['author', 'subreddit']].compute()

    # Groupby operations
    c = df.groupby(['link_id', 'author'])['ups'].count().compute()
    c = df.groupby(df.link_id).ups.mean().compute()
    c = df.groupby(df.link_id).score.count().compute()

    # Drop duplicates
    d = df.author.drop_duplicates().compute()
Ejemplo n.º 24
0
def pca(a, b, n_pc, estimator_matrix, out_dir, n_threads, block_size, nodata):
    """Calculate the principal components for the vertical stack A or with
    combinations of the stack B

    :param A: first input raster data (fists period)
    :param B: second input raster data (second period) or None
    :param n_pc: number of principal components to output
    :param estimator_matrix: pca with correlation of covariance
    :param out_dir: directory to save the outputs
    :return: pca files list and statistics
    """
    A = a
    B = b
    # get/set the nodata
    if nodata is None:
        ds = gdal.Open(A, gdal.GA_ReadOnly)
        nodata = ds.GetRasterBand(1).GetNoDataValue()
        del ds

    print("\nPRINCIPAL COMPONENTS ANALYSIS")
    print("    Compute {} components for:".format(n_pc))
    print("    A: {}".format(A))
    if B is not None:
        print("    B: {}".format(B))

    # init dask as threads (shared memory is required)
    dask.config.set(pool=ThreadPool(n_threads))
    # registered Dask progress bar
    pbar = ProgressBar()
    pbar.register()

    print("\nRead and prepare data:")

    raw_image = []
    nodata_mask = None
    src_ds_A = gdal.Open(A, gdal.GA_ReadOnly)
    src_ds_B = None
    for band in range(src_ds_A.RasterCount):
        ds = src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype(
            np.float32)
        if nodata is not None:
            nodata_mask = ds == nodata if nodata_mask is None else np.logical_or(
                nodata_mask, ds == nodata)
        raw_image.append(ds)
    if B is not None:
        src_ds_B = gdal.Open(B, gdal.GA_ReadOnly)
        for band in range(src_ds_B.RasterCount):
            ds = src_ds_B.GetRasterBand(band +
                                        1).ReadAsArray().flatten().astype(
                                            np.float32)
            if nodata is not None:
                nodata_mask = np.logical_or(nodata_mask, ds == nodata)
            raw_image.append(ds)

    # pair-masking data, let only the valid data across all dimensions/bands
    if nodata is not None:
        raw_image = [b[~nodata_mask] for b in raw_image]
    # flat each dimension (bands)
    flat_dims = da.vstack(raw_image).rechunk((1, block_size**2))
    # bands
    n_bands = flat_dims.shape[0]

    ########
    # compute the mean of each band, in order to center the matrix.
    band_mean = []
    for i in range(n_bands):
        band_mean.append(dask.delayed(np.mean)(flat_dims[i]))
    band_mean = dask.compute(*band_mean)

    ########
    # compute the matrix correlation/covariance
    print("\nComputing the estimator matrix:")
    estimation_matrix = np.empty((n_bands, n_bands))
    if estimator_matrix == "correlation":
        for i in range(n_bands):
            deviation_scores_band_i = flat_dims[i] - band_mean[i]
            for j in range(i, n_bands):
                deviation_scores_band_j = flat_dims[j] - band_mean[j]
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1]
    if estimator_matrix == "covariance":
        for i in range(n_bands):
            deviation_scores_band_i = flat_dims[i] - band_mean[i]
            for j in range(i, n_bands):
                deviation_scores_band_j = flat_dims[j] - band_mean[j]
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1]
    # free mem
    del raw_image, flat_dims, src_ds_B, ds

    ########
    # calculate eigenvectors & eigenvalues of the matrix
    # use 'eigh' rather than 'eig' since estimation_matrix
    # is symmetric, the performance gain is substantial
    eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix)

    # sort eigenvalue in decreasing order
    idx_eigenvals = np.argsort(eigenvals)[::-1]
    eigenvectors = eigenvectors[:, idx_eigenvals]
    # sort eigenvectors according to same index
    eigenvals = eigenvals[idx_eigenvals]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    eigenvectors = eigenvectors[:, :n_pc]

    ########
    # save the principal components separated in tif images

    def get_raw_band_from_stack(band):
        src_ds_A = gdal.Open(A, gdal.GA_ReadOnly)
        if band < src_ds_A.RasterCount:
            return src_ds_A.GetRasterBand(band +
                                          1).ReadAsArray().flatten().astype(
                                              np.float32)
        if band >= src_ds_A.RasterCount:
            src_ds_B = gdal.Open(B, gdal.GA_ReadOnly)
            return src_ds_B.GetRasterBand(band - src_ds_A.RasterCount +
                                          1).ReadAsArray().flatten().astype(
                                              np.float32)

    @dask.delayed
    def get_principal_component(i, j):
        return eigenvectors[j, i] * (get_raw_band_from_stack(j) - band_mean[j])

    print("\nComputing and saving the components in pca-stack.tif:")

    # save component as file
    tmp_pca_file = Path(out_dir) / 'pca-stack.tif'
    driver = gdal.GetDriverByName("GTiff")
    out_pc = driver.Create(str(tmp_pca_file), src_ds_A.RasterXSize,
                           src_ds_A.RasterYSize, n_pc, gdal.GDT_Float32)

    for i in range(n_pc):
        pc = dask.delayed(sum)(
            [get_principal_component(i, j) for j in range(n_bands)])
        pc = pc.astype(np.float32)
        pc = np.array(pc.compute())
        if nodata is not None:
            pc[nodata_mask] = 0
        pc = pc.reshape((src_ds_A.RasterYSize, src_ds_A.RasterXSize))

        pcband = out_pc.GetRasterBand(i + 1)
        if nodata is not None:
            pcband.SetNoDataValue(0)
        pcband.WriteArray(pc)
        del pc, pcband
    # set projection and geotransform
    if src_ds_A.GetGeoTransform() is not None:
        out_pc.SetGeoTransform(src_ds_A.GetGeoTransform())
    if src_ds_A.GetProjection() is not None:
        out_pc.SetProjection(src_ds_A.GetProjection())
    out_pc.FlushCache()

    # free mem
    del src_ds_A, nodata_mask, out_pc

    print("\nDONE")
Ejemplo n.º 25
0
def main(argv=sys.argv[1:]):
    global LOG
    from satpy import Scene
    from satpy.resample import get_area_def
    from satpy.writers import compute_writer_results
    from dask.diagnostics import ProgressBar
    from polar2grid.core.script_utils import (
        setup_logging, rename_log_file, create_exc_handler)
    import argparse
    prog = os.getenv('PROG_NAME', sys.argv[0])
    # "usage: " will be printed at the top of this:
    usage = """
    %(prog)s -h
see available products:
    %(prog)s -r <reader> -w <writer> --list-products -f file1 [file2 ...]
basic processing:
    %(prog)s -r <reader> -w <writer> [options] -f file1 [file2 ...]
basic processing with limited products:
    %(prog)s -r <reader> -w <writer> [options] -p prod1 prod2 -f file1 [file2 ...]
"""
    parser = argparse.ArgumentParser(prog=prog, usage=usage,
                                     description="Load, composite, resample, and save datasets.")
    parser.add_argument('-v', '--verbose', dest='verbosity', action="count", default=0,
                        help='each occurrence increases verbosity 1 level through ERROR-WARNING-INFO-DEBUG (default INFO)')
    parser.add_argument('-l', '--log', dest="log_fn", default=None,
                        help="specify the log filename")
    parser.add_argument('--progress', action='store_true',
                        help="show processing progress bar (not recommended for logged output)")
    parser.add_argument('--num-workers', type=int, default=4,
                        help="specify number of worker threads to use (default: 4)")
    parser.add_argument('--match-resolution', dest='preserve_resolution', action='store_false',
                        help="When using the 'native' resampler for composites, don't save data "
                             "at its native resolution, use the resolution used to create the "
                             "composite.")
    parser.add_argument('-w', '--writers', nargs='+',
                        help='writers to save datasets with')
    parser.add_argument("--list-products", dest="list_products", action="store_true",
                        help="List available reader products and exit")
    subgroups = add_scene_argument_groups(parser)
    subgroups += add_resample_argument_groups(parser)

    argv_without_help = [x for x in argv if x not in ["-h", "--help"]]
    args, remaining_args = parser.parse_known_args(argv_without_help)

    # get the logger if we know the readers and writers that will be used
    if args.reader is not None and args.writers is not None:
        glue_name = args.reader + "_" + "-".join(args.writers or [])
        LOG = logging.getLogger(glue_name)
    # add writer arguments
    if args.writers is not None:
        for writer in (args.writers or []):
            parser_func = WRITER_PARSER_FUNCTIONS.get(writer)
            if parser_func is None:
                continue
            subgroups += parser_func(parser)
    args = parser.parse_args(argv)

    if args.reader is None:
        parser.print_usage()
        parser.exit(1, "\nERROR: Reader must be provided (-r flag).\n"
                       "Supported readers:\n\t{}\n".format('\n\t'.join(['abi_l1b', 'ahi_hsd', 'hrit_ahi'])))
    if args.writers is None:
        parser.print_usage()
        parser.exit(1, "\nERROR: Writer must be provided (-w flag) with one or more writer.\n"
                       "Supported writers:\n\t{}\n".format('\n\t'.join(['geotiff'])))

    def _args_to_dict(group_actions):
        return {ga.dest: getattr(args, ga.dest) for ga in group_actions if hasattr(args, ga.dest)}
    scene_args = _args_to_dict(subgroups[0]._group_actions)
    load_args = _args_to_dict(subgroups[1]._group_actions)
    resample_args = _args_to_dict(subgroups[2]._group_actions)
    writer_args = {}
    for idx, writer in enumerate(args.writers):
        sgrp1, sgrp2 = subgroups[3 + idx * 2: 5 + idx * 2]
        wargs = _args_to_dict(sgrp1._group_actions)
        if sgrp2 is not None:
            wargs.update(_args_to_dict(sgrp2._group_actions))
        writer_args[writer] = wargs
        # get default output filename
        if 'filename' in wargs and wargs['filename'] is None:
            wargs['filename'] = get_default_output_filename(args.reader, writer)

    if not args.filenames:
        parser.print_usage()
        parser.exit(1, "\nERROR: No data files provided (-f flag)\n")

    # Prepare logging
    rename_log = False
    if args.log_fn is None:
        rename_log = True
        args.log_fn = glue_name + "_fail.log"
    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    setup_logging(console_level=levels[min(3, args.verbosity)], log_filename=args.log_fn)
    logging.getLogger('rasterio').setLevel(levels[min(2, args.verbosity)])
    sys.excepthook = create_exc_handler(LOG.name)
    if levels[min(3, args.verbosity)] > logging.DEBUG:
        import warnings
        warnings.filterwarnings("ignore")
    LOG.debug("Starting script with arguments: %s", " ".join(sys.argv))

    # Set up dask and the number of workers
    if args.num_workers:
        from multiprocessing.pool import ThreadPool
        dask.config.set(pool=ThreadPool(args.num_workers))

    # Parse provided files and search for files if provided directories
    scene_args['filenames'] = get_input_files(scene_args['filenames'])
    # Create a Scene, analyze the provided files
    LOG.info("Sorting and reading input files...")
    try:
        scn = Scene(**scene_args)
    except ValueError as e:
        LOG.error("{} | Enable debug message (-vvv) or see log file for details.".format(str(e)))
        LOG.debug("Further error information: ", exc_info=True)
        return -1
    except OSError:
        LOG.error("Could not open files. Enable debug message (-vvv) or see log file for details.")
        LOG.debug("Further error information: ", exc_info=True)
        return -1

    if args.list_products:
        print("\n".join(sorted(scn.available_dataset_names(composites=True))))
        return 0

    # Rename the log file
    if rename_log:
        rename_log_file(glue_name + scn.attrs['start_time'].strftime("_%Y%m%d_%H%M%S.log"))

    # Load the actual data arrays and metadata (lazy loaded as dask arrays)
    if load_args['products'] is None:
        try:
            reader_mod = importlib.import_module('polar2grid.readers.' + scene_args['reader'])
            load_args['products'] = reader_mod.DEFAULT_PRODUCTS
            LOG.info("Using default product list: {}".format(load_args['products']))
        except (ImportError, AttributeError):
            LOG.error("No default products list set, please specify with `--products`.")
            return -1

    LOG.info("Loading product metadata from files...")
    scn.load(load_args['products'])

    resample_kwargs = resample_args.copy()
    areas_to_resample = resample_kwargs.pop('grids')
    grid_configs = resample_kwargs.pop('grid_configs')
    resampler = resample_kwargs.pop('resampler')

    if areas_to_resample is None and resampler in [None, 'native']:
        # no areas specified
        areas_to_resample = ['MAX']
    elif areas_to_resample is None:
        raise ValueError("Resampling method specified (--method) without any destination grid/area (-g flag).")
    elif not areas_to_resample:
        # they don't want any resampling (they used '-g' with no args)
        areas_to_resample = [None]

    has_custom_grid = any(g not in ['MIN', 'MAX', None] for g in areas_to_resample)
    if has_custom_grid and resampler == 'native':
        LOG.error("Resampling method 'native' can only be used with 'MIN' or 'MAX' grids "
                  "(use 'nearest' method instead).")
        return -1

    p2g_grid_configs = [x for x in grid_configs if x.endswith('.conf')]
    pyresample_area_configs = [x for x in grid_configs if not x.endswith('.conf')]
    if not grid_configs or p2g_grid_configs:
        # if we were given p2g grid configs or we weren't given any to choose from
        from polar2grid.grids import GridManager
        grid_manager = GridManager(*p2g_grid_configs)
    else:
        grid_manager = {}

    if pyresample_area_configs:
        from pyresample.utils import parse_area_file
        custom_areas = parse_area_file(pyresample_area_configs)
        custom_areas = {x.area_id: x for x in custom_areas}
    else:
        custom_areas = {}

    ll_bbox = resample_kwargs.pop('ll_bbox')
    if ll_bbox:
        scn = scn.crop(ll_bbox=ll_bbox)

    wishlist = scn.wishlist.copy()
    preserve_resolution = get_preserve_resolution(args, resampler, areas_to_resample)
    if preserve_resolution:
        preserved_products = set(wishlist) & set(scn.datasets.keys())
        resampled_products = set(wishlist) - preserved_products

        # original native scene
        to_save = write_scene(scn, args.writers, writer_args, preserved_products)
    else:
        preserved_products = set()
        resampled_products = set(wishlist)
        to_save = []

    LOG.debug("Products to preserve resolution for: {}".format(preserved_products))
    LOG.debug("Products to use new resolution for: {}".format(resampled_products))
    for area_name in areas_to_resample:
        if area_name is None:
            # no resampling
            area_def = None
        elif area_name == 'MAX':
            area_def = scn.max_area()
        elif area_name == 'MIN':
            area_def = scn.min_area()
        elif area_name in custom_areas:
            area_def = custom_areas[area_name]
        elif area_name in grid_manager:
            from pyresample.geometry import DynamicAreaDefinition
            p2g_def = grid_manager[area_name]
            area_def = p2g_def.to_satpy_area()
            if isinstance(area_def, DynamicAreaDefinition) and p2g_def['cell_width'] is not None:
                area_def = area_def.freeze(scn.max_area(),
                                           resolution=(abs(p2g_def['cell_width']), abs(p2g_def['cell_height'])))
        else:
            area_def = get_area_def(area_name)

        if resampler is None and area_def is not None:
            rs = 'native' if area_name in ['MIN', 'MAX'] else 'nearest'
            LOG.debug("Setting default resampling to '{}' for grid '{}'".format(rs, area_name))
        else:
            rs = resampler

        if area_def is not None:
            LOG.info("Resampling data to '%s'", area_name)
            new_scn = scn.resample(area_def, resampler=rs, **resample_kwargs)
        elif not preserve_resolution:
            # the user didn't want to resample to any areas
            # the user also requested that we don't preserve resolution
            # which means we have to save this Scene's datasets
            # because they won't be saved
            new_scn = scn

        to_save = write_scene(new_scn, args.writers, writer_args, resampled_products, to_save=to_save)

    if args.progress:
        pbar = ProgressBar()
        pbar.register()

    LOG.info("Computing products and saving data to writers...")
    compute_writer_results(to_save)
    LOG.info("SUCCESS")
    return 0
Ejemplo n.º 26
0
def _load_basic_dataframe(df_file=None,
                          datatype='sim',
                          config='IC86.2012',
                          energy_reco=True,
                          energy_cut_key='reco_log_energy',
                          log_energy_min=None,
                          log_energy_max=None,
                          columns=None,
                          n_jobs=1,
                          verbose=False,
                          compute=True):

    validate_datatype(datatype)

    if df_file is not None:
        files = df_file
    else:
        paths = get_config_paths()
        file_pattern = os.path.join(paths.comp_data_dir, config, datatype,
                                    'processed_hdf',
                                    'nominal' if datatype == 'sim' else '',
                                    '*.hdf')
        files = sorted(glob.glob(file_pattern))

    ddf = dd.read_hdf(files,
                      key='dataframe',
                      mode='r',
                      columns=columns,
                      chunksize=10000)

    # Energy reconstruction
    if energy_reco:
        model_dict = load_trained_model(
            'linearregression_energy_{}'.format(config), return_metadata=True)
        pipeline = model_dict['pipeline']
        feature_list = list(model_dict['training_features'])

        def add_reco_energy(partition):
            partition['reco_log_energy'] = pipeline.predict(
                partition[feature_list])
            partition['reco_energy'] = 10**partition['reco_log_energy']
            return partition

        ddf = ddf.map_partitions(add_reco_energy)

    # Energy range cut
    if log_energy_min is not None and log_energy_max is not None:

        def apply_energy_cut(partition):
            energy_mask = (partition[energy_cut_key] > log_energy_min) & (
                partition[energy_cut_key] < log_energy_max)
            return partition.loc[energy_mask, :]

        ddf = ddf.map_partitions(apply_energy_cut)

    if compute:
        if verbose:
            pbar = ProgressBar()
            pbar.register()
        scheduler = 'processes' if n_jobs > 1 else 'synchronous'
        df = ddf.compute(scheduler=scheduler, num_workers=n_jobs)
        df = df.reset_index(drop=True)
    else:
        df = ddf

    return df
Ejemplo n.º 27
0
def main(argv=sys.argv[1:]):
    global LOG

    import satpy
    from satpy import Scene
    from satpy.writers import compute_writer_results
    from dask.diagnostics import ProgressBar
    from polar2grid.core.script_utils import (setup_logging, rename_log_file,
                                              create_exc_handler)
    import argparse

    dist = pkg_resources.get_distribution('polar2grid')
    if dist_is_editable(dist):
        p2g_etc = os.path.join(dist.module_path, 'etc')
    else:
        p2g_etc = os.path.join(sys.prefix, 'etc', 'polar2grid')
    config_path = satpy.config.get('config_path')
    if p2g_etc not in config_path:
        satpy.config.set(config_path=config_path + [p2g_etc])

    USE_POLAR2GRID_DEFAULTS = bool(
        int(os.environ.setdefault("USE_POLAR2GRID_DEFAULTS", "1")))

    prog = os.getenv('PROG_NAME', sys.argv[0])
    # "usage: " will be printed at the top of this:
    usage = """
    %(prog)s -h
see available products:
    %(prog)s -r <reader> -w <writer> --list-products -f file1 [file2 ...]
basic processing:
    %(prog)s -r <reader> -w <writer> [options] -f file1 [file2 ...]
basic processing with limited products:
    %(prog)s -r <reader> -w <writer> [options] -p prod1 prod2 -f file1 [file2 ...]
"""
    parser = argparse.ArgumentParser(
        prog=prog,
        usage=usage,
        fromfile_prefix_chars="@",
        description="Load, composite, resample, and save datasets.")
    parser.add_argument(
        '-v',
        '--verbose',
        dest='verbosity',
        action="count",
        default=0,
        help='each occurrence increases verbosity 1 level through '
        'ERROR-WARNING-INFO-DEBUG (default INFO)')
    parser.add_argument('-l',
                        '--log',
                        dest="log_fn",
                        default=None,
                        help="specify the log filename")
    parser.add_argument(
        '--progress',
        action='store_true',
        help="show processing progress bar (not recommended for logged output)"
    )
    parser.add_argument(
        '--num-workers',
        type=int,
        default=os.getenv('DASK_NUM_WORKERS', 4),
        help="specify number of worker threads to use (default: 4)")
    parser.add_argument(
        '--match-resolution',
        dest='preserve_resolution',
        action='store_false',
        help="When using the 'native' resampler for composites, don't save data "
        "at its native resolution, use the resolution used to create the "
        "composite.")
    parser.add_argument("--list-products",
                        dest="list_products",
                        action="store_true",
                        help="List available reader products and exit")
    reader_group = add_scene_argument_groups(
        parser, is_polar2grid=USE_POLAR2GRID_DEFAULTS)[0]
    resampling_group = add_resample_argument_groups(
        parser, is_polar2grid=USE_POLAR2GRID_DEFAULTS)[0]
    writer_group = add_writer_argument_groups(parser)[0]
    subgroups = [reader_group, resampling_group, writer_group]

    argv_without_help = [x for x in argv if x not in ["-h", "--help"]]

    _retitle_optional_arguments(parser)
    args, remaining_args = parser.parse_known_args(argv_without_help)
    os.environ['DASK_NUM_WORKERS'] = str(args.num_workers)

    # get the logger if we know the readers and writers that will be used
    if args.readers is not None and args.writers is not None:
        glue_name = args.readers[0] + "_" + "-".join(args.writers or [])
        LOG = logging.getLogger(glue_name)
    # add writer arguments
    for writer in (args.writers or []):
        parser_func = WRITER_PARSER_FUNCTIONS.get(writer)
        if parser_func is None:
            continue
        subgroups += parser_func(parser)
    args = parser.parse_args(argv)

    if args.readers is None:
        parser.print_usage()
        parser.exit(
            1, "\nERROR: Reader must be provided (-r flag).\n"
            "Supported readers:\n\t{}\n".format('\n\t'.join(
                ['abi_l1b', 'ahi_hsd', 'hrit_ahi'])))
    elif len(args.readers) > 1:
        parser.print_usage()
        parser.exit(
            1, "\nMultiple readers is not currently supported. Got:\n\t"
            "{}\n".format('\n\t'.join(args.readers)))
        return -1
    if args.writers is None:
        parser.print_usage()
        parser.exit(
            1,
            "\nERROR: Writer must be provided (-w flag) with one or more writer.\n"
            "Supported writers:\n\t{}\n".format('\n\t'.join(['geotiff'])))

    def _args_to_dict(group_actions, exclude=None):
        if exclude is None:
            exclude = []
        return {
            ga.dest: getattr(args, ga.dest)
            for ga in group_actions
            if hasattr(args, ga.dest) and ga.dest not in exclude
        }

    reader_args = _args_to_dict(reader_group._group_actions)
    reader_names = reader_args.pop('readers')
    scene_creation = {
        'filenames': reader_args.pop('filenames'),
        'reader': reader_names[0],
    }
    load_args = {
        'products': reader_args.pop('products'),
    }
    # anything left in 'reader_args' is a reader-specific kwarg
    resample_args = _args_to_dict(resampling_group._group_actions)
    writer_args = _args_to_dict(writer_group._group_actions)
    # writer_args = {}
    subgroup_idx = 3
    for idx, writer in enumerate(writer_args['writers']):
        sgrp1, sgrp2 = subgroups[subgroup_idx + idx * 2:subgroup_idx + 2 +
                                 idx * 2]
        wargs = _args_to_dict(sgrp1._group_actions)
        if sgrp2 is not None:
            wargs.update(_args_to_dict(sgrp2._group_actions))
        writer_args[writer] = wargs
        # get default output filename
        if 'filename' in wargs and wargs['filename'] is None:
            wargs['filename'] = get_default_output_filename(
                args.readers[0], writer)

    if not args.filenames:
        parser.print_usage()
        parser.exit(1, "\nERROR: No data files provided (-f flag)\n")

    # Prepare logging
    rename_log = False
    if args.log_fn is None:
        rename_log = True
        args.log_fn = glue_name + "_fail.log"
    levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG]
    setup_logging(console_level=levels[min(3, args.verbosity)],
                  log_filename=args.log_fn)
    logging.getLogger('rasterio').setLevel(levels[min(2, args.verbosity)])
    sys.excepthook = create_exc_handler(LOG.name)
    if levels[min(3, args.verbosity)] > logging.DEBUG:
        import warnings
        warnings.filterwarnings("ignore")
    LOG.debug("Starting script with arguments: %s", " ".join(sys.argv))

    # Set up dask and the number of workers
    if args.num_workers:
        dask.config.set(num_workers=args.num_workers)

    # Parse provided files and search for files if provided directories
    scene_creation['filenames'] = get_input_files(scene_creation['filenames'])
    # Create a Scene, analyze the provided files
    LOG.info("Sorting and reading input files...")
    try:
        scn = Scene(**scene_creation)
    except ValueError as e:
        LOG.error(
            "{} | Enable debug message (-vvv) or see log file for details.".
            format(str(e)))
        LOG.debug("Further error information: ", exc_info=True)
        return -1
    except OSError:
        LOG.error(
            "Could not open files. Enable debug message (-vvv) or see log file for details."
        )
        LOG.debug("Further error information: ", exc_info=True)
        return -1

    if args.list_products:
        print("\n".join(sorted(scn.available_dataset_names(composites=True))))
        return 0

    # Rename the log file
    if rename_log:
        rename_log_file(glue_name +
                        scn.attrs['start_time'].strftime("_%Y%m%d_%H%M%S.log"))

    # Load the actual data arrays and metadata (lazy loaded as dask arrays)
    LOG.info("Loading product metadata from files...")
    load_args['products'] = _apply_default_products_and_aliases(
        scn, scene_creation['reader'], load_args['products'])
    if not load_args['products']:
        return -1
    scn.load(load_args['products'])

    ll_bbox = resample_args.pop('ll_bbox')
    if ll_bbox:
        scn = scn.crop(ll_bbox=ll_bbox)

    scn = filter_scene(
        scn,
        reader_names,
        sza_threshold=reader_args['sza_threshold'],
        day_fraction=reader_args['filter_day_products'],
        night_fraction=reader_args['filter_night_products'],
    )
    if scn is None:
        LOG.info("No remaining products after filtering.")
        return 0

    to_save = []
    areas_to_resample = resample_args.pop("grids")
    if 'ewa_persist' in resample_args:
        resample_args['persist'] = resample_args.pop('ewa_persist')
    scenes_to_save = resample_scene(
        scn,
        areas_to_resample,
        preserve_resolution=args.preserve_resolution,
        is_polar2grid=USE_POLAR2GRID_DEFAULTS,
        **resample_args)
    for scene_to_save, products_to_save in scenes_to_save:
        overwrite_platform_name_with_aliases(scene_to_save)
        to_save = write_scene(scene_to_save,
                              writer_args['writers'],
                              writer_args,
                              products_to_save,
                              to_save=to_save)

    if args.progress:
        pbar = ProgressBar()
        pbar.register()

    LOG.info("Computing products and saving data to writers...")
    compute_writer_results(to_save)
    LOG.info("SUCCESS")
    return 0
Ejemplo n.º 28
0
 def get(*args, **kwargs):
     pbar = ProgressBar()
     pbar.register()
     out = client.get(*args, **kwargs)
     pbar.unregister()
     return out