Example #1
0
 def test_to_resolved_name_dict_pairs_and_keep(self):
     container = [
         'a', 'a1', 'a2', 'b1', 'b2', 'x_c', 'y_c', 'd', 'e', 'e1', 'e_1',
         'e_12', 'e_AB'
     ]
     resolved = to_resolved_name_dict_pairs([('a*', None),
                                             ('b', {
                                                 'name': 'B'
                                             }),
                                             ('*_c', {
                                                 'marker': True,
                                                 'name': 'C'
                                             }), ('d', {
                                                 'name': 'D'
                                             }), ('e_??', {
                                                 'marker': True
                                             })],
                                            container,
                                            keep=True)
     self.assertEqual(
         [
             ('a', None),
             ('a1', None),
             ('a2', None),
             ('b', {
                 'name': 'B'
             }),  # 'b' is kept!
             ('x_c', {
                 'marker': True,
                 'name': 'C'
             }),
             ('y_c', {
                 'marker': True,
                 'name': 'C'
             }),
             ('d', {
                 'name': 'D'
             }),
             ('e_12', {
                 'marker': True
             }),
             ('e_AB', {
                 'marker': True
             })
         ],
         resolved)
Example #2
0
def _process_input(input_processor: InputProcessor,
                   input_reader: DatasetIO,
                   input_reader_params: Dict[str, Any],
                   output_writer: DatasetIO,
                   output_writer_params: Dict[str, Any],
                   input_file: str,
                   output_size: Tuple[int, int],
                   output_region: Tuple[float, float, float, float],
                   output_resampling: str,
                   output_path: str,
                   output_metadata: NameAnyDict = None,
                   output_variables: NameDictPairList = None,
                   processed_variables: NameDictPairList = None,
                   profile_mode: bool = False,
                   dry_run: bool = False,
                   monitor: Callable[..., None] = None) -> bool:
    monitor('reading input slice...')
    # noinspection PyBroadException
    try:
        input_dataset = input_reader.read(input_file, **input_reader_params)
        monitor(f'Dataset read:\n{input_dataset}')
    except Exception as e:
        monitor(f'Error: cannot read input: {e}: skipping...')
        traceback.print_exc()
        return False

    time_range = input_processor.get_time_range(input_dataset)
    if time_range[0] > time_range[1]:
        monitor('Error: start time is greater than end time: skipping...')
        return False

    if output_variables:
        output_variables = to_resolved_name_dict_pairs(output_variables,
                                                       input_dataset,
                                                       keep=True)
    else:
        output_variables = [(var_name, None)
                            for var_name in input_dataset.data_vars]

    time_index, update_mode = find_time_slice(
        output_path,
        from_time_in_days_since_1970((time_range[0] + time_range[1]) / 2))

    width, height = output_size
    x_min, y_min, x_max, y_max = output_region
    xy_res = max((x_max - x_min) / width, (y_max - y_min) / height)
    output_geom = ImageGeom(size=output_size,
                            x_min=x_min,
                            y_min=y_min,
                            xy_res=xy_res,
                            is_geo_crs=True)

    steps = []

    # noinspection PyShadowingNames
    def step1(input_slice):
        return input_processor.pre_process(input_slice)

    steps.append((step1, 'pre-processing input slice'))

    geo_coding = None

    # noinspection PyShadowingNames
    def step1a(input_slice):
        nonlocal geo_coding
        geo_coding = GeoCoding.from_dataset(input_slice)
        subset = select_spatial_subset(input_slice,
                                       xy_bbox=output_geom.xy_bbox,
                                       xy_border=output_geom.xy_res,
                                       ij_border=1,
                                       geo_coding=geo_coding)
        if subset is None:
            monitor('no spatial overlap with input')
        elif subset is not input_slice:
            geo_coding = GeoCoding.from_dataset(subset)
        return subset

    steps.append((step1a, 'spatial subsetting'))

    # noinspection PyShadowingNames
    def step2(input_slice):
        return evaluate_dataset(input_slice,
                                processed_variables=processed_variables)

    steps.append((step2, 'computing input slice variables'))

    # noinspection PyShadowingNames
    def step3(input_slice):
        extra_vars = input_processor.get_extra_vars(input_slice)
        selected_variables = set(
            [var_name for var_name, _ in output_variables])
        selected_variables.update(extra_vars or set())
        return select_variables_subset(input_slice, selected_variables)

    steps.append((step3, 'selecting input slice variables'))

    # noinspection PyShadowingNames
    def step4(input_slice):
        # noinspection PyTypeChecker
        return input_processor.process(input_slice,
                                       geo_coding=geo_coding,
                                       output_geom=output_geom,
                                       output_resampling=output_resampling,
                                       include_non_spatial_vars=False)

    steps.append((step4, 'transforming input slice'))

    if time_range is not None:

        def step5(input_slice):
            return add_time_coords(input_slice, time_range)

        steps.append((step5, 'adding time coordinates to input slice'))

    def step6(input_slice):
        return update_dataset_var_attrs(input_slice, output_variables)

    steps.append((step6, 'updating variable attributes of input slice'))

    def step7(input_slice):
        return input_processor.post_process(input_slice)

    steps.append((step7, 'post-processing input slice'))

    if update_mode == 'create':

        def step8(input_slice):
            if not dry_run:
                rimraf(output_path)
                output_writer.write(input_slice, output_path,
                                    **output_writer_params)
                _update_cube_attrs(output_writer,
                                   output_path,
                                   global_attrs=output_metadata,
                                   temporal_only=False)
            return input_slice

        steps.append((step8, f'creating input slice in {output_path}'))

    elif update_mode == 'append':

        def step8(input_slice):
            if not dry_run:
                output_writer.append(input_slice, output_path,
                                     **output_writer_params)
                _update_cube_attrs(output_writer,
                                   output_path,
                                   temporal_only=True)
            return input_slice

        steps.append((step8, f'appending input slice to {output_path}'))

    elif update_mode == 'insert':

        def step8(input_slice):
            if not dry_run:
                output_writer.insert(input_slice, time_index, output_path)
                _update_cube_attrs(output_writer,
                                   output_path,
                                   temporal_only=True)
            return input_slice

        steps.append((
            step8,
            f'inserting input slice before index {time_index} in {output_path}'
        ))

    elif update_mode == 'replace':

        def step8(input_slice):
            if not dry_run:
                output_writer.replace(input_slice, time_index, output_path)
                _update_cube_attrs(output_writer,
                                   output_path,
                                   temporal_only=True)
            return input_slice

        steps.append(
            (step8,
             f'replacing input slice at index {time_index} in {output_path}'))

    if profile_mode:
        pr = cProfile.Profile()
        pr.enable()

    status = True
    try:
        num_steps = len(steps)
        dataset = input_dataset
        total_t1 = time.perf_counter()
        for step_index in range(num_steps):
            transform, label = steps[step_index]
            step_t1 = time.perf_counter()
            monitor(f'step {step_index + 1} of {num_steps}: {label}...')
            dataset = transform(dataset)
            step_t2 = time.perf_counter()
            if dataset is None:
                monitor(
                    f'  {label} terminated after {step_t2 - step_t1} seconds, skipping input slice'
                )
                status = False
                break
            monitor(f'  {label} completed in {step_t2 - step_t1} seconds')
        total_t2 = time.perf_counter()
        monitor(
            f'{num_steps} steps took {total_t2 - total_t1} seconds to complete'
        )
    except RuntimeError as e:
        monitor(
            f'Error: something went wrong during processing, skipping input slice: {e}'
        )
        traceback.print_exc()
        status = False
    finally:
        input_dataset.close()

    if profile_mode:
        # noinspection PyUnboundLocalVariable
        pr.disable()
        s = io.StringIO()
        ps = pstats.Stats(pr, stream=s).sort_stats('cumtime')
        ps.print_stats()
        monitor(s.getvalue())

    return status
Example #3
0
def compute_dataset(dataset: xr.Dataset,
                    processed_variables: NameDictPairList = None,
                    errors: str = 'raise') -> xr.Dataset:
    """
    Compute a dataset from another dataset and return it.

    New variables are computed according to the value of an ``expression`` attribute which, if given,
    must by a valid Python expression that can reference any other preceding variables by name.
    The expression can also reference any flags defined by another variable according the their CF
    attributes ``flag_meaning`` and ``flag_values``.

    Invalid values may be masked out using the value of an
    optional ``valid_pixel_expression`` attribute that forms a boolean Python expression.
    The value of the ``_FillValue`` attribute or NaN will be used in the new variable where the
    expression returns zero or false.

    Other attributes will be stored as variable metadata as-is.

    :param dataset: A dataset.
    :param processed_variables: Optional list of variables that will be loaded or computed in the order given.
           Each variable is either identified by name or by a name to variable attributes mapping.
    :param errors: How to deal with errors while evaluating expressions.
           May be be one of "raise", "warn", or "ignore".
    :return: new dataset with computed variables
    """

    if processed_variables:
        processed_variables = to_resolved_name_dict_pairs(processed_variables,
                                                          dataset,
                                                          keep=True)
    else:
        var_names = list(dataset.data_vars)
        var_names = sorted(var_names,
                           key=functools.partial(_get_var_sort_key, dataset))
        processed_variables = [(var_name, None) for var_name in var_names]

    # Initialize namespace with some constants and modules
    namespace = dict(NaN=np.nan, PI=math.pi, np=np, xr=xr)
    # Now add all mask sets and variables
    for var_name in dataset.data_vars:
        var = dataset[var_name]
        if MaskSet.is_flag_var(var):
            namespace[var_name] = MaskSet(var)
        else:
            namespace[var_name] = var

    for var_name, var_props in processed_variables:
        if var_name in dataset.data_vars:
            # Existing variable
            var = dataset[var_name]
            if var_props:
                var_props_temp = var_props
                var_props = dict(var.attrs)
                var_props.update(var_props_temp)
            else:
                var_props = dict(var.attrs)
        else:
            # Computed variable
            var = None
            if var_props is None:
                var_props = dict()

        expression = var_props.get('expression')
        if expression:
            # Compute new variable
            computed_array = compute_array_expr(expression,
                                                namespace=namespace,
                                                result_name=f'{var_name!r}',
                                                errors=errors)
            if computed_array is not None:
                if hasattr(computed_array, 'attrs'):
                    var = computed_array
                    var.attrs.update(var_props)
                namespace[var_name] = computed_array

        valid_pixel_expression = var_props.get('valid_pixel_expression')
        if valid_pixel_expression:
            # Compute new mask for existing variable
            if var is None:
                raise ValueError(f'undefined variable {var_name!r}')
            valid_mask = compute_array_expr(
                valid_pixel_expression,
                namespace=namespace,
                result_name=f'valid mask for {var_name!r}',
                errors=errors)
            if valid_mask is not None:
                masked_var = var.where(valid_mask)
                if hasattr(masked_var, 'attrs'):
                    masked_var.attrs.update(var_props)
                namespace[var_name] = masked_var

    computed_dataset = dataset.copy()
    for name, value in namespace.items():
        if isinstance(value, xr.DataArray):
            computed_dataset[name] = value

    return computed_dataset
Example #4
0
def evaluate_dataset(dataset: xr.Dataset,
                     processed_variables: NameDictPairList = None,
                     errors: str = 'raise') -> xr.Dataset:
    """
    Compute new variables or mask existing variables in *dataset*
    by the evaluation of Python expressions, that may refer to other
    existing or new variables.
    Returns a new dataset that contains the old and new variables,
    where both may bew now masked.

    Expressions may be given by attributes of existing variables in
    *dataset* or passed a via the *processed_variables* argument
    which is a sequence of variable name / attributes tuples.

    Two types of expression attributes are recognized in the attributes:

    1. The attribute ``expression`` generates
       a new variable computed from its attribute value.
    2. The attribute ``valid_pixel_expression`` masks out
       invalid variable values.

    In both cases the attribuite value must be a string that forms
    a valid Python expression that can reference any other preceding
    variables by name.
    The expression can also reference any flags defined by another
    variable according the their CF attributes ``flag_meaning``
    and ``flag_values``.

    Invalid variable values may be masked out using the value the
    ``valid_pixel_expression`` attribute whose value should form
    a Boolean Python expression. In case, the expression
    returns zero or false, the value of the ``_FillValue`` attribute
    or NaN will be used in the new variable.

    Other attributes will be stored as variable metadata as-is.

    :param dataset: A dataset.
    :param processed_variables: Optional list of variable
        name-attributes pairs that will processed in the given order.
    :param errors: How to deal with errors while evaluating expressions.
           May be be one of "raise", "warn", or "ignore".
    :return: new dataset with computed variables
    """

    if processed_variables:
        processed_variables = to_resolved_name_dict_pairs(processed_variables,
                                                          dataset,
                                                          keep=True)
    else:
        var_names = list(dataset.data_vars)
        var_names = sorted(var_names,
                           key=functools.partial(_get_var_sort_key, dataset))
        processed_variables = [(var_name, None) for var_name in var_names]

    # Initialize namespace with some constants and modules
    namespace = dict(NaN=np.nan, PI=math.pi, np=np, xr=xr)
    # Now add all mask sets and variables
    for var_name in dataset.data_vars:
        var = dataset[var_name]
        if MaskSet.is_flag_var(var):
            namespace[var_name] = MaskSet(var)
        else:
            namespace[var_name] = var

    for var_name, var_props in processed_variables:
        if var_name in dataset.data_vars:
            # Existing variable
            var = dataset[var_name]
            if var_props:
                var_props_temp = var_props
                var_props = dict(var.attrs)
                var_props.update(var_props_temp)
            else:
                var_props = dict(var.attrs)
        else:
            # Computed variable
            var = None
            if var_props is None:
                var_props = dict()

        do_load = var_props.get('load', False)

        expression = var_props.get('expression')
        if expression:
            # Compute new variable
            computed_array = compute_array_expr(expression,
                                                namespace=namespace,
                                                result_name=f'{var_name!r}',
                                                errors=errors)
            if computed_array is not None:
                if hasattr(computed_array, 'attrs'):
                    var = computed_array
                    var.attrs.update(var_props)
                if do_load:
                    computed_array.load()
                namespace[var_name] = computed_array

        valid_pixel_expression = var_props.get('valid_pixel_expression')
        if valid_pixel_expression:
            # Compute new mask for existing variable
            if var is None:
                raise ValueError(f'undefined variable {var_name!r}')
            valid_mask = compute_array_expr(
                valid_pixel_expression,
                namespace=namespace,
                result_name=f'valid mask for {var_name!r}',
                errors=errors)
            if valid_mask is not None:
                masked_var = var.where(valid_mask)
                if hasattr(masked_var, 'attrs'):
                    masked_var.attrs.update(var_props)
                if do_load:
                    masked_var.load()
                namespace[var_name] = masked_var

    computed_dataset = dataset.copy()
    for name, value in namespace.items():
        if isinstance(value, xr.DataArray):
            computed_dataset[name] = value

    return computed_dataset