Ejemplo n.º 1
0
 def test_unpack_column_raises_on_max_value_violation(self):
     """checks that unpacking a column with a max value constraint works
     """
     with self.assertRaises(ValueError):
         sit_parser.unpack_table(table=pd.DataFrame([(1, )]),
                                 column_descriptions=[{
                                     "index": 0,
                                     "name": "col0",
                                     "type": int,
                                     "max_value": 0
                                 }],
                                 table_name="")
Ejemplo n.º 2
0
 def test_unpack_table_raises_on_duplicate_column(self):
     """checks that if any 2 columns are the identical, an error is thrown
     """
     with self.assertRaises(ValueError):
         sit_parser.unpack_table(table=pd.DataFrame([("0", "0")]),
                                 column_descriptions=[{
                                     "index": 0,
                                     "name": "duplicate"
                                 }, {
                                     "index": 1,
                                     "name": "duplicate"
                                 }],
                                 table_name="")
Ejemplo n.º 3
0
def parse(yield_table, classifiers, classifier_values, age_classes):
    """Parses and validates the CBM SIT growth and yield format.

    Args:
        yield_table (pandas.DataFrame): SIT formatted growth and yield data
        classifiers (pandas.DataFrame): used to validate the classifier
            set columns of the yield data. Use the return value of:
            :py:func:`libcbm.input.sit.sit_classifier_parser.parse`
        classifier_values (pandas.DataFrame): used to validate the classifier
            set columns of the yield data. Use the return value of:
            :py:func:`libcbm.input.sit.sit_classifier_parser.parse`
        age_classes (pandas.DataFrame): used to validate the number of volume
            columns.  Use the return value of:
            :py:func:`libcbm.input.sit.sit_age_class_parser.parse`

    Raises:
        ValueError: the specified data did not have the correct number of
            columns according to the defined classifiers and age classes
        ValueError: the leading_species column contained a value that was
            not defined in the specified species map.
        ValueError: Classifier sets were not valid according to the specified
            classifiers and classifier_values.

    Returns:
        pandas.DataFrame: Validated sit input with standardized column names
            and substituted species
    """
    yield_format = sit_format.get_yield_format(
        classifiers.name, len(yield_table.columns))

    unpacked_table = sit_parser.unpack_table(
        yield_table, yield_format, "yield")

    # check that the number of volumes is equal to the number of age classes
    expected_column_count = len(age_classes) + len(classifiers) + 1
    if expected_column_count != len(unpacked_table.columns):
        raise ValueError(
            f"expected {expected_column_count} columns. This is defined as "
            f"{len(classifiers) + 1} classifiers plus {len(age_classes)} "
            "age classes")

    # check that the correct number of classifiers are present and check that
    # each value in yield table classifier sets is defined in classifier values
    for row in classifiers.itertuples():
        yield_classifiers = unpacked_table[row.name].unique()
        defined_classifier_values = classifier_values[
            classifier_values["classifier_id"] == row.id]["name"].unique()
        wildcard = np.array([sit_classifier_parser.get_wildcard_keyword()])
        valid_classifiers = np.concatenate(
            [defined_classifier_values, wildcard])

        diff = np.setdiff1d(yield_classifiers, valid_classifiers)
        if len(diff) > 0:
            raise ValueError(
                "Undefined classifier values detected: "
                f"classifier: '{row.name}', values: {diff}")

    return unpacked_table
Ejemplo n.º 4
0
    def test_unpack_column_raises_on_unconvertable_value(self):
        """checks that unpacking a column with a type constraint works
        """

        cases = [
            ("invalid_integer", int),
            ("1.1", int),
            ("invalid_float", float),
        ]

        for value, constraint_type in cases:
            with self.assertRaises(ValueError):
                sit_parser.unpack_table(table=pd.DataFrame([(value, )]),
                                        column_descriptions=[{
                                            "index":
                                            0,
                                            "name":
                                            "col0",
                                            "type":
                                            constraint_type
                                        }],
                                        table_name="")
def parse(disturbance_types_table):
    """Parse and validate a SIT formatted disturbance type table

    Args:
        disturbance_types_table (pandas.DataFrame): a table in SIT
            disturbance type format

    Example:

        Input:

            ========  =========
              0         1
            ========  =========
            distid1   fire
            distid2   clearcut
            distid3   clearcut
            ========  =========

        Output:

            ========  =========
             id         name
            ========  =========
            distid1   fire
            distid2   clearcut
            distid3   clearcut
            ========  =========


    Raises:
        ValueError: duplicate ids detected in disturbance data.

    Returns:
        pandas.DataFrame: a validated copy of the input table with
            standardized colmun names
    """
    result = sit_parser.unpack_table(
        disturbance_types_table,
        sit_format.get_disturbance_type_format(
            len(disturbance_types_table.columns)), "disturbance types")

    duplicates = result.groupby("id").size()
    duplicates = list(duplicates[duplicates > 1].index)
    if len(duplicates) > 0:
        raise ValueError(
            f"duplicate ids detected in disturbance types {duplicates}")

    # establish a numeric identifier for each row of the SIT disturbances
    result.insert(0, "sit_disturbance_type_id", np.arange(len(result)) + 1)
    return result
Ejemplo n.º 6
0
    def test_unpack_table_expected_result(self):
        """test that unpack_table function returns an expected value
        """
        unpacked = sit_parser.unpack_table(table=pd.DataFrame([("1", "2", "3")
                                                               ]),
                                           column_descriptions=[{
                                               "index": 0,
                                               "name": "col0",
                                               "type": int
                                           }, {
                                               "index": 1,
                                               "name": "col1",
                                               "type": float
                                           }, {
                                               "index": 2,
                                               "name": "col2"
                                           }],
                                           table_name="")
        self.assertTrue(list(unpacked.columns) == ["col0", "col1", "col2"])
        table = list(unpacked.itertuples())[0]

        self.assertTrue(table.col0 == 1)
        self.assertTrue(table.col1 == 2.0)
        self.assertTrue(table.col2 == "3")
Ejemplo n.º 7
0
def parse(age_class_table):
    """Parse the sit age class table format into a table of age classes with
    fields:

        - name
        - class_size
        - start_year
        - end_year

    Args:
        age_class_table (pandas.DataFrame): a dataframe

    Raises:
        ValueError: the first, and only the first row must have a 0 value
        ValueError: duplicate values in the first column of the specified
            table were detected

    Example:

        Input:

            ======  ====
             0       1
            ======  ====
            age_0    0
            age_1    10
            age_2    10
            age_3    10
            age_4    10
            age_5    10
            age_6    10
            age_7    10
            age_8    10
            age_9    10
            ======  ====

        Output:

            ======  ===========  ===========  =========
            name    class_size   start_year   end_year
            ======  ===========  ===========  =========
            age_0    0              0           0
            age_1    10             1           10
            age_2    10             11          20
            age_3    10             21          30
            age_4    10             31          40
            age_5    10             41          50
            age_6    10             51          60
            age_7    10             61          70
            age_8    10             71          80
            age_9    10             81          90
            ======  ===========  ===========  =========


    Returns:
        pandas.DataFrame: a dataframe describing the age classes.
    """
    table = sit_parser.unpack_table(age_class_table,
                                    sit_format.get_age_class_format(),
                                    "age classes")

    result = []
    for i, row in enumerate(table.itertuples()):
        size = row.class_size
        if i == 0:
            if size != 0:
                raise ValueError("First age class row expected to have 0 size")
            result.append({
                "name": row.id,
                "class_size": 0,
                "start_year": 0,
                "end_year": 0
            })
        else:
            start_year = result[-1]["end_year"] + 1
            if size == 0:
                raise ValueError("All age class rows other than the"
                                 "first one must have size > 0")
            result.append({
                "name": row.id,
                "class_size": row.class_size,
                "start_year": start_year,
                "end_year": start_year + row.class_size - 1
            })

    age_classes = pd.DataFrame(
        result, columns=["name", "class_size", "start_year", "end_year"])

    duplicates = age_classes.groupby("name").size()
    duplicates = list(duplicates[duplicates > 1].index)
    if len(duplicates) > 0:
        raise ValueError(
            f"duplicate names detected in age classes {duplicates}")
    return age_classes
def parse(transition_rules, classifiers, classifier_values,
          classifier_aggregates, disturbance_types, age_classes):
    """Parses and validates the CBM SIT transition rule format.

    Args:
        transition_rules (pandas.DataFrame): CBM SIT transition rule formatted
            data.
        classifiers (pandas.DataFrame): used to validate the classifier
            set columns of the transition rule data. Use the return value of:
            :py:func:`libcbm.input.sit.sit_classifier_parser.parse`
        classifier_values (pandas.DataFrame): used to validate the classifier
            set columns of the transition rule data. Use the return value of:
            :py:func:`libcbm.input.sit.sit_classifier_parser.parse`
        classifier_aggregates (pandas.DataFrame): used to validate the
            classifier set columns of the transition rule data. Use the return
            value of:
            :py:func:`libcbm.input.sit.sit_classifier_parser.parse`
        disturbance_types (pandas.DataFrame): Used to validate the
            disturbance_type column of the transition rule data. Use the return
            value of:
            :py:func:`libcbm.input.sit.sit_disturbance_types_parser.parse`
        age_classes (pandas.DataFrame): used to validate the number of volume
            columns.  Use the return value of:
            :py:func:`libcbm.input.sit.sit_age_class_parser.parse`

    Raises:
        ValueError: undefined classifier values were found in the transition
            rule classifier sets
        ValueError: a grouped set of transition rules has a percent greater
            than 100%.
        ValueError: undefined disturbance types were found in the transition
            rule disturbance_type column

    Returns:
        pandas.DataFrame: validated transition rules
    """
    transition_rule_format = sit_format.get_transition_rules_format(
        classifiers.name, len(transition_rules.columns))

    transitions = sit_parser.unpack_table(
        transition_rules, transition_rule_format, "transitions")
    if len(transitions.index) == 0:
        return transitions
    # check that each value in transition_rules events classifier sets is
    # defined in classifier values, classifier aggregates or is a wildcard
    for row in classifiers.itertuples():
        source_classifiers = transitions[row.name].unique()

        # get the destination classifier
        tr_dest_fmt = sit_format.get_tr_classifier_set_postfix()
        dest_classifiers = transitions[f"{row.name}{tr_dest_fmt}"]

        defined_classifiers = classifier_values[
            classifier_values["classifier_id"] == row.id]["name"].unique()

        aggregates = np.array(
            [x["name"] for x in
             classifier_aggregates if x["classifier_id"] == row.id])
        wildcard = np.array([sit_classifier_parser.get_wildcard_keyword()])
        valid_source_classifiers = np.concatenate(
            [defined_classifiers, aggregates, wildcard])

        diff_source = np.setdiff1d(
            source_classifiers, valid_source_classifiers)
        if len(diff_source) > 0:
            raise ValueError(
                "Undefined classifier values detected: "
                f"classifier: '{row.name}', values: {diff_source}")

        # aggregates may not appear in transition rule destination classifier
        # set (only the defined classifier values, or wildcards)
        valid_dest_classifiers = np.concatenate(
            [defined_classifiers, wildcard])
        diff_dest = np.setdiff1d(
            dest_classifiers, valid_dest_classifiers)
        if len(diff_dest) > 0:
            raise ValueError(
                "Undefined classifier values detected: "
                f"classifier: '{row.name}', values: {diff_dest}")

    parse_bool_func = sit_parser.get_parse_bool_func(
        "transitions", "using_age_class")
    transitions = sit_parser.substitute_using_age_class_rows(
        transitions, parse_bool_func, age_classes)

    # validate and substitute disturbance type names versus the SIT disturbance
    # types
    a = transitions.disturbance_type.unique()
    b = disturbance_types.id.unique()
    undefined_disturbances = np.setdiff1d(a, b)
    if len(undefined_disturbances) > 0:
        raise ValueError(
            "Undefined disturbance type ids (as defined in sit "
            f"disturbance types) detected: {undefined_disturbances}"
        )

    transitions = transitions.rename(
        columns={
            "min_softwood_age": "min_age",
            "max_softwood_age": "max_age"})

    transitions = transitions.drop(
        columns=["using_age_class", "min_hardwood_age", "max_hardwood_age"])

    # if the sum of percent for grouped transition rules exceeds 100% raise an
    # error
    group_cols = list(classifiers.name) + \
        ["min_age", "max_age", "disturbance_type"]
    grouped = transitions[group_cols + ["percent"]].groupby(group_cols).sum()
    invalid_grouped = grouped[
        grouped.percent > (100 + GROUPED_PERCENT_ERR_MAX)]
    if len(invalid_grouped) > 0:
        invalid_percents = [x.Index for x in invalid_grouped.head().itertuples()]
        raise ValueError(
            "the following groups have a total percent greater than 100%: "
            f"{invalid_percents}")

    return transitions
Ejemplo n.º 9
0
def parse_eligibilities(disturbance_events, disturbance_eligibilities):
    """Parse and validate disturbance eligibilities which are a libcbm-specific
    alternative to the eligibility columns in the cbm-cfs3 sit_disturbance
    events input.

    The benefit of this format is that the number of columns in sit_events is
    greatly reduced, and arbitrary boolean expressions of stand pool and state
    values, rather than min/max ranges supported in the CBM3-SIT format may be
    used.

    Example disturbance_eligibilities table:

     ==   =====================================  =======================
     id   pool_filter_expression                 state_filter_expression
     ==   =====================================  =======================
     1    (SoftwoodMerch + HardwoodMerch) >= 10  NULL
     2    (SoftwoodMerch + HardwoodMerch) >= 10  (age > 5) & (age < 100)
     3    NULL                                   NULL
     ==   =====================================  =======================

    * The id field in the disturbance_eligibilities corresponds to sit events
    * expressions are parsed by the numexpr library
    * note brackets are required around nested boolean expressions
      joined by a boolean operator (eg &)
    * for both pool_filter_expression, and state_filter_expression,
      the expressions must evaluate to a True or False value.  False
      indicates that the stand records being evaluated for the
      corresponding disturbance event deemed ineligible for the
      disturbance. True indicates that the expressions does not
      eliminate the stand from eligibility.
    * for pool_filter_expression any CBM pool is acceptable.  The pool names
      are defined in the cbm_defaults database tables.
    * for state_filter_expression any of the state values may be used in the
      boolean expression. See:
     :py:func:`libcbm.model.cbm.cbm_variables.initialize_cbm_state_variables`

    The final eligibility is evaluated as follows:

     ====================== ======================= =================
     pool_filter_expression state_filter_expression deemed_ineligible
     ====================== ======================= =================
     NULL or TRUE           NULL or TRUE            FALSE
     NULL or TRUE           FALSE                   TRUE
     FALSE                  NULL or TRUE            TRUE
     FALSE                  FALSE                   TRUE
     ====================== ======================= =================

    Args:
        disturbance_events (pandas.DataFrame): alternate form of CBM-CFS3
            sit_events: the 21 eligibility columns and the using age class
            and min-max columns are omitted.
        disturbance_eligibilities (pandas.DataFrame): table of id (int),
            state_filter expression (str), pool filter expression (str).
            The disturbance event disturbance_eligibility_id column
            corresponds to the id column in this table.

    Raises:
        ValueError: disturbance_eligibility_id values found in the specified
            sit_events were not present in the provided
            disturbance_eligibilities table.
        ValueError: at lease one null id value was detected in the id column
            of the specified disturbance_eligibilities table.
        ValueError: duplicate id value was detected in the id column of the
            specified disturbance_eligibilities table.

    Returns:
        pandas.DataFrame: the validated event eligibilities table
    """
    disturbance_eligibility_format = \
        sit_format.get_disturbance_eligibility_format()

    eligibilities = sit_parser.unpack_table(
        disturbance_eligibilities, disturbance_eligibility_format,
        "disturbance eligibilities")

    # confirm that each row in the disturbance events with an
    # eligibility id >= 0 has a corresponding record in the eligibilities
    # table
    missing_ids = (
        set(disturbance_events["disturbance_eligibility_id"]) -
        set(eligibilities["disturbance_eligibility_id"]))
    if missing_ids:
        raise ValueError(
            "disturbance_eligibility_id values found in sit_events "
            f"but not in sit_disturbance_eligibilities {missing_ids}")
    if pd.isnull(eligibilities.disturbance_eligibility_id).any():
        raise ValueError(
            "null values detected in eligibilities disturbance_eligibility_id "
            "column")
    if eligibilities.disturbance_eligibility_id.duplicated().any():
        raise ValueError(
            "duplicated disturbance_eligibility_id values detected in "
            "eligibilities")
    eligibilities = eligibilities.fillna("")
    return eligibilities
Ejemplo n.º 10
0
def parse(disturbance_events, classifiers, classifier_values,
          classifier_aggregates, disturbance_types, age_classes=None,
          separate_eligibilities=False):
    """Parses and validates the CBM SIT disturbance event format, or
    optionally an extended sit disturbance event format where disturbance
    eligibilites are separate from sit_events and joined by foreign key.

    Args:
        disturbance_events (pandas.DataFrame): CBM SIT disturbance events
            formatted data.
        classifiers (pandas.DataFrame): used to validate the classifier
            set columns of the disturbance event data. Use the return value
            of: :py:func:`libcbm.input.sit.sit_classifier_parser.parse`
        classifier_values (pandas.DataFrame): used to validate the classifier
            set columns of the disturbance event data. Use the return value
            of: :py:func:`libcbm.input.sit.sit_classifier_parser.parse`
        classifier_aggregates (pandas.DataFrame): used to validate the
            classifier set columns of the disturbance event data. Use the
            return value of:
            :py:func:`libcbm.input.sit.sit_classifier_parser.parse`
        disturbance_types (pandas.DataFrame): Used to validate the
            disturbance_type column of the disturbance event data. Use the
            return value of:
            :py:func:`libcbm.input.sit.sit_disturbance_types_parser.parse`
        age_classes (pandas.DataFrame, optional): used to validate and compute
            age eligibility criteria in disturbance_events. Use the return
            value of:
            :py:func:`libcbm.input.sit.sit_age_class_parser.parse`.
        disturbance_eligibilities (pandas.DataFrame, optional): table of
            eligibility expressions.

    Raises:
        ValueError: undefined classifier values were found in the disturbance
            event classifier sets
        ValueError: undefined disturbance types were found in the disturbance
            event disturbance_type column
        ValueError: undefined sort types were found in the disturbance
            event sort_type column. See :py:func:`get_sort_types`
        ValueError: undefined target types were found in the disturbance
            event target_type column. See :py:func:`get_target_types`

    Returns:
        pandas.DataFrame: the validated disturbance events
    """

    disturbance_event_format = sit_format.get_disturbance_event_format(
          classifiers.name, len(disturbance_events.columns),
          include_eligibility_columns=not separate_eligibilities)

    events = sit_parser.unpack_table(
        disturbance_events, disturbance_event_format, "disturbance events")

    # check that the correct number of classifiers are present, and check
    # that each value in disturbance events classifier sets is defined in
    # classifier values, classifier aggregates or is a wildcard
    for row in classifiers.itertuples():
        event_classifiers = events[row.name].unique()

        defined_classifiers = classifier_values[
            classifier_values["classifier_id"] == row.id]["name"].unique()

        aggregates = np.array(
            [x["name"] for x in
             classifier_aggregates if x["classifier_id"] == row.id])
        wildcard = np.array([sit_classifier_parser.get_wildcard_keyword()])
        valid_classifiers = np.concatenate(
            [defined_classifiers, aggregates, wildcard])

        diff_classifiers = np.setdiff1d(
            event_classifiers, valid_classifiers)
        if len(diff_classifiers) > 0:
            raise ValueError(
                "Undefined classifier values detected: "
                f"classifier: '{row.name}', values: {diff_classifiers}")

    if not separate_eligibilities:
        # if age classes are used substitute the age critera based on the age
        # class id, and raise an error if the id is not defined, and drop
        # using_age_class from output
        parse_bool_func = sit_parser.get_parse_bool_func(
            "events", "using_age_class")
        events = sit_parser.substitute_using_age_class_rows(
            events, parse_bool_func, age_classes)
        events = events.rename(
            columns={
                "min_softwood_age": "min_age",
                "max_softwood_age": "max_age"})

        events = events.drop(
            columns=["using_age_class", "min_hardwood_age",
                     "max_hardwood_age"])

    # validate sort type
    valid_sort_types = get_sort_types().keys()
    int_sort_type = events.sort_type.astype(int)
    sort_type_diff = set(int_sort_type.unique()) \
        .difference(set(valid_sort_types))
    if len(sort_type_diff) > 0:
        raise ValueError(
            f"specified sort types are not valid: {sort_type_diff}")
    events.sort_type = int_sort_type.map(get_sort_types())

    # validate target type
    valid_target_types = get_target_types().keys()
    target_type_diff = set(events.target_type.unique()) \
        .difference(set(valid_target_types))
    if len(target_type_diff) > 0:
        raise ValueError(
            f"specified target types are not valid: {target_type_diff}")
    events.target_type = events.target_type.map(get_target_types())

    # validate disturbance type according to specified disturbance types
    a = events.disturbance_type.unique()
    b = disturbance_types.id.unique()
    undefined_disturbances = np.setdiff1d(a, b)
    if len(undefined_disturbances) > 0:
        raise ValueError(
            "Undefined disturbance type ids (as defined in sit "
            f"disturbance types) detected: {undefined_disturbances}"
        )

    return events
Ejemplo n.º 11
0
def parse(inventory_table, classifiers, classifier_values, disturbance_types,
          age_classes):
    """Parses and validates SIT formatted inventory data.  The inventory_table
    parameter is the primary data, and the other args act as validation
    metadata.

    Args:
        inventory_table (pandas.DataFrame): SIT formatted inventory
        classifiers (pandas.DataFrame): table of classifier as returned by the
            function:
            :py:func:`libcbm.input.sit.sit_classifier_parser.parse`
        classifier_values (pandas.DataFrame): table of classifier values as
            returned by the function:
            :py:func:`libcbm.input.sit.sit_classifier_parser.parse`
        disturbance_types (pandas.DataFrame): table of disturbance types as
            returned by the function:
            :py:func:`libcbm.input.sit.sit_disturbance_type_parser.parse`
        age_classes (pandas.DataFrame): table of disturbance types as
            returned by the function:
            :py:func:`libcbm.input.sit.sit_age_class_parser.parse`

    Raises:
        ValueError: Undefined classifier values detected in inventory table
        ValueError: Undefined disturbance types detected in inventory table

    Example:

        Input:

            SIT_Inventory:

                ===  ===  ======  =======  ===  ===  ===  =====  =====  ===
                0    1    2       3        4    5    6    7       8      9
                ===  ===  ======  =======  ===  ===  ===  =====  =====  ===
                b    a    True    age_2    1    1    1    dist1  dist2  -1
                a    a    False   100      1    0    0    dist2  dist1   0
                a    a    -1      4        1    0    0    dist1  dist1  -1
                ===  ===  ======  =======  ===  ===  ===  =====  =====  ===

            classifiers parameter:

                ===  ===========
                id   name
                ===  ===========
                1    classifier1
                2    classifier2
                ===  ===========

            classifier_values parameter:

                ==============  =====  ============
                classifier_id   name   description
                ==============  =====  ============
                 1               a      a
                 1               b      b
                 2               a      a
                ==============  =====  ============

            disturbance_types parameter:

                ======  =========
                id         name
                ======  =========
                dist1    fire
                dist2    clearcut
                dist3    clearcut
                ======  =========

            age_classes parameter:

                ======  ===========  ===========  =========
                name    class_size   start_year   end_year
                ======  ===========  ===========  =========
                age_0    0              0           0
                age_1    10             1           10
                age_2    10             11          20
                age_3    10             21          30
                age_4    10             31          40
                age_5    10             41          50
                age_6    10             51          60
                age_7    10             61          70
                age_8    10             71          80
                age_9    10             81          90
                ======  ===========  ===========  =========

            land_classes parameter::

                land_classes = {0: "lc_1", 1: "lc_2"}

        Output: (abbreviated column names)

            ==  ===    =====  ====  =====  =====  ==========  =========  =====
            c1  c2     age    area  delay   lc    hist_dist   last_dist  s_ref
            ==  ===    =====  ====  =====  =====  ==========  =========  =====
            a    a      100   1.0    0      lc_1    fire       fire        0
            a    a      4     1.0    0      lc_1    clearcut   clearcut   -1
            b    a      11    0.1    1      lc_2    fire       fire       -1
            b    a      12    0.1    1      lc_2    fire       fire       -1
            b    a      13    0.1    1      lc_2    fire       fire       -1
            b    a      14    0.1    1      lc_2    fire       fire       -1
            b    a      15    0.1    1      lc_2    fire       fire       -1
            b    a      16    0.1    1      lc_2    fire       fire       -1
            b    a      17    0.1    1      lc_2    fire       fire       -1
            b    a      18    0.1    1      lc_2    fire       fire       -1
            b    a      19    0.1    1      lc_2    fire       fire       -1
            b    a      20    0.1    1      lc_2    fire       fire       -1
            ==  ===    =====  ====  =====  =====  ==========  =========  =====

            The actual output column names for this example are:

                - classifier1
                - classifier2
                - age
                - area
                - delay
                - land_class
                - historical_disturbance_type
                - last_pass_disturbance_type
                - spatial_reference

    Returns:
        pandas.DataFrame: validated inventory
    """
    inventory_format = sit_format.get_inventory_format(
        classifiers.name, len(inventory_table.columns))

    inventory = sit_parser.unpack_table(inventory_table, inventory_format,
                                        "inventory")

    # validate the classifier values in the inventory table
    for row in classifiers.itertuples():
        a = inventory[row.name].unique()
        b = classifier_values[classifier_values["classifier_id"] ==
                              row.id]["name"].unique()
        diff = np.setdiff1d(a, b)
        if len(diff) > 0:
            raise ValueError("Undefined classifier values detected: "
                             f"classifier: '{row.name}', values: {diff}")

    # if the historical/last pass disturbances are specified substitute them
    # according to the specified disturbance type parameters
    if "historical_disturbance_type" in inventory:
        # first of all, validate
        undefined_historic = np.setdiff1d(
            inventory.historical_disturbance_type.unique(),
            disturbance_types.id.unique())

        undefined_lastpass = np.setdiff1d(
            inventory.last_pass_disturbance_type.unique(),
            disturbance_types.id.unique())
        if len(undefined_historic) > 0:
            raise ValueError(
                "Undefined disturbance type ids (as defined in sit "
                f"disturbance types) detected: {undefined_historic}")
        if len(undefined_lastpass) > 0:
            raise ValueError(
                "Undefined disturbance type ids (as defined in sit "
                f"disturbance types) detected: {undefined_lastpass}")

    inventory.using_age_class = inventory.using_age_class.map(
        sit_parser.get_parse_bool_func("inventory", "using_age_class"))

    # for rows where using_age_class is false, a type of integer and min value
    # of 0 is enforced
    age_column_format = [x for x in inventory_format
                         if x["name"] == "age"][0].copy()
    age_column_format["type"] = int
    age_column_format["min_value"] = 0

    sit_parser.unpack_column(inventory.loc[~inventory.using_age_class],
                             age_column_format, "inventory")

    if inventory.using_age_class.any():
        inventory = expand_age_class_inventory(inventory, age_classes)

    inventory = inventory.drop(columns=["using_age_class"])
    inventory = inventory.reset_index(drop=True)

    if "spatial_reference" in inventory:
        if inventory.spatial_reference[
                inventory.spatial_reference > 0].duplicated().any():
            raise ValueError(
                "duplicate value detected in spatial_reference column")
    return inventory
Ejemplo n.º 12
0
def parse(classifiers_table):
    """parse SIT_Classifiers formatted data.

    Args:
        classifiers_table (pandas.DataFrame): a dataFrame in sit classifiers
            format.

    Raises:
        ValueError: duplicated names detected, or other validation error
            occurred

    Example Input:

        ==   ===========  ===========  ===  ===
        0     1           2            3    4
        ==   ===========  ===========  ===  ===
        1    _CLASSIFIER  classifier1  NaN  NaN
        1    a            a            NaN  NaN
        1    b            b            NaN  NaN
        1    agg1         agg1         a    b
        1    agg2         agg2         a    b
        2    _CLASSIFIER  classifier2  NaN  NaN
        2    a            a            NaN  NaN
        2    agg1         agg1         a    NaN
        ==   ===========  ===========  ===  ===

    Output based on Example input:

        Classifiers:

            ===  ===========
            id   name
            ===  ===========
            1    classifier1
            2    classifier2
            ===  ===========

        Classifier Values:

            ==============  =====  ============
            classifier_id   name   description
            ==============  =====  ============
             1               a      a
             1               b      b
             2               a      a
            ==============  =====  ============

        Classifier Aggregates::

            [{'classifier_id': 1,
              'name': 'agg1',
              'description': 'agg2',
              'classifier_values': ['a', 'b']},
             {'classifier_id': 1,
              'name': 'agg2',
              'description': 'agg2',
              'classifier_values': ['a', 'b']},
             {'classifier_id': 2,
              'name': 'agg1',
              'description': 'agg1',
              'classifier_values': ['a']}]

    Returns:
        tuple:

            - classifiers - a validated table of classifiers
            - classifier_values - a validated table of classifier values
            - aggregate_values - a dictionary describing aggregate values

    """
    classifiers_format = sit_format.get_classifier_format(
        len(classifiers_table.columns))
    unpacked = sit_parser.unpack_table(classifiers_table, classifiers_format,
                                       "classifiers")

    classifiers = unpacked \
        .loc[unpacked["name"] == get_classifier_keyword()]
    classifiers = pd.DataFrame(
        data={
            "id": classifiers.id,
            # for classifiers, the 3rd column is used for the name
            "name": classifiers.description
        },
        columns=["id", "name"])

    if classifiers.shape[0] != len(unpacked.id.unique()):
        # this can occur if the data isnt formatted correctly
        raise ValueError(
            "number of unique id values must match number of occurrences of "
            "'{}'".format(get_classifier_keyword()))
    # since the order of classifier ids defines the order of classifier
    # value columns in the other SIT tables, sorting is important
    classifiers.sort_values(by="id", inplace=True)

    duplicate_classifiers = classifiers.groupby("name").size()
    duplicated_classifier_names = list(
        duplicate_classifiers[duplicate_classifiers > 1].index)
    if len(duplicated_classifier_names) > 0:
        raise ValueError(
            "The following classifier names appear more than one time:"
            f"{duplicated_classifier_names}")
    # filter out rows that have the _CLASSIFIER keyword and also
    # any rows that have a value on the 3rd or greater column.
    # This is the set of classifier values.
    classifier_values = unpacked \
        .loc[pd.isnull(unpacked.iloc[:, 3:]).all(axis=1) &
             (unpacked["name"] != get_classifier_keyword())]

    classifier_values = pd.DataFrame({
        "classifier_id":
        classifier_values.id,
        "name":
        classifier_values.name,
        "description":
        classifier_values.description
    })

    duplicate_classifier_values = classifier_values.groupby(
        ["classifier_id", "name"]).size()
    duplicate_classifier_values = [{
        "classifier_id": x[0],
        "classifier_value": x[1]
    } for x in list(duplicate_classifier_values[
        duplicate_classifier_values > 1].index)]
    if len(duplicate_classifier_values) > 0:
        raise ValueError(
            "The following classifier values are duplicated for the specified "
            f"classifier ids: {duplicate_classifier_values}")

    aggregate_values = []
    classifier_aggregates = unpacked.loc[~pd.isnull(unpacked.iloc[:, 3:]).all(
        axis=1)]
    for i in range(0, classifier_aggregates.shape[0]):

        agg_values = classifier_aggregates.iloc[i, 3:]
        agg_values = agg_values[~pd.isnull(agg_values)]
        aggregate_values.append({
            "classifier_id":
            classifier_aggregates.iloc[i, :]["id"],
            "name":
            classifier_aggregates.iloc[i, :]["name"],
            "description":
            classifier_aggregates.iloc[i, :]["description"],
            "classifier_values":
            list(agg_values[:])
        })

    unique_agg_set = set()
    unique_agg_value_set = set()
    for agg in aggregate_values:
        classifier_id = agg["classifier_id"]
        name = agg["name"]
        agg_values = agg["classifier_values"]
        if len(agg_values) > len(set(agg_values)):
            raise ValueError(
                "duplicate classifier values detected in aggregate with "
                f"classifier_id: {classifier_id}, name {name}")
        for classifier_value in agg_values:
            unique_agg_value_set.add((classifier_id, classifier_value))
        if (classifier_id, name) in unique_agg_set:
            raise ValueError("duplicate classifier aggregate detected: "
                             f"classifier_id: {classifier_id}, name {name}")
        else:
            unique_agg_set.add((classifier_id, name))

    for classifier_id in classifier_values.classifier_id.unique():
        classifier_id_values_set = set(classifier_values[
            classifier_values.classifier_id == classifier_id].name)
        aggregate_values_set = set(
            [x[1] for x in unique_agg_value_set if x[0] == classifier_id])
        if not aggregate_values_set.issubset(classifier_id_values_set):
            missing_aggregate_values = aggregate_values_set.difference(
                classifier_id_values_set)
            raise ValueError(
                "The following aggregate values that are not defined as "
                f"classifier values in the classifier with id {classifier_id} "
                f"were found: {missing_aggregate_values}.")

    return classifiers, classifier_values, aggregate_values