def _scale_observations_relative_to_interface(self, processor: Processor,
                                               interface_name: str,
                                               scale: Union[int, float]):
     for factor in processor.factors:
         for observation in factor.quantitative_observations:
             relative_to_interface = observation.attributes.get(
                 "relative_to", None)
             if relative_to_interface and strcmp(relative_to_interface.name,
                                                 interface_name):
                 observation.value = float(observation.value) * scale
                 observation.attributes["relative_to"] = None
    def _constrains_interface(self, scale: str, invoking_interface_name: str,
                              requested_interface_name: str,
                              parent_processor: Processor,
                              child_processor: Processor):
        for f in parent_processor.factors:
            if strcmp(f.name, invoking_interface_name):
                origin_factor = f
                break
        else:
            raise Exception("Invoking interface name '" +
                            invoking_interface_name +
                            "' not found for processor '" +
                            parent_processor.name + "'")

        for f in child_processor.factors:
            if strcmp(f.name, requested_interface_name):
                destination_factor = f
                break
        else:
            raise Exception("Requested interface name '" +
                            invoking_interface_name +
                            "' not found for processor '" +
                            parent_processor.name + "'")

        relationship = FactorsRelationScaleObservation.create_and_append(
            origin=origin_factor,
            destination=destination_factor,
            observer=None,
            quantity=scale)

        # relationship = ProcessorsRelationUpscaleObservation.create_and_append(parent=parent_processor,
        #                                                                       child=child_processor,
        #                                                                       observer=None,
        #                                                                       factor_name=interface_name,
        #                                                                       quantity=scale)

        self._glb_idx.put(relationship.key(), relationship)
예제 #3
0
    def execute(self, state: "State"):
        issues = []

        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state)
        name = self._content["command_name"]

        # List of available dataset names. The newly defined datasets must not be in this list
        ds_names = [ds.code for ds in datasets.values()]

        # Process parsed information
        for r, line in enumerate(self._content["items"]):
            # A dataset
            dataset_name = line["name"]
            # Find it in the already available datasets. MUST EXIST
            for n in ds_names:
                if strcmp(dataset_name, n):
                    df = pd.read_json(StringIO(line["values"]), orient="split")
                    # Check columns
                    ds = datasets[n]
                    iss = prepare_dataframe_after_external_read(ds, df)
                    for issue in iss:
                        issues.append(
                            Issue(itype=3,
                                  description=issue,
                                  location=IssueLocation(sheet_name=name, row=-1, column=-1)))
                    # Everything ok? Store the dataframe!
                    if len(iss) == 0:
                        ds.data = df
                    break
            else:
                issues.append(
                    Issue(itype=3,
                          description="Metadata for the dataset '"+dataset_name+"' must be defined previously",
                          location=IssueLocation(sheet_name=name, row=-1, column=-1)))

        return issues, None
예제 #4
0
        def process_row(item):
            """
            Process a dictionary representing a row of the data input command. The dictionary can come directly from
            the worksheet or from a dataset.

            Implicitly uses "glb_idx"

            :param row: dictionary
            """
            # Gather variables in one dictionary
            fields_value = {
                k: item.get(k, v.default_value)
                for k, v in fields.items()
            }

            # Check if mandatory fields with no value exist
            for field in [
                    k for k, v in fields.items()
                    if v.mandatory and not fields_value[k]
            ]:
                add_issue(IType.error(),
                          f"Mandatory field '{field}' is empty. Skipped.")
                return

            # Interface
            f_alias = fields_value.get("alias")
            f_processor_name = fields_value.get("processor")
            f_interface_type_name = fields_value.get("interface_type")
            f_interface_name = fields_value.get(
                "interface")  # A "simple_ident", optional
            f_location = fields_value.get("location")
            f_orientation = fields_value.get("orientation")
            # f_roegen_type = fields_value.get("roegen_type")
            # f_sphere = fields_value.get("sphere")
            # f_opposite_processor_type = fields_value.get("opposite_processor_type")
            # f_geolocation_ref = fields_value.get("geolocation_ref")
            # f_geolocation_code = fields_value.get("geolocation_code")

            # Qualified Quantity
            f_value = fields_value.get("value")
            f_unit = fields_value.get("unit")
            f_uncertainty = fields_value.get("uncertainty")
            f_assessment = fields_value.get("assessment")
            f_pedigree_matrix = fields_value.get("pedigree_matrix")
            f_pedigree = fields_value.get("pedigree")
            f_relative_to = fields_value.get("relative_to")
            f_time = fields_value.get("time")
            f_source = fields_value.get("qq_source")
            f_number_attributes = fields_value.get("number_attributes", {})
            f_comments = fields_value.get("comments")

            # Transform text of "interface_attributes" into a dictionary
            field_val = fields_value.get("interface_attributes")
            if field_val:
                try:
                    fields_value[
                        "interface_attributes"] = dictionary_from_key_value_list(
                            field_val, glb_idx)
                except Exception as e:
                    add_issue(IType.error(), str(e))
                    return
            else:
                fields_value["interface_attributes"] = {}

            # Transform text of "number_attributes" into a dictionary
            if f_number_attributes:
                try:
                    number_attributes = dictionary_from_key_value_list(
                        f_number_attributes, glb_idx)
                except Exception as e:
                    add_issue(IType.error(), str(e))
                    return
            else:
                number_attributes = {}

            # f_processor_name -> p
            # f_interface_type_name -> it
            # f_interface_name -> i
            #
            # IF NOT i AND it AND p => i_name = it.name => get or create "i"
            # IF i AND it AND p => get or create "i", IF "i" exists, i.it MUST BE equal to "it" (IF NOT, error)
            # IF i AND p AND NOT it => get "i" (MUST EXIST)
            if not f_interface_name:
                if not f_interface_type_name:
                    add_issue(
                        IType.error(),
                        "At least one of InterfaceType or Interface must be defined"
                    )
                    return

                possibly_local_interface_name = None
                f_interface_name = f_interface_type_name
            else:
                possibly_local_interface_name = f_interface_name

            # Check existence of PedigreeMatrix, if used
            if f_pedigree_matrix and f_pedigree:
                pm = glb_idx.get(
                    PedigreeMatrix.partial_key(name=f_pedigree_matrix))
                if len(pm) == 0:
                    add_issue(
                        IType.error(), "Could not find Pedigree Matrix '" +
                        f_pedigree_matrix + "'")
                    return
                else:
                    try:
                        lst = pm[0].get_modes_for_code(f_pedigree)
                    except:
                        add_issue(
                            IType.error(), "Could not decode Pedigree '" +
                            f_pedigree + "' for Pedigree Matrix '" +
                            f_pedigree_matrix + "'")
                        return
            elif f_pedigree and not f_pedigree_matrix:
                add_issue(
                    IType.error(),
                    "Pedigree specified without accompanying Pedigree Matrix")
                return

            # Source
            if f_source:
                try:
                    ast = parser_field_parsers.string_to_ast(
                        parser_field_parsers.reference, f_source)
                    ref_id = ast["ref_id"]
                    references = glb_idx.get(
                        ProvenanceReference.partial_key(ref_id))
                    if len(references) == 1:
                        source = references[0]
                    else:
                        references = glb_idx.get(
                            BibliographicReference.partial_key(ref_id))
                        if len(references) == 1:
                            source = references[0]
                        else:
                            add_issue(IType.error(),
                                      f"Reference '{f_source}' not found")
                except:
                    # TODO Change when Ref* are implemented
                    source = f_source + " (not found)"
            else:
                source = None

            # Geolocation
            if f_location:
                try:
                    # TODO Change to parser for Location (includes references, but also Codes)
                    ast = parser_field_parsers.string_to_ast(
                        parser_field_parsers.reference, f_location)
                    ref_id = ast["ref_id"]
                    references = glb_idx.get(
                        GeographicReference.partial_key(ref_id))
                    if len(references) == 1:
                        geolocation = references[0]
                except:
                    geolocation = f_location
            else:
                geolocation = None

            # Find Processor
            # TODO Allow creating a basic Processor if it is not found?
            p = glb_idx.get(Processor.partial_key(f_processor_name))
            if len(p) == 0:
                add_issue(
                    IType.error(), "Processor '" + f_processor_name +
                    "' not declared previously")
                return
            elif len(p) > 1:
                add_issue(
                    IType.error(),
                    "Processor '" + f_processor_name + "' found " +
                    str(len(p)) + " times. It must be uniquely identified.")
                return
            else:
                p = p[0]

            # Try to find Interface
            ft: FactorType = None
            f = glb_idx.get(
                Factor.partial_key(processor=p, name=f_interface_name))
            if len(f) == 1:
                f = f[0]
                ft: FactorType = f.taxon
                if f_interface_type_name:
                    if not strcmp(ft.name, f_interface_type_name):
                        add_issue(
                            IType.warning(),
                            f"The InterfaceType of the Interface, {ft.name} "
                            f"is different from the specified InterfaceType, {f_interface_type_name}. Record skipped."
                        )
                        return
            elif len(f) > 1:
                add_issue(
                    IType.error(),
                    f"Interface '{f_interface_name}' found {str(len(f))} times. "
                    f"It must be uniquely identified.")
                return
            elif len(f) == 0:
                f: Factor = None  # Does not exist, create it below
                if not f_orientation:
                    add_issue(
                        IType.error(),
                        f"Orientation must be defined for new Interfaces")
                    return

            # InterfaceType still not found
            if not ft:
                # Find FactorType
                # TODO Allow creating a basic FactorType if it is not found
                ft = glb_idx.get(FactorType.partial_key(f_interface_type_name))
                if len(ft) == 0:
                    add_issue(
                        IType.error(),
                        f"InterfaceType '{f_interface_type_name}' not declared previously"
                    )
                    return
                elif len(ft) > 1:
                    add_issue(
                        IType.error(),
                        f"InterfaceType '{f_interface_type_name}' found {str(len(ft))} times. "
                        f"It must be uniquely identified.")
                    return
                else:
                    ft = ft[0]

            if not f:
                # Get attributes default values taken from Interface Type or Processor attributes
                default_values = {
                    # "orientation": ft.orientation,
                    "sphere": ft.sphere,
                    "roegen_type": ft.roegen_type,
                    "opposite_processor_type": p.subsystem_type
                }

                # Get internal and user-defined attributes in one dictionary
                attributes = {
                    k: ifnull(fields_value[k], default_values.get(k, None))
                    for k, v in fields.items() if v.attribute_of == Factor
                }
                attributes.update(fields_value["interface_attributes"])

                f = Factor.create_and_append(
                    f_interface_name,
                    p,
                    in_processor_type=FactorInProcessorType(external=False,
                                                            incoming=False),
                    taxon=ft,
                    geolocation=f_location,
                    tags=None,
                    attributes=attributes)
                glb_idx.put(f.key(), f)

            # Find Observer
            oer = glb_idx.get(Observer.partial_key(f_source))
            if not oer:
                add_issue(IType.warning(),
                          f"Observer '{f_source}' has not been found.")
            else:
                oer = oer[0]

            if f_relative_to:
                ast = parser_field_parsers.string_to_ast(
                    parser_field_parsers.factor_unit, f_relative_to)
                relative_to_interface_name = ast_to_string(ast["factor"])

                rel_unit_name = ast["unparsed_unit"]
                try:
                    f_unit = str((ureg(f_unit) / ureg(rel_unit_name)).units)
                except (UndefinedUnitError, AttributeError) as ex:
                    add_issue(
                        3,
                        f"The final unit could not be computed, interface '{f_unit}' / "
                        f"relative_to '{rel_unit_name}': {str(ex)}")
                    return

                f_relative_to = first(
                    f.processor.factors,
                    lambda ifc: strcmp(ifc.name, relative_to_interface_name))

                if not f_relative_to:
                    add_issue(
                        IType.error(),
                        f"Interface specified in 'relative_to' column "
                        f"'{relative_to_interface_name}' has not been found.")
                    return

            # Create quantitative observation
            if f_value:
                # If an observation exists then "time" is mandatory
                if not f_time:
                    add_issue(
                        IType.error(),
                        f"Field 'time' needs to be specified for the given observation."
                    )
                    return

                o = _create_or_append_quantitative_observation(
                    f, f_value, f_unit, f_uncertainty, f_assessment,
                    f_pedigree, f_pedigree_matrix, oer, f_relative_to, f_time,
                    None, f_comments, None, number_attributes)
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name,
                              state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)

    # Look for the name of the input Dataset
    dataset_name = None
    available_at_datetime = None
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["inputdataset"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    dataset_name = v
                    break  # Stop on first definition
        elif col_name.lower().strip() in ["availableatdatetime"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    available_at_datetime = v
                    break  # Stop on first definition

    # Obtain the source
    source = obtain_dataset_source(dataset_name)
    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source)
    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = create_dictionary(data={
                k: None
                for k in dims[d].code_list.keys()
            })  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True

    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]]
            tmp = create_dictionary(
                data={
                    to["d"]: None
                    for o in mappings[m].map for to in o["to"] if to["d"]
                })
            cl[mappings[m].
               destination] = tmp  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.

    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian

    out_dims = []

    out_measures = OrderedDict()
    for r in range(area[0] + 1, area[1] + 1):
        out_measures[r] = dict(measure=None, agg_func=None, measure_as=None)

    filter_ = {
    }  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    measure_names_column = None
    aggregations_column = None
    for c in range(area[2], area[3]):  # Each column
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["resultdimensions",
                                        "dimensions"]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, d in enumerate(lst):
                if not d:
                    continue
                if d not in cl:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The dimension specified for output, '"
                            + d +
                            "' is neither a dataset dimension nor a mapped dimension. ["
                            + ', '.join([d2 for d2 in cl]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in ["resultmeasures",
                                          "measures"]:  # "SELECT"
            measure_names_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for r, m in enumerate(lst):
                if not m:
                    continue
                if m not in meas:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The specified measure, '" + m +
                            "' is not a measure available in the dataset. [" +
                            ', '.join([m2 for m2 in measures]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["measure"] = m
        elif col_name.lower().strip() in [
                "resultmeasuresaggregation", "resultmeasuresaggregator",
                "aggregation"
        ]:  # "SELECT AGGREGATORS"
            aggregations_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, f in enumerate(lst):
                if not f:
                    continue

                if f.lower() not in [
                        "sum", "avg", "count", "sumna", "countav", "avgna",
                        "pctna"
                ]:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The specified aggregation function, '"
                            + f +
                            "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["agg_func"] = f
        elif col_name.lower().strip() in [
                "resultmeasurename", "resultmeasuresnames", "resultmeasuresas",
                "measuresas"
        ]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, m in enumerate(lst):
                out_measures[r + area[0] + 1]["measure_as"] = m
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, cd in enumerate(lst):
                if not cd:
                    continue
                if str(cd) not in cl[col_name]:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The code '" + cd +
                            "' is not present in the codes declared for dimension '"
                            + col_name + "'. Please, check them.",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in [
                "startperiod", "endperiod"
        ]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                filter_[col_name] = lst[
                    0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in [
                "outputdatasetname", "outputdataset", "result_name",
                "result name", "resultname"
        ]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident,
                                                       result_name)
                except:
                    issues.append(
                        Issue(itype=3,
                              description="Column '" + col_name +
                              "' has an invalid dataset name '" + result_name +
                              "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=2,
                                                     column=c + 1)))

    # If more than one agg function defined -> all must be defined
    # If no agg func defined -> assume AVG
    # If agg func defined only in first row -> extend to other columns
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]
    if len(agg_funcs) > 1:
        first_agg_func = None
    elif len(agg_funcs) == 0:
        issues.append(
            Issue(itype=2,
                  description=
                  "No aggregation function specified. Assuming 'average'",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=aggregations_column)))
        first_agg_func = "avg"
    else:  # One aggregation function
        first_agg_func = out_measures[area[0] + 1]["agg_func"]
        if not first_agg_func:
            issues.append(
                Issue(
                    itype=3,
                    description=
                    "The aggregation function must be defined in the first row",
                    location=IssueLocation(sheet_name=name,
                                           row=1,
                                           column=aggregations_column)))

    if first_agg_func:
        for v in out_measures.values():
            if v.get("measure", None):
                v["agg_func"] = first_agg_func

    # Uniform rows, with the three values defined: measure, aggregation function and "measure as"
    for r, v in out_measures.items():
        measure = v.get("measure", None)
        agg_func = v.get("agg_func", None)
        measure_as = v.get("measure_as", None)
        if measure and not agg_func or not measure and agg_func:
            issues.append(
                Issue(
                    itype=3,
                    description=
                    "Each measure must be associated with an aggregation function",
                    location=IssueLocation(sheet_name=name,
                                           row=r,
                                           column=measure_names_column)))
        elif measure and not measure_as:
            v["measure_as"] = measure + "_" + agg_func

    measures = [v["measure"] for v in out_measures.values() if v["measure"]]
    measures_as = [
        v["measure_as"] for v in out_measures.values() if v["measure_as"]
    ]
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]

    if len(measures) == 0:
        issues.append(
            Issue(itype=3,
                  description="At least one measure should be specified",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=measure_names_column)))

    # measures != agg_funcs && len(agg_funcs) == 1 --> OK
    if len(measures) != len(agg_funcs) and len(agg_funcs) != 1:
        issues.append(
            Issue(
                itype=3,
                description=
                "There must be one aggregation function (used for all measures) or one aggregation per measure",
                location=IssueLocation(sheet_name=name,
                                       row=1,
                                       column=aggregations_column)))

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append(
            Issue(itype=2,
                  description="No result name specified. Assuming '" +
                  result_name + "'",
                  location=IssueLocation(sheet_name=name, row=2,
                                         column=c + 1)))

    content = {
        "dataset_source": source,
        "dataset_name": dataset_name,
        "dataset_datetime": available_at_datetime,
        "where": filter_,
        "dimensions": [d for d in dims],
        "group_by": out_dims,
        "measures": measures,
        "agg_funcs": agg_funcs,
        "measures_as": measures_as,
        "result_name": result_name
    }
    return issues, None, content
    def _process_row(self, row: Dict[str, Any]):
        self._current_row_number = row["_row"]
        self._fields_values = self._get_command_fields_values(row)

        self._check_all_mandatory_fields_have_values()

        scaling_type = self._fields_values["scaling_type"]
        scale: str = self._fields_values["scale"]

        # Find processors
        invoking_processor = self._get_processor_from_field(
            "invoking_processor")
        requested_processor = self._get_processor_from_field(
            "requested_processor")

        invoking_interface_name: str = self._fields_values[
            "invoking_interface"]
        requested_interface_name: str = self._fields_values[
            "requested_interface"]

        requested_new_processor_name: str = self._fields_values[
            "new_processor_name"]

        print(
            f"Invoking: {invoking_processor.name}:{invoking_interface_name}, Requested: {requested_processor.name}:{requested_interface_name}"
        )

        if strcmp(scaling_type, "CloneAndScale"):
            # TODO: check “RequestedProcessor” must be an archetype
            # 1. Clones “RequestedProcessor” as a child of “InvokingProcessor”
            requested_processor_clone = self._clone_processor_as_child(
                processor=requested_processor,
                parent_processor=invoking_processor,
                name=requested_new_processor_name)

            # 2. Constrains the value of “RequestedInterface” to the value of “InvokingInterface”, scaled by “Scale”
            self._constrains_interface(
                scale=scale,
                invoking_interface_name=invoking_interface_name,
                requested_interface_name=requested_interface_name,
                parent_processor=invoking_processor,
                child_processor=requested_processor_clone)

        elif strcmp(scaling_type, "Scale"):
            # Processors must be of same type (archetype or instance)
            if not strcmp(invoking_processor.instance_or_archetype,
                          requested_processor.instance_or_archetype):
                raise CommandExecutionError(
                    "Requested and invoking processors should be of the same type "
                    "(both instance or_archetype)")

            # 1. Constrains the value of “RequestedInterface” to the value of “InvokingInterface”, scaled by “Scale”
            self._constrains_interface(
                scale=scale,
                invoking_interface_name=invoking_interface_name,
                requested_interface_name=requested_interface_name,
                parent_processor=invoking_processor,
                child_processor=requested_processor)

        elif strcmp(scaling_type, "CloneScaled"):
            # “RequestedProcessor” must be an archetype
            # if not strcmp(requested_processor.instance_or_archetype, "archetype"):
            #     raise CommandExecutionError(f"Requested processor '{requested_processor.name}' should be of type 'archetype'")

            # “InvokingProcessor” must be an instance
            # if not strcmp(invoking_processor.instance_or_archetype, "instance"):
            #     raise CommandExecutionError(f"Invoking processor '{invoking_processor.name}' should be of type 'instance'")

            # 1. Clones “RequestedProcessor” as a child of “InvokingProcessor”
            # 2. Scales the new processor using “Scale” as the value of “RequestedInterface”
            requested_processor_clone = self._clone_processor_as_child(
                processor=requested_processor,
                parent_processor=invoking_processor)

            # Value Scale, which can be an expression, should be evaluated (ast) because we need a final float number
            scale_value = self._get_scale_value(scale)

            # In the cloned processor search in all interfaces if there are Observations relative_to RequestedInterface
            # and multiply the observation by the computed scale.
            self._scale_observations_relative_to_interface(
                processor=requested_processor_clone,
                interface_name=requested_interface_name,
                scale=scale_value)
예제 #7
0
    def execute(self, state: "State"):
        """
        First bring the data considering the filter
        Second, group, third aggregate
        Finally, store the result in State
        """
        issues = []
        # Obtain global variables in state
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)

        # DS Source + DS Name
        source = self._content["dataset_source"]
        dataset_name = self._content["dataset_name"]
        dataset_datetime = self._content["dataset_datetime"]

        # Result name
        result_name = self._content["result_name"]
        if result_name in datasets or state.get(result_name):
            issues.append((2, "A dataset called '" + result_name +
                           "' is already stored in the registry of datasets"))

        # Dataset metadata
        dims, attrs, measures = obtain_dataset_metadata(dataset_name, source)

        # Obtain filter parameters
        params = create_dictionary(
        )  # Native dimension name to list of values the filter will allow to pass
        joined_dimensions = []
        for dim in self._content["where"]:
            lst = self._content["where"][dim]
            native_dim = None
            if dim.lower() in ["startperiod", "endperiod"]:
                native_dim = dim
                lst = [lst]
            elif dim not in dims:
                # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR
                for m in mappings:
                    if strcmp(mappings[m].destination, dim) and \
                            strcmp(mappings[m].source, source) and \
                            strcmp(mappings[m].dataset, dataset_name) and \
                            mappings[m].origin in dims:
                        joined_dimensions.append(
                            mappings[m].destination
                        )  # Store dimension in the original case
                        native_dim = mappings[m].origin
                        lst = obtain_reverse_codes(mappings[m].map, lst)
                        break
            else:
                # Get the dimension name with the original case
                native_dim = dims[dim].name
            if native_dim:
                if native_dim not in params:
                    f = set()
                    params[native_dim] = f
                else:
                    f = params[native_dim]
                f.update(lst)

        # Convert param contents from set to list
        for p in params:
            params[p] = [i for i in params[p]]

        # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        ds = backend.data_source_manager.get_dataset_filtered(
            source, dataset_name, params)
        df = ds.data

        # Join with mapped dimensions (augment it)
        mapping_dict = create_dictionary()
        for m in mappings:
            if strcmp(mappings[m].source, source) and \
                    strcmp(mappings[m].dataset, dataset_name) and \
                    mappings[m].origin in dims:
                # mapping_tuples.append((mappings[m].origin, mappings[m].destination, mappings[m].map))
                mapping_dict[mappings[m].origin] = (mappings[m].destination, {
                    d["o"]: d["to"]
                    for d in mappings[m].map
                })

        df = augment_dataframe_with_mapped_columns(df, mapping_dict, ["value"])

        # Aggregate (If any dimension has been specified)
        if len(self._content["group_by"]) > 0:
            # Column names where data is
            # HACK: for the case where the measure has been named "obs_value", use "value"
            values = [
                m.lower() if m.lower() != "obs_value" else "value"
                for m in self._content["measures"]
            ]
            # TODO: use metadata name (e.g. "OBS_VALUE") instead of hardcoded "value"
            # values = self._content["measures"]
            out_names = self._content["measures_as"]
            group_by_dims = translate_case(self._content["group_by"],
                                           params)  # Group by dimension names
            lcase_group_by_dims = [d.lower() for d in group_by_dims]
            # Now joined_dimensions
            for d in joined_dimensions:
                if d.lower() in lcase_group_by_dims:
                    # Find and replace
                    for i, d2 in enumerate(group_by_dims):
                        if strcmp(d, d2):
                            group_by_dims[i] = d
                            break

            agg_funcs = []  # Aggregation functions
            agg_names = {}
            for f in self._content["agg_funcs"]:
                if f.lower() in ["avg", "average"]:
                    agg_funcs.append(np.average)
                    agg_names[np.average] = "avg"
                elif f.lower() in ["sum"]:
                    agg_funcs.append(np.sum)
                    agg_names[np.sum] = "sum"
                elif f.lower() in ["count"]:
                    agg_funcs.append(np.size)
                    agg_names[np.size] = "count"
                elif f.lower() in ["sumna"]:
                    agg_funcs.append(np.nansum)
                    agg_names[np.nansum] = "sumna"
                elif f.lower() in ["countav"]:
                    agg_funcs.append("count")
                    agg_names["count"] = "countav"
                elif f.lower() in ["avgna"]:
                    agg_funcs.append(np.nanmean)
                    agg_names[np.nanmean] = "avgna"
                elif f.lower() in ["pctna"]:
                    agg_funcs.append(pctna)
                    agg_names[pctna] = "pctna"

            # Calculate Pivot Table. The columns are a combination of values x aggregation functions
            # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided
            # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]]
            try:
                # Check that all "group_by_dims" on which pivot table aggregates are present in the input "df"
                # If not either synthesize them (only if there is a single filter value) or remove (if not present
                for r in group_by_dims.copy():
                    df_columns_dict = create_dictionary(
                        data={c: None
                              for c in df.columns})
                    if r not in df_columns_dict:
                        found = False
                        for k in params:
                            if strcmp(k, r):
                                found = True
                                if len(params[k]) == 1:
                                    df[k] = params[k][0]
                                else:
                                    group_by_dims.remove(r)
                                    issues.append((
                                        2, "Dimension '" + r +
                                        "' removed from the list of dimensions because it is not present in the raw input dataset."
                                    ))
                                break
                        if not found:
                            group_by_dims.remove(r)
                            issues.append((
                                2, "Dimension '" + r +
                                "' removed from the list of dimensions because it is not present in the raw input dataset."
                            ))

                # Create and register Hierarchy objects from origin Dataset dimensions: state, ds
                ds_columns_dict = create_dictionary(
                    data={c.code: c.code
                          for c in ds.dimensions})
                for r in group_by_dims:
                    if r in ds_columns_dict:
                        # Create hierarchy local to the dataset
                        for d in ds.dimensions:
                            if strcmp(r, d.code):
                                if d.code_list:
                                    h = convert_code_list_to_hierarchy(
                                        d.code_list)
                                    h.name = result_name + "_" + r
                                    glb_idx.put(h.key(), h)
                                    break

                # Pivot table using Group by
                if True:
                    groups = df.groupby(by=group_by_dims,
                                        as_index=False)  # Split
                    d = OrderedDict([])
                    lst_names = []
                    if len(values) == len(agg_funcs):
                        for i, (value,
                                agg_func) in enumerate(zip(values, agg_funcs)):
                            if len(out_names) == len(values) and out_names[i]:
                                lst_names.append(out_names[i])
                            else:
                                lst_names.append(agg_names[agg_func] + "_" +
                                                 value)
                            lst = d.get(value, [])
                            lst.append(agg_func)
                            d[value] = lst
                    else:
                        for value in values:
                            lst = d.get(value, [])
                            for agg_func in agg_funcs:
                                lst.append(agg_func)
                                lst_names.append(agg_names[agg_func] + "_" +
                                                 value)
                            d[value] = lst
                    # Print NaN values for each value column
                    for value in set(values):
                        cnt = df[value].isnull().sum()
                        print("NA count for col '" + value + "': " + str(cnt) +
                              " of " + str(df.shape[0]))
                    # AGGREGATE !!
                    df2 = groups.agg(d)

                    # Rename the aggregated columns
                    df2.columns = group_by_dims + lst_names
                else:
                    # Pivot table
                    df2 = pd.pivot_table(df,
                                         values=values,
                                         index=group_by_dims,
                                         aggfunc=[agg_funcs[0]],
                                         fill_value=np.NaN,
                                         margins=False,
                                         dropna=True)
                    # Remove the multiindex in columns
                    df2.columns = [col[-1] for col in df2.columns.values]
                    # Remove the index
                    df2.reset_index(inplace=True)
                # The result, all columns (no index), is stored for later use
                ds.data = df2
            except Exception as e:
                traceback.print_exc()
                issues.append((3, "There was a problem: " + str(e)))

        # Store the dataset in State
        datasets[result_name] = ds

        return issues, None
        def process_line(item):
            sc_src_hierarchy = item.get("source_hierarchy")
            sc_src_interface_type = item.get("source_interface_type")
            sc_tgt_hierarchy = item.get("target_hierarchy")
            sc_tgt_interface_type = item.get("target_interface_type")
            sc_scale = item.get("scale")
            sc_src_context = item.get("source_context")
            sc_tgt_context = item.get("target_context")
            sc_src_unit = item.get("source_unit")
            sc_tgt_unit = item.get("target_unit")

            # Check the existence of the interface types

            force_create = True
            if force_create:
                pass

            # Check if FactorTypes exist
            fts = []
            for i, (hierarchy, interface_type) in enumerate([
                (sc_src_hierarchy, sc_src_interface_type),
                (sc_tgt_hierarchy, sc_tgt_interface_type)
            ]):
                m = "origin" if i == 0 else "destination"
                if not interface_type:
                    issues.append(
                        Issue(itype=3,
                              description="The " + m +
                              "interface type name has not been specified",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return

                # Check if FactorType exists
                ft = glb_idx.get(FactorType.partial_key(interface_type))
                if len(ft) > 0:
                    if len(ft) == 1:
                        fts.append(ft[0])
                    else:
                        if not hierarchy:
                            issues.append(
                                Issue(
                                    itype=3,
                                    description="The hierarchy of the " + m +
                                    "interface type name has not been specified and the interface type name is not unique",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return

                        for ft2 in ft:
                            if strcmp(ft2.hierarchy.name, hierarchy):
                                fts.append(ft2)

            if len(fts) != 2:
                issues.append(
                    Issue(
                        itype=3,
                        description="Found " + str(len(fts)) +
                        " interface types in the specification of a scale change",
                        location=IssueLocation(sheet_name=name,
                                               row=r,
                                               column=None)))
                return

            # Check that the interface types are from different hierarchies (warn if not; not error)
            if fts[0].hierarchy == fts[1].hierarchy:
                issues.append(
                    Issue(itype=2,
                          description="The interface types '" + fts[0].name +
                          "' and '" + fts[1].name +
                          "' are in the same hierarchy",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))

            # Create the directed Scale (Linear "Transformation") Relationship
            origin = fts[0]
            destination = fts[1]
            FactorTypesRelationUnidirectionalLinearTransformObservation.\
                create_and_append(origin, destination, sc_scale,
                                  sc_src_context, sc_tgt_context,
                                  Observer.no_observer_specified)
def parse_scale_conversion_command(
        sh: Worksheet,
        area: AreaTupleType,
        name: str = None) -> IssuesLabelContentTripleType:
    """
    Analyze the input area
    Obtain the numerical part
    Read a row above and a column to the left, looking for source (left col) and target (row above) factor types

    FactorTypes do not need to exist previously, they can be created

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :return: list of issues (issue_type, message), command label, command content
    """
    def get_subrow(r, c1, c2):
        lst = []
        # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell
        previous = None
        for c in range(c1, c2):
            v = sh.cell(row=r, column=c).value
            if not v:
                if previous:
                    lst.append(previous)
                else:
                    lst.append("")
            else:
                previous = v
                lst.append(v)

        return lst

    def get_subcolumn(c, r1, r2):
        lst = []
        # To deal with combined cell ranges, store "previous" value, and if "" is found, assume it is a merged cell
        # !!! This may not be correct at all times: when a cell is intentionally left blank
        # To solve this, use "sh.merged_cell_ranges" to check if the current cell (r, c) is inside a range
        previous = None
        for r in range(r1, r2):
            v = sh.cell(row=r, column=c).value
            if not v:
                if previous:
                    lst.append(previous)
                else:
                    lst.append("")
            else:
                previous = v
                lst.append(v)
        return lst

    # ---------------------------------------------

    some_error = False
    issues = []

    # Detect the matrix defining scales
    m = binary_mask_from_worksheet(
        sh, True)  # "True" is to focus on cells containing numbers
    # Locate the matrix with numbers. Assume this defines the labels to consider, they will be around the matrix
    t = obtain_rectangular_submatrices(m)[
        0]  # Take just the first tuple: U=t[0], D=t[1], L=t[2], R=t[3]
    t = (
        t[0] + 1, t[1] + 1, t[2] + 1, t[3] + 1
    )  # The previous calculation is done using Numpy, so it is Zero based. Correct this

    # Obtain the factor type names in the subrow on top of the matrix
    subrow = get_subrow(t[0] - 1, t[2], t[3])
    # Obtain the factor type names in the subcolumn to the left of the matrix
    subcol = get_subcolumn(t[2] - 1, t[0], t[1])

    # Check that we have valid factor type names
    for ft in subrow + subcol:
        try:
            parser_field_parsers.string_to_ast(
                parser_field_parsers.simple_h_name, ft)
        except:
            some_error = True
            issues.append((3, "'" + ft + "' is not a valid Factor Type name"))
    if some_error:
        return issues, None, None

    # Scan the matrix, creating scale records
    scales = []
    for i, r in enumerate(range(t[0], t[1])):
        for j, c in enumerate(range(t[2], t[3])):
            v = sh.cell(row=r, column=c).value
            if v:
                if not isinstance(v, str):
                    v = str(v)
                # Origin factor
                origin = subcol[i]
                # Destination factor
                destination = subrow[j]
                if strcmp(origin, destination):
                    issues.append(
                        (3, "A change of scale to the same factor type (" +
                         origin + ") is not allowed"))
                else:
                    try:
                        parser_field_parsers.string_to_ast(
                            parser_field_parsers.expression_with_parameters, v)
                        # Add the scale
                        scales.append(
                            dict(origin=origin,
                                 destination=destination,
                                 scale=v))
                    except:
                        issues.append(
                            (3, "The expression '" + v +
                             "' at the intersection of factor types " +
                             origin + " and " + destination +
                             " is syntactically incorrect"))

    content = {
        "origin_factor_types": subcol,
        "destination_factor_types": subrow,
        "scales": scales
    }

    return issues, None, content
    def execute(self, state: "State"):
        """
        First bring the data considering the filter
        Second, group, third aggregate
        Finally, store the result in State
        """
        issues = []
        # Obtain global variables in state
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)

        # DS Source + DS Name
        source = self._content["dataset_source"]
        dataset_name = self._content["dataset_name"]

        # Result name
        result_name = self._content["result_name"]
        if result_name in datasets or state.get(result_name):
            issues.append((2, "A dataset called '" + result_name +
                           "' is already stored in the registry of datasets"))

        # Dataset metadata
        dims, attrs, meas = obtain_dataset_metadata(dataset_name, source)
        # Obtain filter parameters
        params = create_dictionary(
        )  # Native dimension name to list of values the filter will allow to pass
        joined_dimensions = []
        for dim in self._content["where"]:
            lst = self._content["where"][dim]
            native_dim = None
            if dim.lower() in ["startperiod", "endperiod"]:
                native_dim = dim
                lst = [lst]
            elif dim not in dims:
                # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR
                for m in mappings:
                    if strcmp(mappings[m].destination, dim) and \
                            strcmp(mappings[m].source, source) and \
                            strcmp(mappings[m].dataset, dataset_name) and \
                            mappings[m].origin in dims:
                        joined_dimensions.append(
                            mappings[m].destination
                        )  # Store dimension in the original case
                        native_dim = mappings[m].origin
                        lst = obtain_reverse_codes(mappings[m].map, lst)
                        break
            else:
                # Get the dimension name with the original case
                native_dim = dims[dim].name
            if native_dim:
                if native_dim not in params:
                    f = set()
                    params[native_dim] = f
                else:
                    f = params[native_dim]
                f.update(lst)

        # Convert param contents from set to list
        for p in params:
            params[p] = [i for i in params[p]]

        # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        ds = backend.data_source_manager.get_dataset_filtered(
            source, dataset_name, params)
        df = ds.data

        # Join with mapped dimensions (augment it)
        # TODO Prepare an "m" containing ALL the mappings affecting "df"
        # TODO df2 = augment_dataframe_with_mapped_columns(df, m, ["value"])
        # TODO Does it allow adding the new column for the dimension, in case it is requested? Probably yes, but test it
        for m in mappings:
            if strcmp(mappings[m].source, source) and \
                    strcmp(mappings[m].dataset, dataset_name) and \
                    mappings[m].origin in dims:
                # TODO Change by many-to-many mapping
                # TODO augment_dataframe_with_mapped_columns(df, maps, measure_columns)
                # Elaborate a many to one mapping
                tmp = []
                for el in mappings[m].map:
                    for to in el["to"]:
                        if to["d"]:
                            tmp.append([el["o"], to["d"]])
                df_dst = pd.DataFrame(
                    tmp, columns=['sou_rce', mappings[m].destination])
                for di in df.columns:
                    if strcmp(mappings[m].origin, di):
                        d = di
                        if not backend.case_sensitive:
                            df[d + "_l"] = df[d].str.lower()
                            d = d + "_l"
                        break
                df = pd.merge(df,
                              df_dst,
                              how='left',
                              left_on=d,
                              right_on='sou_rce')
                del df['sou_rce']
                if not backend.case_sensitive:
                    del df[d]

        # Aggregate (If any dimension has been specified)
        if len(self._content["group_by"]) > 0:
            # Column names where data is
            # HACK: for the case where the measure has been named "obs_value", use "value"
            values = [
                m.lower() if m.lower() != "obs_value" else "value"
                for m in self._content["measures"]
            ]
            out_names = self._content["measures_as"]
            rows = translate_case(self._content["group_by"],
                                  params)  # Group by dimension names
            lcase_rows = [d.lower() for d in rows]
            # Now joined_dimensions
            for d in joined_dimensions:
                if d.lower() in lcase_rows:
                    # Find and replace
                    for i, d2 in enumerate(rows):
                        if strcmp(d, d2):
                            rows[i] = d
                            break

            aggs = []  # Aggregation functions
            agg_names = {}
            for f in self._content["agg_funcs"]:
                if f.lower() in ["avg", "average"]:
                    aggs.append(np.average)
                    agg_names[np.average] = "avg"
                elif f.lower() in ["sum"]:
                    aggs.append(np.sum)
                    agg_names[np.sum] = "sum"
                elif f.lower() in ["count"]:
                    aggs.append(np.size)
                    agg_names[np.size] = "count"
                elif f.lower() in ["sumna"]:
                    aggs.append(np.nansum)
                    agg_names[np.nansum] = "sumna"
                elif f.lower() in ["countav"]:
                    aggs.append("count")
                    agg_names["count"] = "countav"
                elif f.lower() in ["avgna"]:
                    aggs.append(np.nanmean)
                    agg_names[np.nanmean] = "avgna"
                elif f.lower() in ["pctna"]:
                    aggs.append(pctna)
                    agg_names[pctna] = "pctna"

            # Calculate Pivot Table. The columns are a combination of values x aggregation functions
            # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided
            # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]]
            try:
                # Check that all "rows" on which pivot table aggregates are present in the input "df"
                # If not either synthesize them (only if there is a single filter value) or remove (if not present
                df_columns_dict = create_dictionary(
                    data={c: c
                          for c in df.columns})
                for r in rows.copy():
                    if r not in df_columns_dict:
                        found = False
                        for k in params:
                            if strcmp(k, r):
                                found = True
                                if len(params[k]) == 1:
                                    df[r] = params[k][0]
                                else:
                                    rows.remove(r)
                                    issues.append((
                                        2, "Dimension '" + r +
                                        "' removed from the list of dimensions because it is not present in the raw input dataset."
                                    ))
                                break
                        if not found:
                            rows.remove(r)
                            issues.append((
                                2, "Dimension '" + r +
                                "' removed from the list of dimensions because it is not present in the raw input dataset."
                            ))
                # Put proper DIMENSION names
                for ir, r in enumerate(rows):
                    if r in df_columns_dict:
                        rows[ir] = df_columns_dict[r]

                # Create and register Hierarchy objects from origin Dataset dimensions: state, ds
                ds_columns_dict = create_dictionary(
                    data={c.code: c.code
                          for c in ds.dimensions})
                for r in rows:
                    if r in ds_columns_dict:
                        # Create hierarchy local to the dataset
                        for d in ds.dimensions:
                            if strcmp(r, d.code):
                                if d.code_list:
                                    h = convert_code_list_to_hierarchy(
                                        d.code_list)
                                    h.name = result_name + "_" + r
                                    glb_idx.put(h.key(), h)
                                    break

                # Pivot table using Group by
                # if True:
                groups = df.groupby(by=rows, as_index=False)  # Split
                d = OrderedDict([])
                lst_names = []
                if len(values) == len(aggs):
                    for i, t in enumerate(zip(values, aggs)):
                        v, agg = t
                        if len(out_names) == len(values):
                            if out_names[i]:
                                lst_names.append(out_names[i])
                            else:
                                lst_names.append(agg_names[agg] + "_" + v)
                        else:
                            lst_names.append(agg_names[agg] + "_" + v)
                        lst = d.get(v, [])
                        lst.append(agg)
                        d[v] = lst
                else:
                    for v in values:
                        lst = d.get(v, [])
                        for agg in aggs:
                            lst.append(agg)
                            lst_names.append(agg_names[agg] + "_" + v)
                        d[v] = lst
                # Print NaN values for each value column
                for v in set(values):
                    cnt = df[v].isnull().sum()
                    print("NA count for col '" + v + "': " + str(cnt) +
                          " of " + str(df.shape[0]))
                # AGGREGATE !!
                df2 = groups.agg(d)

                # Rename the aggregated columns
                df2.columns = rows + lst_names
                # else:
                #     # Pivot table
                #     df2 = pd.pivot_table(df,
                #                          values=values,
                #                          index=rows,
                #                          aggfunc=[aggs[0]], fill_value=np.NaN, margins=False,
                #                          dropna=True)
                #     # Remove the multiindex in columns
                #     df2.columns = [col[-1] for col in df2.columns.values]
                #     # Remove the index
                #     df2.reset_index(inplace=True)
                # The result, all columns (no index), is stored for later use
                ds.data = df2
            except Exception as e:
                issues.append(
                    (3, "There was a problem with the grouping: " + repr(e)))

        # Store the dataset in State
        datasets[result_name] = ds

        return issues, None
def parse_etl_external_dataset_command(sh: Worksheet, area: AreaTupleType, dataset_name: str, state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state)
    # Dataset source
    source = obtain_dataset_source(dataset_name)
    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source)
    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = [k.lower() for k in dims[d].code_list.keys()]  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True
    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]]
            cl[mappings[m].destination] = set(tmp)  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.
    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian
    measures = []
    out_dims = []
    agg_funcs = []
    measures_as = []
    filter_ = {}  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue

        if col_name.lower().strip() in ["dimensions_kept", "dims", "dimensions"]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for d in lst:
                if not d:
                    continue
                if d not in cl:
                    issues.append((3, "The dimension specified for output, '"+d+"' is neither a dataset dimension nor a mapped dimension. ["+', '.join([d2 for d2 in cl])+"]"))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in ["aggregation_function", "aggfunc", "agg_func"]:  # "SELECT AGGREGATORS"
            lst = obtain_column(c, area[0] + 1, area[1])
            for f in lst:
                if f.lower() not in ["sum", "avg", "count", "sumna", "countav", "avgna", "pctna"]:
                    issues.append((3, "The specified aggregation function, '"+f+"' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'"))
                else:
                    agg_funcs.append(f)
        elif col_name.lower().strip() in ["measures"]:  # "SELECT"
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for m in lst:
                if not m:
                    continue
                if m not in meas:
                    issues.append((3, "The specified measure, '"+m+"' is not a measure available in the dataset. ["+', '.join([m2 for m2 in measures])+"]"))
                else:
                    measures.append(m)
        elif col_name.lower().strip() in ["measuresas"]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for m in lst:
                measures_as.append(m)
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for cd in lst:
                if not cd:
                    continue
                if str(cd).lower() not in cl[col_name]:
                    issues.append((3, "The code '"+cd+"' is not present in the codes declared for dimension '"+col_name+"'. Please, check them."))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in ["startperiod", "endperiod"]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                filter_[col_name] = lst[0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in ["result_name", "result name", "resultname"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident, result_name)
                except:
                    issues.append(
                        (3, "Column '" + col_name + "' has an invalid dataset name '" + result_name + "'"))

    if len(measures) == 0:
        issues.append((3, "At least one measure should be specified"))

    if len(agg_funcs) == 0:
        issues.append((2, "No aggregation function specified. Assuming 'average'"))
        agg_funcs.append("average")

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append((2, "No result name specified. Assuming '"+result_name+"'"))

    content = {"dataset_source": source,
               "dataset_name": dataset_name,
               "dataset_datetime": None,
               "where": filter_,
               "dimensions": [d for d in dims],
               "group_by": out_dims,
               "measures": measures,
               "agg_funcs": agg_funcs,
               "measures_as": measures_as,
               "result_name": result_name
               }
    return issues, None, content
예제 #12
0
        def process_line(item):
            # Read variables
            ft_h_name = item.get(
                "interface_type_hierarchy",
                "_default")  # "_default" InterfaceType Hierarchy NAME <<<<<<
            ft_name = item.get("interface_type", None)
            ft_sphere = item.get("sphere", None)
            ft_roegen_type = item.get("roegen_type", None)
            ft_parent = item.get("parent_interface_type", None)
            ft_formula = item.get("formula", None)
            ft_description = item.get("description", None)
            ft_unit = item.get("unit", None)
            # ft_orientation = item.get("orientation", None)
            ft_unit = item.get("unit", None)
            ft_attributes = item.get("attributes", {})
            if ft_attributes:
                try:
                    attributes = dictionary_from_key_value_list(
                        ft_attributes, glb_idx)
                except Exception as e:
                    issues.append(
                        Issue(itype=3,
                              description=str(e),
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
            else:
                attributes = {}

            # Process
            # Mandatory fields
            if not ft_h_name:
                issues.append(
                    Issue(itype=3,
                          description=
                          "Empty interface type hierarchy name. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            if not ft_name:
                issues.append(
                    Issue(itype=3,
                          description="Empty interface type name. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            # Check if a hierarchy of interface types by the name <ft_h_name> exists, if not, create it and register it
            hie = glb_idx.get(Hierarchy.partial_key(name=ft_h_name))
            if not hie:
                hie = Hierarchy(name=ft_h_name, type_name="interfacetype")
                glb_idx.put(hie.key(), hie)
            else:
                hie = hie[0]

            # If parent defined, check if it exists
            # (it must be registered both in the global registry AND in the hierarchy)
            if ft_parent:
                parent = glb_idx.get(FactorType.partial_key(ft_parent))
                if len(parent) > 0:
                    for p in parent:
                        if p.hierarchy == hie:
                            parent = p
                            break
                    if not isinstance(parent, FactorType):
                        issues.append(
                            Issue(itype=3,
                                  description="Parent interface type name '" +
                                  ft_parent + "' not found in hierarchy '" +
                                  ft_h_name + "'",
                                  location=IssueLocation(sheet_name=name,
                                                         row=r,
                                                         column=None)))
                        return
                else:
                    issues.append(
                        Issue(itype=3,
                              description="Parent interface type name '" +
                              ft_parent + "' not found",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
                # Double check, it must be defined in "hie"
                if ft_parent not in hie.codes:
                    issues.append(
                        Issue(itype=3,
                              description="Parent interface type name '" +
                              ft_parent +
                              "' not registered in the hierarchy '" +
                              ft_h_name + "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
            else:
                parent = None

            # Check if FactorType exists
            ft = glb_idx.get(FactorType.partial_key(ft_name))
            if len(ft) == 0:
                # TODO Compile and CONSIDER attributes (on the FactorType side)
                roegen_type = None
                if ft_roegen_type:
                    roegen_type = FlowFundRoegenType.flow if strcmp(
                        ft_roegen_type, "flow") else FlowFundRoegenType.fund

                ft = FactorType(
                    ft_name,
                    parent=parent,
                    hierarchy=hie,
                    roegen_type=roegen_type,
                    tags=None,  # No tags
                    attributes=dict(unit=ft_unit,
                                    description=ft_description,
                                    **ft_attributes),
                    expression=ft_formula,
                    # orientation=ft_orientation,
                    sphere=ft_sphere)
                # Simple name
                glb_idx.put(FactorType.partial_key(ft_name, ft.ident), ft)
                if not strcmp(ft_name, ft.full_hierarchy_name()):
                    glb_idx.put(
                        FactorType.partial_key(ft.full_hierarchy_name(),
                                               ft.ident), ft)
            else:
                issues.append(
                    Issue(itype=3,
                          description="Interface type name '" + ft_name +
                          "' already registered",
                          location=IssueLocation(sheet_name=name,
                                                 row=r + 1,
                                                 column=None)))
                return
예제 #13
0
def parse_dataset_data_command(sh: Worksheet, area: AreaTupleType, name: str,
                               state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """

    issues: List[Issue] = []

    # Analyze column names
    col_map = create_dictionary()
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=area[0], column=c).value.strip()
        # Avoid repetitions
        if col_name in col_map:
            issues.append(
                Issue(itype=3,
                      description="The column name '" + col_name +
                      "' is repeated",
                      location=IssueLocation(sheet_name=name, row=1,
                                             column=c)))

        if strcmp(col_name, "DatasetName") or strcmp(col_name, "Dataset"):
            col_map["dataset"] = c
        elif col_name:
            # Concept name
            col_map[col_name] = c

    if "dataset" not in col_map:
        issues.append(
            Issue(
                itype=3,
                description=
                "The column name 'DatasetName' is not defined for command 'DatasetData'",
                location=IssueLocation(sheet_name=name, row=1, column=c)))

    if any([i.itype == 3 for i in issues]):
        return issues, None, None

    # Read all the content into a list of lists
    lines = []
    for r in range(area[0] + 1, area[1]):
        line = []
        for col_name, c in col_map.items():
            v = sh.cell(row=r, column=c).value
            if isinstance(v, str):
                v = v.strip()
            line.append(v)
        lines.append(line)

    # pd.DataFrame
    df = pd.DataFrame(columns=[col_name for col_name in col_map], data=lines)

    # Find the different datasets
    datasets = df["dataset"].unique()
    datasets = set([d.lower() for d in datasets])

    content = []  # The output JSON
    for dataset in datasets:
        # Obtain filtered
        df2 = df.loc[df['dataset'].str.lower() == dataset]
        # Convert to JSON and store in content
        del df2["dataset"]
        s = StringIO()
        df2.to_json(s, orient="split")
        content.append(dict(name=dataset, values=s.getvalue()))

    return issues, None, dict(items=content, command_name=name)
예제 #14
0
        def parse_and_unfold_line(item):
            # Consider multiplicity because of:
            # - A dataset (only one). First a list of dataset concepts used in the line is obtained.
            #   Then the unique tuples formed by them are obtained.
            # - Processor name.
            #   - A set of processors (wildcard or filter by attributes)
            #   - A set of interfaces (according to another filter?)
            # - Multiple types of relation
            # - Both (first each dataset record applied -expanded-, then the name evaluation is applied)
            # - UNRESOLVED: expressions are resolved partially. Parts where parameters
            # expressions depending on parameters. Only the part of the expression depending on varying things
            # - The processor name could be a concatenation of multiple literals
            #
            # Look for multiple items in r_source_processor_name, r_source_interface_name,
            #                            r_target_processor_name, r_target_interface_name
            if item["_complex"]:
                asts = parse_line(item, fields)
                if item["_expandable"]:
                    # It is an expandable line
                    # Look for fields which are specified to be variable in order to originate the expansion
                    res = classify_variables(asts, datasets, hh, parameters)
                    ds_list = res["datasets"]
                    ds_concepts = res["ds_concepts"]
                    h_list = res["hierarchies"]
                    if len(ds_list) >= 1 and len(h_list) >= 1:
                        issues.append(
                            Issue(
                                itype=3,
                                description="Dataset(s): " +
                                ", ".join([d.name for d in ds_list]) +
                                ", and hierarchy(ies): " +
                                ", ".join([h.name for h in h_list]) +
                                ", have been specified. Only a single dataset is supported.",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                        return
                    elif len(ds_list) > 1:
                        issues.append(
                            Issue(
                                itype=3,
                                description=
                                "More than one dataset has been specified: " +
                                ", ".join([d.name for d in ds_list]) +
                                ", just one dataset is supported.",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                        return
                    elif len(h_list) > 0:
                        issues.append(
                            Issue(
                                itype=3,
                                description=
                                "One or more hierarchies have been specified: "
                                + ", ".join([h.name for h in h_list]),
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                        return
                    const_dict = obtain_dictionary_with_literal_fields(
                        item, asts)
                    if len(ds_list) == 1:
                        # If a measure is requested and not all dimensions are used, aggregate or
                        # issue an error (because it is not possible to reduce without aggregation).
                        # If only dimensions are used, then obtain all the unique tuples
                        ds = ds_list[0]
                        measure_requested = False
                        all_dimensions = set([
                            c.code for c in ds.dimensions if not c.is_measure
                        ])
                        for con in ds_concepts:
                            for c in ds.dimensions:
                                if strcmp(c.code, con):
                                    if c.is_measure:
                                        measure_requested = True
                                    else:  # Dimension
                                        all_dimensions.remove(c.code)
                        only_dimensions_requested = len(all_dimensions) == 0

                        if measure_requested and not only_dimensions_requested:
                            issues.append(
                                Issue(
                                    itype=3,
                                    description=
                                    "It is not possible to use a measure if not all dataset dimensions are used (cannot assume implicit aggregation)",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        elif not measure_requested and not only_dimensions_requested:
                            # TODO Reduce the dataset to the unique tuples (consider the current case -sensitive or not-sensitive-)
                            data = None
                        else:  # Take the dataset as-is!!!
                            data = ds.data

                        # Each row
                        for row in data.iterrows():
                            item2 = const_dict.copy()

                            d = {}
                            for c in ds_concepts:
                                d["{" + ds.code + "." + c + "}"] = row[c]
                            # Expand in all fields
                            for f in fields:
                                if f not in const_dict:
                                    # Replace all
                                    string = item[f]
                                    # TODO Could iterate through the variables in the field (not IN ALL FIELDS of the row)
                                    for item in sorted(d.keys(),
                                                       key=len,
                                                       reverse=True):
                                        string = re.sub(item, d[item], string)
                                    item2[f] = string

                            print("Multiple by dataset: " + str(item2))
                            yield item2
                    else:  # No dataset, no hierarchy of categories, but it could be still complex, because of wildcards
                        # For now return just the line
                        yield item
                        # wildcard_in_source = ".." in item.get("source_processor", "")
                        # wildcard_in_target = ".." in item.get("target_processor", "")
                        # if wildcard_in_source or wildcard_in_target:
                        #     r_source_processor_name = string_to_ast(processor_names, item.get("source_processor", None))
                        #     r_target_processor_name = string_to_ast(processor_names, item.get("target_processor", None))
                        #     if wildcard_in_source:
                        #         source_processor_names = obtain_matching_processors(r_source_processor_name, all_processors)
                        #     else:
                        #         source_processor_names = [item["source_processor"]]
                        #     if wildcard_in_target:
                        #         target_processor_names = obtain_matching_processors(r_target_processor_name, all_processors)
                        #     else:
                        #         target_processor_names = [item["target_processor"]]
                        #     for s in source_processor_names:
                        #         for t in target_processor_names:
                        #             item3 = const_dict.copy()
                        #             item3["source_processor"] = s
                        #             item3["target_processor"] = t
                        #             print("Multiple by wildcard: "+str(item3))
                        #             yield item3
                        # else:
                        #     # yield item
                        #     raise Exception("If 'complex' is signaled, it should not pass by this line")
            else:
                # print("Single: "+str(item))
                yield item
예제 #15
0
    def execute(self, state: "State"):
        """
        For each parent processor clone all the child processors.
        The cloning process may pass some factor observation, that may result in
        """
        some_error = False
        issues = []

        parent_processor_type = self._content["parent_processor_type"]
        child_processor_type = self._content["child_processor_type"]
        scaled_factor = self._content["scaled_factor"]
        source = self._content["source"]
        # column_headers = self._content["column_headers"]
        # row_headers = self._content["row_headers"]
        scales = self._content["scales"]

        # Find processor sets, for parent and child
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state)
        if parent_processor_type not in p_sets:
            some_error = True
            issues.append((3, "The processor type '"+parent_processor_type +
                           "' (appointed for parent) has not been found in the commands execute so far"))

        if child_processor_type not in p_sets:
            some_error = True
            issues.append((3, "The processor type '"+child_processor_type +
                           "' (should be child processor) has not been found in the commands execute so far"))

        if some_error:
            return issues, None

        # CREATE the Observer of the Upscaling
        oer = glb_idx.get(Observer.partial_key(source))
        if not oer:
            oer = Observer(source)
            glb_idx.put(oer.key(), oer)
        else:
            oer = oer[0]

        # Processor Sets have associated attributes, and each of them has a code list
        parent = p_sets[parent_processor_type]  # type: ProcessorsSet
        child = p_sets[child_processor_type]  # type: ProcessorsSet

        # Form code lists from the command specification
        code_lists = None
        for sc_dict in scales:
            codes = sc_dict["codes"]
            if not code_lists:
                code_lists = [set() for _ in codes]

            for i, c in enumerate(codes):
                code_lists[i].add(c)

        # Match existing code lists (from Processor attributes) with the ones gathered in the specification of
        # the two (parent and child) processors sets.
        # Form lists of attributes of processors used in the code lists
        parent_attrs = []
        child_attrs = []
        matched = []
        for i, cl in enumerate(code_lists):
            found = False
            for attr, attr_values in parent.attributes.items():
                if set(attr_values).issuperset(cl):
                    parent_attrs.append((attr, i))  # (Attribute, code list index)
                    found = True
                    break
            for attr, attr_values in child.attributes.items():
                if set(attr_values).issuperset(cl):
                    child_attrs.append((attr, i))  # (Attribute, code list index)
                    found = True
                    break
            matched.append(found)
        for i, found in enumerate(matched):
            if not found:
                cl = code_lists[i]
                # TODO Try cl as a list of names of parent or child processors
                if not found:
                    issues.append((2, "The code list: " + ", ".join(cl) + " is not contained in the attributes of the parent processors set '" + parent_processor_type + "' nor in the attributes of the child processors set '" + child_processor_type + "'"))

        # Execute the upscale for each
        cached_processors = {}
        for sc_dict in scales:
            try:
                non_zero_weight = math.fabs(float(sc_dict["weight"])) > 1e-6
            except:
                non_zero_weight = True
            if not non_zero_weight:
                continue

            codes = sc_dict["codes"]
            # Find parent processor
            parent_dict = {attr: codes[i] for attr, i in parent_attrs}
            d2s = str(parent_dict)
            if d2s in cached_processors:
                parent = cached_processors[d2s]
                if not parent:
                    issues.append((3, "Either the tuple (" + d2s + ") did not match any Processor or matched more than one."))
            else:
                parent_dict.update(Processor.partial_key())

                # Obtain Processor matching the attributes <<<<<<<<<<
                # Query the PartialRetrievalDictionary by attributes
                parents = glb_idx.get(parent_dict)

                if len(parents) > 1:
                    issues.append((3, "The tuple ("+str(parent_dict)+") matches "+str(len(parents))+" Processors: "+(", ".join([p.name for p in parents]))))
                    parent = None
                elif len(parents) == 0:
                    issues.append((3, "The tuple (" + str(parent_dict) + ") did not match any Processor"))
                    parent = None
                else:
                    parent = parents[0]

                cached_processors[d2s] = parent

            # Find child processor
            child_dict = {attr: codes[i] for attr, i in child_attrs}
            d2s = str(child_dict)
            if d2s in cached_processors:
                child = cached_processors[d2s]
                if not child:
                    issues.append((3, "Either the tuple (" + d2s + ") did not match any Processor or matched more than one."))
            else:
                child_dict.update(Processor.partial_key())

                # Obtain Processors matching the attributes
                # Query the PartialRetrievalDictionary by attributes
                children = glb_idx.get(child_dict)

                if len(children) > 1:
                    issues.append((3, "The tuple ("+str(child_dict)+") matches "+str(len(parents))+" Processors: "+(", ".join([p.name for p in children]))))
                    child = None
                elif len(children) == 0:
                    issues.append((3, "The tuple (" + str(child_dict) + ") did not match any Processor"))
                    child = None
                else:
                    child = children[0]  # type: Processor

                cached_processors[d2s] = child

            # Clone child processor (and its descendants) and add an upscale relation between "parent" and the clone
            if parent and child:
                if non_zero_weight:
                    # Clone the child processor
                    # TODO
                    cloned_child = child.clone(state=glb_idx)
                    glb_idx.put(cloned_child.key(), cloned_child)

                    # Create the new Relation Observations
                    # - Part-of Relation
                    o1 = ProcessorsRelationPartOfObservation.create_and_append(parent, cloned_child, oer)  # Part-of
                    glb_idx.put(o1.key(), o1)
                    # - Upscale Relation
                    quantity = str(sc_dict["weight"])
                    if True:
                        # Find Interface named "scaled_factor"
                        for f in parent.factors:
                            if strcmp(f.name, scaled_factor):
                                origin = f
                                break
                        else:
                            origin = None
                        for f in cloned_child.factors:
                            if strcmp(f.name, scaled_factor):
                                destination = f
                                break
                        else:
                            destination = None

                        if origin and destination:
                            o3 = FactorsRelationScaleObservation.create_and_append(origin, destination,
                                                                                   observer=None,
                                                                                   quantity=quantity)
                            glb_idx.put(o3.key(), o3)
                        else:
                            raise Exception("Could not find Interfaces to define a Scale relation. Processors: " +
                                            parent.name+", "+cloned_child.name+"; Interface name: "+scaled_factor)
                    else:
                        o3 = ProcessorsRelationUpscaleObservation.create_and_append(parent, cloned_child,
                                                                                    observer=None,
                                                                                    factor_name=scaled_factor,
                                                                                    quantity=quantity)
                        glb_idx.put(o3.key(), o3)
            else:
                # TODO
                parent_dict = str({attr: codes[i] for attr, i in parent_attrs})
                child_dict = str({attr: codes[i] for attr, i in child_attrs})
                if not parent and child:
                    issues.append((2, "Could not find parent Processor matching attributes: "+parent_dict))
                elif not child and parent:
                    issues.append((2, "Could not find child Processor matching attributes: "+child_dict))
                else:
                    issues.append((2, "Could not find parent Processor matching attributes: "+parent_dict+", nor child Processor matching attributes: " + child_dict))

        return issues, None
예제 #16
0
    def execute(self, state: "State"):
        """
        Create a Hierarchy of Taxon. The exact form of this hierarchy is different depending on the concept:
        * FactorTypes and Categories use Hierarchies, which are intrinsic.
            The hierarchy name is passed to the containing Hierarchy object
        * Processors use Part-Of Relations. In this case, the hierarchy name is lost
        Names of Processor and FactorTypes are built both in hierarchical and simple form
        The hierarchical is all the ancestors from root down to the current node, separated by "."
        The simple name is just the current node. If there is already another concept with that name, the simple name
        is not stored (STORE BOTH CONCEPTS by the same name, and design some tie breaking mechanism??)
        """
        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        # Process parsed information
        for item in self._content["items"]:
            r = item["_row"]
            # HierarchySource (Optional)
            hsource = item.get("source",
                               None)  # Code of entity defining the Hierarchy
            if hsource:
                tmp = hsource
                hsource = glb_idx.get(
                    HierarchySource.partial_key(name=hsource))
                if len(hsource) == 0:
                    hsource = HierarchySource(name=tmp)
                    glb_idx.put(hsource.key(), hsource)
                else:
                    hsource = hsource[0]

            hname = item.get("hierarchy_name", None)
            if not hname:
                issues.append(
                    Issue(
                        itype=3,
                        description=
                        "The name of the Hierarchy has not been defined. Skipped.",
                        location=IssueLocation(sheet_name=name,
                                               row=r,
                                               column=None)))
                continue

            # HierarchyGroup (equivalent to Hierarchy of Code Lists, HCL)
            hg = item.get("hierarchy_group", None)
            if hg:
                is_code_list = False  # Hierarchy group
            else:
                is_code_list = True  # Hierarchy group for the Code List, with the same name
                hg = hname

            # Check if the HierarchyGroup is previously defined. YES, use it; NO, create new HierarchyGroup
            tmp = hg
            hg = glb_idx.get(HierarchyGroup.partial_key(name=hg))
            if len(hg) == 0:
                hg = HierarchyGroup(name=tmp, source=hsource)
                glb_idx.put(hg.key(), hg)
            else:
                hg = hg[0]

            # Check if the Hierarchy is defined. YES, get it; NO, create it
            tmp = hname
            h = glb_idx.get(Hierarchy.partial_key(name=hname))
            if len(h) == 0:
                h = Hierarchy(name=tmp)
                glb_idx.put(h.key(), h)
                glb_idx.put(h.key(hg.name + "." + h.name),
                            h)  # Register with alternative (full) name
            else:
                h = h[0]

            # Add the Hierarchy to the HierarchyGroup (if not)
            if h not in hg.hierarchies:
                hg.hierarchies.append(h)

            # Level
            level = item.get("level", None)
            if level:
                # Check if the level is defined. YES, get it; NO, create it
                for l in h.levels:
                    if strcmp(l.name, level):
                        level = l
                        break
                else:
                    level = HierarchyLevel(name=level, hierarchy=h)
                    h.levels.append(level)

            code = item.get("code", None)
            label = item.get("label", None)
            description = item.get("description", None)
            attributes = item.get("attributes", None)
            expression = item.get("expression", None)

            # Parent property (what really defines Hierarchies)
            parent_code = item.get("parent_code", None)
            if parent_code:
                ph = h  # Parent Hierarchy is the same as current hierarchy
                pcode = ph.codes.get(parent_code, None)
                if not pcode:
                    issues.append(
                        Issue(itype=3,
                              description="Could not find code '" +
                              parent_code + "' in hierarchy '" + ph.name +
                              "'. Skipped.",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
            else:
                pcode = None

            # ReferredHierarchy. If we are not defining a Code List, the base hierarchy has to be mentioned
            if not is_code_list:
                ref_hierarchy = item.get("referred_hierarchy", None)
                if not ref_hierarchy:
                    issues.append(
                        Issue(
                            itype=3,
                            description=
                            "For HCLs, defining ReferredHierarchy is mandatory",
                            location=IssueLocation(sheet_name=name,
                                                   row=r,
                                                   column=None)))
                    continue

                tmp = ref_hierarchy
                ref_hierarchy = glb_idx.get(
                    Hierarchy.partial_key(name=ref_hierarchy))
                if len(ref_hierarchy) == 0:
                    issues.append(
                        Issue(itype=3,
                              description="ReferredHierarchy '" + tmp +
                              "' not defined previously",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
                else:
                    ref_hierarchy = ref_hierarchy[0]

                ref_code = ref_hierarchy.codes.get(code, None)
                if not ref_code:
                    issues.append(
                        Issue(itype=3,
                              description="Code '" + code +
                              "' not found in referred hierarchy '" +
                              ref_hierarchy.name + "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue

                # Ignore: LABEL, DESCRIPTION. Copy them from referred code
                label = ref_code.label
                description = ref_code.description
            else:
                ref_code = None

            c = h.codes.get(code, None)
            if c:
                issues.append(
                    Issue(itype=3,
                          description="Code '" + code + "' in hierarchy '" +
                          h.name + "' redefined.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                continue

            # Finally, create the HierarchyCode with all the gathered attributes, then weave it to other
            # (name, label=None, description=None, referred_node=None, parent=None, parent_weight=1.0, hierarchy=None)
            c = Taxon(name=code,
                      hierarchy=h,
                      level=level,
                      referred_taxon=ref_code,
                      parent=pcode,
                      label=label,
                      description=description,
                      attributes=attributes,
                      expression=expression)
            # Add code to hierarchy
            h.codes[code] = c
            if not c.parent:
                h.roots_append(c)
            # Add code to level
            if level:
                level.codes.add(c)
            # Add child to parent code
            # (DONE BY THE CONSTRUCTOR!!)
            # if pcode:
            #     pcode.children_codes.append(c)

        return issues, None  # Issues, Output
def parse_mapping_command(sh: Worksheet, area: AreaTupleType, origin, destination) -> IssuesLabelContentTripleType:
    """
    Map from a set of categories from an external dataset into a set of MuSIASEM categories
    If the categories do not exist, they are created flat. Later they can be turned into a hierarchy and the mapping
    will still hold

    The syntax of the mapping allows expressing MANY to ONE and also MANY to MANY correspondence.
    The mapping has to be complete (all elements from left side must be covered, if not "" is assumed on the right side)

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :param origin:
    :param destination:
    :return: list of issues (issue_type, message), command label, command content
    """
    some_error = False
    issues = []
    # Analyze Origin
    cell = sh.cell(row=area[0], column=area[2])
    col_name = cell.value
    if origin:
        if not strcmp(origin, col_name):
            some_error = True
            issues.append((3, "The Origin name is different in the sheet name and in the worksheet ("+origin+", "+col_name+")"))
    else:
        origin = col_name

    #   Obtain the source, the dataset and the dimension of "origin"
    spl = origin.split(".")
    if len(spl) == 3:  # Source.Dataset.Dimension
        s, ds, dim = spl
        s = s + "."
        origin_ok = True
    elif len(spl) == 2:  # Dataset.Dimension
        ds, dim = spl
        s = ""
        origin_ok = True
    else:
        origin_ok = False
        some_error = True
        issues.append((3, "Origin must specify a dataset and a dimension name separated by '.'"))

    if origin_ok:
        origin_dataset = s + ds
        origin_dim = dim

        if not check_dataset_exists(origin_dataset):
            some_error = True
            issues.append((3, "The Origin '" + origin_dataset + "' does not match any registered dataset"))
        else:
            dims, attrs, meas = obtain_dataset_metadata(ds)
            if origin_dim not in dims:
                some_error = True
                issues.append((3, "The Origin dataset '" + origin_dataset + "' does not have a dimension '" + origin_dim + "'"))

    # Analyze Destination
    cell = sh.cell(row=area[0], column=area[2] + 1)
    col_name = cell.value
    if destination:
        if not strcmp(destination, col_name):
            some_error = True
            issues.append((3, "The Destination name is different in the sheet name and in the worksheet (" + destination + ", " + col_name + ")"))
    else:
        destination = col_name

    #  Destination name must be a simple identity
    try:
        parser_field_parsers.simple_ident.parseString(destination, parseAll=True)
    except:
        some_error = True
        issues.append((3, "'" + destination + "' category name has to be a simple identifier"))

    if some_error:  # Issues at this point are errors, return if there are any
        return issues, None, None

    # Read mapping Origin to Destination
    o_dict = create_dictionary()
    for r in range(area[0] + 1, area[1]):
        o_value = sh.cell(row=r, column=area[2]).value  # First column -> Origin
        d_value = sh.cell(row=r, column=area[2] + 1).value  # Second column -> Destination
        try:
            exp_value = sh.cell(row=r, column=area[2] + 2).value  # Third column -> Weight (for Many to Many mappings)
            if exp_value:
                try:
                    exp_value = float(exp_value)
                except:  # If it is not possible, it maybe an expression, postpone conversion until usage
                    pass
            else:
                exp_value = 1.0  # If undefined -> Many to One
        except:
            exp_value = 1.0  # If undefined -> Many to One

        if not o_value and not d_value:
            # issues.append((2, "Row " + str(r) + ": Origin and Destination are not defined. Row skipped."))
            continue
        elif not o_value or not d_value:
            if not o_value and d_value:
                issues.append((2, "Row "+str(r)+": Origin not defined. Row skipped."))
            else:
                issues.append((2, "Row " + str(r) + ": Destination not defined. Row skipped."))
            continue

        o_value = str(o_value).lower()
        d_value = str(d_value).lower()
        if o_value in o_dict:
            lst = o_dict[o_value]
        else:
            lst = []
            o_dict[o_value] = lst
        # Check "d_value" is not being repeated for "o_value"
        if (len(lst) == 0) or (len(lst) >= 1 and d_value not in [d["d"] for d in lst]):
            lst.append({"d": d_value, "w": exp_value})
        else:
            issues.append((3, "Destination category '" + destination + "' has been repeated for origin category '" + o_value + "' at row '"+str(r)+"'"))

    # List of dictionaries, where each dictionary contains the specification of an origin "o"
    # For multiple entries (many to many map), the origin maps a list "to" of dictionaries "d", "e"
    content = {"origin_dataset": origin_dataset,  # Name of the origin dataset (may include the source name)
               "origin_dimension": origin_dim,  # Name of the origin dimension inside the dataset
               "destination": destination,  # Name of the destination hierarchy
               "map": [{"o": k, "to": v} for k, v in o_dict.items()]
               }
    label = ((content["origin_dataset"] + ".") if origin_dataset else "") + content["origin_dimension"] + " -> " + content["destination"]
    return issues, label, content