Beispiel #1
0
    def check_expandable(v, location):
        """
        Check if curly braces match, that what is inside is syntactically correct, (and that the value exists)

        :param v:
        :return:
        """
        import re
        reg = re.compile(r"{.*?}")
        matches = reg.findall(v)
        output = set()
        if len(matches) == 0:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=f"Incorrect syntax, no macro expansion found",
                    location=location))
        else:
            for m in matches:
                h_name = m[1:-1]
                try:
                    parser_field_parsers.string_to_ast(
                        arith_boolean_expression, h_name)  # simple_h_name
                    output.add(h_name)
                except:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description=
                            f"The value {m[1:-1]} is not a valid hierarchical name",
                            location=location))
        return output
    def execute(self, state: "State"):
        """
        Process each of the references, simply storing them as Reference objects
        """
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]
        issues = []

        # Receive a list of validated references
        # Store them as objects, which can be referred to later
        for ref in self._content["items"]:
            r = ref["_row"]

            if "ref_id" not in ref:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="'ref_id' field not found: " + str(ref),
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                continue
            else:
                ref_id = ref["ref_id"]
                existing = glb_idx.get(self.ref_type.partial_key(ref_id))
                if len(existing) == 1:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="Reference '" + ref_id +
                              "' of type '" + str(self.ref_type) +
                              "' is already defined. Not allowed",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
                elif len(existing) > 1:  # This condition should not occur...
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="The reference '" + ref_id +
                              "' of type '" + str(self.ref_type) +
                              "' is defined more than one time (" +
                              str(len(existing)) + ")",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue

                # Create and store the Reference
                reference = self.ref_type(ref_id, ref)
                glb_idx.put(reference.key(), reference)

                # BibliographicReference and ProvenanceReference ar also Observer
                if isinstance(reference, Observer):
                    glb_idx.put(Observer.key(reference), reference)

        return issues, None
Beispiel #3
0
        def process_line(item):
            # Read variables
            mh_src_dataset = item.get("source_dataset", None)
            mh_src_hierarchy = item.get("source_hierarchy", None)
            mh_src_code = item.get("source_code", None)
            mh_dst_hierarchy = item.get("destination_hierarchy", None)
            mh_dst_code = item.get("destination_code", None)
            mh_weight = item.get("weight", 1.0)

            # Mapping name
            name = ((mh_src_dataset + ".") if mh_src_dataset else
                    "") + mh_src_hierarchy + " -> " + mh_dst_hierarchy

            if name in mappings:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The mapping '" + name +
                          "' has been declared previously. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            if name in local_mappings:
                d = local_mappings[name]
            else:
                d = DottedDict()
                local_mappings[name] = d
                d.name = name
                d.origin_dataset = mh_src_dataset
                d.origin_hierarchy = mh_src_hierarchy
                d.destination_hierarchy = mh_dst_hierarchy
                d.mapping = create_dictionary()

            # Specific code
            if mh_src_code in d.mapping:
                to_dict = d.mapping[mh_src_code]
            else:
                to_dict = create_dictionary()
            if mh_dst_code in to_dict:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The mapping of '" + mh_src_code +
                          "' into '" + mh_dst_code +
                          "' has been already defined",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return
            else:
                to_dict[mh_dst_code] = (
                    mh_weight, r
                )  # NOTE: This could be an object instead of just a FLOAT or expression
                d.mapping[mh_src_code] = to_dict
def check_parameter_value(glb_idx, p, value, issues, sheet_name, row):
    retval = True
    if p.range:
        try:  # Try "numeric interval"
            ast = string_to_ast(number_interval, p.range)
            # try Convert value to float
            ast2 = string_to_ast(expression_with_parameters, value)
            evaluation_issues: List[Tuple[int, str]] = []
            s = State()
            value, unresolved_vars = ast_evaluator(exp=ast2, state=s, obj=None, issue_lst=evaluation_issues)
            if value is not None:
                try:
                    value = float(value)
                    left = ast["left"]
                    right = ast["right"]
                    left_number = ast["number_left"]
                    right_number = ast["number_right"]
                    if left == "[":
                        value_meets_left = value >= left_number
                    else:
                        value_meets_left = value > left_number
                    if right == "]":
                        value_meets_right = value <= right_number
                    else:
                        value_meets_right = value < right_number
                    if not value_meets_left or not value_meets_right:
                        issues.append(Issue(itype=IType.ERROR,
                                            description=f"The value {value} specified for the parameter '{p.name}' is out of the range {p.range}",
                                            location=IssueLocation(sheet_name=sheet_name, row=row, column=None)))
                        retval = False
                except:
                    issues.append(Issue(itype=IType.ERROR,
                                        description=f"The parameter '{p.name}' has a non numeric value '{value}', and has been constrained with a numeric range. Please, either change the Value or the Range",
                                        location=IssueLocation(sheet_name=sheet_name, row=row, column=None)))
                    retval = False
            else:
                pass  # The parameter depends on other parameters, a valid situation

        except:  # A hierarchy name
            h = glb_idx.get(Hierarchy.partial_key(p.range))
            h = h[0]
            if value not in h.codes.keys():
                issues.append(Issue(itype=IType.ERROR,
                                    description=f"The value '{value}' specified for the parameter '{p.name}' is not in the codes of the hierarchy '{p.range}': {', '.join(h.codes.keys())}",
                                    location=IssueLocation(sheet_name=sheet_name, row=row, column=None)))
                retval = False

    return retval
Beispiel #5
0
    def execute(self, state: "State"):
        issues = []
        sheet_name = self._content["command_name"]
        # Obtain global variables in state
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)

        for r, param in enumerate(self._content["items"]):
            name = param["name"]
            p = glb_idx.get(Parameter.partial_key(name))
            if len(p) > 0:
                issues.append(
                    Issue(itype=IType.WARNING,
                          description="The parameter '" + name +
                          "' has been declared previously. Skipped.",
                          location=IssueLocation(sheet_name=sheet_name,
                                                 row=r,
                                                 column=None)))
                continue
            p = Parameter(name)
            p._default_value = p._current_value = param.get("value")
            p._type = param.get("type")
            p._range = param.get("domain")
            p._description = param.get("description")
            p._group = param.get("group")
            glb_idx.put(p.key(), p)
        return issues, None
Beispiel #6
0
 def _add_issue(self, itype: IType, description: str) -> None:
     self._issues.append(
         Issue(itype=itype,
               description=description,
               location=IssueLocation(sheet_name=self._command_name,
                                      row=self._current_row_number,
                                      column=None)))
     return
Beispiel #7
0
    def execute(self, state: "State"):
        any_error = False
        issues = []
        sheet_name = self._content["command_name"]
        # Obtain global variables in state
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state)

        scenarios = create_dictionary()

        for r, param in enumerate(self._content["items"]):
            parameter = param["parameter"]
            scenario = param.get("scenario_name")
            p = glb_idx.get(Parameter.partial_key(parameter))
            if len(p) == 0:
                issues.append(Issue(itype=IType.ERROR,
                                    description="The parameter '" + parameter + "' has not been declared previously.",
                                    location=IssueLocation(sheet_name=sheet_name, row=r, column=None)))
                any_error = True
                continue

            p = p[0]
            name = parameter

            value = param.get("parameter_value")

            check_parameter_value(glb_idx, p, value, issues, sheet_name, r)

            description = param.get("description")  # For readability of the workbook. Not used for solving
            if scenario:
                if scenario in scenarios:
                    sp = scenarios[scenario]
                else:
                    sp = create_dictionary()
                    scenarios[scenario] = sp
                sp[name] = value
            else:
                p.current_value = value
                p.default_value = value

        if not any_error:
            solver_parameters = {}  # {p.name: p.current_value for p in glb_idx.get(Parameter.partial_key()) if p.group and strcmp(p.group, "NISSolverParameters")}
            if len(scenarios) == 0:
                scenarios["default"] = create_dictionary()
            ps = ProblemStatement(solver_parameters, scenarios)
            glb_idx.put(ps.key(), ps)

        return issues, None
Beispiel #8
0
def transform_issues(issues: List[Union[dict, nexinfosys.Issue, tuple, Issue]],
                     cmd, sheet_number: int) -> (List[Issue], bool):

    errors_exist = False
    new_issues: List[Issue] = []

    for i in issues:
        if isinstance(i, dict):
            issue = Issue(itype=IType(i["type"]),
                          description=i["message"],
                          ctype=i["c_type"],
                          location=IssueLocation(
                              sheet_name=i["sheet_name"],
                              sheet_number=i["sheet_number"]))
        elif isinstance(i, nexinfosys.Issue):  # namedtuple
            issue = Issue(itype=i.type,
                          description=i.message,
                          ctype=i.c_type,
                          location=IssueLocation(sheet_name=i.sheet_name,
                                                 sheet_number=i.sheet_number))
        elif isinstance(i, tuple):
            issue = Issue(itype=IType(i[0]),
                          description=i[1],
                          location=IssueLocation(sheet_name=""))
        else:  # isinstance(i, Issue):
            issue = i

        if issue.itype == IType.ERROR:
            errors_exist = True

        if not issue.ctype and cmd:  # "cmd" may be "None", in case the Issue is produced by the commands container loop
            issue.ctype = cmd._serialization_type

        if not issue.location.sheet_name or issue.location.sheet_name == "":
            issue.location.sheet_name = cmd._source_block_name if hasattr(
                cmd, "_source_block_name") else ""

        if not issue.location.sheet_number:
            issue.location.sheet_number = sheet_number

        new_issues.append(issue)

    return new_issues, errors_exist
    def execute(self, state: "State"):
        def process_line(item):
            # Read variables
            dsd_dataset_name = item.get("dataset_name", None)
            dsd_dataset_data_location = item.get("dataset_data_location", None)
            dsd_concept_type = item.get("concept_type", None)
            dsd_concept_name = item.get("concept_name", None)
            dsd_concept_data_type = item.get("concept_data_type", None)
            dsd_concept_domain = item.get("concept_domain", None)
            dsd_concept_description = item.get("concept_description", None)
            dsd_attributes = item.get("concept_attributes", None)
            if dsd_attributes:
                try:
                    attributes = dictionary_from_key_value_list(
                        dsd_attributes, glb_idx)
                except Exception as e:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description=str(e),
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
            else:
                attributes = {}

            if dsd_dataset_name in ds_names:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The dataset '" + dsd_dataset_name +
                          "' has been already defined",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            # Internal dataset definitions cache
            ds = current_ds.get(dsd_dataset_name, None)
            if True:  # Statistical dataset format
                if not ds:
                    ds = Dataset()
                    ds.code = dsd_dataset_name  # Name
                    ds.database = None
                    ds.attributes = {}
                    current_ds[dsd_dataset_name] = ds
                if not dsd_concept_type:
                    if ds.attributes.get("_location"):
                        issues.append(
                            Issue(
                                itype=IType.WARNING,
                                description=
                                f"Location of data for dataset {ds.code} previously declared. "
                                f"Former: {attributes.get('_location')}, "
                                f"Current: {dsd_dataset_data_location}",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                        attributes = ds.attributes
                    else:
                        attributes["_dataset_first_row"] = r
                    attributes[
                        "_location"] = dsd_dataset_data_location  # Location
                    ds.description = dsd_concept_description
                    ds.attributes = attributes  # Set attributes
                else:  # If concept_type is defined => add a concept
                    # Check if the concept name already appears --> Error
                    for d1 in ds.dimensions:
                        if strcmp(d1.code, dsd_concept_name):
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    f"Concept {dsd_concept_name} already declared for dataset {ds.code}",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            break

                    d = Dimension()
                    d.dataset = ds
                    d.description = dsd_concept_description
                    d.code = dsd_concept_name
                    d.is_measure = False if dsd_concept_type.lower(
                    ) == "dimension" else True
                    if not d.is_measure and dsd_concept_data_type.lower(
                    ) == "time":
                        d.is_time = True
                    else:
                        d.is_time = False
                    if dsd_concept_type.lower() == "attribute":
                        attributes["_attribute"] = True
                    else:
                        attributes["_attribute"] = False
                    if dsd_concept_data_type.lower() == "category":
                        # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made)
                        # h = hierarchies.get(dsd_concept_domain, None)
                        h = glb_idx.get(
                            Hierarchy.partial_key(name=dsd_concept_domain))
                        if len(h) == 0:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    "Could not find hierarchy of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        elif len(h) > 1:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    "Found more than one instance of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        else:  # len(h) == 1
                            h = h[0]
                        d.hierarchy = h
                        # Reencode the Hierarchy as a CodeList
                        cl = convert_hierarchy_to_code_list(h)
                        d.code_list = cl

                    attributes["_datatype"] = dsd_concept_data_type
                    attributes["_domain"] = dsd_concept_domain
                    d.attributes = attributes

        # -------------------------------------------------------------------------------------------------------------
        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        # List of available dataset names. The newly defined datasets must not be in this list
        ds_names = [ds.name for ds in datasets]

        # List of available Category hierarchies
        hierarchies = create_dictionary()
        for h in hh:
            hierarchies[h.name] = hh

        # Datasets being defined in this Worksheet
        current_ds = create_dictionary()  # type: Dict[str, Dataset]

        # Process parsed information
        for line in self._content["items"]:
            r = line["_row"]
            # If the line contains a reference to a dataset or hierarchy, expand it
            # If not, process it directly
            is_expansion = False
            if is_expansion:
                pass
            else:
                process_line(line)

        # Any error?
        error = any_error_issue(issues)

        # Load the data for those datasets that are not local (data defined later in the same spreadsheet)
        for ds in current_ds.values():
            if "_location" not in ds.attributes:
                error = True
                issues.append(
                    Issue(itype=IType.ERROR,
                          description=
                          "Location of data not specified, for dataset '" +
                          ds.code + "'",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
            else:
                loc = ds.attributes["_location"]
                ast = parser_field_parsers.string_to_ast(url_parser, loc)
                if ast["scheme"] != "data":
                    df = load_dataset(loc)
                    if df is None:
                        error = True
                        issues.append(
                            Issue(
                                itype=IType.ERROR,
                                description=
                                f"Could not obtain data for dataset '{ds.code}' at '{loc}'",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                    else:
                        iss = prepare_dataframe_after_external_read(
                            ds, df, name)
                        issues.extend(iss)
                        # Everything ok? Store the dataframe!
                        if len(iss) == 0:
                            ds.data = df

        if not error:
            # If no error happened, add the new Datasets to the Datasets in the "global" state
            for ds in current_ds:
                r = current_ds[ds].attributes["_dataset_first_row"]
                df = current_ds[ds].data
                if df is not None:
                    # Loop over "ds" concepts.
                    # - "dimension" concepts of type "string" generate a CodeHierarchy
                    # - Check that the DataFrame contains ALL declared concepts. If not, generate issue
                    cid = create_dictionary(
                        data={col: col
                              for col in df.columns})
                    col_names = list(df.columns)
                    for c in current_ds[ds].dimensions:
                        if c.code in df.columns:
                            col_names[df.columns.get_loc(
                                cid[c.code])] = c.code  # Rename column
                            dsd_concept_data_type = c.attributes["_datatype"]
                            if dsd_concept_data_type.lower(
                            ) == "string" and not c.is_measure:  # Freely defined dimension
                                cl = df[cid[c.code]].unique().tolist()
                                c.code_list = CodeList.construct(
                                    c.code,
                                    c.code, [""],
                                    codes=[
                                        CodeImmutable(c, c, "", []) for c in cl
                                    ])
                        else:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    f"Concept '{c.code}' not defined for '{ds}' in {loc}",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                    df.columns = col_names
                datasets[ds] = current_ds[ds]

        return issues, None
Beispiel #10
0
def parse_cmd_row_dict(cmd_name: str, row: Dict[str, str],
                       already_parsed_fields: Set[str],
                       location: IssueLocation) -> Tuple[bool, List[Issue]]:
    """
    Parse a row (as a dictionary) from a command
    It is used after expansion of "macros"

    :param cmd_name: Name of command
    :param row: A dictionary containing the values to parse syntactically. Keys are field names, Values are field values
    :param already_parsed_fields: Set of fields already known to be syntactically valid
    :param location: IssueLocation object to use when creating Issues
    :return: A tuple: a boolean (True if the row can be used, otherwise False) and a list of Issues
    """

    issues: List[Issue] = []

    from nexinfosys.command_field_definitions import command_fields
    field_defs_dict = {f.name: f for f in command_fields[cmd_name]}
    mandatory_not_found = set([
        c.name for c in command_fields[cmd_name]
        if c.mandatory and isinstance(c.mandatory, bool)
    ])
    print(mandatory_not_found)
    complex_mandatory_cols = [
        c for c in command_fields[cmd_name] if isinstance(c.mandatory, str)
    ]
    may_append = True
    complex_row = False
    for field_name, field_value in row.items():
        field_def = field_defs_dict.get(field_name)
        if not field_def:
            return ParseException(
                f"Field {field_name} not found for command {cmd_name}")

        if field_value is not None:
            if not isinstance(field_value, str):
                field_value = str(field_value)
            field_value = field_value.strip()
        else:
            continue

        # Parse the field
        if field_def.allowed_values:
            if field_value.lower() not in [
                    v.lower() for v in field_def.allowed_values
            ]:  # TODO Case insensitive CI
                issues.append(
                    Issue(
                        itype=IType.ERROR,
                        description=
                        f"Field '{field_name}' of command '{cmd_name}' has invalid value '{field_value}'."
                        f" Allowed values are: {', '.join(field_def.allowed_values)}.",
                        location=location))
                may_append = False
            else:
                pass  # OK
        else:  # Instead of a list of values, check if a syntactic rule is met by the value
            if field_def.parser:  # Parse, just check syntax (do not store the AST)
                try:
                    if field_name not in already_parsed_fields:
                        ast = parser_field_parsers.string_to_ast(
                            field_def.parser, field_value)
                        # Rules are in charge of informing if the result is expandable and if it complex
                        if "expandable" in ast and ast["expandable"]:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    f"Field '{field_name}' of command '{cmd_name}' cannot be expandable again.",
                                    location=location))
                            may_append = False
                        if "complex" in ast and ast["complex"]:
                            complex_row = True
                except:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description=
                            f"The value in field '{field_name}' of command '{cmd_name}' "
                            f"is not syntactically correct. Entered: {field_value}",
                            location=location))
                    may_append = False
            else:
                pass  # Valid

        if field_def.name in mandatory_not_found:
            mandatory_not_found.discard(field_def.name)

    # MODIFY INPUT Dictionary with this new Key
    if complex_row:
        row["_complex"] = complex_row

    # Append if all mandatory fields have been filled
    if len(mandatory_not_found) > 0:
        issues.append(
            Issue(
                itype=IType.ERROR,
                description=
                f"Mandatory columns: {', '.join(mandatory_not_found)} have not been specified",
                location=location))
        may_append = False

    # Check varying mandatory fields (fields depending on the value of other fields)
    for c in complex_mandatory_cols:
        field_def = c.name  # next(c2 for c2 in col_map if strcmp(c.name, c2.name))
        if isinstance(c.mandatory, str):
            # Evaluate
            mandatory = eval(c.mandatory, None, row)
            may_append = (mandatory and field_def in row) or (not mandatory)
            if mandatory and field_def not in row:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="Mandatory column: " + field_def +
                          " has not been specified",
                          location=location))
                may_append = False

    return may_append, issues
Beispiel #11
0
    def execute(self, state: "State"):
        def process_line(item):
            # Read variables
            mh_src_dataset = item.get("source_dataset", None)
            mh_src_hierarchy = item.get("source_hierarchy", None)
            mh_src_code = item.get("source_code", None)
            mh_dst_hierarchy = item.get("destination_hierarchy", None)
            mh_dst_code = item.get("destination_code", None)
            mh_weight = item.get("weight", 1.0)

            # Mapping name
            name = ((mh_src_dataset + ".") if mh_src_dataset else
                    "") + mh_src_hierarchy + " -> " + mh_dst_hierarchy

            if name in mappings:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The mapping '" + name +
                          "' has been declared previously. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            if name in local_mappings:
                d = local_mappings[name]
            else:
                d = DottedDict()
                local_mappings[name] = d
                d.name = name
                d.origin_dataset = mh_src_dataset
                d.origin_hierarchy = mh_src_hierarchy
                d.destination_hierarchy = mh_dst_hierarchy
                d.mapping = create_dictionary()

            # Specific code
            if mh_src_code in d.mapping:
                to_dict = d.mapping[mh_src_code]
            else:
                to_dict = create_dictionary()
            if mh_dst_code in to_dict:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The mapping of '" + mh_src_code +
                          "' into '" + mh_dst_code +
                          "' has been already defined",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return
            else:
                to_dict[mh_dst_code] = (
                    mh_weight, r
                )  # NOTE: This could be an object instead of just a FLOAT or expression
                d.mapping[mh_src_code] = to_dict

        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        local_mappings = create_dictionary()

        # Process parsed information
        for line in self._content["items"]:
            r = line["_row"]
            # If the line contains a reference to a dataset or hierarchy, expand it
            # If not, process it directly
            is_expansion = False
            if is_expansion:
                # TODO Iterate through dataset and/or hierarchy elements, producing a list of new items
                pass
            else:
                process_line(line)

        # Mappings post-processing
        for d in local_mappings:
            # Convert the mapping into:
            # [{"o": "", "to": [{"d": "", "w": ""}]}]
            # [ {o: origin category, to: [{d: destination category, w: weight assigned to destination category}] } ]
            mapping = []
            ds_rows = []  # Rows in which a dataset is mentioned
            for orig in local_mappings[d].mapping:
                lst = []
                for dst in local_mappings[d].mapping[orig]:
                    t = local_mappings[d].mapping[orig][dst]
                    lst.append(dict(d=dst, w=t[0]))
                    if local_mappings[d].origin_dataset:
                        ds_rows.append(t[1])
                mapping.append(dict(o=orig, to=lst))
            from nexinfosys.ie_imports.data_source_manager import DataSourceManager
            if local_mappings[d].origin_dataset:
                if not DataSourceManager.obtain_dataset_source(
                        local_mappings[d].origin_dataset, datasets):
                    for r in ds_rows:
                        issues.append(
                            Issue(
                                itype=IType.ERROR,
                                description=
                                f"The dataset '{local_mappings[d].origin_dataset}' was not found",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                    continue
                dims, attrs, meas = obtain_dataset_metadata(
                    local_mappings[d].origin_dataset, None, datasets)
                if local_mappings[d].origin_hierarchy not in dims:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="The origin dimension '" +
                              local_mappings[d].origin_hierarchy +
                              "' does not exist in dataset '" +
                              local_mappings[d].origin_dataset + "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
                else:
                    dim = dims[local_mappings[d].origin_hierarchy]
                    mapping = fill_map_with_all_origin_categories(dim, mapping)
            #
            origin_dataset = local_mappings[d].origin_dataset
            origin_hierarchy = local_mappings[d].origin_hierarchy
            destination_hierarchy = local_mappings[d].destination_hierarchy
            # Create Mapping and add it to Case Study mappings variable
            mappings[d] = Mapping(
                d,
                DataSourceManager.obtain_dataset_source(
                    origin_dataset, datasets), origin_dataset,
                origin_hierarchy, destination_hierarchy, mapping)

        # TODO
        # Use the function to perform many to many mappings, "augment_dataframe_with_mapped_columns"
        # Put it to work !!!

        # One or more mapping in sequence could be specified?. The key is "source hierarchy+dest hierarchy"
        # Read mapping parameters

        return issues, None
Beispiel #12
0
    def execute(self, state: "State"):
        """
        Create a Hierarchy of Taxon. The exact form of this hierarchy is different depending on the concept:
        * FactorTypes and Categories use Hierarchies, which are intrinsic.
            The hierarchy name is passed to the containing Hierarchy object
        * Processors use Part-Of Relations. In this case, the hierarchy name is lost
        Names of Processor and FactorTypes are built both in hierarchical and simple form
        The hierarchical is all the ancestors from root down to the current node, separated by "."
        The simple name is just the current node. If there is already another concept with that name, the simple name
        is not stored (STORE BOTH CONCEPTS by the same name, and design some tie breaking mechanism??)
        """
        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        # Process parsed information
        for item in self._content["items"]:
            r = item["_row"]
            # HierarchySource (Optional)
            hsource = item.get("source",
                               None)  # Code of entity defining the Hierarchy
            if hsource:
                tmp = hsource
                hsource = glb_idx.get(
                    HierarchySource.partial_key(name=hsource))
                if len(hsource) == 0:
                    hsource = HierarchySource(name=tmp)
                    glb_idx.put(hsource.key(), hsource)
                else:
                    hsource = hsource[0]

            hname = item.get("hierarchy_name", None)
            if not hname:
                issues.append(
                    Issue(
                        itype=IType.ERROR,
                        description=
                        "The name of the Hierarchy has not been defined. Skipped.",
                        location=IssueLocation(sheet_name=name,
                                               row=r,
                                               column=None)))
                continue

            # HierarchyGroup (equivalent to Hierarchy of Code Lists, HCL)
            hg = item.get("hierarchy_group", None)
            if hg:
                is_code_list = False  # Hierarchy group
            else:
                is_code_list = True  # Hierarchy group for the Code List, with the same name
                hg = hname

            # Check if the HierarchyGroup is previously defined. YES, use it; NO, create new HierarchyGroup
            tmp = hg
            hg = glb_idx.get(HierarchyGroup.partial_key(name=hg))
            if len(hg) == 0:
                hg = HierarchyGroup(name=tmp, source=hsource)
                glb_idx.put(hg.key(), hg)
            else:
                hg = hg[0]

            # Check if the Hierarchy is defined. YES, get it; NO, create it
            tmp = hname
            h = glb_idx.get(Hierarchy.partial_key(name=hname))
            if len(h) == 0:
                h = Hierarchy(name=tmp)
                glb_idx.put(h.key(), h)
                glb_idx.put(h.key(hg.name + "." + h.name),
                            h)  # Register with alternative (full) name
            else:
                h = h[0]

            # Add the Hierarchy to the HierarchyGroup (if not)
            if h not in hg.hierarchies:
                hg.hierarchies.append(h)

            # Level
            level = item.get("level", None)
            if level:
                # Check if the level is defined. YES, get it; NO, create it
                for l in h.levels:
                    if strcmp(l.name, level):
                        level = l
                        break
                else:
                    level = HierarchyLevel(name=level, hierarchy=h)
                    h.levels.append(level)

            code = item.get("code", None)
            label = item.get("label", None)
            description = item.get("description", None)
            attributes = item.get("attributes", None)
            expression = item.get("expression", None)

            # Parent property (what really defines Hierarchies)
            parent_code = item.get("parent_code", None)
            if parent_code:
                ph = h  # Parent Hierarchy is the same as current hierarchy
                pcode = ph.codes.get(parent_code, None)
                if not pcode:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="Could not find code '" +
                              parent_code + "' in hierarchy '" + ph.name +
                              "'. Skipped.",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
            else:
                pcode = None

            # ReferredHierarchy. If we are not defining a Code List, the base hierarchy has to be mentioned
            if not is_code_list:
                ref_hierarchy = item.get("referred_hierarchy", None)
                if not ref_hierarchy:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description=
                            "For HCLs, defining ReferredHierarchy is mandatory",
                            location=IssueLocation(sheet_name=name,
                                                   row=r,
                                                   column=None)))
                    continue

                tmp = ref_hierarchy
                ref_hierarchy = glb_idx.get(
                    Hierarchy.partial_key(name=ref_hierarchy))
                if len(ref_hierarchy) == 0:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="ReferredHierarchy '" + tmp +
                              "' not defined previously",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
                else:
                    ref_hierarchy = ref_hierarchy[0]

                ref_code = ref_hierarchy.codes.get(code, None)
                if not ref_code:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="Code '" + code +
                              "' not found in referred hierarchy '" +
                              ref_hierarchy.name + "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue

                # Ignore: LABEL, DESCRIPTION. Copy them from referred code
                label = ref_code.label
                description = ref_code.description
            else:
                ref_code = None

            c = h.codes.get(code, None)
            if c:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="Code '" + code + "' in hierarchy '" +
                          h.name + "' redefined.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                continue

            # Finally, create the HierarchyCode with all the gathered attributes, then weave it to other
            # (name, label=None, description=None, referred_node=None, parent=None, parent_weight=1.0, hierarchy=None)
            c = Taxon(name=code,
                      hierarchy=h,
                      level=level,
                      referred_taxon=ref_code,
                      parent=pcode,
                      label=label,
                      description=description,
                      attributes=attributes,
                      expression=expression)
            # Add code to hierarchy
            h.codes[code] = c
            if not c.parent:
                h.roots_append(c)
            # Add code to level
            if level:
                level.codes.add(c)
            # Add child to parent code
            # (DONE BY THE CONSTRUCTOR!!)
            # if pcode:
            #     pcode.children_codes.append(c)

        return issues, None  # Issues, Output
Beispiel #13
0
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name,
                              state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            if isinstance(value, str):
                lst.append(value.strip())
            else:
                lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)

    # Look for the name of the input Dataset
    dataset_name = None
    available_at_datetime = None
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["inputdataset"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    dataset_name = v
                    break  # Stop on first definition
        elif col_name.lower().strip() in ["availableatdatetime"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    available_at_datetime = v
                    break  # Stop on first definition

    if dataset_name is None:
        issues.append(
            Issue(
                itype=IType.ERROR,
                description=
                f"The name of the input dataset must be specified under column 'InputDataset'. Skipping {name} command",
                location=IssueLocation(sheet_name=name, row=None,
                                       column=None)))
        return issues, None, None

    # Obtain the source
    from nexinfosys.ie_imports.data_source_manager import DataSourceManager
    source = DataSourceManager.obtain_dataset_source(dataset_name, datasets)
    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets)
    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = create_dictionary(data={
                k: None
                for k in dims[d].code_list.keys()
            })  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True

    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]]
            tmp = create_dictionary(
                data={
                    to["d"]: None
                    for o in mappings[m].map for to in o["to"] if to["d"]
                })
            cl[mappings[m].
               destination] = tmp  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.

    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian

    out_dims = []

    out_measures = OrderedDict()
    for r in range(area[0] + 1, area[1] + 1):
        out_measures[r] = dict(measure=None, agg_func=None, measure_as=None)

    filter_ = {
    }  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    measure_names_column = None
    aggregations_column = None
    for c in range(area[2], area[3]):  # Each column
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["resultdimensions",
                                        "dimensions"]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, d in enumerate(lst):
                if not d:
                    continue
                if d not in cl:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The dimension specified for output, '"
                            + d +
                            "' is neither a dataset dimension nor a mapped dimension. ["
                            + ', '.join([d2 for d2 in cl]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in ["resultmeasures",
                                          "measures"]:  # "SELECT"
            measure_names_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for r, m in enumerate(lst):
                if not m:
                    continue
                if m not in meas:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The specified measure, '" + m +
                            "' is not a measure available in the dataset. [" +
                            ', '.join(
                                [m2["measure"]
                                 for m2 in out_measures.values]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["measure"] = m
        elif col_name.lower().strip() in [
                "resultmeasuresaggregation", "resultmeasuresaggregator",
                "aggregation"
        ]:  # "SELECT AGGREGATORS"
            aggregations_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, f in enumerate(lst):
                if not f:
                    continue

                if f.lower() not in [
                        "sum", "avg", "count", "sumna", "countav", "avgna",
                        "pctna"
                ]:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The specified aggregation function, '"
                            + f +
                            "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["agg_func"] = f
        elif col_name.lower().strip() in [
                "resultmeasurename", "resultmeasuresnames", "resultmeasuresas",
                "measuresas"
        ]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, m in enumerate(lst):
                out_measures[r + area[0] + 1]["measure_as"] = m
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, cd in enumerate(lst):
                if not cd:
                    continue
                if str(cd) not in cl[col_name]:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The code '" + cd +
                            "' is not present in the codes declared for dimension '"
                            + col_name + "'. Please, check them.",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in [
                "startperiod", "starttime", "endperiod", "endtime"
        ]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                if col_name.lower() == "starttime":
                    col_name = "StartPeriod"
                elif col_name.lower() == "endtime":
                    col_name = "EndPeriod"
                filter_[col_name] = lst[
                    0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in [
                "outputdatasetname", "outputdataset", "result_name",
                "result name", "resultname"
        ]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident,
                                                       result_name)
                except:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="Column '" + col_name +
                              "' has an invalid dataset name '" + result_name +
                              "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=2,
                                                     column=c + 1)))

    # If more than one agg function defined -> all must be defined
    # If no agg func defined -> assume AVG
    # If agg func defined only in first row -> extend to other columns
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]
    if len(agg_funcs) > 1:
        first_agg_func = None
    elif len(agg_funcs) == 0:
        issues.append(
            Issue(itype=IType.WARNING,
                  description=
                  "No aggregation function specified. Assuming 'average'",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=aggregations_column)))
        first_agg_func = "avg"
    else:  # One aggregation function
        first_agg_func = out_measures[area[0] + 1]["agg_func"]
        if not first_agg_func:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=
                    "The aggregation function must be defined in the first row",
                    location=IssueLocation(sheet_name=name,
                                           row=1,
                                           column=aggregations_column)))

    if first_agg_func:
        for v in out_measures.values():
            if v.get("measure", None):
                v["agg_func"] = first_agg_func

    # Uniform rows, with the three values defined: measure, aggregation function and "measure as"
    for r, v in out_measures.items():
        measure = v.get("measure", None)
        agg_func = v.get("agg_func", None)
        measure_as = v.get("measure_as", None)
        if measure and not agg_func or not measure and agg_func:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=
                    "Each measure must be associated with an aggregation function",
                    location=IssueLocation(sheet_name=name,
                                           row=r,
                                           column=measure_names_column)))
        elif measure and not measure_as:
            v["measure_as"] = measure + "_" + agg_func

    measures = [v["measure"] for v in out_measures.values() if v["measure"]]
    measures_as = [
        v["measure_as"] for v in out_measures.values() if v["measure_as"]
    ]
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]

    if len(measures) == 0:
        issues.append(
            Issue(itype=IType.ERROR,
                  description="At least one measure should be specified",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=measure_names_column)))

    # measures != agg_funcs && len(agg_funcs) == 1 --> OK
    if len(measures) != len(agg_funcs) and len(agg_funcs) != 1:
        issues.append(
            Issue(
                itype=IType.ERROR,
                description=
                "There must be one aggregation function (used for all measures) or one aggregation per measure",
                location=IssueLocation(sheet_name=name,
                                       row=1,
                                       column=aggregations_column)))

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append(
            Issue(itype=IType.WARNING,
                  description="No result name specified. Assuming '" +
                  result_name + "'",
                  location=IssueLocation(sheet_name=name, row=2,
                                         column=c + 1)))

    content = {
        "dataset_source": source,
        "dataset_name": dataset_name,
        "dataset_datetime": available_at_datetime,
        "where": filter_,
        "dimensions": [d for d in dims],
        "group_by": out_dims,
        "measures": measures,
        "agg_funcs": agg_funcs,
        "measures_as": measures_as,
        "result_name": result_name
    }
    return issues, None, content
Beispiel #14
0
def check_columns(sh,
                  name: str,
                  area: Tuple,
                  cols: List[CommandField],
                  command_name: str,
                  ignore_not_found=True):
    """
    When parsing of a command starts, check columns
    Try to match each column with declared column fields. If a column is not declared, raise an error (or ignore it)
    If mandatory columns are not found, raise an error

    :param sh: The worksheet being analyzed
    :param name: The name of the worksheet
    :param area: Area inside the worksheet that will be scanned
    :param cols: List of CommandField
    :param command_name: A string with the name of the command
    :param ignore_not_found: True if a column not matching declared ones has to be ignored, False if an error has to be raised in this case
    :return: The map column name to column index (or indices for multiply declared columns); The issues found
    """

    issues: List[Issue] = []

    # Set of mandatory columns
    mandatory_not_found = set([c.name for c in cols if c.mandatory])

    # Check columns
    col_map = {}  # From CommandField to a list of tuples (column, index)
    for c in range(area[2], area[3]):  # For each column of row 0 (Header Row)
        ##val = sh.get((area[0], c), None)
        val = sh.cell(row=area[0], column=c).value
        if not val:
            continue
        col_name = val.strip()
        for col in cols:  # Find matching CommandField from the attribute "regex_allowed_names"
            if col.regex_allowed_names.match(col_name):
                # Found matching CommandField "col". Process
                if "@" in col_name:  # In case of use of "@", remove prefix
                    col_name = col_name[col_name.index("@") + 1:]
                # Column Name to Column Index
                if not col.many_appearances:  # Column appears once
                    if col in col_map:
                        issues.append(
                            Issue(itype=IType.ERROR,
                                  description="The column '" + col.name +
                                  "' should not appear more than one time",
                                  location=IssueLocation(sheet_name=name,
                                                         row=1,
                                                         column=c)))
                    col_map[col] = [(col_name, c)]
                else:  # Column appears one or more times
                    if col not in col_map:
                        col_map[col] = []
                    col_map[col].append((col_name, c))
                # Mandatory found (good)
                if col.name in mandatory_not_found:
                    mandatory_not_found.discard(col.name)
                break
        else:  # No match for the column "col_name"
            issues.append(
                Issue(
                    itype=IType.WARNING if ignore_not_found else IType.ERROR,
                    description=
                    f"In Header row, the column name '{col_name}' does not match any of the "
                    f"allowed column names (internal command '{command_name}')",
                    location=IssueLocation(sheet_name=name, row=1, column=c)))

    if len(mandatory_not_found) > 0:
        issues.append(
            Issue(itype=IType.ERROR,
                  description="In Header row, mandatory columns: " +
                  ", ".join(mandatory_not_found) + " have not been specified",
                  location=IssueLocation(sheet_name=name, row=1, column=None)))

    return col_map, issues
    def execute(self, state: "State"):
        issues = []

        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        # List of available dataset names. The newly defined datasets must not be in this list
        ds_names = [ds.code for ds in datasets.values()]

        # List of datasets with local worksheet name
        external_dataset_names = []
        for ds in datasets.values():
            if ds.attributes["_location"].lower().startswith("data://#"):
                worksheet = ds.attributes["_location"][len("data://#"):]
                if not worksheet.lower().startswith("datasetdata "):
                    worksheet = "DatasetData " + worksheet

                if strcmp(worksheet, name):
                    external_dataset_names.append(ds.code)

        # Process parsed information
        for r, line in enumerate(self._content["items"]):
            # A dataset
            dataset_names = line["name"]
            if dataset_names == "":
                if external_dataset_names:
                    dataset_names = external_dataset_names
                else:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description=
                            "The column name 'DatasetName' was not defined for command 'DatasetData' and there is no 'location' in a DatasetDef command pointing to it",
                            location=IssueLocation(sheet_name=name,
                                                   row=1,
                                                   column=None)))
            else:
                dataset_names = [dataset_names]

            # Find it in the already available datasets. MUST EXIST
            for n in ds_names:
                for dataset_name in dataset_names:
                    if strcmp(dataset_name, n):
                        df = pd.read_json(StringIO(line["values"]),
                                          orient="split")
                        # Check columns
                        ds = datasets[n]
                        iss = prepare_dataframe_after_external_read(
                            ds, df, name)
                        issues.extend(iss)
                        # Everything ok? Store the dataframe!
                        if not any_error_issue(iss):
                            r = ds.attributes["_dataset_first_row"]
                            # Loop over "ds" concepts.
                            # - "dimension" concepts of type "string" generate a CodeHierarchy
                            # - Check that the DataFrame contains ALL declared concepts. If not, generate issue
                            # dims = translate_case([d.code for d in ds.dimensions], df.columns)
                            cid = create_dictionary(
                                data={col: col
                                      for col in df.columns})
                            col_names = list(df.columns)
                            for c in ds.dimensions:
                                if c.code in cid:
                                    col_names[df.columns.get_loc(
                                        cid[c.code])] = c.code  # Rename column
                                    dsd_concept_data_type = c.attributes[
                                        "_datatype"]
                                    if dsd_concept_data_type.lower(
                                    ) == "string" and not c.is_measure:  # Freely defined dimension
                                        cl = df[cid[c.code]].unique().tolist()
                                        c.code_list = CodeList.construct(
                                            c.code,
                                            c.code, [""],
                                            codes=[
                                                CodeImmutable(c, c, "", [])
                                                for c in cl
                                            ])
                                else:
                                    issues.append(
                                        Issue(
                                            itype=IType.ERROR,
                                            description=
                                            f"Concept '{c.code}' not defined for '{ds.code}'",
                                            location=IssueLocation(
                                                sheet_name=name,
                                                row=r,
                                                column=None)))
                            df.columns = col_names
                            ds.data = df
                        dataset_names.remove(dataset_name)
                        break

            if dataset_names:
                issues.append(
                    Issue(
                        itype=IType.ERROR,
                        description=
                        f"Metadata for the datasets: {','.join(dataset_names)}, must be defined previously",
                        location=IssueLocation(sheet_name=name,
                                               row=-1,
                                               column=-1)))

        return issues, None
        def process_line(item):
            # Read variables
            dsd_dataset_name = item.get("dataset_name", None)
            dsd_dataset_data_location = item.get("dataset_data_location", None)
            dsd_concept_type = item.get("concept_type", None)
            dsd_concept_name = item.get("concept_name", None)
            dsd_concept_data_type = item.get("concept_data_type", None)
            dsd_concept_domain = item.get("concept_domain", None)
            dsd_concept_description = item.get("concept_description", None)
            dsd_attributes = item.get("concept_attributes", None)
            if dsd_attributes:
                try:
                    attributes = dictionary_from_key_value_list(
                        dsd_attributes, glb_idx)
                except Exception as e:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description=str(e),
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    return
            else:
                attributes = {}

            if dsd_dataset_name in ds_names:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The dataset '" + dsd_dataset_name +
                          "' has been already defined",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            # Internal dataset definitions cache
            ds = current_ds.get(dsd_dataset_name, None)
            if True:  # Statistical dataset format
                if not ds:
                    ds = Dataset()
                    ds.code = dsd_dataset_name  # Name
                    ds.database = None
                    ds.attributes = {}
                    current_ds[dsd_dataset_name] = ds
                if not dsd_concept_type:
                    if ds.attributes.get("_location"):
                        issues.append(
                            Issue(
                                itype=IType.WARNING,
                                description=
                                f"Location of data for dataset {ds.code} previously declared. "
                                f"Former: {attributes.get('_location')}, "
                                f"Current: {dsd_dataset_data_location}",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                        attributes = ds.attributes
                    else:
                        attributes["_dataset_first_row"] = r
                    attributes[
                        "_location"] = dsd_dataset_data_location  # Location
                    ds.description = dsd_concept_description
                    ds.attributes = attributes  # Set attributes
                else:  # If concept_type is defined => add a concept
                    # Check if the concept name already appears --> Error
                    for d1 in ds.dimensions:
                        if strcmp(d1.code, dsd_concept_name):
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    f"Concept {dsd_concept_name} already declared for dataset {ds.code}",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            break

                    d = Dimension()
                    d.dataset = ds
                    d.description = dsd_concept_description
                    d.code = dsd_concept_name
                    d.is_measure = False if dsd_concept_type.lower(
                    ) == "dimension" else True
                    if not d.is_measure and dsd_concept_data_type.lower(
                    ) == "time":
                        d.is_time = True
                    else:
                        d.is_time = False
                    if dsd_concept_type.lower() == "attribute":
                        attributes["_attribute"] = True
                    else:
                        attributes["_attribute"] = False
                    if dsd_concept_data_type.lower() == "category":
                        # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made)
                        # h = hierarchies.get(dsd_concept_domain, None)
                        h = glb_idx.get(
                            Hierarchy.partial_key(name=dsd_concept_domain))
                        if len(h) == 0:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    "Could not find hierarchy of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        elif len(h) > 1:
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    "Found more than one instance of Categories '"
                                    + dsd_concept_domain + "'",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=None)))
                            return
                        else:  # len(h) == 1
                            h = h[0]
                        d.hierarchy = h
                        # Reencode the Hierarchy as a CodeList
                        cl = convert_hierarchy_to_code_list(h)
                        d.code_list = cl

                    attributes["_datatype"] = dsd_concept_data_type
                    attributes["_domain"] = dsd_concept_domain
                    d.attributes = attributes
Beispiel #17
0
    def analyze_and_complete(
            self,
            structural_graph: Optional[nx.DiGraph] = None) -> List[Issue]:
        """
        It analyzes the flow graph and completes it with inferrable data.

        First, it checks if Flow Graph is DAG (Directed Acyclic Graph), this is, no cycles exist
        Then, it follows these steps:

          How many output edges without weight has the node?
            * More than 1: missing weights (WARNING)
            * Only 1: how many output edges in total?
               * Only 1: is there an opposite weight?
                  * Yes: weight can be inferred as (1.0 / opposite weight) (INFO)
                  * No: weight can be inferred as 1.0 (INFO)
               * More than 1: compute sum of other edges with weight
                  * sum > 1: weight cannot be inferred (WARNING)
                  * sum <= 1: weight can be inferred as (1 - sum) (INFO)

        :return: a list of messages given by the analysis and completion of type INFO, WARNING or ERROR
        """
        issues: List[Issue] = []

        # Checking if graph is acyclic. Just looking at the direct graph is OK.
        # if not nx.algorithms.dag.is_directed_acyclic_graph(self._direct_graph):
        #     print("Cycles detected. The nodes connected in a cycle are: ")
        #     for n in nx.algorithms.cycles.simple_cycles(self._direct_graph):
        #         print(n)
        #     issues.append(Issue(IType.ERROR, 'The graph contains cycles'))
        #     return issues

        for graph, opposite_graph in [
            (self._direct_graph, self._reverse_graph),
            (self._reverse_graph, self._direct_graph)
        ]:
            for n in nx.dfs_preorder_nodes(graph):

                graph.nodes[n]['split'] = False

                # Working on output edges only of node 'n'
                all_edges = graph.out_edges(n, data=True)

                if len(all_edges) == 0:
                    continue

                # How many output edges without weight has the node?
                edges_without_weight = [
                    e for e in all_edges if e[2]['weight'] is None
                ]

                if len(edges_without_weight) > 1:
                    str_edges = [
                        f'({e[0]}, {e[1]})' for e in edges_without_weight
                    ]
                    issues.append(
                        Issue(
                            IType.WARNING,
                            f'The following edges don\'t have a weight: {", ".join(str_edges)}'
                        ))

                elif len(edges_without_weight) == 1:

                    if len(all_edges) == 1:
                        edge = list(all_edges)[0]
                        u, v, data = edge
                        opposite_weight = opposite_graph[v][u]['weight']
                        if opposite_weight is not None:
                            data['weight'] = Weight(
                                0.0) if opposite_weight == 0.0 else (
                                    Weight(1.0) / opposite_weight)
                            issues.append(
                                Issue(
                                    IType.INFO,
                                    f'The weight of single output edge "{edge}" could be inferred from '
                                    f'opposite weight "{opposite_weight}"'))
                        elif structural_graph and (structural_graph.has_edge(
                                u, v) or structural_graph.has_edge(v, u)):
                            issues.append(
                                Issue(
                                    IType.WARNING,
                                    f'The weight of single output edge "{edge}" will not be inferred '
                                    f'because a structural edge exist'))
                        else:
                            data['weight'] = Weight(1.0)
                            issues.append(
                                Issue(
                                    IType.INFO,
                                    f'The weight of single output edge "{edge}" could be inferred '
                                    f'without opposite weight'))
                    else:
                        sum_other_weights = reduce(add, [
                            e[2]['weight']
                            for e in all_edges if e[2]['weight'] is not None
                        ])

                        if sum_other_weights > 1.0:
                            issues.append(
                                Issue(
                                    IType.WARNING,
                                    f'The weight of edge "{edges_without_weight[0]}" cannot be inferred, '
                                    f'the sum of other weights is >= 1.0: {sum_other_weights}'
                                ))
                        else:
                            edges_without_weight[0][2]['weight'] = Weight(
                                1.0) - sum_other_weights
                            graph.nodes[n]['split'] = True
                            issues.append(
                                Issue(
                                    IType.INFO,
                                    f'The weight of edge "{edges_without_weight[0]}" could be inferred '
                                    f'from the sum of other weights'))

                elif len(all_edges) > 1:
                    # All edges have a weight
                    sum_all_weights = reduce(
                        add, [e[2]['weight'] for e in all_edges])
                    if math.isclose(sum_all_weights, 1.0):
                        graph.nodes[n]['split'] = True

        return issues
def parse_dataset_data_command(sh: Worksheet, area: AreaTupleType, name: str, state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """

    issues: List[Issue] = []

    # Analyze column names
    col_map = create_dictionary()
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=area[0], column=c).value.strip()
        # Avoid repetitions
        if col_name in col_map:
            issues.append(Issue(itype=IType.ERROR,
                                description="The column name '"+col_name+"' is repeated",
                                location=IssueLocation(sheet_name=name, row=1, column=c)))

        if strcmp(col_name, "DatasetName") or strcmp(col_name, "Dataset"):
            col_map["dataset"] = c
        elif col_name:
            # Concept name
            col_map[col_name] = c

    if any([i.itype == IType.ERROR for i in issues]):
        return issues, None, None

    # Read all the content into a list of lists
    lines = []
    for r in range(area[0] + 1, area[1]):
        line = []
        for col_name, c in col_map.items():
            v = sh.cell(row=r, column=c).value
            if isinstance(v, str):
                v = v.strip()
            line.append(v)
        lines.append(line)

    # pd.DataFrame
    df = pd.DataFrame(columns=[col_name for col_name in col_map], data=lines)

    content = []  # The output JSON

    if "dataset" in df:
        # Find the different datasets
        datasets = df["dataset"].unique()
        datasets = set([d.lower() for d in datasets])

        for dataset in datasets:
            # Obtain filtered
            df2 = df.loc[df['dataset'].str.lower() == dataset]
            # Convert to JSON and store in content
            del df2["dataset"]
            s = StringIO()
            df2.to_json(s, orient="split")
            content.append(dict(name=dataset, values=s.getvalue()))
    else:
        s = StringIO()
        df.to_json(s, orient="split")
        content.append(dict(name="", values=s.getvalue()))

    return issues, None, dict(items=content, command_name=name)
Beispiel #19
0
def parse_command_in_worksheet(sh: Worksheet, area: AreaTupleType,
                               name: Optional[str],
                               cmd_name: str) -> IssuesLabelContentTripleType:
    """
    Parse command in general
    Generate a JSON
    Generate a list of issues

    :param sh: Worksheet to read
    :param area: Area of the worksheet
    :param name: Name of the worksheet
    :param cmd_name: Name of the command. Key to access "command_fields" variable. Also, shown in issue descriptions
    :return: issues List, None, content (JSON)
    """
    def check_expandable(v, location):
        """
        Check if curly braces match, that what is inside is syntactically correct, (and that the value exists)

        :param v:
        :return:
        """
        import re
        reg = re.compile(r"{.*?}")
        matches = reg.findall(v)
        output = set()
        if len(matches) == 0:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=f"Incorrect syntax, no macro expansion found",
                    location=location))
        else:
            for m in matches:
                h_name = m[1:-1]
                try:
                    parser_field_parsers.string_to_ast(
                        arith_boolean_expression, h_name)  # simple_h_name
                    output.add(h_name)
                except:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description=
                            f"The value {m[1:-1]} is not a valid hierarchical name",
                            location=location))
        return output

    def commented_row(rn):
        commented = False
        v = sh.cell(row=r, column=1).value
        if v is not None:
            if str(v).startswith("#"):
                commented = True
        return commented

    issues: List[Issue] = []

    from nexinfosys.command_field_definitions import command_fields

    cols = command_fields[
        cmd_name]  # List of CommandField that will guide the parsing
    col_map, local_issues = check_columns(sh, name, area, cols, cmd_name)

    if any([i.itype == IType.ERROR for i in local_issues]):
        return local_issues, None, None

    issues.extend(local_issues)

    # The "mandatoriness" of a field may depend on values in other fields (like in RefBibliographic command fields)
    # Elaborate a list of fields having this "complex" mandatory property
    complex_mandatory_cols = [c for c in cols if isinstance(c.mandatory, str)]

    content = []  # The output JSON
    # Parse each Row
    for r in range(area[0] + 1, area[1]):
        line = {}
        expandable = set(
        )  # A set of variables to be expanded. If empty, it is a literal line (not expandable)
        complex = False  # The line contains at least one field with a complex rule (which cannot be evaluated with a simple cast)

        # A row is commented if the value in the first column starts with "#" (a first empty column could be inserted
        # to ease this, just to signal commented rows)
        if commented_row(r):
            continue

        # Constant mandatory values
        mandatory_not_found = set([
            c.name for c in cols
            if c.mandatory and isinstance(c.mandatory, bool)
        ])

        # Each "field"
        for field_def in col_map.keys():
            field_name = field_def.name
            field_defined = False
            # Appearances of field (normally just once, there are attributes allowing more than one appearance)
            for col_name, col_idx in col_map[field_def]:
                # Read and prepare "value"
                value = sh.cell(row=r, column=col_idx).value
                if value is not None:
                    if isinstance(value, float):
                        if value == int(value):
                            value = str(int(value))
                        else:
                            value = str(value)
                    elif not isinstance(value, str):
                        value = str(value)
                    value = value.strip()
                    field_defined = True
                else:
                    continue

                # Check if value contains "{", expansion
                if "{" in value:
                    # Expandable. Do not parse now. Check: curly pairs, and that what is between is a
                    #  simple_h_name and that it exists: as dataset
                    expandable.update(
                        check_expandable(
                            value,
                            IssueLocation(sheet_name=name,
                                          row=r,
                                          column=col_idx)))
                    # With many appearances, just a "Key-Value list" syntax is permitted
                    if field_def.many_appearances:
                        if field_name in line:
                            line[
                                field_name] += ", " + col_name + "='" + value + "'"
                        else:
                            line[field_name] = col_name + "='" + value + "'"
                    else:
                        if field_name in line:
                            line[field_name] += ", " + value
                        else:
                            line[field_name] = value  # Store the value
                else:
                    if field_def.allowed_values:  # If the CommandField checks for a list of allowed values
                        allowed_values_dict: Dict[str, str] = {
                            v.lower(): v
                            for v in field_def.allowed_values
                        }
                        if value.lower(
                        ) not in allowed_values_dict:  # TODO Case insensitive CI
                            issues.append(
                                Issue(
                                    itype=IType.ERROR,
                                    description=
                                    f"Field '{col_name}' of command '{cmd_name}' has invalid category "
                                    f"'{value}'. Allowed values are: {', '.join(field_def.allowed_values)}.",
                                    location=IssueLocation(sheet_name=name,
                                                           row=r,
                                                           column=col_idx)))
                        else:
                            # Use case from allowed values
                            line[field_name] = allowed_values_dict[
                                value.lower()]
                    else:  # Instead of a list of values, check if a syntactic rule is met by the value
                        if field_def.parser:  # Parse, just check syntax (do not store the AST)
                            try:
                                standalone_attribute_value = "@" in field_def.allowed_names[
                                    0]
                                if not standalone_attribute_value:
                                    ast = parser_field_parsers.string_to_ast(
                                        field_def.parser, value)
                                else:
                                    try:
                                        ast = parser_field_parsers.string_to_ast(
                                            field_def.parser, value)
                                    except:
                                        ast = parser_field_parsers.string_to_ast(
                                            unquoted_string, value)

                                # Rules are in charge of informing if the result is expandable and if it complex
                                if "expandable" in ast and ast["expandable"]:
                                    issues.append(
                                        Issue(
                                            itype=IType.ERROR,
                                            description=
                                            f"The value in field '{col_header}' of command "
                                            f"'{cmd_name}' should not be expandable. Entered: {value}",
                                            location=IssueLocation(
                                                sheet_name=name,
                                                row=r,
                                                column=col_idx)))
                                if "complex" in ast and ast["complex"]:
                                    complex = True

                                # With many appearances, just a "Key-Value list" syntax is permitted
                                if field_def.many_appearances:
                                    if field_name in line:
                                        line[
                                            field_name] += ", " + col_name + "='" + value + "'"
                                    else:
                                        line[
                                            field_name] = col_name + "='" + value + "'"
                                else:
                                    if field_name in line:
                                        line[field_name] += ", " + value
                                    else:
                                        line[
                                            field_name] = value  # Store the value
                            except:
                                import traceback
                                traceback.print_exc()
                                col_header = sh.cell(row=1,
                                                     column=col_idx).value
                                issues.append(
                                    Issue(
                                        itype=IType.ERROR,
                                        description=
                                        f"The value in field '{col_header}' of command "
                                        f"'{cmd_name}' is not syntactically correct. Entered: {value}",
                                        location=IssueLocation(
                                            sheet_name=name,
                                            row=r,
                                            column=col_idx)))
                        else:
                            line[
                                field_name] = value  # No parser, just store blindly the value

            if field_defined and field_def.name in mandatory_not_found:
                mandatory_not_found.discard(field_def.name)

        if len(line) == 0:
            continue  # Empty line (allowed)

        # Flags to accelerate the second evaluation, during execution
        line["_row"] = r
        line["_expandable"] = list(expandable)
        line["_complex"] = complex

        # Append if all mandatory fields have been filled
        may_append = True
        if len(mandatory_not_found) > 0:
            issues.append(
                Issue(itype=IType.ERROR,
                      description="Mandatory columns: " +
                      ", ".join(mandatory_not_found) +
                      " have not been specified",
                      location=IssueLocation(sheet_name=name,
                                             row=r,
                                             column=None)))
            may_append = False

        # Check varying mandatory fields (fields depending on the value of other fields)
        for c in complex_mandatory_cols:
            field_def = c.name  # next(c2 for c2 in col_map if strcmp(c.name, c2.name))
            if isinstance(c.mandatory, str):
                # Evaluate
                mandatory = eval(c.mandatory, None, line)
                may_append = (mandatory
                              and field_def in line) or (not mandatory)
                if mandatory and field_def not in line:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="Mandatory column: " + field_def +
                              " has not been specified",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))

        if may_append:
            content.append(line)

    return issues, None, {"items": content, "command_name": name}