예제 #1
0
def parse_metadata_command(sh: Worksheet, area: AreaTupleType, name: str = None) -> IssuesLabelContentTripleType:
    """
    Most "parse" methods are mostly syntactic (as opposed to semantic). They do not check existence of names.
    But in this case, the valid field names are fixed beforehand, so they are checked at this time.
    Some of the fields will be controlled also, according to some

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :return: list of issues (issue_type, message), command label, command content
    """
    some_error = False
    issues = []
    controlled = create_dictionary()
    mandatory = create_dictionary()
    keys = create_dictionary()
    for t in metadata_fields:
        controlled[t[4]] = t[3]
        mandatory[t[4]] = t[2]
        keys[t[0]] = t[4]

    # Scan the sheet, the first column must be one of the keys of "k_list", following
    # columns can contain repeating values

    # Map key to a list of values
    content = {}  # Dictionary of lists, one per metadata key
    for r in range(area[0], area[1]):
        label = sh.cell(row=r, column=area[2]).value
        if label in keys:
            key = keys[label]
            for c in range(area[2]+1, area[3]):
                value = sh.cell(row=r, column=c).value
                if value:
                    value = str(value).strip()
                    if controlled[key]:
                        # Control "value" if the field is controllable
                        cl = {"dimensions": ["water", "energy", "food", "land", "climate"],
                              "subject_topic_keywords": None,
                              "geographical_level": ["local", "regional", "region", "country", "europe", "sectoral", "sector"],
                              "geographical_situation": None,  # TODO Read the list of all geographical regions (A long list!!)
                              "restriction_level": ["internal", "confidential", "public"],
                              "language": None,  # TODO Read the list of ALL languages (or just "English"??)
                              }
                        if cl[key] and value.lower() not in cl[key]:
                            issues.append((3, "The key '"+key+"' should be one of: "+",".join(cl[key])))

                    if key not in content:
                        content[key] = []
                    content[key].append(value)
        else:
            issues.append((2, "Row "+str(r)+": unknown metadata label '"+label+"'"))

    for key in keys.values():
        if mandatory[key] and key not in content:
            some_error = True
            issues.append((3, "The value '"+key+"' is mandatory in the definition of the metadata"))

    return issues, None, content
        def process_line(item):
            # Read variables
            mh_src_dataset = item.get("source_dataset", None)
            mh_src_hierarchy = item.get("source_hierarchy", None)
            mh_src_code = item.get("source_code", None)
            mh_dst_hierarchy = item.get("destination_hierarchy", None)
            mh_dst_code = item.get("destination_code", None)
            mh_weight = item.get("weight", None)

            # Mapping name
            name = ((mh_src_dataset + ".") if mh_src_dataset else
                    "") + mh_dst_hierarchy + " -> " + mh_dst_hierarchy

            if name in mappings:
                issues.append(
                    Issue(itype=3,
                          description="The mapping '" + name +
                          "' has been declared previously. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            if name in local_mappings:
                d = local_mappings[name]
            else:
                d = DottedDict()
                local_mappings[name] = d
                d.name = name
                d.origin_dataset = mh_src_dataset
                d.origin_hierarchy = mh_src_hierarchy
                d.destination_hierarchy = mh_dst_hierarchy
                d.mapping = create_dictionary()

            # Specific code
            if mh_src_code in d.mapping:
                to_dict = d.mapping[mh_src_code]
            else:
                to_dict = create_dictionary()
            if mh_dst_code in to_dict:
                issues.append(
                    Issue(itype=3,
                          description="The mapping of '" + mh_src_code +
                          "' into '" + mh_dst_code + "' has been done already",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return
            else:
                to_dict[
                    mh_dst_code] = mh_weight  # NOTE: This could be an object instead of just a FLOAT or expression
                d.mapping[mh_src_code] = to_dict
    def execute(self, state: "State"):
        any_error = False
        issues = []
        sheet_name = self._content["command_name"]
        # Obtain global variables in state
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)

        scenarios = create_dictionary()
        solver_parameters = create_dictionary()

        for r, param in enumerate(self._content["items"]):
            parameter = param["parameter"]
            scenario = param["scenario_name"]
            p = glb_idx.get(Parameter.partial_key(parameter))
            if scenario:
                if len(p) == 0:
                    issues.append(
                        Issue(itype=3,
                              description="The parameter '" + parameter +
                              "' has not been declared previously.",
                              location=IssueLocation(sheet_name=sheet_name,
                                                     row=r,
                                                     column=None)))
                    any_error = True
                    continue
                p = p[0]
                name = p.name
            else:
                name = parameter
            value = param["parameter_value"]
            description = param.get(
                "description",
                None)  # For readability of the workbook. Not used for solving
            if scenario:
                if scenario in scenarios:
                    sp = scenarios[scenario]
                else:
                    sp = create_dictionary()
                    scenarios[scenario] = sp
                sp[name] = value
            else:
                solver_parameters[name] = value

        if not any_error:
            ps = ProblemStatement(solver_parameters, scenarios)
            glb_idx.put(ps.key(), ps)

        return issues, None
예제 #4
0
 def obtain_problem_statement() -> ProblemStatement:
     """
     Obtain a ProblemStatement instance
     Obtain the solver parameters plus a list of scenarios
     :param state:
     :return:
     """
     ps_list: List[ProblemStatement] = glb_idx.get(
         ProblemStatement.partial_key())
     if len(ps_list) == 0:
         # No scenarios (dummy), and use the default solver
         scenarios = create_dictionary()
         scenarios["default"] = create_dictionary()
         return ProblemStatement(scenarios=scenarios)
     else:
         return ps_list[0]
예제 #5
0
 def test_003_many_to_many_2(self):
     # Prepare a many to many map from category set to category set
     # Prepare a simple DataFrame containing
     m = create_dictionary()
     m["cat_o_1"] = ("cat_d_1",
             {
               "c11": [{"d": "c21", "w": 0.6},
                       {"d": "c22", "w": 0.4}],
               "c12": [{"d": "c23", "w": 1.0}],
               "c13": [{"d": "c23", "w": 1.0}]
             }
     )
     m["cat_o_2"] = ("cat_d_2",
           {
               "c31": [{"d": "c41", "w": 0.3},
                       {"d": "c42", "w": 0.7}],
               "c32": [{"d": "c43", "w": 1.0}],
               "c33": [{"d": "c43", "w": 1.0}]
           }
     )
     # Prepare a simple DataFrame
     df = pd.DataFrame(data=[["c11", "c31", 4], ["c12", "c32", 3], ["c13", "c31", 1.5]], columns=["cat_o_1", "cat_o_2", "value"])
     # >>>>> Call Cython ACCELERATED Function <<<<<
     df2 = augment_dataframe_with_mapped_columns2(df, m, ["value"])
     # Check result
     self.assertEqual(list(df2.columns), ["cat_o_1", "cat_o_2", "cat_d_1", "cat_d_2", "value"])
     self.assertEqual(df2.shape, (7, 5))
예제 #6
0
def obtain_dictionary_with_literal_fields(item, asts):
    d = create_dictionary()
    for f in item:
        if not f.startswith("_"):
            ast = asts[f]
            if "complex" not in ast or ("complex" in ast and not ast["complex"]):
                d[f] = item[f]
    return d
예제 #7
0
    def construct(name: str, description: str, levels: List[str],
                  codes: List[CodeImmutable]):
        """

        :param name: Name of the Code List
        :param description: Description of the Code List
        :param levels: Names of the levels
        :param codes: List of codes, including in each the following tuple: CodeImmutable = namedtuple("CodeTuple", "code description level children")
        :return:
        """

        cl = CodeList()
        cl.code = name
        cl.description = description
        # Levels
        levels_dict = create_dictionary()
        for l in levels:
            cll = CodeListLevel()
            cll.code_list = cl  # Point to the containing CodeList
            cll.code = l
            cll.description = None
            levels_dict[l] = cll
        # Codes
        codes_dict = create_dictionary()
        for ct in codes:
            c = Code()
            c.code = ct.code
            c.description = ct.description
            if ct.level in levels_dict:
                c.level = levels_dict[
                    ct.level]  # Point to the containing CodeListLevel
            else:
                c.level = None
            codes_dict[ct.code] = c
            c.children = []
            c.parents = []
        # Set children & parents
        for ct in codes:
            for ch in ct.children:
                if ch in codes_dict:
                    c.children.append(codes_dict[ch])
                    codes_dict[ch].parents.append(c)

        return cl
예제 #8
0
    def get_dataset_structure(self, database, dataset) -> Dataset:
        """ Obtain the structure of a dataset: concepts, dimensions, attributes and measures """
        refs = dict(references='all')
        dsd_response = estat.datastructure("DSD_" + dataset, params=refs)
        dsd = dsd_response.datastructure["DSD_" + dataset]
        metadata = dsd_response.write()
        # SDMXConcept = collections.namedtuple('Concept', 'type name istime description code_list')
        # DataSource <- Database <- DATASET <- Dimension(s) (including Measures) <- CodeList
        #                                      |
        #                                      v
        #                                      Concept <- CodeList  (NOT CONSIDERED NOW)
        ds = Dataset()
        ds.code = dataset
        ds.description = None  # How to get description?
        ds.attributes = {}  # Dataset level attributes? (encode them using a dictionary)
        ds.metadata = None  # Metadata for the dataset SDMX (flow, date of production, etc.)
        ds.database = database  # Reference to containing database

        dims = {}

        for d in dsd.dimensions:
            istime = str(dsd.dimensions.get(d)).split("|")[0].strip() == "TimeDimension"
            dd = Dimension()
            dd.code = d
            dd.description = None
            dd.attributes = None
            dd.is_time = istime
            dd.is_measure = False
            dd.dataset = ds
            dims[d] = dd
        for m in dsd.measures:
            dd = Dimension()
            dd.code = m
            dd.description = None
            dd.attributes = None
            dd.is_time = False
            dd.is_measure = True
            dd.dataset = ds
            dims[m] = dd
        for a in dsd.attributes:
            ds.attributes[a] = None  # TODO Get the value
        for l in metadata.codelist.index.levels[0]:
            first = True
            # Read code lists
            cl = create_dictionary()
            for m, v in list(zip(metadata.codelist.loc[l].index, metadata.codelist.loc[l]["name"])):
                if not first:
                    cl[m] = v
                else:
                    first = False
            # Attach it to the Dimension or Measure
            if metadata.codelist.loc[l]["dim_or_attr"][0] == "D":
                # Build Code List from dictionary
                dims[l].code_list = CodeList.construct(l, None, [""], [CodeImmutable(k, cl[k], "", []) for k in cl])

        return ds
예제 #9
0
    def initialize_datasets_registry(self, datasets_list: List[Dataset]):
        """
        Receive a list of the datasets and make a copy

        :param datasets_list:
        :return:
        """
        self._registry = create_dictionary()
        for ds in datasets_list:
            self.register_dataset(ds.code, ds)
예제 #10
0
    def list_all_names(self):
        """
            Returns a list of the names of registered entities considering the scopes
            Start from top level, end in bottom level (the current one, which takes precedence)
            :return:
        """
        t = create_dictionary()
        for scope in self.__scope:
            t.update(scope._registry)

        return t.keys()
예제 #11
0
def serialize_state(state: State):
    """
    Serialization prepared for a given organization of the state

    :return:
    """
    def serialize_dataframe(df):
        return df.to_json(orient="split")  # list(df.index.names), df.to_dict()

    print("  serialize_state IN")

    import copy
    # "_datasets"
    ns_ds = {}
    # Save and nullify before deep copy
    for ns in state.list_namespaces():
        _, _, _, datasets, _ = get_case_study_registry_objects(state, ns)
        ns_ds[ns] = datasets
        state.set("_datasets", create_dictionary(), ns)  # Nullify datasets

    # !!! WARNING: It destroys "state", so a DEEP COPY is performed !!!
    tmp = sys.getrecursionlimit()
    sys.setrecursionlimit(10000)
    state2 = copy.deepcopy(state)
    sys.setrecursionlimit(tmp)

    # Iterate all namespaces
    for ns in state2.list_namespaces():
        glb_idx, p_sets, hh, _, mappings = get_case_study_registry_objects(
            state2, ns)
        if glb_idx:
            tmp = glb_idx.to_pickable()
            state2.set("_glb_idx", tmp, ns)
        datasets = ns_ds[ns]
        # TODO Serialize other DataFrames.
        # Process Datasets
        for ds_name in datasets:
            ds = datasets[ds_name]
            if isinstance(ds.data, pd.DataFrame):
                tmp = serialize_dataframe(ds.data)
            else:
                tmp = None
                # ds.data = None
            # DB serialize the datasets
            lst2 = serialize(ds.get_objects_list())
            lst2.append(tmp)  # Append the serialized DataFrame
            datasets[ds_name] = lst2
        state2.set("_datasets", datasets, ns)
    tmp = serialize_from_object(
        state2)  # <<<<<<<< SLOWEST !!!! (when debugging)
    print("  serialize_state length: " + str(len(tmp)) + " OUT")

    return tmp
예제 #12
0
def get_case_study_registry_objects(state, namespace=None):
    """
    Obtain the main entries of the state

    :param state: Input state (modified also)
    :param namespace: State supports several namespaces. This one serves to specify which one. Default=None
    :return: Tuple: (global index, processor sets, hierarchies, datasets, mappings)
    """
    # Index of ALL objects
    glb_idx = state.get("_glb_idx", namespace)
    if not glb_idx:
        glb_idx = PartialRetrievalDictionary()
        state.set("_glb_idx", glb_idx, namespace)

    # ProcessorSet dict (dict of sets)
    p_sets = state.get("_processor_sets", namespace)
    if not p_sets:
        p_sets = create_dictionary()
        state.set("_processor_sets", p_sets, namespace)

    # Hierarchies Dict
    hh = state.get("_hierarchies", namespace)
    if not hh:
        hh = create_dictionary()
        state.set("_hierarchies", hh, namespace)
    # Datasets Dict
    datasets = state.get("_datasets", namespace)
    if not datasets:
        datasets = create_dictionary()
        state.set("_datasets", datasets, namespace)
    # Mappings Dict
    mappings = state.get("_mappings", namespace)
    if not mappings:
        mappings = create_dictionary()
        state.set("_mappings", mappings, namespace)

    return glb_idx, p_sets, hh, datasets, mappings
예제 #13
0
    def execute(self, state: "State"):
        """
        Create a set of linear scale conversions, from factor type to factor type
        """
        some_error = False
        issues = []

        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)

        origin_factor_types = self._content["origin_factor_types"]
        destination_factor_types = self._content["destination_factor_types"]
        scales = self._content["scales"]

        # Check that we have valid factor type names
        fts = create_dictionary()
        for ft_name in origin_factor_types + destination_factor_types:
            # Obtain (maybe Create) the mentioned Factor Types
            p, ft, f = find_or_create_observable(
                glb_idx,
                ft_name,
                Observer.no_observer_specified,
                None,
                proc_external=None,
                proc_attributes=None,
                proc_location=None,
                fact_roegen_type=None,
                fact_attributes=None,
                fact_incoming=None,
                fact_external=None,
                fact_location=None)
            if not ft:
                some_error = True
                issues.append((3, "Could not obtain/create the Factor Type '" +
                               ft_name + "'"))
            fts[ft_name] = ft

        if some_error:
            return issues, None

        for sc in scales:
            origin = fts[sc["origin"]]
            destination = fts[sc["destination"]]
            scale = sc["scale"]
            FactorTypesRelationUnidirectionalLinearTransformObservation.create_and_append(
                origin, destination, scale, Observer.no_observer_specified)

        return None, None
예제 #14
0
def get_processor_names_to_processors_dictionary(
        state: PartialRetrievalDictionary):
    """
    Obtain a dictionary with all processor names (a processor may have multiple names) and
    the corresponding Processor object

    :param state:
    :return:
    """
    ps = state.get(Processor.partial_key())
    ps = set(ps)  # Avoid repeating Processor objects
    d = create_dictionary()
    for p in ps:
        for n in p.full_hierarchy_names(state):
            d[n] = p
    return d
예제 #15
0
 def test_001_many_to_one_1(self):
     # Prepare a many to one map from category set to category set
     m = create_dictionary()
     m["cat_o_1"] = ("cat_d_1",
           {
               "c11": [{"d": "c21", "w": 1.0}],
               "c12": [{"d": "c23", "w": 1.0}],
               "c13": [{"d": "c23", "w": 1.0}]
           }
     )
     # Prepare a simple DataFrame
     df = pd.DataFrame(data=[["c11", 4], ["c12", 3], ["c13", 1.5]], columns=["cat_o_1", "value"])
     # Call
     df2 = augment_dataframe_with_mapped_columns(df, m, ["value"])
     # Check result
     self.assertEqual(list(df2.columns), ["cat_o_1", "cat_d_1", "value"])
     self.assertEqual(df2.shape, (3, 3))
예제 #16
0
def get_observations_OLD(prd: PartialRetrievalDictionary) \
        -> Tuple[PartialRetrievalDictionary, PartialRetrievalDictionary, Dict[str, int]]:
    """
    Process All QQ observations (intensive or extensive):
    * Store in a compact way (then clear), by Time-period, by Interface, by Observer.
    * Convert to float or prepare AST
    * Store as value the result plus the QQ observation (in a tuple)

    :param prd:
    :param relative: True->a QQ observation relative to the value of another interface
    :return: another PartialRetrievalDictionary, the Observers and the Time Periods (indexed)
    """
    observations_prd = PartialRetrievalDictionary()
    relative_observations_prd = PartialRetrievalDictionary()
    time_periods: Dict[str, int] = create_dictionary(
    )  # Dictionary of time periods and the associated IDX
    state = State()

    next_time_period_idx = 0
    for observation in find_quantitative_observations(
            prd, processor_instances_only=True):

        # Obtain time period index
        time = observation.attributes["time"]
        if time not in time_periods:
            time_periods[time] = next_time_period_idx
            next_time_period_idx += 1

        # Elaborate Key: Interface, Time, Observer
        key = dict(__i=observation.factor,
                   __t=time_periods[time],
                   __o=observation.observer)

        value, ast, _, issues = evaluate_numeric_expression_with_parameters(
            observation.value, state)
        if not value:
            value = ast

        # Store Key: (Value, FactorQuantitativeObservation)
        if observation.is_relative:
            relative_observations_prd.put(key, (value, observation))
        else:
            observations_prd.put(key, (value, observation))

    return observations_prd, relative_observations_prd, time_periods
예제 #17
0
def convert_code_list_to_hierarchy(cl, as_list=False):
    """
    Receives a list of codes. Codes are sorted lexicographically (to include numbers).

    Two types of coding schemes are supported by assuming that trailing zeros can be ignored to match parent -> child
    relations. The first is uniformly sized codes (those with trailing zeros). The second is growing length codes.

    Those with length less than others but common prefix are parents

    :param cl:
    :param as_list: if True, return a flat tree (all nodes are siblings, descending from a single root)
    :return:
    """
    def can_be_child(parent_candidate, child_candidate):
        # Strip zeros to the right, from parent_candidate, and
        # check if the child starts with the resulting substring
        return child_candidate.startswith(parent_candidate.rstrip("0"))

    root = Node("")
    path = [root]
    code_to_node = create_dictionary()
    for c in sorted(cl):
        if as_list:
            n = Node(c, path[-1])
        else:
            found = False
            while len(path) > 0 and not found:
                if can_be_child(path[-1].name, c):
                    found = True
                else:
                    path.pop()
            if c.rstrip("0") == path[-1].name:
                # Just modify (it may enter here only in the root node)
                path[-1].name = c
                n = path[-1]
            else:
                # Create node and append it to the active path
                n = Node(c, path[-1])
                path.append(n)
        code_to_node[c] = n  # Map the code to the node

    return root, code_to_node
예제 #18
0
def dictionary_from_key_value_list(kvl, state: State = None):
    """
    From a string containing a list of keys and values, return a dictionary
    Keys must be literals, values can be expressions, to be evaluated at a later moment

    (syntactic validity of expressions is not checked here)

    :param kvl: String containing the list of keys and values
    :except If syntactic problems occur
    :return: A dictionary
    """
    pairs = kvl.split(",")
    d = create_dictionary()
    for p in pairs:
        k, v = p.split("=", maxsplit=1)
        if not k:
            raise Exception(
                "Each key-value pair must be separated by '=' and key has to be defined, value can be empty: "
                + kvl)
        else:
            try:
                k = k.strip()
                v = v.strip()
                string_to_ast(simple_ident, k)
                try:
                    # Simplest: string
                    string_to_ast(quotedString, v)
                    v = v[1:-1]
                except:
                    issues = []
                    ast = string_to_ast(expression_with_parameters, v)
                    res, unres = ast_evaluator(ast, state, None, issues)
                    if len(unres) == 0:
                        v = res

                d[k] = v
            except:
                raise Exception("Key must be a string: " + k +
                                " in key-value list: " + kvl)
    return d
예제 #19
0
def generate_dublin_core_xml(content):
    """
    Generate an XML string with a Simple Dublin Core Record from a Case Study Metadata Command Content
    :param content:
    :return:
    """
    controlled = create_dictionary()
    for t in metadata_fields:
        controlled[t[4]] = t

    s = """<?xml version="1.0"?>
<caseStudyMetadata xmlns="http://magic-nexus.org/dmp/" xmlns:dc="http://purl.org/dc/elements/1.1/">
"""
    for key in content:
        k = controlled[key][1]
        if k:
            for l in content[key]:
                s += "    <dc:" + k + ">" + escape(
                    str(l)) + "</dc:" + k + ">\n"

    s += "</caseStudyMetadata>\n"

    return s
예제 #20
0
def evaluate_parameters_for_scenario(base_params: List[Parameter],
                                     scenario_params: Dict[str, str]):
    """
    Obtain a dictionary (parameter -> value), where parameter is a string and value is a literal: number, boolean,
    category or string.

    Start from the base parameters then overwrite with the values in the current scenario.

    Parameters may depend on other parameters, so this has to be considered before evaluation.
    No cycles are allowed in the dependencies, i.e., if P2 depends on P1, P1 cannot depend on P2.
    To analyze this, first expressions are evaluated, extracting which parameters appear in each of them. Then a graph
    is elaborated based on this information. Finally, an algorithm to find cycles is executed.

    :param base_params:
    :param scenario_params:
    :return:
    """
    # Create dictionary without evaluation
    result_params = create_dictionary()
    result_params.update(
        {p.name: p.default_value
         for p in base_params if p.default_value})

    # Overwrite with scenario expressions or constants
    result_params.update(scenario_params)

    state = State()
    known_params = create_dictionary()
    unknown_params = create_dictionary()

    # Now, evaluate ALL expressions
    for param, expression in result_params.items():
        value, ast, params, issues = evaluate_numeric_expression_with_parameters(
            expression, state)
        if not value:  # It is not a constant, store the parameters on which this depends
            if case_sensitive:
                unknown_params[param] = (ast, set(params))
            else:
                unknown_params[param] = (ast, set([p.lower() for p in params]))
        else:  # It is a constant, store it
            result_params[param] = value  # Overwrite
            known_params[param] = value

    cycles = get_circular_dependencies(unknown_params)
    if len(cycles) > 0:
        raise Exception(
            f"Parameters cannot have circular dependencies. {len(cycles)} cycles were detected: "
            f"{':: '.join(cycles)}")

    # Initialize state with known parameters
    state.update(known_params)

    # Loop until no new parameters can be evaluated
    previous_len_unknown_params = len(unknown_params) + 1
    while len(unknown_params) < previous_len_unknown_params:
        previous_len_unknown_params = len(unknown_params)

        for param in list(
                unknown_params
        ):  # A list(...) is used because the dictionary can be modified inside
            ast, params = unknown_params[param]
            if params.issubset(known_params):
                value, _, _, issues = evaluate_numeric_expression_with_parameters(
                    ast, state)
                if not value:
                    raise Exception(
                        f"It should be possible to evaluate the parameter '{param}'. "
                        f"Issues: {', '.join(issues)}")
                else:
                    del unknown_params[param]
                    result_params[param] = value
                    state.set(param, value)

    if len(unknown_params) > 0:
        raise Exception(
            f"Could not evaluate the following parameters: {', '.join(unknown_params)}"
        )

    return result_params
예제 #21
0
 def __init__(self, name=None):
     self._name = name  # A name for the scope itself
     self._registry = create_dictionary()
예제 #22
0
 def __init__(self):
     self._default_namespace = ""
     self._namespaces = create_dictionary()  # type:
예제 #23
0
def prepare_model(state) -> Dict[str, Set[Processor]]:
    """
    Modify the state so that:
    * Implicit references of Interfaces to subcontexts are materialized
      * Creating processors
      * Creating interfaces in these processors
      * Creating relationships in these processors

    :param state:
    :return: A dictionary of systems each containing a set of the Processors inside it ("local" and "environment")
    """
    # Registry and the other objects also
    glb_idx, _, _, _, _ = get_case_study_registry_objects(state)
    # Prepare a Query to obtain ALL interfaces
    query = BasicQuery(state)
    filt = {}
    objs = query.execute([Factor], filt)
    processors_by_system = create_dictionary()
    for iface in objs[Factor]:  # type: Factor
        system = iface.processor.processor_system

        processors = processors_by_system.get(system, set())
        if system not in processors_by_system:
            processors_by_system[system] = processors

        if iface.processor not in processors:
            processors.add(iface.processor)

        # If the Interface is connected to a "Subcontext" different than the owning Processor
        if iface.opposite_processor_type and \
           iface.opposite_processor_type.lower() != iface.processor.subsystem_type.lower():

            # Check if the interface has flow relationships
            # TODO An alternative is to search "observations" of type FactorsRelationDirectedFlowObservation
            #      in the same "iface"

            if iface.orientation.lower() == "input":
                parameter = {"target": iface}
            else:
                parameter = {"source": iface}

            relations = glb_idx.get(
                FactorsRelationDirectedFlowObservation.partial_key(
                    **parameter))

            # If not, define Processor name, check if exists, if not create it
            # Then create an Interface and a Relationship
            if len(relations) == 0:
                # Define the name of a Processor in the same context but in different subcontext
                p_name = system + "_" + iface.opposite_processor_type
                p = glb_idx.get(Processor.partial_key(p_name))
                if len(p) == 0:
                    attributes = {
                        'subsystem_type': iface.opposite_processor_type,
                        'processor_system': iface.processor.processor_system,
                        'functional_or_structural': 'Functional',
                        'instance_or_archetype': 'Instance'
                        # 'stock': None
                    }

                    p = Processor(p_name, attributes=attributes)
                    glb_idx.put(p.key(), p)

                    if p.subsystem_type.lower() in ["local", "environment"]:
                        processors.add(p)
                else:
                    p = p[0]

                attributes = {
                    'sphere':
                    'Technosphere' if iface.opposite_processor_type.lower()
                    in ["local", "external"] else 'Biosphere',
                    'roegen_type':
                    iface.roegen_type,
                    'orientation':
                    "Input"
                    if iface.orientation.lower() == "output" else "Output",
                    'opposite_processor_type':
                    iface.processor.subsystem_type
                }

                # Create Interface
                f = Factor.create_and_append(
                    name=iface.taxon.name,
                    processor=p,
                    in_processor_type=FactorInProcessorType(external=False,
                                                            incoming=False),
                    attributes=attributes,
                    taxon=iface.taxon)

                glb_idx.put(f.key(), f)

                # Create Flow Relationship
                if iface.orientation.lower() == "output":
                    source = iface
                    target = f
                else:
                    source = f
                    target = iface

                fr = FactorsRelationDirectedFlowObservation.create_and_append(
                    source=source, target=target, observer=None)
                glb_idx.put(fr.key(), fr)

    return processors_by_system
    def execute(self, state: "State"):
        def process_line(item):
            # Read variables
            mh_src_dataset = item.get("source_dataset", None)
            mh_src_hierarchy = item.get("source_hierarchy", None)
            mh_src_code = item.get("source_code", None)
            mh_dst_hierarchy = item.get("destination_hierarchy", None)
            mh_dst_code = item.get("destination_code", None)
            mh_weight = item.get("weight", None)

            # Mapping name
            name = ((mh_src_dataset + ".") if mh_src_dataset else
                    "") + mh_dst_hierarchy + " -> " + mh_dst_hierarchy

            if name in mappings:
                issues.append(
                    Issue(itype=3,
                          description="The mapping '" + name +
                          "' has been declared previously. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            if name in local_mappings:
                d = local_mappings[name]
            else:
                d = DottedDict()
                local_mappings[name] = d
                d.name = name
                d.origin_dataset = mh_src_dataset
                d.origin_hierarchy = mh_src_hierarchy
                d.destination_hierarchy = mh_dst_hierarchy
                d.mapping = create_dictionary()

            # Specific code
            if mh_src_code in d.mapping:
                to_dict = d.mapping[mh_src_code]
            else:
                to_dict = create_dictionary()
            if mh_dst_code in to_dict:
                issues.append(
                    Issue(itype=3,
                          description="The mapping of '" + mh_src_code +
                          "' into '" + mh_dst_code + "' has been done already",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return
            else:
                to_dict[
                    mh_dst_code] = mh_weight  # NOTE: This could be an object instead of just a FLOAT or expression
                d.mapping[mh_src_code] = to_dict

        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        local_mappings = create_dictionary()

        # Process parsed information
        for line in self._content["items"]:
            r = line["_row"]
            # If the line contains a reference to a dataset or hierarchy, expand it
            # If not, process it directly
            is_expansion = False
            if is_expansion:
                # TODO Iterate through dataset and/or hierarchy elements, producing a list of new items
                pass
            else:
                process_line(line)

        # Mappings post-processing
        for d in local_mappings:
            # Convert the mapping into:
            # [{"o": "", "to": [{"d": "", "w": ""}]}]
            # [ {o: origin category, to: [{d: destination category, w: weight assigned to destination category}] } ]
            mapping = []
            for orig in local_mappings[d].mapping:
                lst = []
                for dst in local_mappings[d].mapping[orig]:
                    lst.append(
                        dict(d=dst, w=local_mappings[d].mapping[orig][dst]))
                mapping.append(dict(o=orig, to=lst))
            if local_mappings[d].origin_dataset:
                dims, attrs, meas = obtain_dataset_metadata(
                    local_mappings[d].origin_dataset)
                if local_mappings[d].origin_hierarchy not in dims:
                    issues.append(
                        Issue(itype=3,
                              description="The origin dimension '" +
                              local_mappings[d].origin_hierarchy +
                              "' does not exist in dataset '" +
                              local_mappings[d].origin_dataset + "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
                else:
                    dim = dims[local_mappings[d].origin_hierarchy]
                    mapping = fill_map_with_all_origin_categories(dim, mapping)
            #
            origin_dataset = local_mappings[d].origin_dataset
            origin_hierarchy = local_mappings[d].origin_hierarchy
            destination_hierarchy = local_mappings[d].destination_hierarchy
            # Create Mapping and add it to Case Study mappings variable
            mappings[d] = Mapping(d, obtain_dataset_source(origin_dataset),
                                  origin_dataset, origin_hierarchy,
                                  destination_hierarchy, mapping)

        # TODO
        # Use the function to perform many to many mappings, "augment_dataframe_with_mapped_columns"
        # Put it to work !!!

        # One or more mapping in sequence could be specified?. The key is "source hierarchy+dest hierarchy"
        # Read mapping parameters

        return issues, None
예제 #25
0
 def __init__(self, session_factory):
     self.registry = create_dictionary()
     self._session_factory = session_factory
def parse_data_input_command(sh: Worksheet,
                             area: AreaTupleType,
                             processors_type: str,
                             state=None) -> IssuesLabelContentTripleType:
    """
    Scans the "area" of input worksheet "sh" where it is assumed a "data input" command
    is present.

    It obtains a list of observations, a list of processors, a list of observables, a list of tags
    All those are represented in JSON format

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :param processors_type: Name for the type of processors. Also label of the command
    :param state: Transient state useful for checking existence of variables
    :return: DataInputCommand, list of issues (issue_type, message)
    """
    some_error = False
    issues = []
    # Define a set of observations (qualified quantities) of observables
    # This set can be replicated. So, ?how to refer to each replica?
    # Regular expression, internal name, Mandatory (True|False)
    known_columns = [
        (r"Name|Processor[_ ]name", "processor", False),
        (r"Level", "level", False),
        (r"Parent", "parent", False),
        (r"FF[_ ]type", "ff_type", True),
        (r"Var|Variable", "factor", True),
        (r"Value|NUSAP\.N", "value",
         False),  # If value is not specified, then just declare the Factor
        (r"Unit|NUSAP\.U", "unit",
         True),  # If blank, a dimensionless amount is assumed
        (r"Relative[_ ]to", "relative_to", False),
        (r"Uncertainty|Spread|NUSAP\.S", "uncertainty", False),
        (r"Assessment|NUSAP\.A", "assessment", False),
        (r"Pedigree[_ ]matrix|NUSAP\.PM", "pedigree_matrix", False),
        (r"Pedigree|NUSAP\.P", "pedigree", False),
        (r"Time|Date", "time", False),
        (r"Geo|Geolocation", "geolocation", False),
        (r"Source", "source", False),
        (r"Comment|Comments", "comments", False)
    ]

    label = "Processors " + processors_type

    # First, examine columns, to know which fields are being specified
    # Special cases:
    #   Open columns: the field is specified in the cell togheter with the value. Like "attr1=whatever", instead of a header "attr1" and in a row below, a value "whatever"
    #   Complex values: the value has syntactic rules. Like expressions for both quantities AND qualities (like NUSAP)
    #   References: the field refers to additional information in another worksheet. Unique names or ref holder (worksheet name) plus ref inside the worksheet, would be allowed. Also ref type can disambiguate
    mandatory = {t[1]: t[2] for t in known_columns}
    cre = {
    }  # Column Regular Expression dictionary (K: regular expression; V: RegularExpression object)
    if not case_sensitive:
        flags = re.IGNORECASE
    else:
        flags = 0
    for kc in known_columns:
        cre[kc[0]] = re.compile(kc[0], flags=flags)
    col_names = {}
    standard_cols = {
    }  # Internal (standardized) column name to column index in the worksheet (freedom in the order of columns)
    attribute_cols = create_dictionary(
    )  # Not recognized columns are considered freely named categories, attributes or tags
    attributes = [
    ]  # List of attributes or tags (keys of the previous dictionary)
    col_allows_dataset = create_dictionary(
    )  # If the column allows the reference to a dataset dimension
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=area[0], column=c).value
        if not col_name:
            continue

        col_name = col_name.replace("\n", " ")
        col_names[c] = col_name

        # Match
        found = False
        for kc in known_columns:
            res = cre[kc[0]].search(col_name)
            if res:
                if kc[1] in standard_cols:
                    issues.append(
                        (2, "Cannot repeat column name '" + col_name + "' (" +
                         kc[0] + ") in data input command '" +
                         processors_type + "'"))
                else:
                    standard_cols[kc[1]] = c
                    col_names[c] = kc[
                        1]  # Override column name with pseudo column name for standard columns
                    if col_names[c].lower() in [
                            "factor", "value", "time", "geolocation"
                    ]:
                        col_allows_dataset[col_names[c]] = True
                    else:
                        col_allows_dataset[col_names[c]] = False
                    found = True
                break
        if not found:
            if col_name not in attribute_cols:
                # TODO Check valid col_names. It must be a valid Variable Name
                attribute_cols[col_name] = c
                attributes.append(col_name)
                col_allows_dataset[col_name] = True
            else:
                issues.append(
                    (2, "Cannot repeat column name '" + col_name +
                     "' in data input command '" + processors_type + "'"))

    del cre

    # Check if there are mandatory columns missing

    # TODO There could be combinations of columns which change the character of mandatory of some columns
    # TODO For instance, if we are only specifying structure, Value would not be needed
    print("BORRAME - " + str(known_columns))
    print("BORRAME 2 - " + str(standard_cols))
    for kc in known_columns:
        # "kc[2]" is the flag indicating if the column is mandatory or not
        # col_map contains standard column names present in the worksheet
        if kc[2] and kc[1] not in standard_cols:
            some_error = True
            issues.append((3, "Column name '" + kc[0] +
                           "' must be specified in data input command '" +
                           processors_type + "'"))

    # If there are errors, do not continue
    if some_error:
        return issues, label, None

    processor_attribute_exclusions = create_dictionary()
    processor_attribute_exclusions[
        "scale"] = None  # Exclude these attributes when characterizing the processor
    processor_attributes = [
        t for t in attributes if t not in processor_attribute_exclusions
    ]

    # SCAN rows
    lst_observations = [
    ]  # List of ALL observations. -- Main outcome of the parse operation --

    set_pedigree_matrices = create_dictionary()  # List of pedigree templates
    set_processors = create_dictionary()  # List of processor names
    set_factors = create_dictionary()  # List of factors
    set_taxa = create_dictionary(
    )  # Dictionary of taxa with their lists of values. Useful to return CODE LISTS
    set_referenced_datasets = create_dictionary(
    )  # Dictionary of datasets to be embedded into the result (it is a job of the execution part)
    processors_taxa = create_dictionary(
    )  # Correspondence "processor" -> taxa (to avoid changes in this correspondence)

    dataset_column_rule = parser_field_parsers.dataset_with_column
    values = [None] * area[3]
    # LOOP OVER EACH ROW
    for r in range(area[0] + 1, area[1]):  # Scan rows (observations)
        # Each row can specify: the processor, the factor, the quantity and qualities about the factor in the processor
        #                       It can also specify a "flow+containment hierarchy" relation

        row = {}  # Store parsed values of the row

        taxa = create_dictionary()  # Store attributes or taxa of the row

        referenced_dataset = None  # Once defined in a row, it cannot change!!
        # Scan the row first, looking for the dataset. The specification is allowed in certain columns:
        # attribute_cols and some standard_cols
        already_processed = create_dictionary()
        for c in range(area[2], area[3]):
            if c in col_names:
                value = sh.cell(row=r, column=c).value
                if isinstance(value, str) and value.startswith("#"):
                    col_name = col_names[c]
                    if col_allows_dataset[col_name]:
                        if not referenced_dataset:
                            try:
                                ast = parser_field_parsers.string_to_ast(
                                    dataset_column_rule, value[1:])
                                if len(ast["parts"]) == 2:
                                    referenced_dataset = ast["parts"][0]
                                    # Remove the dataset variable. It will be stored in "_referenced_dataset"
                                    value = "#" + ast["parts"][1]
                                else:
                                    some_error = True
                                    issues.append((
                                        3,
                                        "The first dataset reference of the row must contain the "
                                        "dataset variable name and the dimension name, row "
                                        + str(r)))

                                # Mark as processed
                                already_processed[col_name] = None
                            except:
                                some_error = True
                                issues.append(
                                    (3, "Column '" + col_name +
                                     "' has an invalid dataset reference '" +
                                     value + "', in row " + str(r)))
                        else:
                            try:
                                ast = parser_field_parsers.string_to_ast(
                                    simple_ident, value[1:])
                                # Mark as processed
                                already_processed[col_name] = None
                            except:
                                some_error = True
                                issues.append(
                                    (3, "Column '" + col_name +
                                     "' has an invalid dataset reference '" +
                                     value + "', in row " + str(r)))
                        if col_name in standard_cols:
                            row[col_name] = value
                        else:
                            taxa[col_name] = value

                values[c] = value

        # TODO If the flow type is decomposed, compose it first
        for c in standard_cols:
            if c in already_processed:
                continue

            value = values[standard_cols[c]]

            # != "" or not
            if value is None or (value is not None and value == ""):
                if c == "unit":
                    value = "-"
                if not value:
                    if mandatory[c]:
                        some_error = True
                        issues.append(
                            (3,
                             "Column '" + c + "' is mandatory, row " + str(r)))
                    continue  # Skip the rest of the iteration!

            # Parse the value
            if c in ["processor", "factor"]:
                # Check that it is a variable name, and allow hierarchical names
                parser_field_parsers.string_to_ast(
                    parser_field_parsers.simple_h_name, value)
            elif c == "pedigree_matrix":
                parser_field_parsers.string_to_ast(
                    parser_field_parsers.simple_ident, value)
            elif c == "relative_to":
                # Two elements, the first a hierarchical name, the second a unit name
                s = value.split(" ")
                if len(s) != 2:
                    some_error = True
                    issues.append((
                        3,
                        "The Relative To value has to have two parts, factor name and unit, separated by a whitespace (specified '"
                        + value + "'), in row " + str(r)))
                else:
                    try:
                        parser_field_parsers.string_to_ast(
                            parser_field_parsers.simple_h_name, s[0])
                    except:
                        some_error = True
                        issues.append((
                            3,
                            "The name specified for the relative to factor '" +
                            s[0] + "' is not valid, in row " + str(r)))

                    # It must be a recognized unit. Check with Pint
                    try:
                        ureg(s[1])
                        ureg.parse_unit_name(s[1], case_sensitive)
                    except UndefinedUnitError:
                        some_error = True
                        issues.append((
                            3, "The unit name '" + s[1] +
                            "' is not registered in the units processing package, in row "
                            + str(r)))
            elif c == "level":
                # A valid level name
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.level_name, value)
                except:
                    some_error = True
                    issues.append((3, "The level '" + value +
                                   "' syntax is not valid, in row " + str(r)))

            elif c == "parent":
                # Check that value is a valid parent name. It can be either a list of tags OR
                # a processor name, something defining a single processor
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.simple_h_name, value)
                except:
                    try:
                        parser_field_parsers.string_to_ast(
                            parser_field_parsers.named_parameters_list, value)
                    except:
                        some_error = True
                        issues.append((3, "Could not parse '" + value +
                                       "' as 'parent' in row " + str(r)))
            elif c == "ff_type":
                # The type of flow/fund must be one of a set of possible values. DEFINE THE LIST
                if value.lower() not in allowed_ff_types:
                    some_error = True
                    issues.append(
                        (3, "ff_type must be one of :" +
                         ', '.join(allowed_ff_types) + ", in row " + str(r)))
            elif c == "value":
                if not isinstance(value, str):
                    value = str(value)
                # Expression allowed. Check syntax only. It can refer to parameters.
                ast = parser_field_parsers.string_to_ast(
                    parser_field_parsers.expression, value)
                # TODO Check existence of used variables
                # TODO basic_elements_parser.ast_evaluator(ast, state, None, issues, "static")
            elif c == "unit":
                # It must be a recognized unit. Check with Pint
                try:
                    value = value.replace("€", "Euro").replace("$", "Dollar")
                    if value == "-":
                        value = ""  # Dimensionless
                    ureg(value)
                    ureg.parse_unit_name(value, case_sensitive)
                except:
                    some_error = True
                    issues.append((
                        3, "The unit name '" + value +
                        "' is not registered in the units processing package, in row "
                        + str(r)))
            elif c == "uncertainty":
                # TODO It must be a valid uncertainty specifier
                pass
            elif c == "assessment":
                # See page 135 of Funtowicz S., Ravetz J., "Uncertainty and Quality in Science for Policy"
                # "c" is "cognitive" assessment, "p" is pragmatic assessment.
                allowed = [
                    "nil", "low", "medium", "high", "total", "nil_c", "low_c",
                    "medium_c", "high_c", "total_c", "nil_p", "low_p",
                    "medium_p", "high_p", "total_p"
                ]
                if value and value.lower() not in allowed:
                    issues.append((3, "Assessment must be empty or one of: " +
                                   ", ".join(allowed)))
            elif c == "pedigree":
                # A valid pedigree specification is just an integer
                try:
                    int(value)
                except:
                    issues.append((3, "The pedigree specification '" + value +
                                   "' must be an integer"))
            elif c == "time":
                # A valid time specification. Possibilities: Year, Month-Year / Year-Month, Time span (two dates)
                if not isinstance(value, str):
                    value = str(value)
                ast = parser_field_parsers.string_to_ast(
                    parser_field_parsers.time_expression, value)
            elif c == "geolocation":
                # A reference to a geolocation
                try:
                    parser_field_parsers.string_to_ast(
                        parser_field_parsers.reference, value)
                except:
                    some_error = True
                    issues.append((3, "The geolocation must be a reference"))
            elif c == "source":
                # Who or what provided the information. It can be formal or informal. Formal can be references (but evaluated later)
                pass
            elif c == "comments":
                # Free text
                pass

            # Store the parsed value
            row[c] = value

        for c in attribute_cols:
            if c in already_processed:
                continue

            value = values[attribute_cols[c]]

            # != "" or not
            if not value:
                taxa[c] = None
                continue  # Skip the rest of the iteration!

            # TODO Check value. Valid identifier, no whitespace
            # Validate "value", it has to be a simple ID
            try:
                if not isinstance(value, str):
                    value = str(value)
                parser_field_parsers.simple_ident.parseString(value,
                                                              parseAll=True)
            except:
                value = None
                some_error = True
                issues.append((
                    3, "The value in column '" + c +
                    "' has to be a simple identifier: start with letter, then letters, numbers and '_', no whitespace, in row "
                    + str(r)))

            taxa[c] = value

            # Disable the registration of taxa. If a Dataset reference is used, there is no way to register
            # taxa at parse time (the dataset is still not obtained). Leave it for the execution
            if c not in set_taxa:
                set_taxa[c] = create_dictionary()
            if value is not None:
                set_taxa[c][value] = None

        # Now that individual columns have been parsed, do other things

        if referenced_dataset:
            row["_referenced_dataset"] = referenced_dataset

        # If "processor" not specified, concatenate taxa columns in order to generate an automatic name
        # (excluding the processor type)
        p_taxa = taxa.copy()
        for k in processor_attribute_exclusions:
            if k in p_taxa: del p_taxa[k]

        if "processor" not in row:
            row["processor"] = "_".join(
                [str(taxa[t]) for t in processor_attributes]
            )  # TODO Which order? (the current is "order of appearance"; maybe "alphabetical order" would be better option)
        # Add as "taxa" the processor type (which is an optional input parameter to this function)
        if processors_type:
            taxa["_processors_type"] = processors_type
        # Store taxa (attributes and taxa)
        row["taxa"] = taxa
        # Store taxa if the processor still does not have it
        if row["processor"] not in processors_taxa:
            processors_taxa[row[
                "processor"]] = p_taxa  # "::".join([taxa[t] for t in lst_taxa_cols])
        else:
            # Taxa should be the same for each "processor". Error if different
            t = processors_taxa[row["processor"]]
            if t != p_taxa:
                issues.append(
                    (3, "The processor '" + row["processor"] +
                     "' has different taxa assigned, in row " + str(r)))

        # Register new processor names, pedigree templates, and variable names
        if "processor" in row:
            set_processors[row["processor"]] = None
        if "pedigree_matrix" in row:
            set_pedigree_matrices[row["pedigree_matrix"]] = None
        if "factor" in row:
            set_factors[row["factor"]] = None
        if referenced_dataset:
            set_referenced_datasets[referenced_dataset] = None

        lst_observations.append(row)

    content = {
        "factor_observations": lst_observations,
        "processor_attributes": processor_attributes,
        "processors": [k for k in set_processors],
        "pedigree_matrices": [k for k in set_pedigree_matrices],
        "factors": [k for k in set_factors],
        "referenced_datasets": [ds for ds in set_referenced_datasets],
        "code_lists": {k: [k2 for k2 in set_taxa[k]]
                       for k in set_taxa}
    }
    return issues, label, content
예제 #27
0
def commands_generator_from_ooxml_file(
        input, state, sublist,
        stack) -> backend.ExecutableCommandIssuesPairType:
    """
    It reads an Office Open XML input
    Yields a sequence of command_executors

    :param input: A bytes input
    :param state: State used to check variables
    :param sublist: List of worksheets to consider
    :param stack: Stack of nested files. Just pass it...
    :return:
    """
    # Start the Excel reader
    workbook = openpyxl.load_workbook(io.BytesIO(input), data_only=True)

    # Command names (for the "list of commands" command)
    command_names = create_dictionary(
        data={cmd_name: None
              for cmd_name in valid_v2_command_names})

    worksheet_to_command = create_dictionary(
    )  # A dictionary to translate a worksheet to an equivalent command
    if sublist:
        # Force reading "ListOfCommands" commands
        for sheet_name in workbook.sheetnames:
            if first(commands,
                     condition=lambda c: c.name == "list_of_commands" and c.
                     regex.search(sheet_name)):
                sublist.append(sheet_name)

    # For each worksheet, get the command type, convert into primitive JSON
    for sheet_number, sheet_name in enumerate(workbook.sheetnames):
        if sublist:
            if sheet_name not in sublist:
                continue

        issues = []
        total_issues: List[Issue] = []
        sheet = workbook[sheet_name]

        c_label: str = None
        c_content = None

        name = sheet.title

        # Use an equivalent command name
        if name in worksheet_to_command:
            name = worksheet_to_command[name]

        # Extract worksheet matrices
        m = binary_mask_from_worksheet(sheet, False)
        t = obtain_rectangular_submatrices(m, only_remove_empty_bottom=True)
        if len(t) == 0:  # No data
            continue

        t = t[
            0]  # Take just the first element, a tuple (top, bottom, left, right) representing a rectangular region
        t = (t[0] + 1, t[1] + 1, t[2] + 1, t[3] + 1)  # Indices start at 1

        # v = worksheet_to_numpy_array(sheet)

        # Find which COMMAND to parse, then parse it
        cmd: Optional[backend.Command] = first(
            commands, condition=lambda c: c.regex.search(name))

        c_type: str = cmd.name if cmd else None
        if not c_type:
            total_issues.append(
                Issue(
                    sheet_number, sheet_name, None, 2,
                    f"The worksheet name '{sheet_name}' has not a supported command associated. Skipped."
                ))

        elif c_type == "etl_dataset":
            if sheet.cell(row=t[0], column=t[2]).value:
                t = (1, m.shape[0] + 1, 1, m.shape[1] + 1)
                # Parse to read parameters
                dataset_name = cmd.regex.search(name).group(2)
                issues, c_label, c_content = cmd.parse_function(
                    sheet, t, dataset_name, state)
            else:
                total_issues.append(
                    Issue(
                        sheet_number, sheet_name, c_type, 3,
                        f"It seems there are no parameters for the dataset import command at worksheet '{sheet_name}'"
                    ))

        elif c_type == "list_of_commands":
            issues, c_label, c_content = parse_command(sheet, t, None,
                                                       cmd.name)
            c_type = None
            if 3 not in [issue.itype for issue in issues]:
                for r in c_content["items"]:
                    worksheet = r.get("worksheet", None)
                    command = r.get("command", None)
                    # Check if valid command
                    if command not in command_names:
                        total_issues.append(
                            Issue(
                                sheet_number, sheet_name, None, 3,
                                "Command '" + command +
                                "' not recognized in List of Commands."))
                    else:
                        worksheet_to_command[worksheet] = command

        elif c_type == "import_commands":
            issues, c_label, c_content = parse_command(sheet, t, None,
                                                       cmd.name)
            if 3 not in [issue.itype for issue in issues]:
                # Declared at this point to avoid circular reference ("parsers_factory" imports "parsers_spreadsheet")
                from backend.command_generators.parsers_factory import commands_container_parser_factory
                # For each line, repeat the import
                for r in c_content["items"]:
                    generator_type, file2, sublist2 = handle_import_commands(r)
                    yield from commands_container_parser_factory(
                        generator_type,
                        None,
                        file2,
                        state,
                        sublist=sublist2,
                        stack=stack)
                    print("Done")

        elif c_type == "mapping":
            groups = cmd.regex.search(name).groups()
            if groups[2] and groups[8]:
                origin = groups[2]
                destination = groups[8]
            elif not groups[2] and not groups[8]:
                origin = None
                destination = None
            else:
                total_issues.append(
                    Issue(
                        sheet_number, sheet_name, c_type, 3,
                        f"Either origin or destination are not correctly specified in the sheet name '{sheet_name}'"
                    ))

            issues, c_label, c_content = cmd.parse_function(
                sheet, t, origin, destination)

        elif c_type in ["datasetqry", "datasetdata"]:
            issues, c_label, c_content = cmd.parse_function(
                sheet, t, sheet_name, state)

        elif c_type == "hierarchy":
            res = cmd.regex.search(name)
            h_type = res.group(2)
            c_label = res.group(3)
            issues, _, c_content = cmd.parse_function(sheet, t, c_label,
                                                      h_type)

        elif c_type == "data_input":
            group2_name = cmd.regex.search(name).group(2)
            issues, c_label, c_content = cmd.parse_function(
                sheet, t, group2_name)

        else:
            # GENERIC command parser
            if cmd.parse_function:
                issues, c_label, c_content = cmd.parse_function(
                    sheet, t, sheet_name)
            else:
                issues, c_label, c_content = parse_command(
                    sheet, t, sheet_name, cmd.name)

        # -------------------------------------------------------------------------------------------------------------
        # Command parsed, now append "issues"
        errors = 0
        if len(issues) > 0:
            for i in issues:
                if isinstance(i, backend.command_generators.Issue):
                    if i.itype == 3:
                        errors += 1
                    issue = Issue(sheet_number, sheet_name, c_type, i.itype,
                                  i.description)
                else:
                    if i[0] == 3:
                        errors += 1
                    issue = Issue(sheet_number, sheet_name, c_type, i[0], i[1])
                total_issues.append(issue)

        if errors == 0:
            try:
                if c_type:
                    cmd, issues = create_command(c_type, c_label, c_content,
                                                 sheet_name)
                else:
                    cmd = None
                    issues = []
            except:
                cmd = None
                issues = [
                    (3, "Could not create command of type '" + c_type + "'")
                ]
            if issues:
                for i in issues:
                    if isinstance(i, backend.command_generators.Issue):
                        issue = Issue(sheet_number, sheet_name, c_type,
                                      i.itype, i.description)
                    else:
                        issue = Issue(sheet_number, sheet_name, c_type, i[0],
                                      i[1])

                    total_issues.append(issue)

        else:
            print(issues)  # Convenient for debugging purposes
            cmd = None  # cmd, _ = create_command(c_type, c_label, {}, sh_name)

        yield cmd, total_issues
예제 #28
0
 def __init__(self):
     self.registry = create_dictionary()
예제 #29
0
        def process_row(row):
            """
            Process a dictionary representing a row of the data input command. The dictionary can come directly from
            the worksheet or from a dataset.

            Implicitly uses "glb_idx"

            :param row: dictionary
            """
            # From "ff_type" extract: flow/fund, external/internal, incoming/outgoing
            # ecosystem/society?
            ft = row["ff_type"].lower()
            if ft == "int_in_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = True
                incoming = True
            elif ft == "int_in_fund":
                roegen_type = FlowFundRoegenType.fund
                internal = True
                incoming = True
            elif ft == "ext_in_fund":
                roegen_type = FlowFundRoegenType.fund
                internal = False
                incoming = True
            elif ft == "int_out_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = True
                incoming = False
            elif ft == "ext_in_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = False
                incoming = True
            elif ft == "ext_out_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = False
                incoming = False
            elif ft == "env_out_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = False
                incoming = False
            elif ft == "env_in_flow":
                roegen_type = FlowFundRoegenType.flow
                internal = False
                incoming = True
            elif ft == "env_in_fund":
                roegen_type = FlowFundRoegenType.fund
                internal = False
                incoming = True

            # Split "taxa" attributes. "scale" corresponds to the observation
            p_attributes = row["taxa"].copy()
            if "scale" in p_attributes:
                other_attrs = create_dictionary()
                other_attrs["scale"] = p_attributes["scale"]
                del p_attributes["scale"]
            else:
                other_attrs = None

            # Check existence of PedigreeMatrix, if used
            if "pedigree_matrix" in row:
                pm = glb_idx.get(
                    PedigreeMatrix.partial_key(name=row["pedigree_matrix"]))
                if len(pm) != 1:
                    issues.append((3, "Could not find Pedigree Matrix '" +
                                   row["pedigree_matrix"] + "'"))
                    del row["pedigree_matrix"]
                else:
                    try:
                        lst = pm[0].get_modes_for_code(row["pedigree"])
                    except:
                        issues.append(
                            (3, "Could not decode Pedigree '" +
                             row["pedigree"] + "' for Pedigree Matrix '" +
                             row["pedigree_matrix"] + "'"))
                        del row["pedigree"]
                        del row["pedigree_matrix"]
            else:
                if "pedigree" in row:
                    issues.append((
                        3,
                        "Pedigree specified without accompanying Pedigree Matrix"
                    ))
                    del row["pedigree"]

            # Source
            if "source" in row:
                try:
                    ast = parser_field_parsers.string_to_ast(
                        parser_field_parsers.reference, row["source"])
                    ref_id = ast["ref_id"]
                    references = glb_idx.get(Reference.partial_key(ref_id),
                                             ref_type="provenance")
                    if len(references) == 1:
                        source = references[0]
                except:
                    source = row["source"]
            else:
                source = None

            # Geolocation
            if "geolocation" in row:
                try:
                    ast = parser_field_parsers.string_to_ast(
                        parser_field_parsers.reference, row["geolocation"])
                    ref_id = ast["ref_id"]
                    references = glb_idx.get(Reference.partial_key(ref_id),
                                             ref_type="geographic")
                    if len(references) == 1:
                        geolocation = references[0]
                except:
                    geolocation = row["geolocation"]
            else:
                geolocation = None

            # CREATE FactorType, A Type of Observable, IF it does not exist
            # AND ADD Quantitative Observation
            p, ft, f, o = create_or_append_quantitative_observation(
                glb_idx,
                factor=row["processor"] + ":" + row["factor"],
                value=row["value"] if "value" in row else None,
                unit=row["unit"],
                observer=source,
                spread=row["uncertainty"] if "uncertainty" in row else None,
                assessment=row["assessment"] if "assessment" in row else None,
                pedigree=row["pedigree"] if "pedigree" in row else None,
                pedigree_template=row["pedigree_matrix"]
                if "pedigree_matrix" in row else None,
                relative_to=row["relative_to"]
                if "relative_to" in row else None,
                time=row["time"] if "time" in row else None,
                geolocation=None,
                comments=row["comments"] if "comments" in row else None,
                tags=None,
                other_attributes=other_attrs,
                proc_aliases=None,
                proc_external=False,  # TODO
                proc_attributes=p_attributes,
                proc_location=None,
                ftype_roegen_type=roegen_type,
                ftype_attributes=None,
                fact_external=not internal,
                fact_incoming=incoming,
                fact_location=geolocation)
            if p_set.append(
                    p, glb_idx
            ):  # Appends codes to the pset if the processor was not member of the pset
                p_set.append_attributes_codes(row["taxa"])
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name,
                              state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)

    # Look for the name of the input Dataset
    dataset_name = None
    available_at_datetime = None
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["inputdataset"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    dataset_name = v
                    break  # Stop on first definition
        elif col_name.lower().strip() in ["availableatdatetime"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    available_at_datetime = v
                    break  # Stop on first definition

    # Obtain the source
    source = obtain_dataset_source(dataset_name)
    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source)
    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = create_dictionary(data={
                k: None
                for k in dims[d].code_list.keys()
            })  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True

    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]]
            tmp = create_dictionary(
                data={
                    to["d"]: None
                    for o in mappings[m].map for to in o["to"] if to["d"]
                })
            cl[mappings[m].
               destination] = tmp  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.

    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian

    out_dims = []

    out_measures = OrderedDict()
    for r in range(area[0] + 1, area[1] + 1):
        out_measures[r] = dict(measure=None, agg_func=None, measure_as=None)

    filter_ = {
    }  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    measure_names_column = None
    aggregations_column = None
    for c in range(area[2], area[3]):  # Each column
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["resultdimensions",
                                        "dimensions"]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, d in enumerate(lst):
                if not d:
                    continue
                if d not in cl:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The dimension specified for output, '"
                            + d +
                            "' is neither a dataset dimension nor a mapped dimension. ["
                            + ', '.join([d2 for d2 in cl]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in ["resultmeasures",
                                          "measures"]:  # "SELECT"
            measure_names_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for r, m in enumerate(lst):
                if not m:
                    continue
                if m not in meas:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The specified measure, '" + m +
                            "' is not a measure available in the dataset. [" +
                            ', '.join([m2 for m2 in measures]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["measure"] = m
        elif col_name.lower().strip() in [
                "resultmeasuresaggregation", "resultmeasuresaggregator",
                "aggregation"
        ]:  # "SELECT AGGREGATORS"
            aggregations_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, f in enumerate(lst):
                if not f:
                    continue

                if f.lower() not in [
                        "sum", "avg", "count", "sumna", "countav", "avgna",
                        "pctna"
                ]:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The specified aggregation function, '"
                            + f +
                            "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["agg_func"] = f
        elif col_name.lower().strip() in [
                "resultmeasurename", "resultmeasuresnames", "resultmeasuresas",
                "measuresas"
        ]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, m in enumerate(lst):
                out_measures[r + area[0] + 1]["measure_as"] = m
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, cd in enumerate(lst):
                if not cd:
                    continue
                if str(cd) not in cl[col_name]:
                    issues.append(
                        Issue(
                            itype=3,
                            description="The code '" + cd +
                            "' is not present in the codes declared for dimension '"
                            + col_name + "'. Please, check them.",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in [
                "startperiod", "endperiod"
        ]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                filter_[col_name] = lst[
                    0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in [
                "outputdatasetname", "outputdataset", "result_name",
                "result name", "resultname"
        ]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident,
                                                       result_name)
                except:
                    issues.append(
                        Issue(itype=3,
                              description="Column '" + col_name +
                              "' has an invalid dataset name '" + result_name +
                              "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=2,
                                                     column=c + 1)))

    # If more than one agg function defined -> all must be defined
    # If no agg func defined -> assume AVG
    # If agg func defined only in first row -> extend to other columns
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]
    if len(agg_funcs) > 1:
        first_agg_func = None
    elif len(agg_funcs) == 0:
        issues.append(
            Issue(itype=2,
                  description=
                  "No aggregation function specified. Assuming 'average'",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=aggregations_column)))
        first_agg_func = "avg"
    else:  # One aggregation function
        first_agg_func = out_measures[area[0] + 1]["agg_func"]
        if not first_agg_func:
            issues.append(
                Issue(
                    itype=3,
                    description=
                    "The aggregation function must be defined in the first row",
                    location=IssueLocation(sheet_name=name,
                                           row=1,
                                           column=aggregations_column)))

    if first_agg_func:
        for v in out_measures.values():
            if v.get("measure", None):
                v["agg_func"] = first_agg_func

    # Uniform rows, with the three values defined: measure, aggregation function and "measure as"
    for r, v in out_measures.items():
        measure = v.get("measure", None)
        agg_func = v.get("agg_func", None)
        measure_as = v.get("measure_as", None)
        if measure and not agg_func or not measure and agg_func:
            issues.append(
                Issue(
                    itype=3,
                    description=
                    "Each measure must be associated with an aggregation function",
                    location=IssueLocation(sheet_name=name,
                                           row=r,
                                           column=measure_names_column)))
        elif measure and not measure_as:
            v["measure_as"] = measure + "_" + agg_func

    measures = [v["measure"] for v in out_measures.values() if v["measure"]]
    measures_as = [
        v["measure_as"] for v in out_measures.values() if v["measure_as"]
    ]
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]

    if len(measures) == 0:
        issues.append(
            Issue(itype=3,
                  description="At least one measure should be specified",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=measure_names_column)))

    # measures != agg_funcs && len(agg_funcs) == 1 --> OK
    if len(measures) != len(agg_funcs) and len(agg_funcs) != 1:
        issues.append(
            Issue(
                itype=3,
                description=
                "There must be one aggregation function (used for all measures) or one aggregation per measure",
                location=IssueLocation(sheet_name=name,
                                       row=1,
                                       column=aggregations_column)))

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append(
            Issue(itype=2,
                  description="No result name specified. Assuming '" +
                  result_name + "'",
                  location=IssueLocation(sheet_name=name, row=2,
                                         column=c + 1)))

    content = {
        "dataset_source": source,
        "dataset_name": dataset_name,
        "dataset_datetime": available_at_datetime,
        "where": filter_,
        "dimensions": [d for d in dims],
        "group_by": out_dims,
        "measures": measures,
        "agg_funcs": agg_funcs,
        "measures_as": measures_as,
        "result_name": result_name
    }
    return issues, None, content