def parse_metadata_command(sh: Worksheet, area: AreaTupleType, name: str = None) -> IssuesLabelContentTripleType: """ Most "parse" methods are mostly syntactic (as opposed to semantic). They do not check existence of names. But in this case, the valid field names are fixed beforehand, so they are checked at this time. Some of the fields will be controlled also, according to some :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :return: list of issues (issue_type, message), command label, command content """ some_error = False issues = [] controlled = create_dictionary() mandatory = create_dictionary() keys = create_dictionary() for t in metadata_fields: controlled[t[4]] = t[3] mandatory[t[4]] = t[2] keys[t[0]] = t[4] # Scan the sheet, the first column must be one of the keys of "k_list", following # columns can contain repeating values # Map key to a list of values content = {} # Dictionary of lists, one per metadata key for r in range(area[0], area[1]): label = sh.cell(row=r, column=area[2]).value if label in keys: key = keys[label] for c in range(area[2]+1, area[3]): value = sh.cell(row=r, column=c).value if value: value = str(value).strip() if controlled[key]: # Control "value" if the field is controllable cl = {"dimensions": ["water", "energy", "food", "land", "climate"], "subject_topic_keywords": None, "geographical_level": ["local", "regional", "region", "country", "europe", "sectoral", "sector"], "geographical_situation": None, # TODO Read the list of all geographical regions (A long list!!) "restriction_level": ["internal", "confidential", "public"], "language": None, # TODO Read the list of ALL languages (or just "English"??) } if cl[key] and value.lower() not in cl[key]: issues.append((3, "The key '"+key+"' should be one of: "+",".join(cl[key]))) if key not in content: content[key] = [] content[key].append(value) else: issues.append((2, "Row "+str(r)+": unknown metadata label '"+label+"'")) for key in keys.values(): if mandatory[key] and key not in content: some_error = True issues.append((3, "The value '"+key+"' is mandatory in the definition of the metadata")) return issues, None, content
def process_line(item): # Read variables mh_src_dataset = item.get("source_dataset", None) mh_src_hierarchy = item.get("source_hierarchy", None) mh_src_code = item.get("source_code", None) mh_dst_hierarchy = item.get("destination_hierarchy", None) mh_dst_code = item.get("destination_code", None) mh_weight = item.get("weight", None) # Mapping name name = ((mh_src_dataset + ".") if mh_src_dataset else "") + mh_dst_hierarchy + " -> " + mh_dst_hierarchy if name in mappings: issues.append( Issue(itype=3, description="The mapping '" + name + "' has been declared previously. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) return if name in local_mappings: d = local_mappings[name] else: d = DottedDict() local_mappings[name] = d d.name = name d.origin_dataset = mh_src_dataset d.origin_hierarchy = mh_src_hierarchy d.destination_hierarchy = mh_dst_hierarchy d.mapping = create_dictionary() # Specific code if mh_src_code in d.mapping: to_dict = d.mapping[mh_src_code] else: to_dict = create_dictionary() if mh_dst_code in to_dict: issues.append( Issue(itype=3, description="The mapping of '" + mh_src_code + "' into '" + mh_dst_code + "' has been done already", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: to_dict[ mh_dst_code] = mh_weight # NOTE: This could be an object instead of just a FLOAT or expression d.mapping[mh_src_code] = to_dict
def execute(self, state: "State"): any_error = False issues = [] sheet_name = self._content["command_name"] # Obtain global variables in state glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) scenarios = create_dictionary() solver_parameters = create_dictionary() for r, param in enumerate(self._content["items"]): parameter = param["parameter"] scenario = param["scenario_name"] p = glb_idx.get(Parameter.partial_key(parameter)) if scenario: if len(p) == 0: issues.append( Issue(itype=3, description="The parameter '" + parameter + "' has not been declared previously.", location=IssueLocation(sheet_name=sheet_name, row=r, column=None))) any_error = True continue p = p[0] name = p.name else: name = parameter value = param["parameter_value"] description = param.get( "description", None) # For readability of the workbook. Not used for solving if scenario: if scenario in scenarios: sp = scenarios[scenario] else: sp = create_dictionary() scenarios[scenario] = sp sp[name] = value else: solver_parameters[name] = value if not any_error: ps = ProblemStatement(solver_parameters, scenarios) glb_idx.put(ps.key(), ps) return issues, None
def obtain_problem_statement() -> ProblemStatement: """ Obtain a ProblemStatement instance Obtain the solver parameters plus a list of scenarios :param state: :return: """ ps_list: List[ProblemStatement] = glb_idx.get( ProblemStatement.partial_key()) if len(ps_list) == 0: # No scenarios (dummy), and use the default solver scenarios = create_dictionary() scenarios["default"] = create_dictionary() return ProblemStatement(scenarios=scenarios) else: return ps_list[0]
def test_003_many_to_many_2(self): # Prepare a many to many map from category set to category set # Prepare a simple DataFrame containing m = create_dictionary() m["cat_o_1"] = ("cat_d_1", { "c11": [{"d": "c21", "w": 0.6}, {"d": "c22", "w": 0.4}], "c12": [{"d": "c23", "w": 1.0}], "c13": [{"d": "c23", "w": 1.0}] } ) m["cat_o_2"] = ("cat_d_2", { "c31": [{"d": "c41", "w": 0.3}, {"d": "c42", "w": 0.7}], "c32": [{"d": "c43", "w": 1.0}], "c33": [{"d": "c43", "w": 1.0}] } ) # Prepare a simple DataFrame df = pd.DataFrame(data=[["c11", "c31", 4], ["c12", "c32", 3], ["c13", "c31", 1.5]], columns=["cat_o_1", "cat_o_2", "value"]) # >>>>> Call Cython ACCELERATED Function <<<<< df2 = augment_dataframe_with_mapped_columns2(df, m, ["value"]) # Check result self.assertEqual(list(df2.columns), ["cat_o_1", "cat_o_2", "cat_d_1", "cat_d_2", "value"]) self.assertEqual(df2.shape, (7, 5))
def obtain_dictionary_with_literal_fields(item, asts): d = create_dictionary() for f in item: if not f.startswith("_"): ast = asts[f] if "complex" not in ast or ("complex" in ast and not ast["complex"]): d[f] = item[f] return d
def construct(name: str, description: str, levels: List[str], codes: List[CodeImmutable]): """ :param name: Name of the Code List :param description: Description of the Code List :param levels: Names of the levels :param codes: List of codes, including in each the following tuple: CodeImmutable = namedtuple("CodeTuple", "code description level children") :return: """ cl = CodeList() cl.code = name cl.description = description # Levels levels_dict = create_dictionary() for l in levels: cll = CodeListLevel() cll.code_list = cl # Point to the containing CodeList cll.code = l cll.description = None levels_dict[l] = cll # Codes codes_dict = create_dictionary() for ct in codes: c = Code() c.code = ct.code c.description = ct.description if ct.level in levels_dict: c.level = levels_dict[ ct.level] # Point to the containing CodeListLevel else: c.level = None codes_dict[ct.code] = c c.children = [] c.parents = [] # Set children & parents for ct in codes: for ch in ct.children: if ch in codes_dict: c.children.append(codes_dict[ch]) codes_dict[ch].parents.append(c) return cl
def get_dataset_structure(self, database, dataset) -> Dataset: """ Obtain the structure of a dataset: concepts, dimensions, attributes and measures """ refs = dict(references='all') dsd_response = estat.datastructure("DSD_" + dataset, params=refs) dsd = dsd_response.datastructure["DSD_" + dataset] metadata = dsd_response.write() # SDMXConcept = collections.namedtuple('Concept', 'type name istime description code_list') # DataSource <- Database <- DATASET <- Dimension(s) (including Measures) <- CodeList # | # v # Concept <- CodeList (NOT CONSIDERED NOW) ds = Dataset() ds.code = dataset ds.description = None # How to get description? ds.attributes = {} # Dataset level attributes? (encode them using a dictionary) ds.metadata = None # Metadata for the dataset SDMX (flow, date of production, etc.) ds.database = database # Reference to containing database dims = {} for d in dsd.dimensions: istime = str(dsd.dimensions.get(d)).split("|")[0].strip() == "TimeDimension" dd = Dimension() dd.code = d dd.description = None dd.attributes = None dd.is_time = istime dd.is_measure = False dd.dataset = ds dims[d] = dd for m in dsd.measures: dd = Dimension() dd.code = m dd.description = None dd.attributes = None dd.is_time = False dd.is_measure = True dd.dataset = ds dims[m] = dd for a in dsd.attributes: ds.attributes[a] = None # TODO Get the value for l in metadata.codelist.index.levels[0]: first = True # Read code lists cl = create_dictionary() for m, v in list(zip(metadata.codelist.loc[l].index, metadata.codelist.loc[l]["name"])): if not first: cl[m] = v else: first = False # Attach it to the Dimension or Measure if metadata.codelist.loc[l]["dim_or_attr"][0] == "D": # Build Code List from dictionary dims[l].code_list = CodeList.construct(l, None, [""], [CodeImmutable(k, cl[k], "", []) for k in cl]) return ds
def initialize_datasets_registry(self, datasets_list: List[Dataset]): """ Receive a list of the datasets and make a copy :param datasets_list: :return: """ self._registry = create_dictionary() for ds in datasets_list: self.register_dataset(ds.code, ds)
def list_all_names(self): """ Returns a list of the names of registered entities considering the scopes Start from top level, end in bottom level (the current one, which takes precedence) :return: """ t = create_dictionary() for scope in self.__scope: t.update(scope._registry) return t.keys()
def serialize_state(state: State): """ Serialization prepared for a given organization of the state :return: """ def serialize_dataframe(df): return df.to_json(orient="split") # list(df.index.names), df.to_dict() print(" serialize_state IN") import copy # "_datasets" ns_ds = {} # Save and nullify before deep copy for ns in state.list_namespaces(): _, _, _, datasets, _ = get_case_study_registry_objects(state, ns) ns_ds[ns] = datasets state.set("_datasets", create_dictionary(), ns) # Nullify datasets # !!! WARNING: It destroys "state", so a DEEP COPY is performed !!! tmp = sys.getrecursionlimit() sys.setrecursionlimit(10000) state2 = copy.deepcopy(state) sys.setrecursionlimit(tmp) # Iterate all namespaces for ns in state2.list_namespaces(): glb_idx, p_sets, hh, _, mappings = get_case_study_registry_objects( state2, ns) if glb_idx: tmp = glb_idx.to_pickable() state2.set("_glb_idx", tmp, ns) datasets = ns_ds[ns] # TODO Serialize other DataFrames. # Process Datasets for ds_name in datasets: ds = datasets[ds_name] if isinstance(ds.data, pd.DataFrame): tmp = serialize_dataframe(ds.data) else: tmp = None # ds.data = None # DB serialize the datasets lst2 = serialize(ds.get_objects_list()) lst2.append(tmp) # Append the serialized DataFrame datasets[ds_name] = lst2 state2.set("_datasets", datasets, ns) tmp = serialize_from_object( state2) # <<<<<<<< SLOWEST !!!! (when debugging) print(" serialize_state length: " + str(len(tmp)) + " OUT") return tmp
def get_case_study_registry_objects(state, namespace=None): """ Obtain the main entries of the state :param state: Input state (modified also) :param namespace: State supports several namespaces. This one serves to specify which one. Default=None :return: Tuple: (global index, processor sets, hierarchies, datasets, mappings) """ # Index of ALL objects glb_idx = state.get("_glb_idx", namespace) if not glb_idx: glb_idx = PartialRetrievalDictionary() state.set("_glb_idx", glb_idx, namespace) # ProcessorSet dict (dict of sets) p_sets = state.get("_processor_sets", namespace) if not p_sets: p_sets = create_dictionary() state.set("_processor_sets", p_sets, namespace) # Hierarchies Dict hh = state.get("_hierarchies", namespace) if not hh: hh = create_dictionary() state.set("_hierarchies", hh, namespace) # Datasets Dict datasets = state.get("_datasets", namespace) if not datasets: datasets = create_dictionary() state.set("_datasets", datasets, namespace) # Mappings Dict mappings = state.get("_mappings", namespace) if not mappings: mappings = create_dictionary() state.set("_mappings", mappings, namespace) return glb_idx, p_sets, hh, datasets, mappings
def execute(self, state: "State"): """ Create a set of linear scale conversions, from factor type to factor type """ some_error = False issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) origin_factor_types = self._content["origin_factor_types"] destination_factor_types = self._content["destination_factor_types"] scales = self._content["scales"] # Check that we have valid factor type names fts = create_dictionary() for ft_name in origin_factor_types + destination_factor_types: # Obtain (maybe Create) the mentioned Factor Types p, ft, f = find_or_create_observable( glb_idx, ft_name, Observer.no_observer_specified, None, proc_external=None, proc_attributes=None, proc_location=None, fact_roegen_type=None, fact_attributes=None, fact_incoming=None, fact_external=None, fact_location=None) if not ft: some_error = True issues.append((3, "Could not obtain/create the Factor Type '" + ft_name + "'")) fts[ft_name] = ft if some_error: return issues, None for sc in scales: origin = fts[sc["origin"]] destination = fts[sc["destination"]] scale = sc["scale"] FactorTypesRelationUnidirectionalLinearTransformObservation.create_and_append( origin, destination, scale, Observer.no_observer_specified) return None, None
def get_processor_names_to_processors_dictionary( state: PartialRetrievalDictionary): """ Obtain a dictionary with all processor names (a processor may have multiple names) and the corresponding Processor object :param state: :return: """ ps = state.get(Processor.partial_key()) ps = set(ps) # Avoid repeating Processor objects d = create_dictionary() for p in ps: for n in p.full_hierarchy_names(state): d[n] = p return d
def test_001_many_to_one_1(self): # Prepare a many to one map from category set to category set m = create_dictionary() m["cat_o_1"] = ("cat_d_1", { "c11": [{"d": "c21", "w": 1.0}], "c12": [{"d": "c23", "w": 1.0}], "c13": [{"d": "c23", "w": 1.0}] } ) # Prepare a simple DataFrame df = pd.DataFrame(data=[["c11", 4], ["c12", 3], ["c13", 1.5]], columns=["cat_o_1", "value"]) # Call df2 = augment_dataframe_with_mapped_columns(df, m, ["value"]) # Check result self.assertEqual(list(df2.columns), ["cat_o_1", "cat_d_1", "value"]) self.assertEqual(df2.shape, (3, 3))
def get_observations_OLD(prd: PartialRetrievalDictionary) \ -> Tuple[PartialRetrievalDictionary, PartialRetrievalDictionary, Dict[str, int]]: """ Process All QQ observations (intensive or extensive): * Store in a compact way (then clear), by Time-period, by Interface, by Observer. * Convert to float or prepare AST * Store as value the result plus the QQ observation (in a tuple) :param prd: :param relative: True->a QQ observation relative to the value of another interface :return: another PartialRetrievalDictionary, the Observers and the Time Periods (indexed) """ observations_prd = PartialRetrievalDictionary() relative_observations_prd = PartialRetrievalDictionary() time_periods: Dict[str, int] = create_dictionary( ) # Dictionary of time periods and the associated IDX state = State() next_time_period_idx = 0 for observation in find_quantitative_observations( prd, processor_instances_only=True): # Obtain time period index time = observation.attributes["time"] if time not in time_periods: time_periods[time] = next_time_period_idx next_time_period_idx += 1 # Elaborate Key: Interface, Time, Observer key = dict(__i=observation.factor, __t=time_periods[time], __o=observation.observer) value, ast, _, issues = evaluate_numeric_expression_with_parameters( observation.value, state) if not value: value = ast # Store Key: (Value, FactorQuantitativeObservation) if observation.is_relative: relative_observations_prd.put(key, (value, observation)) else: observations_prd.put(key, (value, observation)) return observations_prd, relative_observations_prd, time_periods
def convert_code_list_to_hierarchy(cl, as_list=False): """ Receives a list of codes. Codes are sorted lexicographically (to include numbers). Two types of coding schemes are supported by assuming that trailing zeros can be ignored to match parent -> child relations. The first is uniformly sized codes (those with trailing zeros). The second is growing length codes. Those with length less than others but common prefix are parents :param cl: :param as_list: if True, return a flat tree (all nodes are siblings, descending from a single root) :return: """ def can_be_child(parent_candidate, child_candidate): # Strip zeros to the right, from parent_candidate, and # check if the child starts with the resulting substring return child_candidate.startswith(parent_candidate.rstrip("0")) root = Node("") path = [root] code_to_node = create_dictionary() for c in sorted(cl): if as_list: n = Node(c, path[-1]) else: found = False while len(path) > 0 and not found: if can_be_child(path[-1].name, c): found = True else: path.pop() if c.rstrip("0") == path[-1].name: # Just modify (it may enter here only in the root node) path[-1].name = c n = path[-1] else: # Create node and append it to the active path n = Node(c, path[-1]) path.append(n) code_to_node[c] = n # Map the code to the node return root, code_to_node
def dictionary_from_key_value_list(kvl, state: State = None): """ From a string containing a list of keys and values, return a dictionary Keys must be literals, values can be expressions, to be evaluated at a later moment (syntactic validity of expressions is not checked here) :param kvl: String containing the list of keys and values :except If syntactic problems occur :return: A dictionary """ pairs = kvl.split(",") d = create_dictionary() for p in pairs: k, v = p.split("=", maxsplit=1) if not k: raise Exception( "Each key-value pair must be separated by '=' and key has to be defined, value can be empty: " + kvl) else: try: k = k.strip() v = v.strip() string_to_ast(simple_ident, k) try: # Simplest: string string_to_ast(quotedString, v) v = v[1:-1] except: issues = [] ast = string_to_ast(expression_with_parameters, v) res, unres = ast_evaluator(ast, state, None, issues) if len(unres) == 0: v = res d[k] = v except: raise Exception("Key must be a string: " + k + " in key-value list: " + kvl) return d
def generate_dublin_core_xml(content): """ Generate an XML string with a Simple Dublin Core Record from a Case Study Metadata Command Content :param content: :return: """ controlled = create_dictionary() for t in metadata_fields: controlled[t[4]] = t s = """<?xml version="1.0"?> <caseStudyMetadata xmlns="http://magic-nexus.org/dmp/" xmlns:dc="http://purl.org/dc/elements/1.1/"> """ for key in content: k = controlled[key][1] if k: for l in content[key]: s += " <dc:" + k + ">" + escape( str(l)) + "</dc:" + k + ">\n" s += "</caseStudyMetadata>\n" return s
def evaluate_parameters_for_scenario(base_params: List[Parameter], scenario_params: Dict[str, str]): """ Obtain a dictionary (parameter -> value), where parameter is a string and value is a literal: number, boolean, category or string. Start from the base parameters then overwrite with the values in the current scenario. Parameters may depend on other parameters, so this has to be considered before evaluation. No cycles are allowed in the dependencies, i.e., if P2 depends on P1, P1 cannot depend on P2. To analyze this, first expressions are evaluated, extracting which parameters appear in each of them. Then a graph is elaborated based on this information. Finally, an algorithm to find cycles is executed. :param base_params: :param scenario_params: :return: """ # Create dictionary without evaluation result_params = create_dictionary() result_params.update( {p.name: p.default_value for p in base_params if p.default_value}) # Overwrite with scenario expressions or constants result_params.update(scenario_params) state = State() known_params = create_dictionary() unknown_params = create_dictionary() # Now, evaluate ALL expressions for param, expression in result_params.items(): value, ast, params, issues = evaluate_numeric_expression_with_parameters( expression, state) if not value: # It is not a constant, store the parameters on which this depends if case_sensitive: unknown_params[param] = (ast, set(params)) else: unknown_params[param] = (ast, set([p.lower() for p in params])) else: # It is a constant, store it result_params[param] = value # Overwrite known_params[param] = value cycles = get_circular_dependencies(unknown_params) if len(cycles) > 0: raise Exception( f"Parameters cannot have circular dependencies. {len(cycles)} cycles were detected: " f"{':: '.join(cycles)}") # Initialize state with known parameters state.update(known_params) # Loop until no new parameters can be evaluated previous_len_unknown_params = len(unknown_params) + 1 while len(unknown_params) < previous_len_unknown_params: previous_len_unknown_params = len(unknown_params) for param in list( unknown_params ): # A list(...) is used because the dictionary can be modified inside ast, params = unknown_params[param] if params.issubset(known_params): value, _, _, issues = evaluate_numeric_expression_with_parameters( ast, state) if not value: raise Exception( f"It should be possible to evaluate the parameter '{param}'. " f"Issues: {', '.join(issues)}") else: del unknown_params[param] result_params[param] = value state.set(param, value) if len(unknown_params) > 0: raise Exception( f"Could not evaluate the following parameters: {', '.join(unknown_params)}" ) return result_params
def __init__(self, name=None): self._name = name # A name for the scope itself self._registry = create_dictionary()
def __init__(self): self._default_namespace = "" self._namespaces = create_dictionary() # type:
def prepare_model(state) -> Dict[str, Set[Processor]]: """ Modify the state so that: * Implicit references of Interfaces to subcontexts are materialized * Creating processors * Creating interfaces in these processors * Creating relationships in these processors :param state: :return: A dictionary of systems each containing a set of the Processors inside it ("local" and "environment") """ # Registry and the other objects also glb_idx, _, _, _, _ = get_case_study_registry_objects(state) # Prepare a Query to obtain ALL interfaces query = BasicQuery(state) filt = {} objs = query.execute([Factor], filt) processors_by_system = create_dictionary() for iface in objs[Factor]: # type: Factor system = iface.processor.processor_system processors = processors_by_system.get(system, set()) if system not in processors_by_system: processors_by_system[system] = processors if iface.processor not in processors: processors.add(iface.processor) # If the Interface is connected to a "Subcontext" different than the owning Processor if iface.opposite_processor_type and \ iface.opposite_processor_type.lower() != iface.processor.subsystem_type.lower(): # Check if the interface has flow relationships # TODO An alternative is to search "observations" of type FactorsRelationDirectedFlowObservation # in the same "iface" if iface.orientation.lower() == "input": parameter = {"target": iface} else: parameter = {"source": iface} relations = glb_idx.get( FactorsRelationDirectedFlowObservation.partial_key( **parameter)) # If not, define Processor name, check if exists, if not create it # Then create an Interface and a Relationship if len(relations) == 0: # Define the name of a Processor in the same context but in different subcontext p_name = system + "_" + iface.opposite_processor_type p = glb_idx.get(Processor.partial_key(p_name)) if len(p) == 0: attributes = { 'subsystem_type': iface.opposite_processor_type, 'processor_system': iface.processor.processor_system, 'functional_or_structural': 'Functional', 'instance_or_archetype': 'Instance' # 'stock': None } p = Processor(p_name, attributes=attributes) glb_idx.put(p.key(), p) if p.subsystem_type.lower() in ["local", "environment"]: processors.add(p) else: p = p[0] attributes = { 'sphere': 'Technosphere' if iface.opposite_processor_type.lower() in ["local", "external"] else 'Biosphere', 'roegen_type': iface.roegen_type, 'orientation': "Input" if iface.orientation.lower() == "output" else "Output", 'opposite_processor_type': iface.processor.subsystem_type } # Create Interface f = Factor.create_and_append( name=iface.taxon.name, processor=p, in_processor_type=FactorInProcessorType(external=False, incoming=False), attributes=attributes, taxon=iface.taxon) glb_idx.put(f.key(), f) # Create Flow Relationship if iface.orientation.lower() == "output": source = iface target = f else: source = f target = iface fr = FactorsRelationDirectedFlowObservation.create_and_append( source=source, target=target, observer=None) glb_idx.put(fr.key(), fr) return processors_by_system
def execute(self, state: "State"): def process_line(item): # Read variables mh_src_dataset = item.get("source_dataset", None) mh_src_hierarchy = item.get("source_hierarchy", None) mh_src_code = item.get("source_code", None) mh_dst_hierarchy = item.get("destination_hierarchy", None) mh_dst_code = item.get("destination_code", None) mh_weight = item.get("weight", None) # Mapping name name = ((mh_src_dataset + ".") if mh_src_dataset else "") + mh_dst_hierarchy + " -> " + mh_dst_hierarchy if name in mappings: issues.append( Issue(itype=3, description="The mapping '" + name + "' has been declared previously. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) return if name in local_mappings: d = local_mappings[name] else: d = DottedDict() local_mappings[name] = d d.name = name d.origin_dataset = mh_src_dataset d.origin_hierarchy = mh_src_hierarchy d.destination_hierarchy = mh_dst_hierarchy d.mapping = create_dictionary() # Specific code if mh_src_code in d.mapping: to_dict = d.mapping[mh_src_code] else: to_dict = create_dictionary() if mh_dst_code in to_dict: issues.append( Issue(itype=3, description="The mapping of '" + mh_src_code + "' into '" + mh_dst_code + "' has been done already", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: to_dict[ mh_dst_code] = mh_weight # NOTE: This could be an object instead of just a FLOAT or expression d.mapping[mh_src_code] = to_dict issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] local_mappings = create_dictionary() # Process parsed information for line in self._content["items"]: r = line["_row"] # If the line contains a reference to a dataset or hierarchy, expand it # If not, process it directly is_expansion = False if is_expansion: # TODO Iterate through dataset and/or hierarchy elements, producing a list of new items pass else: process_line(line) # Mappings post-processing for d in local_mappings: # Convert the mapping into: # [{"o": "", "to": [{"d": "", "w": ""}]}] # [ {o: origin category, to: [{d: destination category, w: weight assigned to destination category}] } ] mapping = [] for orig in local_mappings[d].mapping: lst = [] for dst in local_mappings[d].mapping[orig]: lst.append( dict(d=dst, w=local_mappings[d].mapping[orig][dst])) mapping.append(dict(o=orig, to=lst)) if local_mappings[d].origin_dataset: dims, attrs, meas = obtain_dataset_metadata( local_mappings[d].origin_dataset) if local_mappings[d].origin_hierarchy not in dims: issues.append( Issue(itype=3, description="The origin dimension '" + local_mappings[d].origin_hierarchy + "' does not exist in dataset '" + local_mappings[d].origin_dataset + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: dim = dims[local_mappings[d].origin_hierarchy] mapping = fill_map_with_all_origin_categories(dim, mapping) # origin_dataset = local_mappings[d].origin_dataset origin_hierarchy = local_mappings[d].origin_hierarchy destination_hierarchy = local_mappings[d].destination_hierarchy # Create Mapping and add it to Case Study mappings variable mappings[d] = Mapping(d, obtain_dataset_source(origin_dataset), origin_dataset, origin_hierarchy, destination_hierarchy, mapping) # TODO # Use the function to perform many to many mappings, "augment_dataframe_with_mapped_columns" # Put it to work !!! # One or more mapping in sequence could be specified?. The key is "source hierarchy+dest hierarchy" # Read mapping parameters return issues, None
def __init__(self, session_factory): self.registry = create_dictionary() self._session_factory = session_factory
def parse_data_input_command(sh: Worksheet, area: AreaTupleType, processors_type: str, state=None) -> IssuesLabelContentTripleType: """ Scans the "area" of input worksheet "sh" where it is assumed a "data input" command is present. It obtains a list of observations, a list of processors, a list of observables, a list of tags All those are represented in JSON format :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :param processors_type: Name for the type of processors. Also label of the command :param state: Transient state useful for checking existence of variables :return: DataInputCommand, list of issues (issue_type, message) """ some_error = False issues = [] # Define a set of observations (qualified quantities) of observables # This set can be replicated. So, ?how to refer to each replica? # Regular expression, internal name, Mandatory (True|False) known_columns = [ (r"Name|Processor[_ ]name", "processor", False), (r"Level", "level", False), (r"Parent", "parent", False), (r"FF[_ ]type", "ff_type", True), (r"Var|Variable", "factor", True), (r"Value|NUSAP\.N", "value", False), # If value is not specified, then just declare the Factor (r"Unit|NUSAP\.U", "unit", True), # If blank, a dimensionless amount is assumed (r"Relative[_ ]to", "relative_to", False), (r"Uncertainty|Spread|NUSAP\.S", "uncertainty", False), (r"Assessment|NUSAP\.A", "assessment", False), (r"Pedigree[_ ]matrix|NUSAP\.PM", "pedigree_matrix", False), (r"Pedigree|NUSAP\.P", "pedigree", False), (r"Time|Date", "time", False), (r"Geo|Geolocation", "geolocation", False), (r"Source", "source", False), (r"Comment|Comments", "comments", False) ] label = "Processors " + processors_type # First, examine columns, to know which fields are being specified # Special cases: # Open columns: the field is specified in the cell togheter with the value. Like "attr1=whatever", instead of a header "attr1" and in a row below, a value "whatever" # Complex values: the value has syntactic rules. Like expressions for both quantities AND qualities (like NUSAP) # References: the field refers to additional information in another worksheet. Unique names or ref holder (worksheet name) plus ref inside the worksheet, would be allowed. Also ref type can disambiguate mandatory = {t[1]: t[2] for t in known_columns} cre = { } # Column Regular Expression dictionary (K: regular expression; V: RegularExpression object) if not case_sensitive: flags = re.IGNORECASE else: flags = 0 for kc in known_columns: cre[kc[0]] = re.compile(kc[0], flags=flags) col_names = {} standard_cols = { } # Internal (standardized) column name to column index in the worksheet (freedom in the order of columns) attribute_cols = create_dictionary( ) # Not recognized columns are considered freely named categories, attributes or tags attributes = [ ] # List of attributes or tags (keys of the previous dictionary) col_allows_dataset = create_dictionary( ) # If the column allows the reference to a dataset dimension for c in range(area[2], area[3]): col_name = sh.cell(row=area[0], column=c).value if not col_name: continue col_name = col_name.replace("\n", " ") col_names[c] = col_name # Match found = False for kc in known_columns: res = cre[kc[0]].search(col_name) if res: if kc[1] in standard_cols: issues.append( (2, "Cannot repeat column name '" + col_name + "' (" + kc[0] + ") in data input command '" + processors_type + "'")) else: standard_cols[kc[1]] = c col_names[c] = kc[ 1] # Override column name with pseudo column name for standard columns if col_names[c].lower() in [ "factor", "value", "time", "geolocation" ]: col_allows_dataset[col_names[c]] = True else: col_allows_dataset[col_names[c]] = False found = True break if not found: if col_name not in attribute_cols: # TODO Check valid col_names. It must be a valid Variable Name attribute_cols[col_name] = c attributes.append(col_name) col_allows_dataset[col_name] = True else: issues.append( (2, "Cannot repeat column name '" + col_name + "' in data input command '" + processors_type + "'")) del cre # Check if there are mandatory columns missing # TODO There could be combinations of columns which change the character of mandatory of some columns # TODO For instance, if we are only specifying structure, Value would not be needed print("BORRAME - " + str(known_columns)) print("BORRAME 2 - " + str(standard_cols)) for kc in known_columns: # "kc[2]" is the flag indicating if the column is mandatory or not # col_map contains standard column names present in the worksheet if kc[2] and kc[1] not in standard_cols: some_error = True issues.append((3, "Column name '" + kc[0] + "' must be specified in data input command '" + processors_type + "'")) # If there are errors, do not continue if some_error: return issues, label, None processor_attribute_exclusions = create_dictionary() processor_attribute_exclusions[ "scale"] = None # Exclude these attributes when characterizing the processor processor_attributes = [ t for t in attributes if t not in processor_attribute_exclusions ] # SCAN rows lst_observations = [ ] # List of ALL observations. -- Main outcome of the parse operation -- set_pedigree_matrices = create_dictionary() # List of pedigree templates set_processors = create_dictionary() # List of processor names set_factors = create_dictionary() # List of factors set_taxa = create_dictionary( ) # Dictionary of taxa with their lists of values. Useful to return CODE LISTS set_referenced_datasets = create_dictionary( ) # Dictionary of datasets to be embedded into the result (it is a job of the execution part) processors_taxa = create_dictionary( ) # Correspondence "processor" -> taxa (to avoid changes in this correspondence) dataset_column_rule = parser_field_parsers.dataset_with_column values = [None] * area[3] # LOOP OVER EACH ROW for r in range(area[0] + 1, area[1]): # Scan rows (observations) # Each row can specify: the processor, the factor, the quantity and qualities about the factor in the processor # It can also specify a "flow+containment hierarchy" relation row = {} # Store parsed values of the row taxa = create_dictionary() # Store attributes or taxa of the row referenced_dataset = None # Once defined in a row, it cannot change!! # Scan the row first, looking for the dataset. The specification is allowed in certain columns: # attribute_cols and some standard_cols already_processed = create_dictionary() for c in range(area[2], area[3]): if c in col_names: value = sh.cell(row=r, column=c).value if isinstance(value, str) and value.startswith("#"): col_name = col_names[c] if col_allows_dataset[col_name]: if not referenced_dataset: try: ast = parser_field_parsers.string_to_ast( dataset_column_rule, value[1:]) if len(ast["parts"]) == 2: referenced_dataset = ast["parts"][0] # Remove the dataset variable. It will be stored in "_referenced_dataset" value = "#" + ast["parts"][1] else: some_error = True issues.append(( 3, "The first dataset reference of the row must contain the " "dataset variable name and the dimension name, row " + str(r))) # Mark as processed already_processed[col_name] = None except: some_error = True issues.append( (3, "Column '" + col_name + "' has an invalid dataset reference '" + value + "', in row " + str(r))) else: try: ast = parser_field_parsers.string_to_ast( simple_ident, value[1:]) # Mark as processed already_processed[col_name] = None except: some_error = True issues.append( (3, "Column '" + col_name + "' has an invalid dataset reference '" + value + "', in row " + str(r))) if col_name in standard_cols: row[col_name] = value else: taxa[col_name] = value values[c] = value # TODO If the flow type is decomposed, compose it first for c in standard_cols: if c in already_processed: continue value = values[standard_cols[c]] # != "" or not if value is None or (value is not None and value == ""): if c == "unit": value = "-" if not value: if mandatory[c]: some_error = True issues.append( (3, "Column '" + c + "' is mandatory, row " + str(r))) continue # Skip the rest of the iteration! # Parse the value if c in ["processor", "factor"]: # Check that it is a variable name, and allow hierarchical names parser_field_parsers.string_to_ast( parser_field_parsers.simple_h_name, value) elif c == "pedigree_matrix": parser_field_parsers.string_to_ast( parser_field_parsers.simple_ident, value) elif c == "relative_to": # Two elements, the first a hierarchical name, the second a unit name s = value.split(" ") if len(s) != 2: some_error = True issues.append(( 3, "The Relative To value has to have two parts, factor name and unit, separated by a whitespace (specified '" + value + "'), in row " + str(r))) else: try: parser_field_parsers.string_to_ast( parser_field_parsers.simple_h_name, s[0]) except: some_error = True issues.append(( 3, "The name specified for the relative to factor '" + s[0] + "' is not valid, in row " + str(r))) # It must be a recognized unit. Check with Pint try: ureg(s[1]) ureg.parse_unit_name(s[1], case_sensitive) except UndefinedUnitError: some_error = True issues.append(( 3, "The unit name '" + s[1] + "' is not registered in the units processing package, in row " + str(r))) elif c == "level": # A valid level name try: parser_field_parsers.string_to_ast( parser_field_parsers.level_name, value) except: some_error = True issues.append((3, "The level '" + value + "' syntax is not valid, in row " + str(r))) elif c == "parent": # Check that value is a valid parent name. It can be either a list of tags OR # a processor name, something defining a single processor try: parser_field_parsers.string_to_ast( parser_field_parsers.simple_h_name, value) except: try: parser_field_parsers.string_to_ast( parser_field_parsers.named_parameters_list, value) except: some_error = True issues.append((3, "Could not parse '" + value + "' as 'parent' in row " + str(r))) elif c == "ff_type": # The type of flow/fund must be one of a set of possible values. DEFINE THE LIST if value.lower() not in allowed_ff_types: some_error = True issues.append( (3, "ff_type must be one of :" + ', '.join(allowed_ff_types) + ", in row " + str(r))) elif c == "value": if not isinstance(value, str): value = str(value) # Expression allowed. Check syntax only. It can refer to parameters. ast = parser_field_parsers.string_to_ast( parser_field_parsers.expression, value) # TODO Check existence of used variables # TODO basic_elements_parser.ast_evaluator(ast, state, None, issues, "static") elif c == "unit": # It must be a recognized unit. Check with Pint try: value = value.replace("€", "Euro").replace("$", "Dollar") if value == "-": value = "" # Dimensionless ureg(value) ureg.parse_unit_name(value, case_sensitive) except: some_error = True issues.append(( 3, "The unit name '" + value + "' is not registered in the units processing package, in row " + str(r))) elif c == "uncertainty": # TODO It must be a valid uncertainty specifier pass elif c == "assessment": # See page 135 of Funtowicz S., Ravetz J., "Uncertainty and Quality in Science for Policy" # "c" is "cognitive" assessment, "p" is pragmatic assessment. allowed = [ "nil", "low", "medium", "high", "total", "nil_c", "low_c", "medium_c", "high_c", "total_c", "nil_p", "low_p", "medium_p", "high_p", "total_p" ] if value and value.lower() not in allowed: issues.append((3, "Assessment must be empty or one of: " + ", ".join(allowed))) elif c == "pedigree": # A valid pedigree specification is just an integer try: int(value) except: issues.append((3, "The pedigree specification '" + value + "' must be an integer")) elif c == "time": # A valid time specification. Possibilities: Year, Month-Year / Year-Month, Time span (two dates) if not isinstance(value, str): value = str(value) ast = parser_field_parsers.string_to_ast( parser_field_parsers.time_expression, value) elif c == "geolocation": # A reference to a geolocation try: parser_field_parsers.string_to_ast( parser_field_parsers.reference, value) except: some_error = True issues.append((3, "The geolocation must be a reference")) elif c == "source": # Who or what provided the information. It can be formal or informal. Formal can be references (but evaluated later) pass elif c == "comments": # Free text pass # Store the parsed value row[c] = value for c in attribute_cols: if c in already_processed: continue value = values[attribute_cols[c]] # != "" or not if not value: taxa[c] = None continue # Skip the rest of the iteration! # TODO Check value. Valid identifier, no whitespace # Validate "value", it has to be a simple ID try: if not isinstance(value, str): value = str(value) parser_field_parsers.simple_ident.parseString(value, parseAll=True) except: value = None some_error = True issues.append(( 3, "The value in column '" + c + "' has to be a simple identifier: start with letter, then letters, numbers and '_', no whitespace, in row " + str(r))) taxa[c] = value # Disable the registration of taxa. If a Dataset reference is used, there is no way to register # taxa at parse time (the dataset is still not obtained). Leave it for the execution if c not in set_taxa: set_taxa[c] = create_dictionary() if value is not None: set_taxa[c][value] = None # Now that individual columns have been parsed, do other things if referenced_dataset: row["_referenced_dataset"] = referenced_dataset # If "processor" not specified, concatenate taxa columns in order to generate an automatic name # (excluding the processor type) p_taxa = taxa.copy() for k in processor_attribute_exclusions: if k in p_taxa: del p_taxa[k] if "processor" not in row: row["processor"] = "_".join( [str(taxa[t]) for t in processor_attributes] ) # TODO Which order? (the current is "order of appearance"; maybe "alphabetical order" would be better option) # Add as "taxa" the processor type (which is an optional input parameter to this function) if processors_type: taxa["_processors_type"] = processors_type # Store taxa (attributes and taxa) row["taxa"] = taxa # Store taxa if the processor still does not have it if row["processor"] not in processors_taxa: processors_taxa[row[ "processor"]] = p_taxa # "::".join([taxa[t] for t in lst_taxa_cols]) else: # Taxa should be the same for each "processor". Error if different t = processors_taxa[row["processor"]] if t != p_taxa: issues.append( (3, "The processor '" + row["processor"] + "' has different taxa assigned, in row " + str(r))) # Register new processor names, pedigree templates, and variable names if "processor" in row: set_processors[row["processor"]] = None if "pedigree_matrix" in row: set_pedigree_matrices[row["pedigree_matrix"]] = None if "factor" in row: set_factors[row["factor"]] = None if referenced_dataset: set_referenced_datasets[referenced_dataset] = None lst_observations.append(row) content = { "factor_observations": lst_observations, "processor_attributes": processor_attributes, "processors": [k for k in set_processors], "pedigree_matrices": [k for k in set_pedigree_matrices], "factors": [k for k in set_factors], "referenced_datasets": [ds for ds in set_referenced_datasets], "code_lists": {k: [k2 for k2 in set_taxa[k]] for k in set_taxa} } return issues, label, content
def commands_generator_from_ooxml_file( input, state, sublist, stack) -> backend.ExecutableCommandIssuesPairType: """ It reads an Office Open XML input Yields a sequence of command_executors :param input: A bytes input :param state: State used to check variables :param sublist: List of worksheets to consider :param stack: Stack of nested files. Just pass it... :return: """ # Start the Excel reader workbook = openpyxl.load_workbook(io.BytesIO(input), data_only=True) # Command names (for the "list of commands" command) command_names = create_dictionary( data={cmd_name: None for cmd_name in valid_v2_command_names}) worksheet_to_command = create_dictionary( ) # A dictionary to translate a worksheet to an equivalent command if sublist: # Force reading "ListOfCommands" commands for sheet_name in workbook.sheetnames: if first(commands, condition=lambda c: c.name == "list_of_commands" and c. regex.search(sheet_name)): sublist.append(sheet_name) # For each worksheet, get the command type, convert into primitive JSON for sheet_number, sheet_name in enumerate(workbook.sheetnames): if sublist: if sheet_name not in sublist: continue issues = [] total_issues: List[Issue] = [] sheet = workbook[sheet_name] c_label: str = None c_content = None name = sheet.title # Use an equivalent command name if name in worksheet_to_command: name = worksheet_to_command[name] # Extract worksheet matrices m = binary_mask_from_worksheet(sheet, False) t = obtain_rectangular_submatrices(m, only_remove_empty_bottom=True) if len(t) == 0: # No data continue t = t[ 0] # Take just the first element, a tuple (top, bottom, left, right) representing a rectangular region t = (t[0] + 1, t[1] + 1, t[2] + 1, t[3] + 1) # Indices start at 1 # v = worksheet_to_numpy_array(sheet) # Find which COMMAND to parse, then parse it cmd: Optional[backend.Command] = first( commands, condition=lambda c: c.regex.search(name)) c_type: str = cmd.name if cmd else None if not c_type: total_issues.append( Issue( sheet_number, sheet_name, None, 2, f"The worksheet name '{sheet_name}' has not a supported command associated. Skipped." )) elif c_type == "etl_dataset": if sheet.cell(row=t[0], column=t[2]).value: t = (1, m.shape[0] + 1, 1, m.shape[1] + 1) # Parse to read parameters dataset_name = cmd.regex.search(name).group(2) issues, c_label, c_content = cmd.parse_function( sheet, t, dataset_name, state) else: total_issues.append( Issue( sheet_number, sheet_name, c_type, 3, f"It seems there are no parameters for the dataset import command at worksheet '{sheet_name}'" )) elif c_type == "list_of_commands": issues, c_label, c_content = parse_command(sheet, t, None, cmd.name) c_type = None if 3 not in [issue.itype for issue in issues]: for r in c_content["items"]: worksheet = r.get("worksheet", None) command = r.get("command", None) # Check if valid command if command not in command_names: total_issues.append( Issue( sheet_number, sheet_name, None, 3, "Command '" + command + "' not recognized in List of Commands.")) else: worksheet_to_command[worksheet] = command elif c_type == "import_commands": issues, c_label, c_content = parse_command(sheet, t, None, cmd.name) if 3 not in [issue.itype for issue in issues]: # Declared at this point to avoid circular reference ("parsers_factory" imports "parsers_spreadsheet") from backend.command_generators.parsers_factory import commands_container_parser_factory # For each line, repeat the import for r in c_content["items"]: generator_type, file2, sublist2 = handle_import_commands(r) yield from commands_container_parser_factory( generator_type, None, file2, state, sublist=sublist2, stack=stack) print("Done") elif c_type == "mapping": groups = cmd.regex.search(name).groups() if groups[2] and groups[8]: origin = groups[2] destination = groups[8] elif not groups[2] and not groups[8]: origin = None destination = None else: total_issues.append( Issue( sheet_number, sheet_name, c_type, 3, f"Either origin or destination are not correctly specified in the sheet name '{sheet_name}'" )) issues, c_label, c_content = cmd.parse_function( sheet, t, origin, destination) elif c_type in ["datasetqry", "datasetdata"]: issues, c_label, c_content = cmd.parse_function( sheet, t, sheet_name, state) elif c_type == "hierarchy": res = cmd.regex.search(name) h_type = res.group(2) c_label = res.group(3) issues, _, c_content = cmd.parse_function(sheet, t, c_label, h_type) elif c_type == "data_input": group2_name = cmd.regex.search(name).group(2) issues, c_label, c_content = cmd.parse_function( sheet, t, group2_name) else: # GENERIC command parser if cmd.parse_function: issues, c_label, c_content = cmd.parse_function( sheet, t, sheet_name) else: issues, c_label, c_content = parse_command( sheet, t, sheet_name, cmd.name) # ------------------------------------------------------------------------------------------------------------- # Command parsed, now append "issues" errors = 0 if len(issues) > 0: for i in issues: if isinstance(i, backend.command_generators.Issue): if i.itype == 3: errors += 1 issue = Issue(sheet_number, sheet_name, c_type, i.itype, i.description) else: if i[0] == 3: errors += 1 issue = Issue(sheet_number, sheet_name, c_type, i[0], i[1]) total_issues.append(issue) if errors == 0: try: if c_type: cmd, issues = create_command(c_type, c_label, c_content, sheet_name) else: cmd = None issues = [] except: cmd = None issues = [ (3, "Could not create command of type '" + c_type + "'") ] if issues: for i in issues: if isinstance(i, backend.command_generators.Issue): issue = Issue(sheet_number, sheet_name, c_type, i.itype, i.description) else: issue = Issue(sheet_number, sheet_name, c_type, i[0], i[1]) total_issues.append(issue) else: print(issues) # Convenient for debugging purposes cmd = None # cmd, _ = create_command(c_type, c_label, {}, sh_name) yield cmd, total_issues
def __init__(self): self.registry = create_dictionary()
def process_row(row): """ Process a dictionary representing a row of the data input command. The dictionary can come directly from the worksheet or from a dataset. Implicitly uses "glb_idx" :param row: dictionary """ # From "ff_type" extract: flow/fund, external/internal, incoming/outgoing # ecosystem/society? ft = row["ff_type"].lower() if ft == "int_in_flow": roegen_type = FlowFundRoegenType.flow internal = True incoming = True elif ft == "int_in_fund": roegen_type = FlowFundRoegenType.fund internal = True incoming = True elif ft == "ext_in_fund": roegen_type = FlowFundRoegenType.fund internal = False incoming = True elif ft == "int_out_flow": roegen_type = FlowFundRoegenType.flow internal = True incoming = False elif ft == "ext_in_flow": roegen_type = FlowFundRoegenType.flow internal = False incoming = True elif ft == "ext_out_flow": roegen_type = FlowFundRoegenType.flow internal = False incoming = False elif ft == "env_out_flow": roegen_type = FlowFundRoegenType.flow internal = False incoming = False elif ft == "env_in_flow": roegen_type = FlowFundRoegenType.flow internal = False incoming = True elif ft == "env_in_fund": roegen_type = FlowFundRoegenType.fund internal = False incoming = True # Split "taxa" attributes. "scale" corresponds to the observation p_attributes = row["taxa"].copy() if "scale" in p_attributes: other_attrs = create_dictionary() other_attrs["scale"] = p_attributes["scale"] del p_attributes["scale"] else: other_attrs = None # Check existence of PedigreeMatrix, if used if "pedigree_matrix" in row: pm = glb_idx.get( PedigreeMatrix.partial_key(name=row["pedigree_matrix"])) if len(pm) != 1: issues.append((3, "Could not find Pedigree Matrix '" + row["pedigree_matrix"] + "'")) del row["pedigree_matrix"] else: try: lst = pm[0].get_modes_for_code(row["pedigree"]) except: issues.append( (3, "Could not decode Pedigree '" + row["pedigree"] + "' for Pedigree Matrix '" + row["pedigree_matrix"] + "'")) del row["pedigree"] del row["pedigree_matrix"] else: if "pedigree" in row: issues.append(( 3, "Pedigree specified without accompanying Pedigree Matrix" )) del row["pedigree"] # Source if "source" in row: try: ast = parser_field_parsers.string_to_ast( parser_field_parsers.reference, row["source"]) ref_id = ast["ref_id"] references = glb_idx.get(Reference.partial_key(ref_id), ref_type="provenance") if len(references) == 1: source = references[0] except: source = row["source"] else: source = None # Geolocation if "geolocation" in row: try: ast = parser_field_parsers.string_to_ast( parser_field_parsers.reference, row["geolocation"]) ref_id = ast["ref_id"] references = glb_idx.get(Reference.partial_key(ref_id), ref_type="geographic") if len(references) == 1: geolocation = references[0] except: geolocation = row["geolocation"] else: geolocation = None # CREATE FactorType, A Type of Observable, IF it does not exist # AND ADD Quantitative Observation p, ft, f, o = create_or_append_quantitative_observation( glb_idx, factor=row["processor"] + ":" + row["factor"], value=row["value"] if "value" in row else None, unit=row["unit"], observer=source, spread=row["uncertainty"] if "uncertainty" in row else None, assessment=row["assessment"] if "assessment" in row else None, pedigree=row["pedigree"] if "pedigree" in row else None, pedigree_template=row["pedigree_matrix"] if "pedigree_matrix" in row else None, relative_to=row["relative_to"] if "relative_to" in row else None, time=row["time"] if "time" in row else None, geolocation=None, comments=row["comments"] if "comments" in row else None, tags=None, other_attributes=other_attrs, proc_aliases=None, proc_external=False, # TODO proc_attributes=p_attributes, proc_location=None, ftype_roegen_type=roegen_type, ftype_attributes=None, fact_external=not internal, fact_incoming=incoming, fact_location=geolocation) if p_set.append( p, glb_idx ): # Appends codes to the pset if the processor was not member of the pset p_set.append_attributes_codes(row["taxa"])
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ def obtain_column(cn, r1, r2): """ Obtain a list with the values of a column, in the range of rows [r1, r2) :param cn: Column number :param r1: Starting row :param r2: End+1 row :return: list with the cell values """ lst = [] for row in range(r1, r2): value = sh.cell(row=row, column=cn).value if value is None: continue lst.append(value) return lst issues = [] # Global variables (at parse time they may not be defined, so process carefully...) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # Look for the name of the input Dataset dataset_name = None available_at_datetime = None for c in range(area[2], area[3]): col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["inputdataset"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: dataset_name = v break # Stop on first definition elif col_name.lower().strip() in ["availableatdatetime"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: available_at_datetime = v break # Stop on first definition # Obtain the source source = obtain_dataset_source(dataset_name) # Obtain metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source) # Load all code lists in a temporary dictionary of sets # Also check if there is a TIME dimension in the dataset cl = create_dictionary() we_have_time = False for d in dims: if dims[d].code_list: cl[d] = create_dictionary(data={ k: None for k in dims[d].code_list.keys() }) # Attach the code list else: cl[d] = None # No code list (TIME_PERIOD for instance) if dims[d].istime: we_have_time = True # Add matching mappings as more dimensions for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # Add a dictionary entry for the new dimension, add also the codes present in the map # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]] tmp = create_dictionary( data={ to["d"]: None for o in mappings[m].map for to in o["to"] if to["d"] }) cl[mappings[m]. destination] = tmp # [t[1] for t in mappings[m].map] # Scan columns for Dimensions, Measures and Aggregation. # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside. # TODO The result COULD be an automatic BI cube (with a separate field) # TODO - Write into a set of tables in Mondrian # TODO - Generate Schema for Mondrian # TODO - Write the Schema for Mondrian out_dims = [] out_measures = OrderedDict() for r in range(area[0] + 1, area[1] + 1): out_measures[r] = dict(measure=None, agg_func=None, measure_as=None) filter_ = { } # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement) result_name = None # By default, no name for the result. It will be dynamically obtained measure_names_column = None aggregations_column = None for c in range(area[2], area[3]): # Each column col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["resultdimensions", "dimensions"]: # "GROUP BY" lst = obtain_column(c, area[0] + 1, area[1]) for r, d in enumerate(lst): if not d: continue if d not in cl: issues.append( Issue( itype=3, description="The dimension specified for output, '" + d + "' is neither a dataset dimension nor a mapped dimension. [" + ', '.join([d2 for d2 in cl]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_dims.append(d) elif col_name.lower().strip() in ["resultmeasures", "measures"]: # "SELECT" measure_names_column = c lst = obtain_column(c, area[0] + 1, area[1]) # Check for measures # TODO (and attributes?) for r, m in enumerate(lst): if not m: continue if m not in meas: issues.append( Issue( itype=3, description="The specified measure, '" + m + "' is not a measure available in the dataset. [" + ', '.join([m2 for m2 in measures]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["measure"] = m elif col_name.lower().strip() in [ "resultmeasuresaggregation", "resultmeasuresaggregator", "aggregation" ]: # "SELECT AGGREGATORS" aggregations_column = c lst = obtain_column(c, area[0] + 1, area[1]) for r, f in enumerate(lst): if not f: continue if f.lower() not in [ "sum", "avg", "count", "sumna", "countav", "avgna", "pctna" ]: issues.append( Issue( itype=3, description="The specified aggregation function, '" + f + "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["agg_func"] = f elif col_name.lower().strip() in [ "resultmeasurename", "resultmeasuresnames", "resultmeasuresas", "measuresas" ]: # "AS <name>" lst = obtain_column(c, area[0] + 1, area[1]) for r, m in enumerate(lst): out_measures[r + area[0] + 1]["measure_as"] = m elif col_name in cl: # A dimension -> "WHERE" # Check codes, and add them to the "filter" lst = obtain_column(c, area[0] + 1, area[1]) for r, cd in enumerate(lst): if not cd: continue if str(cd) not in cl[col_name]: issues.append( Issue( itype=3, description="The code '" + cd + "' is not present in the codes declared for dimension '" + col_name + "'. Please, check them.", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: if col_name not in filter_: lst2 = [] filter_[col_name] = lst2 else: lst2 = filter_[col_name] lst2.append(cd) elif we_have_time and col_name.lower() in [ "startperiod", "endperiod" ]: # SPECIAL "WHERE" FOR TIME # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command # Interval of time periods lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: filter_[col_name] = lst[ 0] # In this case it is not a list, but a number or string !!!! elif col_name.lower() in [ "outputdatasetname", "outputdataset", "result_name", "result name", "resultname" ]: lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: result_name = lst[0] try: parser_field_parsers.string_to_ast(simple_ident, result_name) except: issues.append( Issue(itype=3, description="Column '" + col_name + "' has an invalid dataset name '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) # If more than one agg function defined -> all must be defined # If no agg func defined -> assume AVG # If agg func defined only in first row -> extend to other columns agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(agg_funcs) > 1: first_agg_func = None elif len(agg_funcs) == 0: issues.append( Issue(itype=2, description= "No aggregation function specified. Assuming 'average'", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) first_agg_func = "avg" else: # One aggregation function first_agg_func = out_measures[area[0] + 1]["agg_func"] if not first_agg_func: issues.append( Issue( itype=3, description= "The aggregation function must be defined in the first row", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if first_agg_func: for v in out_measures.values(): if v.get("measure", None): v["agg_func"] = first_agg_func # Uniform rows, with the three values defined: measure, aggregation function and "measure as" for r, v in out_measures.items(): measure = v.get("measure", None) agg_func = v.get("agg_func", None) measure_as = v.get("measure_as", None) if measure and not agg_func or not measure and agg_func: issues.append( Issue( itype=3, description= "Each measure must be associated with an aggregation function", location=IssueLocation(sheet_name=name, row=r, column=measure_names_column))) elif measure and not measure_as: v["measure_as"] = measure + "_" + agg_func measures = [v["measure"] for v in out_measures.values() if v["measure"]] measures_as = [ v["measure_as"] for v in out_measures.values() if v["measure_as"] ] agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(measures) == 0: issues.append( Issue(itype=3, description="At least one measure should be specified", location=IssueLocation(sheet_name=name, row=1, column=measure_names_column))) # measures != agg_funcs && len(agg_funcs) == 1 --> OK if len(measures) != len(agg_funcs) and len(agg_funcs) != 1: issues.append( Issue( itype=3, description= "There must be one aggregation function (used for all measures) or one aggregation per measure", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if not result_name: result_name = source + "_" + dataset_name issues.append( Issue(itype=2, description="No result name specified. Assuming '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) content = { "dataset_source": source, "dataset_name": dataset_name, "dataset_datetime": available_at_datetime, "where": filter_, "dimensions": [d for d in dims], "group_by": out_dims, "measures": measures, "agg_funcs": agg_funcs, "measures_as": measures_as, "result_name": result_name } return issues, None, content