def check_expandable(v, location): """ Check if curly braces match, that what is inside is syntactically correct, (and that the value exists) :param v: :return: """ import re reg = re.compile(r"{.*?}") matches = reg.findall(v) output = set() if len(matches) == 0: issues.append( Issue( itype=IType.ERROR, description=f"Incorrect syntax, no macro expansion found", location=location)) else: for m in matches: h_name = m[1:-1] try: parser_field_parsers.string_to_ast( arith_boolean_expression, h_name) # simple_h_name output.add(h_name) except: issues.append( Issue( itype=IType.ERROR, description= f"The value {m[1:-1]} is not a valid hierarchical name", location=location)) return output
def execute(self, state: "State"): """ Process each of the references, simply storing them as Reference objects """ glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] issues = [] # Receive a list of validated references # Store them as objects, which can be referred to later for ref in self._content["items"]: r = ref["_row"] if "ref_id" not in ref: issues.append( Issue(itype=IType.ERROR, description="'ref_id' field not found: " + str(ref), location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: ref_id = ref["ref_id"] existing = glb_idx.get(self.ref_type.partial_key(ref_id)) if len(existing) == 1: issues.append( Issue(itype=IType.ERROR, description="Reference '" + ref_id + "' of type '" + str(self.ref_type) + "' is already defined. Not allowed", location=IssueLocation(sheet_name=name, row=r, column=None))) continue elif len(existing) > 1: # This condition should not occur... issues.append( Issue(itype=IType.ERROR, description="The reference '" + ref_id + "' of type '" + str(self.ref_type) + "' is defined more than one time (" + str(len(existing)) + ")", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # Create and store the Reference reference = self.ref_type(ref_id, ref) glb_idx.put(reference.key(), reference) # BibliographicReference and ProvenanceReference ar also Observer if isinstance(reference, Observer): glb_idx.put(Observer.key(reference), reference) return issues, None
def process_line(item): # Read variables mh_src_dataset = item.get("source_dataset", None) mh_src_hierarchy = item.get("source_hierarchy", None) mh_src_code = item.get("source_code", None) mh_dst_hierarchy = item.get("destination_hierarchy", None) mh_dst_code = item.get("destination_code", None) mh_weight = item.get("weight", 1.0) # Mapping name name = ((mh_src_dataset + ".") if mh_src_dataset else "") + mh_src_hierarchy + " -> " + mh_dst_hierarchy if name in mappings: issues.append( Issue(itype=IType.ERROR, description="The mapping '" + name + "' has been declared previously. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) return if name in local_mappings: d = local_mappings[name] else: d = DottedDict() local_mappings[name] = d d.name = name d.origin_dataset = mh_src_dataset d.origin_hierarchy = mh_src_hierarchy d.destination_hierarchy = mh_dst_hierarchy d.mapping = create_dictionary() # Specific code if mh_src_code in d.mapping: to_dict = d.mapping[mh_src_code] else: to_dict = create_dictionary() if mh_dst_code in to_dict: issues.append( Issue(itype=IType.ERROR, description="The mapping of '" + mh_src_code + "' into '" + mh_dst_code + "' has been already defined", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: to_dict[mh_dst_code] = ( mh_weight, r ) # NOTE: This could be an object instead of just a FLOAT or expression d.mapping[mh_src_code] = to_dict
def check_parameter_value(glb_idx, p, value, issues, sheet_name, row): retval = True if p.range: try: # Try "numeric interval" ast = string_to_ast(number_interval, p.range) # try Convert value to float ast2 = string_to_ast(expression_with_parameters, value) evaluation_issues: List[Tuple[int, str]] = [] s = State() value, unresolved_vars = ast_evaluator(exp=ast2, state=s, obj=None, issue_lst=evaluation_issues) if value is not None: try: value = float(value) left = ast["left"] right = ast["right"] left_number = ast["number_left"] right_number = ast["number_right"] if left == "[": value_meets_left = value >= left_number else: value_meets_left = value > left_number if right == "]": value_meets_right = value <= right_number else: value_meets_right = value < right_number if not value_meets_left or not value_meets_right: issues.append(Issue(itype=IType.ERROR, description=f"The value {value} specified for the parameter '{p.name}' is out of the range {p.range}", location=IssueLocation(sheet_name=sheet_name, row=row, column=None))) retval = False except: issues.append(Issue(itype=IType.ERROR, description=f"The parameter '{p.name}' has a non numeric value '{value}', and has been constrained with a numeric range. Please, either change the Value or the Range", location=IssueLocation(sheet_name=sheet_name, row=row, column=None))) retval = False else: pass # The parameter depends on other parameters, a valid situation except: # A hierarchy name h = glb_idx.get(Hierarchy.partial_key(p.range)) h = h[0] if value not in h.codes.keys(): issues.append(Issue(itype=IType.ERROR, description=f"The value '{value}' specified for the parameter '{p.name}' is not in the codes of the hierarchy '{p.range}': {', '.join(h.codes.keys())}", location=IssueLocation(sheet_name=sheet_name, row=row, column=None))) retval = False return retval
def execute(self, state: "State"): issues = [] sheet_name = self._content["command_name"] # Obtain global variables in state glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) for r, param in enumerate(self._content["items"]): name = param["name"] p = glb_idx.get(Parameter.partial_key(name)) if len(p) > 0: issues.append( Issue(itype=IType.WARNING, description="The parameter '" + name + "' has been declared previously. Skipped.", location=IssueLocation(sheet_name=sheet_name, row=r, column=None))) continue p = Parameter(name) p._default_value = p._current_value = param.get("value") p._type = param.get("type") p._range = param.get("domain") p._description = param.get("description") p._group = param.get("group") glb_idx.put(p.key(), p) return issues, None
def _add_issue(self, itype: IType, description: str) -> None: self._issues.append( Issue(itype=itype, description=description, location=IssueLocation(sheet_name=self._command_name, row=self._current_row_number, column=None))) return
def execute(self, state: "State"): any_error = False issues = [] sheet_name = self._content["command_name"] # Obtain global variables in state glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state) scenarios = create_dictionary() for r, param in enumerate(self._content["items"]): parameter = param["parameter"] scenario = param.get("scenario_name") p = glb_idx.get(Parameter.partial_key(parameter)) if len(p) == 0: issues.append(Issue(itype=IType.ERROR, description="The parameter '" + parameter + "' has not been declared previously.", location=IssueLocation(sheet_name=sheet_name, row=r, column=None))) any_error = True continue p = p[0] name = parameter value = param.get("parameter_value") check_parameter_value(glb_idx, p, value, issues, sheet_name, r) description = param.get("description") # For readability of the workbook. Not used for solving if scenario: if scenario in scenarios: sp = scenarios[scenario] else: sp = create_dictionary() scenarios[scenario] = sp sp[name] = value else: p.current_value = value p.default_value = value if not any_error: solver_parameters = {} # {p.name: p.current_value for p in glb_idx.get(Parameter.partial_key()) if p.group and strcmp(p.group, "NISSolverParameters")} if len(scenarios) == 0: scenarios["default"] = create_dictionary() ps = ProblemStatement(solver_parameters, scenarios) glb_idx.put(ps.key(), ps) return issues, None
def transform_issues(issues: List[Union[dict, nexinfosys.Issue, tuple, Issue]], cmd, sheet_number: int) -> (List[Issue], bool): errors_exist = False new_issues: List[Issue] = [] for i in issues: if isinstance(i, dict): issue = Issue(itype=IType(i["type"]), description=i["message"], ctype=i["c_type"], location=IssueLocation( sheet_name=i["sheet_name"], sheet_number=i["sheet_number"])) elif isinstance(i, nexinfosys.Issue): # namedtuple issue = Issue(itype=i.type, description=i.message, ctype=i.c_type, location=IssueLocation(sheet_name=i.sheet_name, sheet_number=i.sheet_number)) elif isinstance(i, tuple): issue = Issue(itype=IType(i[0]), description=i[1], location=IssueLocation(sheet_name="")) else: # isinstance(i, Issue): issue = i if issue.itype == IType.ERROR: errors_exist = True if not issue.ctype and cmd: # "cmd" may be "None", in case the Issue is produced by the commands container loop issue.ctype = cmd._serialization_type if not issue.location.sheet_name or issue.location.sheet_name == "": issue.location.sheet_name = cmd._source_block_name if hasattr( cmd, "_source_block_name") else "" if not issue.location.sheet_number: issue.location.sheet_number = sheet_number new_issues.append(issue) return new_issues, errors_exist
def execute(self, state: "State"): def process_line(item): # Read variables dsd_dataset_name = item.get("dataset_name", None) dsd_dataset_data_location = item.get("dataset_data_location", None) dsd_concept_type = item.get("concept_type", None) dsd_concept_name = item.get("concept_name", None) dsd_concept_data_type = item.get("concept_data_type", None) dsd_concept_domain = item.get("concept_domain", None) dsd_concept_description = item.get("concept_description", None) dsd_attributes = item.get("concept_attributes", None) if dsd_attributes: try: attributes = dictionary_from_key_value_list( dsd_attributes, glb_idx) except Exception as e: issues.append( Issue(itype=IType.ERROR, description=str(e), location=IssueLocation(sheet_name=name, row=r, column=None))) return else: attributes = {} if dsd_dataset_name in ds_names: issues.append( Issue(itype=IType.ERROR, description="The dataset '" + dsd_dataset_name + "' has been already defined", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Internal dataset definitions cache ds = current_ds.get(dsd_dataset_name, None) if True: # Statistical dataset format if not ds: ds = Dataset() ds.code = dsd_dataset_name # Name ds.database = None ds.attributes = {} current_ds[dsd_dataset_name] = ds if not dsd_concept_type: if ds.attributes.get("_location"): issues.append( Issue( itype=IType.WARNING, description= f"Location of data for dataset {ds.code} previously declared. " f"Former: {attributes.get('_location')}, " f"Current: {dsd_dataset_data_location}", location=IssueLocation(sheet_name=name, row=r, column=None))) attributes = ds.attributes else: attributes["_dataset_first_row"] = r attributes[ "_location"] = dsd_dataset_data_location # Location ds.description = dsd_concept_description ds.attributes = attributes # Set attributes else: # If concept_type is defined => add a concept # Check if the concept name already appears --> Error for d1 in ds.dimensions: if strcmp(d1.code, dsd_concept_name): issues.append( Issue( itype=IType.ERROR, description= f"Concept {dsd_concept_name} already declared for dataset {ds.code}", location=IssueLocation(sheet_name=name, row=r, column=None))) break d = Dimension() d.dataset = ds d.description = dsd_concept_description d.code = dsd_concept_name d.is_measure = False if dsd_concept_type.lower( ) == "dimension" else True if not d.is_measure and dsd_concept_data_type.lower( ) == "time": d.is_time = True else: d.is_time = False if dsd_concept_type.lower() == "attribute": attributes["_attribute"] = True else: attributes["_attribute"] = False if dsd_concept_data_type.lower() == "category": # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made) # h = hierarchies.get(dsd_concept_domain, None) h = glb_idx.get( Hierarchy.partial_key(name=dsd_concept_domain)) if len(h) == 0: issues.append( Issue( itype=IType.ERROR, description= "Could not find hierarchy of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif len(h) > 1: issues.append( Issue( itype=IType.ERROR, description= "Found more than one instance of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: # len(h) == 1 h = h[0] d.hierarchy = h # Reencode the Hierarchy as a CodeList cl = convert_hierarchy_to_code_list(h) d.code_list = cl attributes["_datatype"] = dsd_concept_data_type attributes["_domain"] = dsd_concept_domain d.attributes = attributes # ------------------------------------------------------------------------------------------------------------- issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] # List of available dataset names. The newly defined datasets must not be in this list ds_names = [ds.name for ds in datasets] # List of available Category hierarchies hierarchies = create_dictionary() for h in hh: hierarchies[h.name] = hh # Datasets being defined in this Worksheet current_ds = create_dictionary() # type: Dict[str, Dataset] # Process parsed information for line in self._content["items"]: r = line["_row"] # If the line contains a reference to a dataset or hierarchy, expand it # If not, process it directly is_expansion = False if is_expansion: pass else: process_line(line) # Any error? error = any_error_issue(issues) # Load the data for those datasets that are not local (data defined later in the same spreadsheet) for ds in current_ds.values(): if "_location" not in ds.attributes: error = True issues.append( Issue(itype=IType.ERROR, description= "Location of data not specified, for dataset '" + ds.code + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) else: loc = ds.attributes["_location"] ast = parser_field_parsers.string_to_ast(url_parser, loc) if ast["scheme"] != "data": df = load_dataset(loc) if df is None: error = True issues.append( Issue( itype=IType.ERROR, description= f"Could not obtain data for dataset '{ds.code}' at '{loc}'", location=IssueLocation(sheet_name=name, row=r, column=None))) else: iss = prepare_dataframe_after_external_read( ds, df, name) issues.extend(iss) # Everything ok? Store the dataframe! if len(iss) == 0: ds.data = df if not error: # If no error happened, add the new Datasets to the Datasets in the "global" state for ds in current_ds: r = current_ds[ds].attributes["_dataset_first_row"] df = current_ds[ds].data if df is not None: # Loop over "ds" concepts. # - "dimension" concepts of type "string" generate a CodeHierarchy # - Check that the DataFrame contains ALL declared concepts. If not, generate issue cid = create_dictionary( data={col: col for col in df.columns}) col_names = list(df.columns) for c in current_ds[ds].dimensions: if c.code in df.columns: col_names[df.columns.get_loc( cid[c.code])] = c.code # Rename column dsd_concept_data_type = c.attributes["_datatype"] if dsd_concept_data_type.lower( ) == "string" and not c.is_measure: # Freely defined dimension cl = df[cid[c.code]].unique().tolist() c.code_list = CodeList.construct( c.code, c.code, [""], codes=[ CodeImmutable(c, c, "", []) for c in cl ]) else: issues.append( Issue( itype=IType.ERROR, description= f"Concept '{c.code}' not defined for '{ds}' in {loc}", location=IssueLocation(sheet_name=name, row=r, column=None))) df.columns = col_names datasets[ds] = current_ds[ds] return issues, None
def parse_cmd_row_dict(cmd_name: str, row: Dict[str, str], already_parsed_fields: Set[str], location: IssueLocation) -> Tuple[bool, List[Issue]]: """ Parse a row (as a dictionary) from a command It is used after expansion of "macros" :param cmd_name: Name of command :param row: A dictionary containing the values to parse syntactically. Keys are field names, Values are field values :param already_parsed_fields: Set of fields already known to be syntactically valid :param location: IssueLocation object to use when creating Issues :return: A tuple: a boolean (True if the row can be used, otherwise False) and a list of Issues """ issues: List[Issue] = [] from nexinfosys.command_field_definitions import command_fields field_defs_dict = {f.name: f for f in command_fields[cmd_name]} mandatory_not_found = set([ c.name for c in command_fields[cmd_name] if c.mandatory and isinstance(c.mandatory, bool) ]) print(mandatory_not_found) complex_mandatory_cols = [ c for c in command_fields[cmd_name] if isinstance(c.mandatory, str) ] may_append = True complex_row = False for field_name, field_value in row.items(): field_def = field_defs_dict.get(field_name) if not field_def: return ParseException( f"Field {field_name} not found for command {cmd_name}") if field_value is not None: if not isinstance(field_value, str): field_value = str(field_value) field_value = field_value.strip() else: continue # Parse the field if field_def.allowed_values: if field_value.lower() not in [ v.lower() for v in field_def.allowed_values ]: # TODO Case insensitive CI issues.append( Issue( itype=IType.ERROR, description= f"Field '{field_name}' of command '{cmd_name}' has invalid value '{field_value}'." f" Allowed values are: {', '.join(field_def.allowed_values)}.", location=location)) may_append = False else: pass # OK else: # Instead of a list of values, check if a syntactic rule is met by the value if field_def.parser: # Parse, just check syntax (do not store the AST) try: if field_name not in already_parsed_fields: ast = parser_field_parsers.string_to_ast( field_def.parser, field_value) # Rules are in charge of informing if the result is expandable and if it complex if "expandable" in ast and ast["expandable"]: issues.append( Issue( itype=IType.ERROR, description= f"Field '{field_name}' of command '{cmd_name}' cannot be expandable again.", location=location)) may_append = False if "complex" in ast and ast["complex"]: complex_row = True except: issues.append( Issue( itype=IType.ERROR, description= f"The value in field '{field_name}' of command '{cmd_name}' " f"is not syntactically correct. Entered: {field_value}", location=location)) may_append = False else: pass # Valid if field_def.name in mandatory_not_found: mandatory_not_found.discard(field_def.name) # MODIFY INPUT Dictionary with this new Key if complex_row: row["_complex"] = complex_row # Append if all mandatory fields have been filled if len(mandatory_not_found) > 0: issues.append( Issue( itype=IType.ERROR, description= f"Mandatory columns: {', '.join(mandatory_not_found)} have not been specified", location=location)) may_append = False # Check varying mandatory fields (fields depending on the value of other fields) for c in complex_mandatory_cols: field_def = c.name # next(c2 for c2 in col_map if strcmp(c.name, c2.name)) if isinstance(c.mandatory, str): # Evaluate mandatory = eval(c.mandatory, None, row) may_append = (mandatory and field_def in row) or (not mandatory) if mandatory and field_def not in row: issues.append( Issue(itype=IType.ERROR, description="Mandatory column: " + field_def + " has not been specified", location=location)) may_append = False return may_append, issues
def execute(self, state: "State"): def process_line(item): # Read variables mh_src_dataset = item.get("source_dataset", None) mh_src_hierarchy = item.get("source_hierarchy", None) mh_src_code = item.get("source_code", None) mh_dst_hierarchy = item.get("destination_hierarchy", None) mh_dst_code = item.get("destination_code", None) mh_weight = item.get("weight", 1.0) # Mapping name name = ((mh_src_dataset + ".") if mh_src_dataset else "") + mh_src_hierarchy + " -> " + mh_dst_hierarchy if name in mappings: issues.append( Issue(itype=IType.ERROR, description="The mapping '" + name + "' has been declared previously. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) return if name in local_mappings: d = local_mappings[name] else: d = DottedDict() local_mappings[name] = d d.name = name d.origin_dataset = mh_src_dataset d.origin_hierarchy = mh_src_hierarchy d.destination_hierarchy = mh_dst_hierarchy d.mapping = create_dictionary() # Specific code if mh_src_code in d.mapping: to_dict = d.mapping[mh_src_code] else: to_dict = create_dictionary() if mh_dst_code in to_dict: issues.append( Issue(itype=IType.ERROR, description="The mapping of '" + mh_src_code + "' into '" + mh_dst_code + "' has been already defined", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: to_dict[mh_dst_code] = ( mh_weight, r ) # NOTE: This could be an object instead of just a FLOAT or expression d.mapping[mh_src_code] = to_dict issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] local_mappings = create_dictionary() # Process parsed information for line in self._content["items"]: r = line["_row"] # If the line contains a reference to a dataset or hierarchy, expand it # If not, process it directly is_expansion = False if is_expansion: # TODO Iterate through dataset and/or hierarchy elements, producing a list of new items pass else: process_line(line) # Mappings post-processing for d in local_mappings: # Convert the mapping into: # [{"o": "", "to": [{"d": "", "w": ""}]}] # [ {o: origin category, to: [{d: destination category, w: weight assigned to destination category}] } ] mapping = [] ds_rows = [] # Rows in which a dataset is mentioned for orig in local_mappings[d].mapping: lst = [] for dst in local_mappings[d].mapping[orig]: t = local_mappings[d].mapping[orig][dst] lst.append(dict(d=dst, w=t[0])) if local_mappings[d].origin_dataset: ds_rows.append(t[1]) mapping.append(dict(o=orig, to=lst)) from nexinfosys.ie_imports.data_source_manager import DataSourceManager if local_mappings[d].origin_dataset: if not DataSourceManager.obtain_dataset_source( local_mappings[d].origin_dataset, datasets): for r in ds_rows: issues.append( Issue( itype=IType.ERROR, description= f"The dataset '{local_mappings[d].origin_dataset}' was not found", location=IssueLocation(sheet_name=name, row=r, column=None))) continue dims, attrs, meas = obtain_dataset_metadata( local_mappings[d].origin_dataset, None, datasets) if local_mappings[d].origin_hierarchy not in dims: issues.append( Issue(itype=IType.ERROR, description="The origin dimension '" + local_mappings[d].origin_hierarchy + "' does not exist in dataset '" + local_mappings[d].origin_dataset + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: dim = dims[local_mappings[d].origin_hierarchy] mapping = fill_map_with_all_origin_categories(dim, mapping) # origin_dataset = local_mappings[d].origin_dataset origin_hierarchy = local_mappings[d].origin_hierarchy destination_hierarchy = local_mappings[d].destination_hierarchy # Create Mapping and add it to Case Study mappings variable mappings[d] = Mapping( d, DataSourceManager.obtain_dataset_source( origin_dataset, datasets), origin_dataset, origin_hierarchy, destination_hierarchy, mapping) # TODO # Use the function to perform many to many mappings, "augment_dataframe_with_mapped_columns" # Put it to work !!! # One or more mapping in sequence could be specified?. The key is "source hierarchy+dest hierarchy" # Read mapping parameters return issues, None
def execute(self, state: "State"): """ Create a Hierarchy of Taxon. The exact form of this hierarchy is different depending on the concept: * FactorTypes and Categories use Hierarchies, which are intrinsic. The hierarchy name is passed to the containing Hierarchy object * Processors use Part-Of Relations. In this case, the hierarchy name is lost Names of Processor and FactorTypes are built both in hierarchical and simple form The hierarchical is all the ancestors from root down to the current node, separated by "." The simple name is just the current node. If there is already another concept with that name, the simple name is not stored (STORE BOTH CONCEPTS by the same name, and design some tie breaking mechanism??) """ issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] # Process parsed information for item in self._content["items"]: r = item["_row"] # HierarchySource (Optional) hsource = item.get("source", None) # Code of entity defining the Hierarchy if hsource: tmp = hsource hsource = glb_idx.get( HierarchySource.partial_key(name=hsource)) if len(hsource) == 0: hsource = HierarchySource(name=tmp) glb_idx.put(hsource.key(), hsource) else: hsource = hsource[0] hname = item.get("hierarchy_name", None) if not hname: issues.append( Issue( itype=IType.ERROR, description= "The name of the Hierarchy has not been defined. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # HierarchyGroup (equivalent to Hierarchy of Code Lists, HCL) hg = item.get("hierarchy_group", None) if hg: is_code_list = False # Hierarchy group else: is_code_list = True # Hierarchy group for the Code List, with the same name hg = hname # Check if the HierarchyGroup is previously defined. YES, use it; NO, create new HierarchyGroup tmp = hg hg = glb_idx.get(HierarchyGroup.partial_key(name=hg)) if len(hg) == 0: hg = HierarchyGroup(name=tmp, source=hsource) glb_idx.put(hg.key(), hg) else: hg = hg[0] # Check if the Hierarchy is defined. YES, get it; NO, create it tmp = hname h = glb_idx.get(Hierarchy.partial_key(name=hname)) if len(h) == 0: h = Hierarchy(name=tmp) glb_idx.put(h.key(), h) glb_idx.put(h.key(hg.name + "." + h.name), h) # Register with alternative (full) name else: h = h[0] # Add the Hierarchy to the HierarchyGroup (if not) if h not in hg.hierarchies: hg.hierarchies.append(h) # Level level = item.get("level", None) if level: # Check if the level is defined. YES, get it; NO, create it for l in h.levels: if strcmp(l.name, level): level = l break else: level = HierarchyLevel(name=level, hierarchy=h) h.levels.append(level) code = item.get("code", None) label = item.get("label", None) description = item.get("description", None) attributes = item.get("attributes", None) expression = item.get("expression", None) # Parent property (what really defines Hierarchies) parent_code = item.get("parent_code", None) if parent_code: ph = h # Parent Hierarchy is the same as current hierarchy pcode = ph.codes.get(parent_code, None) if not pcode: issues.append( Issue(itype=IType.ERROR, description="Could not find code '" + parent_code + "' in hierarchy '" + ph.name + "'. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: pcode = None # ReferredHierarchy. If we are not defining a Code List, the base hierarchy has to be mentioned if not is_code_list: ref_hierarchy = item.get("referred_hierarchy", None) if not ref_hierarchy: issues.append( Issue( itype=IType.ERROR, description= "For HCLs, defining ReferredHierarchy is mandatory", location=IssueLocation(sheet_name=name, row=r, column=None))) continue tmp = ref_hierarchy ref_hierarchy = glb_idx.get( Hierarchy.partial_key(name=ref_hierarchy)) if len(ref_hierarchy) == 0: issues.append( Issue(itype=IType.ERROR, description="ReferredHierarchy '" + tmp + "' not defined previously", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: ref_hierarchy = ref_hierarchy[0] ref_code = ref_hierarchy.codes.get(code, None) if not ref_code: issues.append( Issue(itype=IType.ERROR, description="Code '" + code + "' not found in referred hierarchy '" + ref_hierarchy.name + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # Ignore: LABEL, DESCRIPTION. Copy them from referred code label = ref_code.label description = ref_code.description else: ref_code = None c = h.codes.get(code, None) if c: issues.append( Issue(itype=IType.ERROR, description="Code '" + code + "' in hierarchy '" + h.name + "' redefined.", location=IssueLocation(sheet_name=name, row=r, column=None))) continue # Finally, create the HierarchyCode with all the gathered attributes, then weave it to other # (name, label=None, description=None, referred_node=None, parent=None, parent_weight=1.0, hierarchy=None) c = Taxon(name=code, hierarchy=h, level=level, referred_taxon=ref_code, parent=pcode, label=label, description=description, attributes=attributes, expression=expression) # Add code to hierarchy h.codes[code] = c if not c.parent: h.roots_append(c) # Add code to level if level: level.codes.add(c) # Add child to parent code # (DONE BY THE CONSTRUCTOR!!) # if pcode: # pcode.children_codes.append(c) return issues, None # Issues, Output
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ def obtain_column(cn, r1, r2): """ Obtain a list with the values of a column, in the range of rows [r1, r2) :param cn: Column number :param r1: Starting row :param r2: End+1 row :return: list with the cell values """ lst = [] for row in range(r1, r2): value = sh.cell(row=row, column=cn).value if value is None: continue if isinstance(value, str): lst.append(value.strip()) else: lst.append(value) return lst issues = [] # Global variables (at parse time they may not be defined, so process carefully...) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # Look for the name of the input Dataset dataset_name = None available_at_datetime = None for c in range(area[2], area[3]): col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["inputdataset"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: dataset_name = v break # Stop on first definition elif col_name.lower().strip() in ["availableatdatetime"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: available_at_datetime = v break # Stop on first definition if dataset_name is None: issues.append( Issue( itype=IType.ERROR, description= f"The name of the input dataset must be specified under column 'InputDataset'. Skipping {name} command", location=IssueLocation(sheet_name=name, row=None, column=None))) return issues, None, None # Obtain the source from nexinfosys.ie_imports.data_source_manager import DataSourceManager source = DataSourceManager.obtain_dataset_source(dataset_name, datasets) # Obtain metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets) # Load all code lists in a temporary dictionary of sets # Also check if there is a TIME dimension in the dataset cl = create_dictionary() we_have_time = False for d in dims: if dims[d].code_list: cl[d] = create_dictionary(data={ k: None for k in dims[d].code_list.keys() }) # Attach the code list else: cl[d] = None # No code list (TIME_PERIOD for instance) if dims[d].istime: we_have_time = True # Add matching mappings as more dimensions for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # Add a dictionary entry for the new dimension, add also the codes present in the map # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]] tmp = create_dictionary( data={ to["d"]: None for o in mappings[m].map for to in o["to"] if to["d"] }) cl[mappings[m]. destination] = tmp # [t[1] for t in mappings[m].map] # Scan columns for Dimensions, Measures and Aggregation. # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside. # TODO The result COULD be an automatic BI cube (with a separate field) # TODO - Write into a set of tables in Mondrian # TODO - Generate Schema for Mondrian # TODO - Write the Schema for Mondrian out_dims = [] out_measures = OrderedDict() for r in range(area[0] + 1, area[1] + 1): out_measures[r] = dict(measure=None, agg_func=None, measure_as=None) filter_ = { } # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement) result_name = None # By default, no name for the result. It will be dynamically obtained measure_names_column = None aggregations_column = None for c in range(area[2], area[3]): # Each column col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["resultdimensions", "dimensions"]: # "GROUP BY" lst = obtain_column(c, area[0] + 1, area[1]) for r, d in enumerate(lst): if not d: continue if d not in cl: issues.append( Issue( itype=IType.ERROR, description="The dimension specified for output, '" + d + "' is neither a dataset dimension nor a mapped dimension. [" + ', '.join([d2 for d2 in cl]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_dims.append(d) elif col_name.lower().strip() in ["resultmeasures", "measures"]: # "SELECT" measure_names_column = c lst = obtain_column(c, area[0] + 1, area[1]) # Check for measures # TODO (and attributes?) for r, m in enumerate(lst): if not m: continue if m not in meas: issues.append( Issue( itype=IType.ERROR, description="The specified measure, '" + m + "' is not a measure available in the dataset. [" + ', '.join( [m2["measure"] for m2 in out_measures.values]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["measure"] = m elif col_name.lower().strip() in [ "resultmeasuresaggregation", "resultmeasuresaggregator", "aggregation" ]: # "SELECT AGGREGATORS" aggregations_column = c lst = obtain_column(c, area[0] + 1, area[1]) for r, f in enumerate(lst): if not f: continue if f.lower() not in [ "sum", "avg", "count", "sumna", "countav", "avgna", "pctna" ]: issues.append( Issue( itype=IType.ERROR, description="The specified aggregation function, '" + f + "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["agg_func"] = f elif col_name.lower().strip() in [ "resultmeasurename", "resultmeasuresnames", "resultmeasuresas", "measuresas" ]: # "AS <name>" lst = obtain_column(c, area[0] + 1, area[1]) for r, m in enumerate(lst): out_measures[r + area[0] + 1]["measure_as"] = m elif col_name in cl: # A dimension -> "WHERE" # Check codes, and add them to the "filter" lst = obtain_column(c, area[0] + 1, area[1]) for r, cd in enumerate(lst): if not cd: continue if str(cd) not in cl[col_name]: issues.append( Issue( itype=IType.ERROR, description="The code '" + cd + "' is not present in the codes declared for dimension '" + col_name + "'. Please, check them.", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: if col_name not in filter_: lst2 = [] filter_[col_name] = lst2 else: lst2 = filter_[col_name] lst2.append(cd) elif we_have_time and col_name.lower() in [ "startperiod", "starttime", "endperiod", "endtime" ]: # SPECIAL "WHERE" FOR TIME # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command # Interval of time periods lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: if col_name.lower() == "starttime": col_name = "StartPeriod" elif col_name.lower() == "endtime": col_name = "EndPeriod" filter_[col_name] = lst[ 0] # In this case it is not a list, but a number or string !!!! elif col_name.lower() in [ "outputdatasetname", "outputdataset", "result_name", "result name", "resultname" ]: lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: result_name = lst[0] try: parser_field_parsers.string_to_ast(simple_ident, result_name) except: issues.append( Issue(itype=IType.ERROR, description="Column '" + col_name + "' has an invalid dataset name '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) # If more than one agg function defined -> all must be defined # If no agg func defined -> assume AVG # If agg func defined only in first row -> extend to other columns agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(agg_funcs) > 1: first_agg_func = None elif len(agg_funcs) == 0: issues.append( Issue(itype=IType.WARNING, description= "No aggregation function specified. Assuming 'average'", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) first_agg_func = "avg" else: # One aggregation function first_agg_func = out_measures[area[0] + 1]["agg_func"] if not first_agg_func: issues.append( Issue( itype=IType.ERROR, description= "The aggregation function must be defined in the first row", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if first_agg_func: for v in out_measures.values(): if v.get("measure", None): v["agg_func"] = first_agg_func # Uniform rows, with the three values defined: measure, aggregation function and "measure as" for r, v in out_measures.items(): measure = v.get("measure", None) agg_func = v.get("agg_func", None) measure_as = v.get("measure_as", None) if measure and not agg_func or not measure and agg_func: issues.append( Issue( itype=IType.ERROR, description= "Each measure must be associated with an aggregation function", location=IssueLocation(sheet_name=name, row=r, column=measure_names_column))) elif measure and not measure_as: v["measure_as"] = measure + "_" + agg_func measures = [v["measure"] for v in out_measures.values() if v["measure"]] measures_as = [ v["measure_as"] for v in out_measures.values() if v["measure_as"] ] agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(measures) == 0: issues.append( Issue(itype=IType.ERROR, description="At least one measure should be specified", location=IssueLocation(sheet_name=name, row=1, column=measure_names_column))) # measures != agg_funcs && len(agg_funcs) == 1 --> OK if len(measures) != len(agg_funcs) and len(agg_funcs) != 1: issues.append( Issue( itype=IType.ERROR, description= "There must be one aggregation function (used for all measures) or one aggregation per measure", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if not result_name: result_name = source + "_" + dataset_name issues.append( Issue(itype=IType.WARNING, description="No result name specified. Assuming '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) content = { "dataset_source": source, "dataset_name": dataset_name, "dataset_datetime": available_at_datetime, "where": filter_, "dimensions": [d for d in dims], "group_by": out_dims, "measures": measures, "agg_funcs": agg_funcs, "measures_as": measures_as, "result_name": result_name } return issues, None, content
def check_columns(sh, name: str, area: Tuple, cols: List[CommandField], command_name: str, ignore_not_found=True): """ When parsing of a command starts, check columns Try to match each column with declared column fields. If a column is not declared, raise an error (or ignore it) If mandatory columns are not found, raise an error :param sh: The worksheet being analyzed :param name: The name of the worksheet :param area: Area inside the worksheet that will be scanned :param cols: List of CommandField :param command_name: A string with the name of the command :param ignore_not_found: True if a column not matching declared ones has to be ignored, False if an error has to be raised in this case :return: The map column name to column index (or indices for multiply declared columns); The issues found """ issues: List[Issue] = [] # Set of mandatory columns mandatory_not_found = set([c.name for c in cols if c.mandatory]) # Check columns col_map = {} # From CommandField to a list of tuples (column, index) for c in range(area[2], area[3]): # For each column of row 0 (Header Row) ##val = sh.get((area[0], c), None) val = sh.cell(row=area[0], column=c).value if not val: continue col_name = val.strip() for col in cols: # Find matching CommandField from the attribute "regex_allowed_names" if col.regex_allowed_names.match(col_name): # Found matching CommandField "col". Process if "@" in col_name: # In case of use of "@", remove prefix col_name = col_name[col_name.index("@") + 1:] # Column Name to Column Index if not col.many_appearances: # Column appears once if col in col_map: issues.append( Issue(itype=IType.ERROR, description="The column '" + col.name + "' should not appear more than one time", location=IssueLocation(sheet_name=name, row=1, column=c))) col_map[col] = [(col_name, c)] else: # Column appears one or more times if col not in col_map: col_map[col] = [] col_map[col].append((col_name, c)) # Mandatory found (good) if col.name in mandatory_not_found: mandatory_not_found.discard(col.name) break else: # No match for the column "col_name" issues.append( Issue( itype=IType.WARNING if ignore_not_found else IType.ERROR, description= f"In Header row, the column name '{col_name}' does not match any of the " f"allowed column names (internal command '{command_name}')", location=IssueLocation(sheet_name=name, row=1, column=c))) if len(mandatory_not_found) > 0: issues.append( Issue(itype=IType.ERROR, description="In Header row, mandatory columns: " + ", ".join(mandatory_not_found) + " have not been specified", location=IssueLocation(sheet_name=name, row=1, column=None))) return col_map, issues
def execute(self, state: "State"): issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] # List of available dataset names. The newly defined datasets must not be in this list ds_names = [ds.code for ds in datasets.values()] # List of datasets with local worksheet name external_dataset_names = [] for ds in datasets.values(): if ds.attributes["_location"].lower().startswith("data://#"): worksheet = ds.attributes["_location"][len("data://#"):] if not worksheet.lower().startswith("datasetdata "): worksheet = "DatasetData " + worksheet if strcmp(worksheet, name): external_dataset_names.append(ds.code) # Process parsed information for r, line in enumerate(self._content["items"]): # A dataset dataset_names = line["name"] if dataset_names == "": if external_dataset_names: dataset_names = external_dataset_names else: issues.append( Issue( itype=IType.ERROR, description= "The column name 'DatasetName' was not defined for command 'DatasetData' and there is no 'location' in a DatasetDef command pointing to it", location=IssueLocation(sheet_name=name, row=1, column=None))) else: dataset_names = [dataset_names] # Find it in the already available datasets. MUST EXIST for n in ds_names: for dataset_name in dataset_names: if strcmp(dataset_name, n): df = pd.read_json(StringIO(line["values"]), orient="split") # Check columns ds = datasets[n] iss = prepare_dataframe_after_external_read( ds, df, name) issues.extend(iss) # Everything ok? Store the dataframe! if not any_error_issue(iss): r = ds.attributes["_dataset_first_row"] # Loop over "ds" concepts. # - "dimension" concepts of type "string" generate a CodeHierarchy # - Check that the DataFrame contains ALL declared concepts. If not, generate issue # dims = translate_case([d.code for d in ds.dimensions], df.columns) cid = create_dictionary( data={col: col for col in df.columns}) col_names = list(df.columns) for c in ds.dimensions: if c.code in cid: col_names[df.columns.get_loc( cid[c.code])] = c.code # Rename column dsd_concept_data_type = c.attributes[ "_datatype"] if dsd_concept_data_type.lower( ) == "string" and not c.is_measure: # Freely defined dimension cl = df[cid[c.code]].unique().tolist() c.code_list = CodeList.construct( c.code, c.code, [""], codes=[ CodeImmutable(c, c, "", []) for c in cl ]) else: issues.append( Issue( itype=IType.ERROR, description= f"Concept '{c.code}' not defined for '{ds.code}'", location=IssueLocation( sheet_name=name, row=r, column=None))) df.columns = col_names ds.data = df dataset_names.remove(dataset_name) break if dataset_names: issues.append( Issue( itype=IType.ERROR, description= f"Metadata for the datasets: {','.join(dataset_names)}, must be defined previously", location=IssueLocation(sheet_name=name, row=-1, column=-1))) return issues, None
def process_line(item): # Read variables dsd_dataset_name = item.get("dataset_name", None) dsd_dataset_data_location = item.get("dataset_data_location", None) dsd_concept_type = item.get("concept_type", None) dsd_concept_name = item.get("concept_name", None) dsd_concept_data_type = item.get("concept_data_type", None) dsd_concept_domain = item.get("concept_domain", None) dsd_concept_description = item.get("concept_description", None) dsd_attributes = item.get("concept_attributes", None) if dsd_attributes: try: attributes = dictionary_from_key_value_list( dsd_attributes, glb_idx) except Exception as e: issues.append( Issue(itype=IType.ERROR, description=str(e), location=IssueLocation(sheet_name=name, row=r, column=None))) return else: attributes = {} if dsd_dataset_name in ds_names: issues.append( Issue(itype=IType.ERROR, description="The dataset '" + dsd_dataset_name + "' has been already defined", location=IssueLocation(sheet_name=name, row=r, column=None))) return # Internal dataset definitions cache ds = current_ds.get(dsd_dataset_name, None) if True: # Statistical dataset format if not ds: ds = Dataset() ds.code = dsd_dataset_name # Name ds.database = None ds.attributes = {} current_ds[dsd_dataset_name] = ds if not dsd_concept_type: if ds.attributes.get("_location"): issues.append( Issue( itype=IType.WARNING, description= f"Location of data for dataset {ds.code} previously declared. " f"Former: {attributes.get('_location')}, " f"Current: {dsd_dataset_data_location}", location=IssueLocation(sheet_name=name, row=r, column=None))) attributes = ds.attributes else: attributes["_dataset_first_row"] = r attributes[ "_location"] = dsd_dataset_data_location # Location ds.description = dsd_concept_description ds.attributes = attributes # Set attributes else: # If concept_type is defined => add a concept # Check if the concept name already appears --> Error for d1 in ds.dimensions: if strcmp(d1.code, dsd_concept_name): issues.append( Issue( itype=IType.ERROR, description= f"Concept {dsd_concept_name} already declared for dataset {ds.code}", location=IssueLocation(sheet_name=name, row=r, column=None))) break d = Dimension() d.dataset = ds d.description = dsd_concept_description d.code = dsd_concept_name d.is_measure = False if dsd_concept_type.lower( ) == "dimension" else True if not d.is_measure and dsd_concept_data_type.lower( ) == "time": d.is_time = True else: d.is_time = False if dsd_concept_type.lower() == "attribute": attributes["_attribute"] = True else: attributes["_attribute"] = False if dsd_concept_data_type.lower() == "category": # TODO "hierarchies" variable really does not register hierarchies (see "hierarchy_command.py" or "hierarchy_categories_command.py", no insertion is made) # h = hierarchies.get(dsd_concept_domain, None) h = glb_idx.get( Hierarchy.partial_key(name=dsd_concept_domain)) if len(h) == 0: issues.append( Issue( itype=IType.ERROR, description= "Could not find hierarchy of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return elif len(h) > 1: issues.append( Issue( itype=IType.ERROR, description= "Found more than one instance of Categories '" + dsd_concept_domain + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: # len(h) == 1 h = h[0] d.hierarchy = h # Reencode the Hierarchy as a CodeList cl = convert_hierarchy_to_code_list(h) d.code_list = cl attributes["_datatype"] = dsd_concept_data_type attributes["_domain"] = dsd_concept_domain d.attributes = attributes
def analyze_and_complete( self, structural_graph: Optional[nx.DiGraph] = None) -> List[Issue]: """ It analyzes the flow graph and completes it with inferrable data. First, it checks if Flow Graph is DAG (Directed Acyclic Graph), this is, no cycles exist Then, it follows these steps: How many output edges without weight has the node? * More than 1: missing weights (WARNING) * Only 1: how many output edges in total? * Only 1: is there an opposite weight? * Yes: weight can be inferred as (1.0 / opposite weight) (INFO) * No: weight can be inferred as 1.0 (INFO) * More than 1: compute sum of other edges with weight * sum > 1: weight cannot be inferred (WARNING) * sum <= 1: weight can be inferred as (1 - sum) (INFO) :return: a list of messages given by the analysis and completion of type INFO, WARNING or ERROR """ issues: List[Issue] = [] # Checking if graph is acyclic. Just looking at the direct graph is OK. # if not nx.algorithms.dag.is_directed_acyclic_graph(self._direct_graph): # print("Cycles detected. The nodes connected in a cycle are: ") # for n in nx.algorithms.cycles.simple_cycles(self._direct_graph): # print(n) # issues.append(Issue(IType.ERROR, 'The graph contains cycles')) # return issues for graph, opposite_graph in [ (self._direct_graph, self._reverse_graph), (self._reverse_graph, self._direct_graph) ]: for n in nx.dfs_preorder_nodes(graph): graph.nodes[n]['split'] = False # Working on output edges only of node 'n' all_edges = graph.out_edges(n, data=True) if len(all_edges) == 0: continue # How many output edges without weight has the node? edges_without_weight = [ e for e in all_edges if e[2]['weight'] is None ] if len(edges_without_weight) > 1: str_edges = [ f'({e[0]}, {e[1]})' for e in edges_without_weight ] issues.append( Issue( IType.WARNING, f'The following edges don\'t have a weight: {", ".join(str_edges)}' )) elif len(edges_without_weight) == 1: if len(all_edges) == 1: edge = list(all_edges)[0] u, v, data = edge opposite_weight = opposite_graph[v][u]['weight'] if opposite_weight is not None: data['weight'] = Weight( 0.0) if opposite_weight == 0.0 else ( Weight(1.0) / opposite_weight) issues.append( Issue( IType.INFO, f'The weight of single output edge "{edge}" could be inferred from ' f'opposite weight "{opposite_weight}"')) elif structural_graph and (structural_graph.has_edge( u, v) or structural_graph.has_edge(v, u)): issues.append( Issue( IType.WARNING, f'The weight of single output edge "{edge}" will not be inferred ' f'because a structural edge exist')) else: data['weight'] = Weight(1.0) issues.append( Issue( IType.INFO, f'The weight of single output edge "{edge}" could be inferred ' f'without opposite weight')) else: sum_other_weights = reduce(add, [ e[2]['weight'] for e in all_edges if e[2]['weight'] is not None ]) if sum_other_weights > 1.0: issues.append( Issue( IType.WARNING, f'The weight of edge "{edges_without_weight[0]}" cannot be inferred, ' f'the sum of other weights is >= 1.0: {sum_other_weights}' )) else: edges_without_weight[0][2]['weight'] = Weight( 1.0) - sum_other_weights graph.nodes[n]['split'] = True issues.append( Issue( IType.INFO, f'The weight of edge "{edges_without_weight[0]}" could be inferred ' f'from the sum of other weights')) elif len(all_edges) > 1: # All edges have a weight sum_all_weights = reduce( add, [e[2]['weight'] for e in all_edges]) if math.isclose(sum_all_weights, 1.0): graph.nodes[n]['split'] = True return issues
def parse_dataset_data_command(sh: Worksheet, area: AreaTupleType, name: str, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ issues: List[Issue] = [] # Analyze column names col_map = create_dictionary() for c in range(area[2], area[3]): col_name = sh.cell(row=area[0], column=c).value.strip() # Avoid repetitions if col_name in col_map: issues.append(Issue(itype=IType.ERROR, description="The column name '"+col_name+"' is repeated", location=IssueLocation(sheet_name=name, row=1, column=c))) if strcmp(col_name, "DatasetName") or strcmp(col_name, "Dataset"): col_map["dataset"] = c elif col_name: # Concept name col_map[col_name] = c if any([i.itype == IType.ERROR for i in issues]): return issues, None, None # Read all the content into a list of lists lines = [] for r in range(area[0] + 1, area[1]): line = [] for col_name, c in col_map.items(): v = sh.cell(row=r, column=c).value if isinstance(v, str): v = v.strip() line.append(v) lines.append(line) # pd.DataFrame df = pd.DataFrame(columns=[col_name for col_name in col_map], data=lines) content = [] # The output JSON if "dataset" in df: # Find the different datasets datasets = df["dataset"].unique() datasets = set([d.lower() for d in datasets]) for dataset in datasets: # Obtain filtered df2 = df.loc[df['dataset'].str.lower() == dataset] # Convert to JSON and store in content del df2["dataset"] s = StringIO() df2.to_json(s, orient="split") content.append(dict(name=dataset, values=s.getvalue())) else: s = StringIO() df.to_json(s, orient="split") content.append(dict(name="", values=s.getvalue())) return issues, None, dict(items=content, command_name=name)
def parse_command_in_worksheet(sh: Worksheet, area: AreaTupleType, name: Optional[str], cmd_name: str) -> IssuesLabelContentTripleType: """ Parse command in general Generate a JSON Generate a list of issues :param sh: Worksheet to read :param area: Area of the worksheet :param name: Name of the worksheet :param cmd_name: Name of the command. Key to access "command_fields" variable. Also, shown in issue descriptions :return: issues List, None, content (JSON) """ def check_expandable(v, location): """ Check if curly braces match, that what is inside is syntactically correct, (and that the value exists) :param v: :return: """ import re reg = re.compile(r"{.*?}") matches = reg.findall(v) output = set() if len(matches) == 0: issues.append( Issue( itype=IType.ERROR, description=f"Incorrect syntax, no macro expansion found", location=location)) else: for m in matches: h_name = m[1:-1] try: parser_field_parsers.string_to_ast( arith_boolean_expression, h_name) # simple_h_name output.add(h_name) except: issues.append( Issue( itype=IType.ERROR, description= f"The value {m[1:-1]} is not a valid hierarchical name", location=location)) return output def commented_row(rn): commented = False v = sh.cell(row=r, column=1).value if v is not None: if str(v).startswith("#"): commented = True return commented issues: List[Issue] = [] from nexinfosys.command_field_definitions import command_fields cols = command_fields[ cmd_name] # List of CommandField that will guide the parsing col_map, local_issues = check_columns(sh, name, area, cols, cmd_name) if any([i.itype == IType.ERROR for i in local_issues]): return local_issues, None, None issues.extend(local_issues) # The "mandatoriness" of a field may depend on values in other fields (like in RefBibliographic command fields) # Elaborate a list of fields having this "complex" mandatory property complex_mandatory_cols = [c for c in cols if isinstance(c.mandatory, str)] content = [] # The output JSON # Parse each Row for r in range(area[0] + 1, area[1]): line = {} expandable = set( ) # A set of variables to be expanded. If empty, it is a literal line (not expandable) complex = False # The line contains at least one field with a complex rule (which cannot be evaluated with a simple cast) # A row is commented if the value in the first column starts with "#" (a first empty column could be inserted # to ease this, just to signal commented rows) if commented_row(r): continue # Constant mandatory values mandatory_not_found = set([ c.name for c in cols if c.mandatory and isinstance(c.mandatory, bool) ]) # Each "field" for field_def in col_map.keys(): field_name = field_def.name field_defined = False # Appearances of field (normally just once, there are attributes allowing more than one appearance) for col_name, col_idx in col_map[field_def]: # Read and prepare "value" value = sh.cell(row=r, column=col_idx).value if value is not None: if isinstance(value, float): if value == int(value): value = str(int(value)) else: value = str(value) elif not isinstance(value, str): value = str(value) value = value.strip() field_defined = True else: continue # Check if value contains "{", expansion if "{" in value: # Expandable. Do not parse now. Check: curly pairs, and that what is between is a # simple_h_name and that it exists: as dataset expandable.update( check_expandable( value, IssueLocation(sheet_name=name, row=r, column=col_idx))) # With many appearances, just a "Key-Value list" syntax is permitted if field_def.many_appearances: if field_name in line: line[ field_name] += ", " + col_name + "='" + value + "'" else: line[field_name] = col_name + "='" + value + "'" else: if field_name in line: line[field_name] += ", " + value else: line[field_name] = value # Store the value else: if field_def.allowed_values: # If the CommandField checks for a list of allowed values allowed_values_dict: Dict[str, str] = { v.lower(): v for v in field_def.allowed_values } if value.lower( ) not in allowed_values_dict: # TODO Case insensitive CI issues.append( Issue( itype=IType.ERROR, description= f"Field '{col_name}' of command '{cmd_name}' has invalid category " f"'{value}'. Allowed values are: {', '.join(field_def.allowed_values)}.", location=IssueLocation(sheet_name=name, row=r, column=col_idx))) else: # Use case from allowed values line[field_name] = allowed_values_dict[ value.lower()] else: # Instead of a list of values, check if a syntactic rule is met by the value if field_def.parser: # Parse, just check syntax (do not store the AST) try: standalone_attribute_value = "@" in field_def.allowed_names[ 0] if not standalone_attribute_value: ast = parser_field_parsers.string_to_ast( field_def.parser, value) else: try: ast = parser_field_parsers.string_to_ast( field_def.parser, value) except: ast = parser_field_parsers.string_to_ast( unquoted_string, value) # Rules are in charge of informing if the result is expandable and if it complex if "expandable" in ast and ast["expandable"]: issues.append( Issue( itype=IType.ERROR, description= f"The value in field '{col_header}' of command " f"'{cmd_name}' should not be expandable. Entered: {value}", location=IssueLocation( sheet_name=name, row=r, column=col_idx))) if "complex" in ast and ast["complex"]: complex = True # With many appearances, just a "Key-Value list" syntax is permitted if field_def.many_appearances: if field_name in line: line[ field_name] += ", " + col_name + "='" + value + "'" else: line[ field_name] = col_name + "='" + value + "'" else: if field_name in line: line[field_name] += ", " + value else: line[ field_name] = value # Store the value except: import traceback traceback.print_exc() col_header = sh.cell(row=1, column=col_idx).value issues.append( Issue( itype=IType.ERROR, description= f"The value in field '{col_header}' of command " f"'{cmd_name}' is not syntactically correct. Entered: {value}", location=IssueLocation( sheet_name=name, row=r, column=col_idx))) else: line[ field_name] = value # No parser, just store blindly the value if field_defined and field_def.name in mandatory_not_found: mandatory_not_found.discard(field_def.name) if len(line) == 0: continue # Empty line (allowed) # Flags to accelerate the second evaluation, during execution line["_row"] = r line["_expandable"] = list(expandable) line["_complex"] = complex # Append if all mandatory fields have been filled may_append = True if len(mandatory_not_found) > 0: issues.append( Issue(itype=IType.ERROR, description="Mandatory columns: " + ", ".join(mandatory_not_found) + " have not been specified", location=IssueLocation(sheet_name=name, row=r, column=None))) may_append = False # Check varying mandatory fields (fields depending on the value of other fields) for c in complex_mandatory_cols: field_def = c.name # next(c2 for c2 in col_map if strcmp(c.name, c2.name)) if isinstance(c.mandatory, str): # Evaluate mandatory = eval(c.mandatory, None, line) may_append = (mandatory and field_def in line) or (not mandatory) if mandatory and field_def not in line: issues.append( Issue(itype=IType.ERROR, description="Mandatory column: " + field_def + " has not been specified", location=IssueLocation(sheet_name=name, row=r, column=None))) if may_append: content.append(line) return issues, None, {"items": content, "command_name": name}