def execute(self, state: "State"): some_error = False issues = [] # Read mapping parameters origin_dataset = self._content["origin_dataset"] origin_dimension = self._content["origin_dimension"] destination = self._content["destination"] # [{"o": "", "to": [{"d": "", "w": ""}]}] # [ {o: origin category, to: [{d: destination category, w: weight assigned to destination category}] } ] map = self._content["map"] # Obtain the origin dataset Metadata, obtain the code list dims, attrs, meas = obtain_dataset_metadata(origin_dataset) if origin_dimension not in dims: some_error = True issues.append((3, "The origin dimension '"+origin_dimension+"' does not exist in dataset '"+origin_dataset+"'")) else: dim = dims[origin_dimension] map = fill_map_with_all_origin_categories(dim, map) # # Check all codes exist # src_code_list = [c for c in dim.code_list] # dst_code_set = set() # many_to_one_list = [] # for i in map: # o = i["o"] # for j in i["to"]: # d = j["d"] # dst_code_set.add(d) # many_to_one_list.append((o, d)) # hierarchical_code = True # if hierarchical_code: # mapped, unmapped = map_codelists(src_code_list, list(dst_code_set), many_to_one_list) # else: # # Literal. All codes on the left MUST exist # mapped = many_to_one_list # for i in mapped: # o = i["o"] # if o not in dim.code_list: # some_error = True # issues.append((3, "The origin category '" + o + "' does not exist in dataset dimension '" + origin_dataset + "." +origin_dimension + "'")) if some_error: # Issues at this point are errors, return if there are any return issues, None # Create and store the mapping glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state) from nexinfosys.ie_imports.data_source_manager import DataSourceManager source = DataSourceManager.obtain_dataset_source(origin_dataset, datasets) mappings[self._name] = Mapping(self._name, source, origin_dataset, origin_dimension, destination, map) # TODO If the categories to the left are not totally covered, what to do? # TODO - If a non-listed category appears, remove the line # TODO - If a non-listed category appears, leave the target column NA # TODO - If there are datasets matching the origin, JOIN return None, None
def execute(self, state: "State"): def process_line(item): # Read variables mh_src_dataset = item.get("source_dataset", None) mh_src_hierarchy = item.get("source_hierarchy", None) mh_src_code = item.get("source_code", None) mh_dst_hierarchy = item.get("destination_hierarchy", None) mh_dst_code = item.get("destination_code", None) mh_weight = item.get("weight", 1.0) # Mapping name name = ((mh_src_dataset + ".") if mh_src_dataset else "") + mh_src_hierarchy + " -> " + mh_dst_hierarchy if name in mappings: issues.append( Issue(itype=IType.ERROR, description="The mapping '" + name + "' has been declared previously. Skipped.", location=IssueLocation(sheet_name=name, row=r, column=None))) return if name in local_mappings: d = local_mappings[name] else: d = DottedDict() local_mappings[name] = d d.name = name d.origin_dataset = mh_src_dataset d.origin_hierarchy = mh_src_hierarchy d.destination_hierarchy = mh_dst_hierarchy d.mapping = create_dictionary() # Specific code if mh_src_code in d.mapping: to_dict = d.mapping[mh_src_code] else: to_dict = create_dictionary() if mh_dst_code in to_dict: issues.append( Issue(itype=IType.ERROR, description="The mapping of '" + mh_src_code + "' into '" + mh_dst_code + "' has been already defined", location=IssueLocation(sheet_name=name, row=r, column=None))) return else: to_dict[mh_dst_code] = ( mh_weight, r ) # NOTE: This could be an object instead of just a FLOAT or expression d.mapping[mh_src_code] = to_dict issues = [] glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) name = self._content["command_name"] local_mappings = create_dictionary() # Process parsed information for line in self._content["items"]: r = line["_row"] # If the line contains a reference to a dataset or hierarchy, expand it # If not, process it directly is_expansion = False if is_expansion: # TODO Iterate through dataset and/or hierarchy elements, producing a list of new items pass else: process_line(line) # Mappings post-processing for d in local_mappings: # Convert the mapping into: # [{"o": "", "to": [{"d": "", "w": ""}]}] # [ {o: origin category, to: [{d: destination category, w: weight assigned to destination category}] } ] mapping = [] ds_rows = [] # Rows in which a dataset is mentioned for orig in local_mappings[d].mapping: lst = [] for dst in local_mappings[d].mapping[orig]: t = local_mappings[d].mapping[orig][dst] lst.append(dict(d=dst, w=t[0])) if local_mappings[d].origin_dataset: ds_rows.append(t[1]) mapping.append(dict(o=orig, to=lst)) from nexinfosys.ie_imports.data_source_manager import DataSourceManager if local_mappings[d].origin_dataset: if not DataSourceManager.obtain_dataset_source( local_mappings[d].origin_dataset, datasets): for r in ds_rows: issues.append( Issue( itype=IType.ERROR, description= f"The dataset '{local_mappings[d].origin_dataset}' was not found", location=IssueLocation(sheet_name=name, row=r, column=None))) continue dims, attrs, meas = obtain_dataset_metadata( local_mappings[d].origin_dataset, None, datasets) if local_mappings[d].origin_hierarchy not in dims: issues.append( Issue(itype=IType.ERROR, description="The origin dimension '" + local_mappings[d].origin_hierarchy + "' does not exist in dataset '" + local_mappings[d].origin_dataset + "'", location=IssueLocation(sheet_name=name, row=r, column=None))) continue else: dim = dims[local_mappings[d].origin_hierarchy] mapping = fill_map_with_all_origin_categories(dim, mapping) # origin_dataset = local_mappings[d].origin_dataset origin_hierarchy = local_mappings[d].origin_hierarchy destination_hierarchy = local_mappings[d].destination_hierarchy # Create Mapping and add it to Case Study mappings variable mappings[d] = Mapping( d, DataSourceManager.obtain_dataset_source( origin_dataset, datasets), origin_dataset, origin_hierarchy, destination_hierarchy, mapping) # TODO # Use the function to perform many to many mappings, "augment_dataframe_with_mapped_columns" # Put it to work !!! # One or more mapping in sequence could be specified?. The key is "source hierarchy+dest hierarchy" # Read mapping parameters return issues, None
def parse_etl_external_dataset_command(sh: Worksheet, area: AreaTupleType, dataset_name: str, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ def obtain_column(cn, r1, r2): """ Obtain a list with the values of a column, in the range of rows [r1, r2) :param cn: Column number :param r1: Starting row :param r2: End+1 row :return: list with the cell values """ lst = [] for row in range(r1, r2): value = sh.cell(row=row, column=cn).value if value is None: continue if isinstance(value, str): lst.append(value.strip()) else: lst.append(value) return lst issues = [] # Global variables (at parse time they may not be defined, so process carefully...) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # Dataset source from nexinfosys.ie_imports.data_source_manager import DataSourceManager source = DataSourceManager.obtain_dataset_source(dataset_name, datasets) # Obtain metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets) # Load all code lists in a temporary dictionary of sets # Also check if there is a TIME dimension in the dataset cl = create_dictionary() we_have_time = False for d in dims: if dims[d].code_list: cl[d] = [k.lower() for k in dims[d].code_list.keys()] # Attach the code list else: cl[d] = None # No code list (TIME_PERIOD for instance) if dims[d].istime: we_have_time = True # Add matching mappings as more dimensions for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # Add a dictionary entry for the new dimension, add also the codes present in the map tmp = [ to["d"] for o in mappings[m].map for to in o["to"] if to["d"] ] cl[mappings[m].destination] = set( tmp) # [t[1] for t in mappings[m].map] # Scan columns for Dimensions, Measures and Aggregation. # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside. # TODO The result COULD be an automatic BI cube (with a separate field) # TODO - Write into a set of tables in Mondrian # TODO - Generate Schema for Mondrian # TODO - Write the Schema for Mondrian measures = [] out_dims = [] agg_funcs = [] measures_as = [] filter_ = { } # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement) result_name = None # By default, no name for the result. It will be dynamically obtained for c in range(area[2], area[3]): col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in [ "dimensions_kept", "dims", "dimensions" ]: # "GROUP BY" lst = obtain_column(c, area[0] + 1, area[1]) for d in lst: if not d: continue if d not in cl: issues.append(( 3, "The dimension specified for output, '" + d + "' is neither a dataset dimension nor a mapped dimension. [" + ', '.join([d2 for d2 in cl]) + "]")) else: out_dims.append(d) elif col_name.lower().strip() in [ "aggregation_function", "aggfunc", "agg_func" ]: # "SELECT AGGREGATORS" lst = obtain_column(c, area[0] + 1, area[1]) for f in lst: if f.lower() not in [ "sum", "avg", "count", "sumna", "countav", "avgna", "pctna" ]: issues.append(( 3, "The specified aggregation function, '" + f + "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'" )) else: agg_funcs.append(f) elif col_name.lower().strip() in ["measures"]: # "SELECT" lst = obtain_column(c, area[0] + 1, area[1]) # Check for measures # TODO (and attributes?) for m in lst: if not m: continue if m not in meas: issues.append( (3, "The specified measure, '" + m + "' is not a measure available in the dataset. [" + ', '.join([m2 for m2 in measures]) + "]")) else: measures.append(m) elif col_name.lower().strip() in ["measuresas"]: # "AS <name>" lst = obtain_column(c, area[0] + 1, area[1]) for m in lst: measures_as.append(m) elif col_name in cl: # A dimension -> "WHERE" # Check codes, and add them to the "filter" lst = obtain_column(c, area[0] + 1, area[1]) for cd in lst: if not cd: continue if str(cd).lower() not in cl[col_name]: issues.append(( 3, "The code '" + cd + "' is not present in the codes declared for dimension '" + col_name + "'. Please, check them.")) else: if col_name not in filter_: lst2 = [] filter_[col_name] = lst2 else: lst2 = filter_[col_name] lst2.append(cd) elif we_have_time and col_name.lower() in [ "startperiod", "endperiod" ]: # SPECIAL "WHERE" FOR TIME # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command # Interval of time periods lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: filter_[col_name] = lst[ 0] # In this case it is not a list, but a number or string !!!! elif col_name.lower() in ["result_name", "result name", "resultname"]: lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: result_name = lst[0] try: parser_field_parsers.string_to_ast(simple_ident, result_name) except: issues.append((3, "Column '" + col_name + "' has an invalid dataset name '" + result_name + "'")) if len(measures) == 0: issues.append((3, "At least one measure should be specified")) if len(agg_funcs) == 0: issues.append( (2, "No aggregation function specified. Assuming 'average'")) agg_funcs.append("average") if not result_name: result_name = source + "_" + dataset_name issues.append( (2, "No result name specified. Assuming '" + result_name + "'")) content = { "dataset_source": source, "dataset_name": dataset_name, "dataset_datetime": None, "where": filter_, "dimensions": [d for d in dims], "group_by": out_dims, "measures": measures, "agg_funcs": agg_funcs, "measures_as": measures_as, "result_name": result_name } return issues, None, content
def execute(self, state: "State"): """ First bring the data considering the filter Second, group, third aggregate Finally, store the result in State """ issues = [] # Obtain global variables in state glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # DS Source + DS Name source = self._content["dataset_source"] dataset_name = self._content["dataset_name"] dataset_datetime = self._content["dataset_datetime"] # Result name result_name = self._content["result_name"] if result_name in datasets or state.get(result_name): issues.append((2, "A dataset called '" + result_name + "' is already stored in the registry of datasets")) # Dataset metadata dims, attrs, measures = obtain_dataset_metadata( dataset_name, source, datasets) # Obtain filter parameters params = create_dictionary( ) # Native dimension name to list of values the filter will allow to pass joined_dimensions = [] for dim in self._content["where"]: lst = self._content["where"][dim] native_dim = None if dim.lower() in [ "startperiod", "starttime", "endperiod", "endtime" ]: native_dim = dim lst = [lst] elif dim not in dims: # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR for m in mappings: if strcmp(mappings[m].destination, dim) and \ strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: joined_dimensions.append( mappings[m].destination ) # Store dimension in the original case native_dim = mappings[m].origin lst = obtain_reverse_codes(mappings[m].map, lst) break else: # Get the dimension name with the original case native_dim = dims[dim].name if native_dim: if native_dim not in params: f = set() params[native_dim] = f else: f = params[native_dim] f.update(lst) # Convert param contents from set to list for p in params: params[p] = [i for i in params[p]] # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ds = nexinfosys.data_source_manager.get_dataset_filtered( source, dataset_name, params, datasets) df = ds.data # Join with mapped dimensions (augment it) mapping_dict = create_dictionary() for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # mapping_tuples.append((mappings[m].origin, mappings[m].destination, mappings[m].map)) mapping_dict[mappings[m].origin] = (mappings[m].destination, { d["o"]: d["to"] for d in mappings[m].map }) # If accelerated version not available, use slow version try: if nexinfosys.get_global_configuration_variable( "ENABLE_CYTHON_OPTIMIZATIONS") == "True": from nexinfosys.restful_service.helper_accel import augment_dataframe_with_mapped_columns2 as augment_df else: raise Exception("Just to import the slow version") except: from nexinfosys.common.helper import augment_dataframe_with_mapped_columns as augment_df df = augment_df(df, mapping_dict, ["value"]) # Aggregate (If any dimension has been specified) if len(self._content["group_by"]) > 0: # Column names where data is # HACK: for the case where the measure has been named "obs_value", use "value" values = [ m.lower() if m.lower() != "obs_value" else "value" for m in self._content["measures"] ] v2 = [] for v in values: for c in df.columns: if v.lower() == c.lower(): v2.append(c) break values = v2 # TODO: use metadata name (e.g. "OBS_VALUE") instead of hardcoded "value" # values = self._content["measures"] out_names = self._content["measures_as"] group_by_dims = translate_case( self._content["group_by"], df.columns) # Group by dimension names lcase_group_by_dims = [d.lower() for d in group_by_dims] # Now joined_dimensions for d in joined_dimensions: if d.lower() in lcase_group_by_dims: # Find and replace for i, d2 in enumerate(group_by_dims): if strcmp(d, d2): group_by_dims[i] = d break agg_funcs = [] # Aggregation functions agg_names = {} for f in self._content["agg_funcs"]: if f.lower() in ["avg", "average"]: agg_funcs.append(np.average) agg_names[np.average] = "avg" elif f.lower() in ["sum"]: agg_funcs.append(np.sum) agg_names[np.sum] = "sum" elif f.lower() in ["count"]: agg_funcs.append(np.size) agg_names[np.size] = "count" elif f.lower() in ["sumna"]: agg_funcs.append(np.nansum) agg_names[np.nansum] = "sumna" elif f.lower() in ["countav"]: agg_funcs.append("count") agg_names["count"] = "countav" elif f.lower() in ["avgna"]: agg_funcs.append(np.nanmean) agg_names[np.nanmean] = "avgna" elif f.lower() in ["pctna"]: agg_funcs.append(pctna) agg_names[pctna] = "pctna" # Calculate Pivot Table. The columns are a combination of values x aggregation functions # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]] try: # Check that all "group_by_dims" on which pivot table aggregates are present in the input "df" # If not either synthesize them (only if there is a single filter value) or remove (if not present for r in group_by_dims.copy(): df_columns_dict = create_dictionary( data={c: None for c in df.columns}) if r not in df_columns_dict: found = False for k in params: if strcmp(k, r): found = True if len(params[k]) == 1: df[k] = params[k][0] else: group_by_dims.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) break if not found: group_by_dims.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) # Create and register Hierarchy objects from origin Dataset dimensions: state, ds ds_columns_dict = create_dictionary( data={c.code: c.code for c in ds.dimensions}) for r in group_by_dims: if r in ds_columns_dict: # Create hierarchy local to the dataset for d in ds.dimensions: if strcmp(r, d.code): if d.code_list: h = convert_code_list_to_hierarchy( d.code_list) h.name = result_name + "_" + r glb_idx.put(h.key(), h) break # Pivot table using Group by if True: groups = df.groupby(by=group_by_dims, as_index=False) # Split d = OrderedDict([]) lst_names = [] if len(values) == len(agg_funcs): for i, (value, agg_func) in enumerate(zip(values, agg_funcs)): if len(out_names) == len(values) and out_names[i]: lst_names.append(out_names[i]) else: lst_names.append(agg_names[agg_func] + "_" + value) lst = d.get(value, []) lst.append(agg_func) d[value] = lst else: for value in values: lst = d.get(value, []) for agg_func in agg_funcs: lst.append(agg_func) lst_names.append(agg_names[agg_func] + "_" + value) d[value] = lst # Print NaN values for each value column for value in set(values): cnt = df[value].isnull().sum() print("NA count for col '" + value + "': " + str(cnt) + " of " + str(df.shape[0])) # AGGREGATE !! df2 = groups.agg(d) # Rename the aggregated columns df2.columns = group_by_dims + lst_names # else: # # Pivot table # df2 = pd.pivot_table(df, # values=values, # index=group_by_dims, # aggfunc=[agg_funcs[0]], fill_value=np.NaN, margins=False, # dropna=True) # # Remove the multiindex in columns # df2.columns = [col[-1] for col in df2.columns.values] # # Remove the index # df2.reset_index(inplace=True) # The result, all columns (no index), is stored for later use ds = self._create_new_dataset(result_name, ds, df2, group_by_dims, out_names) except Exception as e: traceback.print_exc() issues.append((3, "There was a problem: " + str(e))) # Store the dataset in State datasets[result_name] = ds return issues, None
def execute(self, state: "State"): """ First bring the data considering the filter Second, group, third aggregate Finally, store the result in State """ issues = [] # Obtain global variables in state glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # DS Source + DS Name source = self._content["dataset_source"] dataset_name = self._content["dataset_name"] # Result name result_name = self._content["result_name"] if result_name in datasets or state.get(result_name): issues.append((2, "A dataset called '" + result_name + "' is already stored in the registry of datasets")) # Dataset metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source) # Obtain filter parameters params = create_dictionary( ) # Native dimension name to list of values the filter will allow to pass joined_dimensions = [] for dim in self._content["where"]: lst = self._content["where"][dim] native_dim = None if dim.lower() in ["startperiod", "endperiod"]: native_dim = dim lst = [lst] elif dim not in dims: # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR for m in mappings: if strcmp(mappings[m].destination, dim) and \ strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: joined_dimensions.append( mappings[m].destination ) # Store dimension in the original case native_dim = mappings[m].origin lst = obtain_reverse_codes(mappings[m].map, lst) break else: # Get the dimension name with the original case native_dim = dims[dim].name if native_dim: if native_dim not in params: f = set() params[native_dim] = f else: f = params[native_dim] f.update(lst) # Convert param contents from set to list for p in params: params[p] = [i for i in params[p]] # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ds = nexinfosys.data_source_manager.get_dataset_filtered( source, dataset_name, params) df = ds.data # Join with mapped dimensions (augment it) # TODO Prepare an "m" containing ALL the mappings affecting "df" # TODO df2 = augment_dataframe_with_mapped_columns(df, m, ["value"]) # TODO Does it allow adding the new column for the dimension, in case it is requested? Probably yes, but test it for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # TODO Change by many-to-many mapping # TODO augment_dataframe_with_mapped_columns(df, maps, measure_columns) # Elaborate a many to one mapping tmp = [] for el in mappings[m].map: for to in el["to"]: if to["d"]: tmp.append([el["o"], to["d"]]) df_dst = pd.DataFrame( tmp, columns=['sou_rce', mappings[m].destination]) for di in df.columns: if strcmp(mappings[m].origin, di): d = di if not nexinfosys.case_sensitive: df[d + "_l"] = df[d].str.lower() d = d + "_l" break df = pd.merge(df, df_dst, how='left', left_on=d, right_on='sou_rce') del df['sou_rce'] if not nexinfosys.case_sensitive: del df[d] # Aggregate (If any dimension has been specified) if len(self._content["group_by"]) > 0: # Column names where data is # HACK: for the case where the measure has been named "obs_value", use "value" values = [ m.lower() if m.lower() != "obs_value" else "value" for m in self._content["measures"] ] out_names = self._content["measures_as"] rows = translate_case(self._content["group_by"], params) # Group by dimension names lcase_rows = [d.lower() for d in rows] # Now joined_dimensions for d in joined_dimensions: if d.lower() in lcase_rows: # Find and replace for i, d2 in enumerate(rows): if strcmp(d, d2): rows[i] = d break aggs = [] # Aggregation functions agg_names = {} for f in self._content["agg_funcs"]: if f.lower() in ["avg", "average"]: aggs.append(np.average) agg_names[np.average] = "avg" elif f.lower() in ["sum"]: aggs.append(np.sum) agg_names[np.sum] = "sum" elif f.lower() in ["count"]: aggs.append(np.size) agg_names[np.size] = "count" elif f.lower() in ["sumna"]: aggs.append(np.nansum) agg_names[np.nansum] = "sumna" elif f.lower() in ["countav"]: # countav=="Count Available" aggs.append("count") # Count number of non-NaN elements agg_names["count"] = "countav" elif f.lower() in ["avgav", "avgna"]: # avgna=="Average without aggs.append(np.nanmean) agg_names[np.nanmean] = "avgna" elif f.lower() in ["pctna"]: # % of NaN vs total elements aggs.append(pctna) agg_names[pctna] = "pctna" # Calculate Pivot Table. The columns are a combination of values x aggregation functions # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]] try: # Check that all "rows" on which pivot table aggregates are present in the input "df" # If not either synthesize them (only if there is a single filter value) or remove (if not present df_columns_dict = create_dictionary( data={c: c for c in df.columns}) for r in rows.copy(): if r not in df_columns_dict: found = False for k in params: if strcmp(k, r): found = True if len(params[k]) == 1: df[r] = params[k][0] else: rows.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) break if not found: rows.remove(r) issues.append(( 2, "Dimension '" + r + "' removed from the list of dimensions because it is not present in the raw input dataset." )) # Put proper DIMENSION names for ir, r in enumerate(rows): if r in df_columns_dict: rows[ir] = df_columns_dict[r] # Create and register Hierarchy objects from origin Dataset dimensions: state, ds ds_columns_dict = create_dictionary( data={c.code: c.code for c in ds.dimensions}) for r in rows: if r in ds_columns_dict: # Create hierarchy local to the dataset for d in ds.dimensions: if strcmp(r, d.code): if d.code_list: h = convert_code_list_to_hierarchy( d.code_list) h.name = result_name + "_" + r glb_idx.put(h.key(), h) break # Pivot table using Group by # if True: groups = df.groupby(by=rows, as_index=False) # Split d = OrderedDict([]) lst_names = [] if len(values) == len(aggs): for i, t in enumerate(zip(values, aggs)): v, agg = t if len(out_names) == len(values): if out_names[i]: lst_names.append(out_names[i]) else: lst_names.append(agg_names[agg] + "_" + v) else: lst_names.append(agg_names[agg] + "_" + v) lst = d.get(v, []) lst.append(agg) d[v] = lst else: for v in values: lst = d.get(v, []) for agg in aggs: lst.append(agg) lst_names.append(agg_names[agg] + "_" + v) d[v] = lst # Print NaN values for each value column for v in set(values): cnt = df[v].isnull().sum() print("NA count for col '" + v + "': " + str(cnt) + " of " + str(df.shape[0])) # AGGREGATE !! df2 = groups.agg(d) # Rename the aggregated columns df2.columns = rows + lst_names # else: # # Pivot table # df2 = pd.pivot_table(df, # values=values, # index=rows, # aggfunc=[aggs[0]], fill_value=np.NaN, margins=False, # dropna=True) # # Remove the multiindex in columns # df2.columns = [col[-1] for col in df2.columns.values] # # Remove the index # df2.reset_index(inplace=True) # The result, all columns (no index), is stored for later use ds.data = df2 except Exception as e: issues.append( (3, "There was a problem with the grouping: " + repr(e))) # Store the dataset in State datasets[result_name] = ds return issues, None
def parse_mapping_command(sh: Worksheet, area: AreaTupleType, origin, destination) -> IssuesLabelContentTripleType: """ Map from a set of categories from an external dataset into a set of MuSIASEM categories If the categories do not exist, they are created flat. Later they can be turned into a hierarchy and the mapping will still hold The syntax of the mapping allows expressing MANY to ONE and also MANY to MANY correspondence. The mapping has to be complete (all elements from left side must be covered, if not "" is assumed on the right side) :param sh: Input worksheet :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the command is present :param origin: :param destination: :return: list of issues (issue_type, message), command label, command content """ some_error = False issues = [] # Analyze Origin cell = sh.cell(row=area[0], column=area[2]) col_name = cell.value if origin: if not strcmp(origin, col_name): some_error = True issues.append(( 3, "The Origin name is different in the sheet name and in the worksheet (" + origin + ", " + col_name + ")")) else: origin = col_name # Obtain the source, the dataset and the dimension of "origin" spl = origin.split(".") if len(spl) == 3: # Source.Dataset.Dimension s, ds, dim = spl s = s + "." origin_ok = True elif len(spl) == 2: # Dataset.Dimension ds, dim = spl s = "" origin_ok = True else: origin_ok = False some_error = True issues.append(( 3, "Origin must specify a dataset and a dimension name separated by '.'" )) if origin_ok: origin_dataset = s + ds origin_dim = dim if not check_dataset_exists(origin_dataset): some_error = True issues.append((3, "The Origin '" + origin_dataset + "' does not match any registered dataset")) else: dims, attrs, meas = obtain_dataset_metadata(ds) if origin_dim not in dims: some_error = True issues.append( (3, "The Origin dataset '" + origin_dataset + "' does not have a dimension '" + origin_dim + "'")) # Analyze Destination cell = sh.cell(row=area[0], column=area[2] + 1) col_name = cell.value if destination: if not strcmp(destination, col_name): some_error = True issues.append(( 3, "The Destination name is different in the sheet name and in the worksheet (" + destination + ", " + col_name + ")")) else: destination = col_name # Destination name must be a simple identity try: parser_field_parsers.simple_ident.parseString(destination, parseAll=True) except: some_error = True issues.append((3, "'" + destination + "' category name has to be a simple identifier")) if some_error: # Issues at this point are errors, return if there are any return issues, None, None # Read mapping Origin to Destination o_dict = create_dictionary() for r in range(area[0] + 1, area[1]): o_value = sh.cell(row=r, column=area[2]).value # First column -> Origin d_value = sh.cell(row=r, column=area[2] + 1).value # Second column -> Destination try: exp_value = sh.cell( row=r, column=area[2] + 2).value # Third column -> Weight (for Many to Many mappings) if exp_value: try: exp_value = float(exp_value) except: # If it is not possible, it maybe an expression, postpone conversion until usage pass else: exp_value = 1.0 # If undefined -> Many to One except: exp_value = 1.0 # If undefined -> Many to One if not o_value and not d_value: # issues.append((2, "Row " + str(r) + ": Origin and Destination are not defined. Row skipped.")) continue elif not o_value or not d_value: if not o_value and d_value: issues.append( (2, "Row " + str(r) + ": Origin not defined. Row skipped.")) else: issues.append((2, "Row " + str(r) + ": Destination not defined. Row skipped.")) continue o_value = str(o_value).lower() d_value = str(d_value).lower() if o_value in o_dict: lst = o_dict[o_value] else: lst = [] o_dict[o_value] = lst # Check "d_value" is not being repeated for "o_value" if (len(lst) == 0) or (len(lst) >= 1 and d_value not in [d["d"] for d in lst]): lst.append({"d": d_value, "w": exp_value}) else: issues.append((3, "Destination category '" + destination + "' has been repeated for origin category '" + o_value + "' at row '" + str(r) + "'")) # List of dictionaries, where each dictionary contains the specification of an origin "o" # For multiple entries (many to many map), the origin maps a list "to" of dictionaries "d", "e" content = { "origin_dataset": origin_dataset, # Name of the origin dataset (may include the source name) "origin_dimension": origin_dim, # Name of the origin dimension inside the dataset "destination": destination, # Name of the destination hierarchy "map": [{ "o": k, "to": v } for k, v in o_dict.items()] } label = ((content["origin_dataset"] + ".") if origin_dataset else "" ) + content["origin_dimension"] + " -> " + content["destination"] return issues, label, content
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name, state) -> IssuesLabelContentTripleType: """ Check that the syntax of the input spreadsheet is correct Return the analysis in JSON compatible format, for execution :param sh: Input worksheet :param area: Area of the input worksheet to be analysed :return: The command in a dict-list object (JSON ready) """ def obtain_column(cn, r1, r2): """ Obtain a list with the values of a column, in the range of rows [r1, r2) :param cn: Column number :param r1: Starting row :param r2: End+1 row :return: list with the cell values """ lst = [] for row in range(r1, r2): value = sh.cell(row=row, column=cn).value if value is None: continue if isinstance(value, str): lst.append(value.strip()) else: lst.append(value) return lst issues = [] # Global variables (at parse time they may not be defined, so process carefully...) glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects( state) # Look for the name of the input Dataset dataset_name = None available_at_datetime = None for c in range(area[2], area[3]): col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["inputdataset"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: dataset_name = v break # Stop on first definition elif col_name.lower().strip() in ["availableatdatetime"]: lst = obtain_column(c, area[0] + 1, area[1]) for v in lst: if v: available_at_datetime = v break # Stop on first definition if dataset_name is None: issues.append( Issue( itype=IType.ERROR, description= f"The name of the input dataset must be specified under column 'InputDataset'. Skipping {name} command", location=IssueLocation(sheet_name=name, row=None, column=None))) return issues, None, None # Obtain the source from nexinfosys.ie_imports.data_source_manager import DataSourceManager source = DataSourceManager.obtain_dataset_source(dataset_name, datasets) # Obtain metadata dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets) # Load all code lists in a temporary dictionary of sets # Also check if there is a TIME dimension in the dataset cl = create_dictionary() we_have_time = False for d in dims: if dims[d].code_list: cl[d] = create_dictionary(data={ k: None for k in dims[d].code_list.keys() }) # Attach the code list else: cl[d] = None # No code list (TIME_PERIOD for instance) if dims[d].istime: we_have_time = True # Add matching mappings as more dimensions for m in mappings: if strcmp(mappings[m].source, source) and \ strcmp(mappings[m].dataset, dataset_name) and \ mappings[m].origin in dims: # Add a dictionary entry for the new dimension, add also the codes present in the map # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]] tmp = create_dictionary( data={ to["d"]: None for o in mappings[m].map for to in o["to"] if to["d"] }) cl[mappings[m]. destination] = tmp # [t[1] for t in mappings[m].map] # Scan columns for Dimensions, Measures and Aggregation. # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside. # TODO The result COULD be an automatic BI cube (with a separate field) # TODO - Write into a set of tables in Mondrian # TODO - Generate Schema for Mondrian # TODO - Write the Schema for Mondrian out_dims = [] out_measures = OrderedDict() for r in range(area[0] + 1, area[1] + 1): out_measures[r] = dict(measure=None, agg_func=None, measure_as=None) filter_ = { } # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement) result_name = None # By default, no name for the result. It will be dynamically obtained measure_names_column = None aggregations_column = None for c in range(area[2], area[3]): # Each column col_name = sh.cell(row=1, column=c).value if not col_name: continue if col_name.lower().strip() in ["resultdimensions", "dimensions"]: # "GROUP BY" lst = obtain_column(c, area[0] + 1, area[1]) for r, d in enumerate(lst): if not d: continue if d not in cl: issues.append( Issue( itype=IType.ERROR, description="The dimension specified for output, '" + d + "' is neither a dataset dimension nor a mapped dimension. [" + ', '.join([d2 for d2 in cl]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_dims.append(d) elif col_name.lower().strip() in ["resultmeasures", "measures"]: # "SELECT" measure_names_column = c lst = obtain_column(c, area[0] + 1, area[1]) # Check for measures # TODO (and attributes?) for r, m in enumerate(lst): if not m: continue if m not in meas: issues.append( Issue( itype=IType.ERROR, description="The specified measure, '" + m + "' is not a measure available in the dataset. [" + ', '.join( [m2["measure"] for m2 in out_measures.values]) + "]", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["measure"] = m elif col_name.lower().strip() in [ "resultmeasuresaggregation", "resultmeasuresaggregator", "aggregation" ]: # "SELECT AGGREGATORS" aggregations_column = c lst = obtain_column(c, area[0] + 1, area[1]) for r, f in enumerate(lst): if not f: continue if f.lower() not in [ "sum", "avg", "count", "sumna", "countav", "avgna", "pctna" ]: issues.append( Issue( itype=IType.ERROR, description="The specified aggregation function, '" + f + "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: out_measures[r + area[0] + 1]["agg_func"] = f elif col_name.lower().strip() in [ "resultmeasurename", "resultmeasuresnames", "resultmeasuresas", "measuresas" ]: # "AS <name>" lst = obtain_column(c, area[0] + 1, area[1]) for r, m in enumerate(lst): out_measures[r + area[0] + 1]["measure_as"] = m elif col_name in cl: # A dimension -> "WHERE" # Check codes, and add them to the "filter" lst = obtain_column(c, area[0] + 1, area[1]) for r, cd in enumerate(lst): if not cd: continue if str(cd) not in cl[col_name]: issues.append( Issue( itype=IType.ERROR, description="The code '" + cd + "' is not present in the codes declared for dimension '" + col_name + "'. Please, check them.", location=IssueLocation(sheet_name=name, row=r + 1, column=c + 1))) else: if col_name not in filter_: lst2 = [] filter_[col_name] = lst2 else: lst2 = filter_[col_name] lst2.append(cd) elif we_have_time and col_name.lower() in [ "startperiod", "starttime", "endperiod", "endtime" ]: # SPECIAL "WHERE" FOR TIME # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command # Interval of time periods lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: if col_name.lower() == "starttime": col_name = "StartPeriod" elif col_name.lower() == "endtime": col_name = "EndPeriod" filter_[col_name] = lst[ 0] # In this case it is not a list, but a number or string !!!! elif col_name.lower() in [ "outputdatasetname", "outputdataset", "result_name", "result name", "resultname" ]: lst = obtain_column(c, area[0] + 1, area[1]) if len(lst) > 0: result_name = lst[0] try: parser_field_parsers.string_to_ast(simple_ident, result_name) except: issues.append( Issue(itype=IType.ERROR, description="Column '" + col_name + "' has an invalid dataset name '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) # If more than one agg function defined -> all must be defined # If no agg func defined -> assume AVG # If agg func defined only in first row -> extend to other columns agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(agg_funcs) > 1: first_agg_func = None elif len(agg_funcs) == 0: issues.append( Issue(itype=IType.WARNING, description= "No aggregation function specified. Assuming 'average'", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) first_agg_func = "avg" else: # One aggregation function first_agg_func = out_measures[area[0] + 1]["agg_func"] if not first_agg_func: issues.append( Issue( itype=IType.ERROR, description= "The aggregation function must be defined in the first row", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if first_agg_func: for v in out_measures.values(): if v.get("measure", None): v["agg_func"] = first_agg_func # Uniform rows, with the three values defined: measure, aggregation function and "measure as" for r, v in out_measures.items(): measure = v.get("measure", None) agg_func = v.get("agg_func", None) measure_as = v.get("measure_as", None) if measure and not agg_func or not measure and agg_func: issues.append( Issue( itype=IType.ERROR, description= "Each measure must be associated with an aggregation function", location=IssueLocation(sheet_name=name, row=r, column=measure_names_column))) elif measure and not measure_as: v["measure_as"] = measure + "_" + agg_func measures = [v["measure"] for v in out_measures.values() if v["measure"]] measures_as = [ v["measure_as"] for v in out_measures.values() if v["measure_as"] ] agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]] if len(measures) == 0: issues.append( Issue(itype=IType.ERROR, description="At least one measure should be specified", location=IssueLocation(sheet_name=name, row=1, column=measure_names_column))) # measures != agg_funcs && len(agg_funcs) == 1 --> OK if len(measures) != len(agg_funcs) and len(agg_funcs) != 1: issues.append( Issue( itype=IType.ERROR, description= "There must be one aggregation function (used for all measures) or one aggregation per measure", location=IssueLocation(sheet_name=name, row=1, column=aggregations_column))) if not result_name: result_name = source + "_" + dataset_name issues.append( Issue(itype=IType.WARNING, description="No result name specified. Assuming '" + result_name + "'", location=IssueLocation(sheet_name=name, row=2, column=c + 1))) content = { "dataset_source": source, "dataset_name": dataset_name, "dataset_datetime": available_at_datetime, "where": filter_, "dimensions": [d for d in dims], "group_by": out_dims, "measures": measures, "agg_funcs": agg_funcs, "measures_as": measures_as, "result_name": result_name } return issues, None, content