Ejemplo n.º 1
0
    def execute(self, state: "State"):
        some_error = False
        issues = []
        # Read mapping parameters
        origin_dataset = self._content["origin_dataset"]
        origin_dimension = self._content["origin_dimension"]
        destination = self._content["destination"]
        # [{"o": "", "to": [{"d": "", "w": ""}]}]
        # [ {o: origin category, to: [{d: destination category, w: weight assigned to destination category}] } ]
        map = self._content["map"]
        # Obtain the origin dataset Metadata, obtain the code list
        dims, attrs, meas = obtain_dataset_metadata(origin_dataset)
        if origin_dimension not in dims:
            some_error = True
            issues.append((3, "The origin dimension '"+origin_dimension+"' does not exist in dataset '"+origin_dataset+"'"))
        else:
            dim = dims[origin_dimension]
            map = fill_map_with_all_origin_categories(dim, map)
            # # Check all codes exist
            # src_code_list = [c for c in dim.code_list]
            # dst_code_set = set()
            # many_to_one_list = []
            # for i in map:
            #     o = i["o"]
            #     for j in i["to"]:
            #         d = j["d"]
            #         dst_code_set.add(d)
            #         many_to_one_list.append((o, d))
            # hierarchical_code = True
            # if hierarchical_code:
            #     mapped, unmapped = map_codelists(src_code_list, list(dst_code_set), many_to_one_list)
            # else:
            #     # Literal. All codes on the left MUST exist
            #     mapped = many_to_one_list
            #     for i in mapped:
            #         o = i["o"]
            #         if o not in dim.code_list:
            #             some_error = True
            #             issues.append((3, "The origin category '" + o + "' does not exist in dataset dimension '" + origin_dataset + "." +origin_dimension + "'"))

        if some_error:  # Issues at this point are errors, return if there are any
            return issues, None

        # Create and store the mapping
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(state)

        from nexinfosys.ie_imports.data_source_manager import DataSourceManager
        source = DataSourceManager.obtain_dataset_source(origin_dataset, datasets)

        mappings[self._name] = Mapping(self._name, source, origin_dataset, origin_dimension, destination, map)

        # TODO If the categories to the left are not totally covered, what to do?
        # TODO - If a non-listed category appears, remove the line
        # TODO - If a non-listed category appears, leave the target column NA

        # TODO - If there are datasets matching the origin, JOIN

        return None, None
Ejemplo n.º 2
0
    def execute(self, state: "State"):
        def process_line(item):
            # Read variables
            mh_src_dataset = item.get("source_dataset", None)
            mh_src_hierarchy = item.get("source_hierarchy", None)
            mh_src_code = item.get("source_code", None)
            mh_dst_hierarchy = item.get("destination_hierarchy", None)
            mh_dst_code = item.get("destination_code", None)
            mh_weight = item.get("weight", 1.0)

            # Mapping name
            name = ((mh_src_dataset + ".") if mh_src_dataset else
                    "") + mh_src_hierarchy + " -> " + mh_dst_hierarchy

            if name in mappings:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The mapping '" + name +
                          "' has been declared previously. Skipped.",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return

            if name in local_mappings:
                d = local_mappings[name]
            else:
                d = DottedDict()
                local_mappings[name] = d
                d.name = name
                d.origin_dataset = mh_src_dataset
                d.origin_hierarchy = mh_src_hierarchy
                d.destination_hierarchy = mh_dst_hierarchy
                d.mapping = create_dictionary()

            # Specific code
            if mh_src_code in d.mapping:
                to_dict = d.mapping[mh_src_code]
            else:
                to_dict = create_dictionary()
            if mh_dst_code in to_dict:
                issues.append(
                    Issue(itype=IType.ERROR,
                          description="The mapping of '" + mh_src_code +
                          "' into '" + mh_dst_code +
                          "' has been already defined",
                          location=IssueLocation(sheet_name=name,
                                                 row=r,
                                                 column=None)))
                return
            else:
                to_dict[mh_dst_code] = (
                    mh_weight, r
                )  # NOTE: This could be an object instead of just a FLOAT or expression
                d.mapping[mh_src_code] = to_dict

        issues = []
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)
        name = self._content["command_name"]

        local_mappings = create_dictionary()

        # Process parsed information
        for line in self._content["items"]:
            r = line["_row"]
            # If the line contains a reference to a dataset or hierarchy, expand it
            # If not, process it directly
            is_expansion = False
            if is_expansion:
                # TODO Iterate through dataset and/or hierarchy elements, producing a list of new items
                pass
            else:
                process_line(line)

        # Mappings post-processing
        for d in local_mappings:
            # Convert the mapping into:
            # [{"o": "", "to": [{"d": "", "w": ""}]}]
            # [ {o: origin category, to: [{d: destination category, w: weight assigned to destination category}] } ]
            mapping = []
            ds_rows = []  # Rows in which a dataset is mentioned
            for orig in local_mappings[d].mapping:
                lst = []
                for dst in local_mappings[d].mapping[orig]:
                    t = local_mappings[d].mapping[orig][dst]
                    lst.append(dict(d=dst, w=t[0]))
                    if local_mappings[d].origin_dataset:
                        ds_rows.append(t[1])
                mapping.append(dict(o=orig, to=lst))
            from nexinfosys.ie_imports.data_source_manager import DataSourceManager
            if local_mappings[d].origin_dataset:
                if not DataSourceManager.obtain_dataset_source(
                        local_mappings[d].origin_dataset, datasets):
                    for r in ds_rows:
                        issues.append(
                            Issue(
                                itype=IType.ERROR,
                                description=
                                f"The dataset '{local_mappings[d].origin_dataset}' was not found",
                                location=IssueLocation(sheet_name=name,
                                                       row=r,
                                                       column=None)))
                    continue
                dims, attrs, meas = obtain_dataset_metadata(
                    local_mappings[d].origin_dataset, None, datasets)
                if local_mappings[d].origin_hierarchy not in dims:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="The origin dimension '" +
                              local_mappings[d].origin_hierarchy +
                              "' does not exist in dataset '" +
                              local_mappings[d].origin_dataset + "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=r,
                                                     column=None)))
                    continue
                else:
                    dim = dims[local_mappings[d].origin_hierarchy]
                    mapping = fill_map_with_all_origin_categories(dim, mapping)
            #
            origin_dataset = local_mappings[d].origin_dataset
            origin_hierarchy = local_mappings[d].origin_hierarchy
            destination_hierarchy = local_mappings[d].destination_hierarchy
            # Create Mapping and add it to Case Study mappings variable
            mappings[d] = Mapping(
                d,
                DataSourceManager.obtain_dataset_source(
                    origin_dataset, datasets), origin_dataset,
                origin_hierarchy, destination_hierarchy, mapping)

        # TODO
        # Use the function to perform many to many mappings, "augment_dataframe_with_mapped_columns"
        # Put it to work !!!

        # One or more mapping in sequence could be specified?. The key is "source hierarchy+dest hierarchy"
        # Read mapping parameters

        return issues, None
def parse_etl_external_dataset_command(sh: Worksheet, area: AreaTupleType,
                                       dataset_name: str,
                                       state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            if isinstance(value, str):
                lst.append(value.strip())
            else:
                lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)
    # Dataset source
    from nexinfosys.ie_imports.data_source_manager import DataSourceManager
    source = DataSourceManager.obtain_dataset_source(dataset_name, datasets)

    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets)

    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = [k.lower()
                     for k in dims[d].code_list.keys()]  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True
    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            tmp = [
                to["d"] for o in mappings[m].map for to in o["to"] if to["d"]
            ]
            cl[mappings[m].destination] = set(
                tmp)  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.
    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian
    measures = []
    out_dims = []
    agg_funcs = []
    measures_as = []
    filter_ = {
    }  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue

        if col_name.lower().strip() in [
                "dimensions_kept", "dims", "dimensions"
        ]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for d in lst:
                if not d:
                    continue
                if d not in cl:
                    issues.append((
                        3, "The dimension specified for output, '" + d +
                        "' is neither a dataset dimension nor a mapped dimension. ["
                        + ', '.join([d2 for d2 in cl]) + "]"))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in [
                "aggregation_function", "aggfunc", "agg_func"
        ]:  # "SELECT AGGREGATORS"
            lst = obtain_column(c, area[0] + 1, area[1])
            for f in lst:
                if f.lower() not in [
                        "sum", "avg", "count", "sumna", "countav", "avgna",
                        "pctna"
                ]:
                    issues.append((
                        3, "The specified aggregation function, '" + f +
                        "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'"
                    ))
                else:
                    agg_funcs.append(f)
        elif col_name.lower().strip() in ["measures"]:  # "SELECT"
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for m in lst:
                if not m:
                    continue
                if m not in meas:
                    issues.append(
                        (3, "The specified measure, '" + m +
                         "' is not a measure available in the dataset. [" +
                         ', '.join([m2 for m2 in measures]) + "]"))
                else:
                    measures.append(m)
        elif col_name.lower().strip() in ["measuresas"]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for m in lst:
                measures_as.append(m)
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for cd in lst:
                if not cd:
                    continue
                if str(cd).lower() not in cl[col_name]:
                    issues.append((
                        3, "The code '" + cd +
                        "' is not present in the codes declared for dimension '"
                        + col_name + "'. Please, check them."))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in [
                "startperiod", "endperiod"
        ]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                filter_[col_name] = lst[
                    0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in ["result_name", "result name", "resultname"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident,
                                                       result_name)
                except:
                    issues.append((3, "Column '" + col_name +
                                   "' has an invalid dataset name '" +
                                   result_name + "'"))

    if len(measures) == 0:
        issues.append((3, "At least one measure should be specified"))

    if len(agg_funcs) == 0:
        issues.append(
            (2, "No aggregation function specified. Assuming 'average'"))
        agg_funcs.append("average")

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append(
            (2, "No result name specified. Assuming '" + result_name + "'"))

    content = {
        "dataset_source": source,
        "dataset_name": dataset_name,
        "dataset_datetime": None,
        "where": filter_,
        "dimensions": [d for d in dims],
        "group_by": out_dims,
        "measures": measures,
        "agg_funcs": agg_funcs,
        "measures_as": measures_as,
        "result_name": result_name
    }
    return issues, None, content
Ejemplo n.º 4
0
    def execute(self, state: "State"):
        """
        First bring the data considering the filter
        Second, group, third aggregate
        Finally, store the result in State
        """
        issues = []
        # Obtain global variables in state
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)

        # DS Source + DS Name
        source = self._content["dataset_source"]
        dataset_name = self._content["dataset_name"]
        dataset_datetime = self._content["dataset_datetime"]

        # Result name
        result_name = self._content["result_name"]
        if result_name in datasets or state.get(result_name):
            issues.append((2, "A dataset called '" + result_name +
                           "' is already stored in the registry of datasets"))

        # Dataset metadata
        dims, attrs, measures = obtain_dataset_metadata(
            dataset_name, source, datasets)

        # Obtain filter parameters
        params = create_dictionary(
        )  # Native dimension name to list of values the filter will allow to pass
        joined_dimensions = []
        for dim in self._content["where"]:
            lst = self._content["where"][dim]
            native_dim = None
            if dim.lower() in [
                    "startperiod", "starttime", "endperiod", "endtime"
            ]:
                native_dim = dim
                lst = [lst]
            elif dim not in dims:
                # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR
                for m in mappings:
                    if strcmp(mappings[m].destination, dim) and \
                            strcmp(mappings[m].source, source) and \
                            strcmp(mappings[m].dataset, dataset_name) and \
                            mappings[m].origin in dims:
                        joined_dimensions.append(
                            mappings[m].destination
                        )  # Store dimension in the original case
                        native_dim = mappings[m].origin
                        lst = obtain_reverse_codes(mappings[m].map, lst)
                        break
            else:
                # Get the dimension name with the original case
                native_dim = dims[dim].name
            if native_dim:
                if native_dim not in params:
                    f = set()
                    params[native_dim] = f
                else:
                    f = params[native_dim]
                f.update(lst)

        # Convert param contents from set to list
        for p in params:
            params[p] = [i for i in params[p]]

        # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        ds = nexinfosys.data_source_manager.get_dataset_filtered(
            source, dataset_name, params, datasets)
        df = ds.data

        # Join with mapped dimensions (augment it)
        mapping_dict = create_dictionary()
        for m in mappings:
            if strcmp(mappings[m].source, source) and \
                    strcmp(mappings[m].dataset, dataset_name) and \
                    mappings[m].origin in dims:
                # mapping_tuples.append((mappings[m].origin, mappings[m].destination, mappings[m].map))
                mapping_dict[mappings[m].origin] = (mappings[m].destination, {
                    d["o"]: d["to"]
                    for d in mappings[m].map
                })

        # If accelerated version not available, use slow version
        try:
            if nexinfosys.get_global_configuration_variable(
                    "ENABLE_CYTHON_OPTIMIZATIONS") == "True":
                from nexinfosys.restful_service.helper_accel import augment_dataframe_with_mapped_columns2 as augment_df
            else:
                raise Exception("Just to import the slow version")
        except:
            from nexinfosys.common.helper import augment_dataframe_with_mapped_columns as augment_df

        df = augment_df(df, mapping_dict, ["value"])

        # Aggregate (If any dimension has been specified)
        if len(self._content["group_by"]) > 0:
            # Column names where data is
            # HACK: for the case where the measure has been named "obs_value", use "value"
            values = [
                m.lower() if m.lower() != "obs_value" else "value"
                for m in self._content["measures"]
            ]
            v2 = []
            for v in values:
                for c in df.columns:
                    if v.lower() == c.lower():
                        v2.append(c)
                        break
            values = v2

            # TODO: use metadata name (e.g. "OBS_VALUE") instead of hardcoded "value"
            # values = self._content["measures"]
            out_names = self._content["measures_as"]
            group_by_dims = translate_case(
                self._content["group_by"],
                df.columns)  # Group by dimension names
            lcase_group_by_dims = [d.lower() for d in group_by_dims]
            # Now joined_dimensions
            for d in joined_dimensions:
                if d.lower() in lcase_group_by_dims:
                    # Find and replace
                    for i, d2 in enumerate(group_by_dims):
                        if strcmp(d, d2):
                            group_by_dims[i] = d
                            break

            agg_funcs = []  # Aggregation functions
            agg_names = {}
            for f in self._content["agg_funcs"]:
                if f.lower() in ["avg", "average"]:
                    agg_funcs.append(np.average)
                    agg_names[np.average] = "avg"
                elif f.lower() in ["sum"]:
                    agg_funcs.append(np.sum)
                    agg_names[np.sum] = "sum"
                elif f.lower() in ["count"]:
                    agg_funcs.append(np.size)
                    agg_names[np.size] = "count"
                elif f.lower() in ["sumna"]:
                    agg_funcs.append(np.nansum)
                    agg_names[np.nansum] = "sumna"
                elif f.lower() in ["countav"]:
                    agg_funcs.append("count")
                    agg_names["count"] = "countav"
                elif f.lower() in ["avgna"]:
                    agg_funcs.append(np.nanmean)
                    agg_names[np.nanmean] = "avgna"
                elif f.lower() in ["pctna"]:
                    agg_funcs.append(pctna)
                    agg_names[pctna] = "pctna"

            # Calculate Pivot Table. The columns are a combination of values x aggregation functions
            # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided
            # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]]
            try:
                # Check that all "group_by_dims" on which pivot table aggregates are present in the input "df"
                # If not either synthesize them (only if there is a single filter value) or remove (if not present
                for r in group_by_dims.copy():
                    df_columns_dict = create_dictionary(
                        data={c: None
                              for c in df.columns})
                    if r not in df_columns_dict:
                        found = False
                        for k in params:
                            if strcmp(k, r):
                                found = True
                                if len(params[k]) == 1:
                                    df[k] = params[k][0]
                                else:
                                    group_by_dims.remove(r)
                                    issues.append((
                                        2, "Dimension '" + r +
                                        "' removed from the list of dimensions because it is not present in the raw input dataset."
                                    ))
                                break
                        if not found:
                            group_by_dims.remove(r)
                            issues.append((
                                2, "Dimension '" + r +
                                "' removed from the list of dimensions because it is not present in the raw input dataset."
                            ))

                # Create and register Hierarchy objects from origin Dataset dimensions: state, ds
                ds_columns_dict = create_dictionary(
                    data={c.code: c.code
                          for c in ds.dimensions})
                for r in group_by_dims:
                    if r in ds_columns_dict:
                        # Create hierarchy local to the dataset
                        for d in ds.dimensions:
                            if strcmp(r, d.code):
                                if d.code_list:
                                    h = convert_code_list_to_hierarchy(
                                        d.code_list)
                                    h.name = result_name + "_" + r
                                    glb_idx.put(h.key(), h)
                                    break

                # Pivot table using Group by
                if True:
                    groups = df.groupby(by=group_by_dims,
                                        as_index=False)  # Split
                    d = OrderedDict([])
                    lst_names = []
                    if len(values) == len(agg_funcs):
                        for i, (value,
                                agg_func) in enumerate(zip(values, agg_funcs)):
                            if len(out_names) == len(values) and out_names[i]:
                                lst_names.append(out_names[i])
                            else:
                                lst_names.append(agg_names[agg_func] + "_" +
                                                 value)
                            lst = d.get(value, [])
                            lst.append(agg_func)
                            d[value] = lst
                    else:
                        for value in values:
                            lst = d.get(value, [])
                            for agg_func in agg_funcs:
                                lst.append(agg_func)
                                lst_names.append(agg_names[agg_func] + "_" +
                                                 value)
                            d[value] = lst
                    # Print NaN values for each value column
                    for value in set(values):
                        cnt = df[value].isnull().sum()
                        print("NA count for col '" + value + "': " + str(cnt) +
                              " of " + str(df.shape[0]))
                    # AGGREGATE !!
                    df2 = groups.agg(d)

                    # Rename the aggregated columns
                    df2.columns = group_by_dims + lst_names
                # else:
                #     # Pivot table
                #     df2 = pd.pivot_table(df,
                #                          values=values,
                #                          index=group_by_dims,
                #                          aggfunc=[agg_funcs[0]], fill_value=np.NaN, margins=False,
                #                          dropna=True)
                #     # Remove the multiindex in columns
                #     df2.columns = [col[-1] for col in df2.columns.values]
                #     # Remove the index
                #     df2.reset_index(inplace=True)
                # The result, all columns (no index), is stored for later use
                ds = self._create_new_dataset(result_name, ds, df2,
                                              group_by_dims, out_names)
            except Exception as e:
                traceback.print_exc()
                issues.append((3, "There was a problem: " + str(e)))

        # Store the dataset in State
        datasets[result_name] = ds

        return issues, None
Ejemplo n.º 5
0
    def execute(self, state: "State"):
        """
        First bring the data considering the filter
        Second, group, third aggregate
        Finally, store the result in State
        """
        issues = []
        # Obtain global variables in state
        glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
            state)

        # DS Source + DS Name
        source = self._content["dataset_source"]
        dataset_name = self._content["dataset_name"]

        # Result name
        result_name = self._content["result_name"]
        if result_name in datasets or state.get(result_name):
            issues.append((2, "A dataset called '" + result_name +
                           "' is already stored in the registry of datasets"))

        # Dataset metadata
        dims, attrs, meas = obtain_dataset_metadata(dataset_name, source)
        # Obtain filter parameters
        params = create_dictionary(
        )  # Native dimension name to list of values the filter will allow to pass
        joined_dimensions = []
        for dim in self._content["where"]:
            lst = self._content["where"][dim]
            native_dim = None
            if dim.lower() in ["startperiod", "endperiod"]:
                native_dim = dim
                lst = [lst]
            elif dim not in dims:
                # Check if there is a mapping. If so, obtain the native equivalent(s). If not, ERROR
                for m in mappings:
                    if strcmp(mappings[m].destination, dim) and \
                            strcmp(mappings[m].source, source) and \
                            strcmp(mappings[m].dataset, dataset_name) and \
                            mappings[m].origin in dims:
                        joined_dimensions.append(
                            mappings[m].destination
                        )  # Store dimension in the original case
                        native_dim = mappings[m].origin
                        lst = obtain_reverse_codes(mappings[m].map, lst)
                        break
            else:
                # Get the dimension name with the original case
                native_dim = dims[dim].name
            if native_dim:
                if native_dim not in params:
                    f = set()
                    params[native_dim] = f
                else:
                    f = params[native_dim]
                f.update(lst)

        # Convert param contents from set to list
        for p in params:
            params[p] = [i for i in params[p]]

        # Obtain the filtered Dataset <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        ds = nexinfosys.data_source_manager.get_dataset_filtered(
            source, dataset_name, params)
        df = ds.data

        # Join with mapped dimensions (augment it)
        # TODO Prepare an "m" containing ALL the mappings affecting "df"
        # TODO df2 = augment_dataframe_with_mapped_columns(df, m, ["value"])
        # TODO Does it allow adding the new column for the dimension, in case it is requested? Probably yes, but test it
        for m in mappings:
            if strcmp(mappings[m].source, source) and \
                    strcmp(mappings[m].dataset, dataset_name) and \
                    mappings[m].origin in dims:
                # TODO Change by many-to-many mapping
                # TODO augment_dataframe_with_mapped_columns(df, maps, measure_columns)
                # Elaborate a many to one mapping
                tmp = []
                for el in mappings[m].map:
                    for to in el["to"]:
                        if to["d"]:
                            tmp.append([el["o"], to["d"]])
                df_dst = pd.DataFrame(
                    tmp, columns=['sou_rce', mappings[m].destination])
                for di in df.columns:
                    if strcmp(mappings[m].origin, di):
                        d = di
                        if not nexinfosys.case_sensitive:
                            df[d + "_l"] = df[d].str.lower()
                            d = d + "_l"
                        break
                df = pd.merge(df,
                              df_dst,
                              how='left',
                              left_on=d,
                              right_on='sou_rce')
                del df['sou_rce']
                if not nexinfosys.case_sensitive:
                    del df[d]

        # Aggregate (If any dimension has been specified)
        if len(self._content["group_by"]) > 0:
            # Column names where data is
            # HACK: for the case where the measure has been named "obs_value", use "value"
            values = [
                m.lower() if m.lower() != "obs_value" else "value"
                for m in self._content["measures"]
            ]
            out_names = self._content["measures_as"]
            rows = translate_case(self._content["group_by"],
                                  params)  # Group by dimension names
            lcase_rows = [d.lower() for d in rows]
            # Now joined_dimensions
            for d in joined_dimensions:
                if d.lower() in lcase_rows:
                    # Find and replace
                    for i, d2 in enumerate(rows):
                        if strcmp(d, d2):
                            rows[i] = d
                            break

            aggs = []  # Aggregation functions
            agg_names = {}
            for f in self._content["agg_funcs"]:
                if f.lower() in ["avg", "average"]:
                    aggs.append(np.average)
                    agg_names[np.average] = "avg"
                elif f.lower() in ["sum"]:
                    aggs.append(np.sum)
                    agg_names[np.sum] = "sum"
                elif f.lower() in ["count"]:
                    aggs.append(np.size)
                    agg_names[np.size] = "count"
                elif f.lower() in ["sumna"]:
                    aggs.append(np.nansum)
                    agg_names[np.nansum] = "sumna"
                elif f.lower() in ["countav"]:  # countav=="Count Available"
                    aggs.append("count")  # Count number of non-NaN elements
                    agg_names["count"] = "countav"
                elif f.lower() in ["avgav",
                                   "avgna"]:  # avgna=="Average without
                    aggs.append(np.nanmean)
                    agg_names[np.nanmean] = "avgna"
                elif f.lower() in ["pctna"]:  # % of NaN vs total elements
                    aggs.append(pctna)
                    agg_names[pctna] = "pctna"

            # Calculate Pivot Table. The columns are a combination of values x aggregation functions
            # For instance, if two values ["v2", "v2"] and two agg. functions ["avg", "sum"] are provided
            # The columns will be: [["average", "v2"], ["average", "v2"], ["sum", "v2"], ["sum", "v2"]]
            try:
                # Check that all "rows" on which pivot table aggregates are present in the input "df"
                # If not either synthesize them (only if there is a single filter value) or remove (if not present
                df_columns_dict = create_dictionary(
                    data={c: c
                          for c in df.columns})
                for r in rows.copy():
                    if r not in df_columns_dict:
                        found = False
                        for k in params:
                            if strcmp(k, r):
                                found = True
                                if len(params[k]) == 1:
                                    df[r] = params[k][0]
                                else:
                                    rows.remove(r)
                                    issues.append((
                                        2, "Dimension '" + r +
                                        "' removed from the list of dimensions because it is not present in the raw input dataset."
                                    ))
                                break
                        if not found:
                            rows.remove(r)
                            issues.append((
                                2, "Dimension '" + r +
                                "' removed from the list of dimensions because it is not present in the raw input dataset."
                            ))
                # Put proper DIMENSION names
                for ir, r in enumerate(rows):
                    if r in df_columns_dict:
                        rows[ir] = df_columns_dict[r]

                # Create and register Hierarchy objects from origin Dataset dimensions: state, ds
                ds_columns_dict = create_dictionary(
                    data={c.code: c.code
                          for c in ds.dimensions})
                for r in rows:
                    if r in ds_columns_dict:
                        # Create hierarchy local to the dataset
                        for d in ds.dimensions:
                            if strcmp(r, d.code):
                                if d.code_list:
                                    h = convert_code_list_to_hierarchy(
                                        d.code_list)
                                    h.name = result_name + "_" + r
                                    glb_idx.put(h.key(), h)
                                    break

                # Pivot table using Group by
                # if True:
                groups = df.groupby(by=rows, as_index=False)  # Split
                d = OrderedDict([])
                lst_names = []
                if len(values) == len(aggs):
                    for i, t in enumerate(zip(values, aggs)):
                        v, agg = t
                        if len(out_names) == len(values):
                            if out_names[i]:
                                lst_names.append(out_names[i])
                            else:
                                lst_names.append(agg_names[agg] + "_" + v)
                        else:
                            lst_names.append(agg_names[agg] + "_" + v)
                        lst = d.get(v, [])
                        lst.append(agg)
                        d[v] = lst
                else:
                    for v in values:
                        lst = d.get(v, [])
                        for agg in aggs:
                            lst.append(agg)
                            lst_names.append(agg_names[agg] + "_" + v)
                        d[v] = lst
                # Print NaN values for each value column
                for v in set(values):
                    cnt = df[v].isnull().sum()
                    print("NA count for col '" + v + "': " + str(cnt) +
                          " of " + str(df.shape[0]))
                # AGGREGATE !!
                df2 = groups.agg(d)

                # Rename the aggregated columns
                df2.columns = rows + lst_names
                # else:
                #     # Pivot table
                #     df2 = pd.pivot_table(df,
                #                          values=values,
                #                          index=rows,
                #                          aggfunc=[aggs[0]], fill_value=np.NaN, margins=False,
                #                          dropna=True)
                #     # Remove the multiindex in columns
                #     df2.columns = [col[-1] for col in df2.columns.values]
                #     # Remove the index
                #     df2.reset_index(inplace=True)
                # The result, all columns (no index), is stored for later use
                ds.data = df2
            except Exception as e:
                issues.append(
                    (3, "There was a problem with the grouping: " + repr(e)))

        # Store the dataset in State
        datasets[result_name] = ds

        return issues, None
def parse_mapping_command(sh: Worksheet, area: AreaTupleType, origin,
                          destination) -> IssuesLabelContentTripleType:
    """
    Map from a set of categories from an external dataset into a set of MuSIASEM categories
    If the categories do not exist, they are created flat. Later they can be turned into a hierarchy and the mapping
    will still hold

    The syntax of the mapping allows expressing MANY to ONE and also MANY to MANY correspondence.
    The mapping has to be complete (all elements from left side must be covered, if not "" is assumed on the right side)

    :param sh: Input worksheet
    :param area: Tuple (top, bottom, left, right) representing the rectangular area of the input worksheet where the
    command is present
    :param origin:
    :param destination:
    :return: list of issues (issue_type, message), command label, command content
    """
    some_error = False
    issues = []
    # Analyze Origin
    cell = sh.cell(row=area[0], column=area[2])
    col_name = cell.value
    if origin:
        if not strcmp(origin, col_name):
            some_error = True
            issues.append((
                3,
                "The Origin name is different in the sheet name and in the worksheet ("
                + origin + ", " + col_name + ")"))
    else:
        origin = col_name

    #   Obtain the source, the dataset and the dimension of "origin"
    spl = origin.split(".")
    if len(spl) == 3:  # Source.Dataset.Dimension
        s, ds, dim = spl
        s = s + "."
        origin_ok = True
    elif len(spl) == 2:  # Dataset.Dimension
        ds, dim = spl
        s = ""
        origin_ok = True
    else:
        origin_ok = False
        some_error = True
        issues.append((
            3,
            "Origin must specify a dataset and a dimension name separated by '.'"
        ))

    if origin_ok:
        origin_dataset = s + ds
        origin_dim = dim

        if not check_dataset_exists(origin_dataset):
            some_error = True
            issues.append((3, "The Origin '" + origin_dataset +
                           "' does not match any registered dataset"))
        else:
            dims, attrs, meas = obtain_dataset_metadata(ds)
            if origin_dim not in dims:
                some_error = True
                issues.append(
                    (3, "The Origin dataset '" + origin_dataset +
                     "' does not have a dimension '" + origin_dim + "'"))

    # Analyze Destination
    cell = sh.cell(row=area[0], column=area[2] + 1)
    col_name = cell.value
    if destination:
        if not strcmp(destination, col_name):
            some_error = True
            issues.append((
                3,
                "The Destination name is different in the sheet name and in the worksheet ("
                + destination + ", " + col_name + ")"))
    else:
        destination = col_name

    #  Destination name must be a simple identity
    try:
        parser_field_parsers.simple_ident.parseString(destination,
                                                      parseAll=True)
    except:
        some_error = True
        issues.append((3, "'" + destination +
                       "' category name has to be a simple identifier"))

    if some_error:  # Issues at this point are errors, return if there are any
        return issues, None, None

    # Read mapping Origin to Destination
    o_dict = create_dictionary()
    for r in range(area[0] + 1, area[1]):
        o_value = sh.cell(row=r,
                          column=area[2]).value  # First column -> Origin
        d_value = sh.cell(row=r, column=area[2] +
                          1).value  # Second column -> Destination
        try:
            exp_value = sh.cell(
                row=r, column=area[2] +
                2).value  # Third column -> Weight (for Many to Many mappings)
            if exp_value:
                try:
                    exp_value = float(exp_value)
                except:  # If it is not possible, it maybe an expression, postpone conversion until usage
                    pass
            else:
                exp_value = 1.0  # If undefined -> Many to One
        except:
            exp_value = 1.0  # If undefined -> Many to One

        if not o_value and not d_value:
            # issues.append((2, "Row " + str(r) + ": Origin and Destination are not defined. Row skipped."))
            continue
        elif not o_value or not d_value:
            if not o_value and d_value:
                issues.append(
                    (2,
                     "Row " + str(r) + ": Origin not defined. Row skipped."))
            else:
                issues.append((2, "Row " + str(r) +
                               ": Destination not defined. Row skipped."))
            continue

        o_value = str(o_value).lower()
        d_value = str(d_value).lower()
        if o_value in o_dict:
            lst = o_dict[o_value]
        else:
            lst = []
            o_dict[o_value] = lst
        # Check "d_value" is not being repeated for "o_value"
        if (len(lst) == 0) or (len(lst) >= 1
                               and d_value not in [d["d"] for d in lst]):
            lst.append({"d": d_value, "w": exp_value})
        else:
            issues.append((3, "Destination category '" + destination +
                           "' has been repeated for origin category '" +
                           o_value + "' at row '" + str(r) + "'"))

    # List of dictionaries, where each dictionary contains the specification of an origin "o"
    # For multiple entries (many to many map), the origin maps a list "to" of dictionaries "d", "e"
    content = {
        "origin_dataset":
        origin_dataset,  # Name of the origin dataset (may include the source name)
        "origin_dimension":
        origin_dim,  # Name of the origin dimension inside the dataset
        "destination": destination,  # Name of the destination hierarchy
        "map": [{
            "o": k,
            "to": v
        } for k, v in o_dict.items()]
    }
    label = ((content["origin_dataset"] + ".") if origin_dataset else ""
             ) + content["origin_dimension"] + " -> " + content["destination"]
    return issues, label, content
Ejemplo n.º 7
0
def parse_dataset_qry_command(sh: Worksheet, area: AreaTupleType, name,
                              state) -> IssuesLabelContentTripleType:
    """
    Check that the syntax of the input spreadsheet is correct
    Return the analysis in JSON compatible format, for execution

    :param sh:   Input worksheet
    :param area: Area of the input worksheet to be analysed
    :return:     The command in a dict-list object (JSON ready)
    """
    def obtain_column(cn, r1, r2):
        """
        Obtain a list with the values of a column, in the range of rows [r1, r2)

        :param cn: Column number
        :param r1: Starting row
        :param r2: End+1 row
        :return: list with the cell values
        """
        lst = []
        for row in range(r1, r2):
            value = sh.cell(row=row, column=cn).value
            if value is None:
                continue
            if isinstance(value, str):
                lst.append(value.strip())
            else:
                lst.append(value)
        return lst

    issues = []
    # Global variables (at parse time they may not be defined, so process carefully...)
    glb_idx, p_sets, hh, datasets, mappings = get_case_study_registry_objects(
        state)

    # Look for the name of the input Dataset
    dataset_name = None
    available_at_datetime = None
    for c in range(area[2], area[3]):
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["inputdataset"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    dataset_name = v
                    break  # Stop on first definition
        elif col_name.lower().strip() in ["availableatdatetime"]:
            lst = obtain_column(c, area[0] + 1, area[1])
            for v in lst:
                if v:
                    available_at_datetime = v
                    break  # Stop on first definition

    if dataset_name is None:
        issues.append(
            Issue(
                itype=IType.ERROR,
                description=
                f"The name of the input dataset must be specified under column 'InputDataset'. Skipping {name} command",
                location=IssueLocation(sheet_name=name, row=None,
                                       column=None)))
        return issues, None, None

    # Obtain the source
    from nexinfosys.ie_imports.data_source_manager import DataSourceManager
    source = DataSourceManager.obtain_dataset_source(dataset_name, datasets)
    # Obtain metadata
    dims, attrs, meas = obtain_dataset_metadata(dataset_name, source, datasets)
    # Load all code lists in a temporary dictionary of sets
    # Also check if there is a TIME dimension in the dataset
    cl = create_dictionary()
    we_have_time = False
    for d in dims:
        if dims[d].code_list:
            cl[d] = create_dictionary(data={
                k: None
                for k in dims[d].code_list.keys()
            })  # Attach the code list
        else:
            cl[d] = None  # No code list (TIME_PERIOD for instance)
        if dims[d].istime:
            we_have_time = True

    # Add matching mappings as more dimensions
    for m in mappings:
        if strcmp(mappings[m].source, source) and \
                strcmp(mappings[m].dataset, dataset_name) and \
                mappings[m].origin in dims:
            # Add a dictionary entry for the new dimension, add also the codes present in the map
            # tmp = [to["d"] for o in mappings[m].map for to in o["to"] if to["d"]]
            tmp = create_dictionary(
                data={
                    to["d"]: None
                    for o in mappings[m].map for to in o["to"] if to["d"]
                })
            cl[mappings[m].
               destination] = tmp  # [t[1] for t in mappings[m].map]

    # Scan columns for Dimensions, Measures and Aggregation.
    # Pivot Table is a Visualization, so now it is not in the command, there will be a command aside.

    # TODO The result COULD be an automatic BI cube (with a separate field)
    # TODO - Write into a set of tables in Mondrian
    # TODO - Generate Schema for Mondrian
    # TODO - Write the Schema for Mondrian

    out_dims = []

    out_measures = OrderedDict()
    for r in range(area[0] + 1, area[1] + 1):
        out_measures[r] = dict(measure=None, agg_func=None, measure_as=None)

    filter_ = {
    }  # Cannot use "create_dictionary()" because CaseInsensitiveDict is NOT serializable (which is a requirement)
    result_name = None  # By default, no name for the result. It will be dynamically obtained
    measure_names_column = None
    aggregations_column = None
    for c in range(area[2], area[3]):  # Each column
        col_name = sh.cell(row=1, column=c).value
        if not col_name:
            continue
        if col_name.lower().strip() in ["resultdimensions",
                                        "dimensions"]:  # "GROUP BY"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, d in enumerate(lst):
                if not d:
                    continue
                if d not in cl:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The dimension specified for output, '"
                            + d +
                            "' is neither a dataset dimension nor a mapped dimension. ["
                            + ', '.join([d2 for d2 in cl]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_dims.append(d)
        elif col_name.lower().strip() in ["resultmeasures",
                                          "measures"]:  # "SELECT"
            measure_names_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            # Check for measures
            # TODO (and attributes?)
            for r, m in enumerate(lst):
                if not m:
                    continue
                if m not in meas:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The specified measure, '" + m +
                            "' is not a measure available in the dataset. [" +
                            ', '.join(
                                [m2["measure"]
                                 for m2 in out_measures.values]) + "]",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["measure"] = m
        elif col_name.lower().strip() in [
                "resultmeasuresaggregation", "resultmeasuresaggregator",
                "aggregation"
        ]:  # "SELECT AGGREGATORS"
            aggregations_column = c
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, f in enumerate(lst):
                if not f:
                    continue

                if f.lower() not in [
                        "sum", "avg", "count", "sumna", "countav", "avgna",
                        "pctna"
                ]:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The specified aggregation function, '"
                            + f +
                            "' is not one of the supported ones: 'sum', 'avg', 'count', 'sumna', 'avgna', 'countav', 'pctna'",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    out_measures[r + area[0] + 1]["agg_func"] = f
        elif col_name.lower().strip() in [
                "resultmeasurename", "resultmeasuresnames", "resultmeasuresas",
                "measuresas"
        ]:  # "AS <name>"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, m in enumerate(lst):
                out_measures[r + area[0] + 1]["measure_as"] = m
        elif col_name in cl:  # A dimension -> "WHERE"
            # Check codes, and add them to the "filter"
            lst = obtain_column(c, area[0] + 1, area[1])
            for r, cd in enumerate(lst):
                if not cd:
                    continue
                if str(cd) not in cl[col_name]:
                    issues.append(
                        Issue(
                            itype=IType.ERROR,
                            description="The code '" + cd +
                            "' is not present in the codes declared for dimension '"
                            + col_name + "'. Please, check them.",
                            location=IssueLocation(sheet_name=name,
                                                   row=r + 1,
                                                   column=c + 1)))
                else:
                    if col_name not in filter_:
                        lst2 = []
                        filter_[col_name] = lst2
                    else:
                        lst2 = filter_[col_name]
                    lst2.append(cd)
        elif we_have_time and col_name.lower() in [
                "startperiod", "starttime", "endperiod", "endtime"
        ]:  # SPECIAL "WHERE" FOR TIME
            # TODO Instead, should use a single column, "Time", using the interval syntax of the Time column in the Data Input command
            # Interval of time periods
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                if col_name.lower() == "starttime":
                    col_name = "StartPeriod"
                elif col_name.lower() == "endtime":
                    col_name = "EndPeriod"
                filter_[col_name] = lst[
                    0]  # In this case it is not a list, but a number or string !!!!
        elif col_name.lower() in [
                "outputdatasetname", "outputdataset", "result_name",
                "result name", "resultname"
        ]:
            lst = obtain_column(c, area[0] + 1, area[1])
            if len(lst) > 0:
                result_name = lst[0]
                try:
                    parser_field_parsers.string_to_ast(simple_ident,
                                                       result_name)
                except:
                    issues.append(
                        Issue(itype=IType.ERROR,
                              description="Column '" + col_name +
                              "' has an invalid dataset name '" + result_name +
                              "'",
                              location=IssueLocation(sheet_name=name,
                                                     row=2,
                                                     column=c + 1)))

    # If more than one agg function defined -> all must be defined
    # If no agg func defined -> assume AVG
    # If agg func defined only in first row -> extend to other columns
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]
    if len(agg_funcs) > 1:
        first_agg_func = None
    elif len(agg_funcs) == 0:
        issues.append(
            Issue(itype=IType.WARNING,
                  description=
                  "No aggregation function specified. Assuming 'average'",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=aggregations_column)))
        first_agg_func = "avg"
    else:  # One aggregation function
        first_agg_func = out_measures[area[0] + 1]["agg_func"]
        if not first_agg_func:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=
                    "The aggregation function must be defined in the first row",
                    location=IssueLocation(sheet_name=name,
                                           row=1,
                                           column=aggregations_column)))

    if first_agg_func:
        for v in out_measures.values():
            if v.get("measure", None):
                v["agg_func"] = first_agg_func

    # Uniform rows, with the three values defined: measure, aggregation function and "measure as"
    for r, v in out_measures.items():
        measure = v.get("measure", None)
        agg_func = v.get("agg_func", None)
        measure_as = v.get("measure_as", None)
        if measure and not agg_func or not measure and agg_func:
            issues.append(
                Issue(
                    itype=IType.ERROR,
                    description=
                    "Each measure must be associated with an aggregation function",
                    location=IssueLocation(sheet_name=name,
                                           row=r,
                                           column=measure_names_column)))
        elif measure and not measure_as:
            v["measure_as"] = measure + "_" + agg_func

    measures = [v["measure"] for v in out_measures.values() if v["measure"]]
    measures_as = [
        v["measure_as"] for v in out_measures.values() if v["measure_as"]
    ]
    agg_funcs = [v["agg_func"] for v in out_measures.values() if v["agg_func"]]

    if len(measures) == 0:
        issues.append(
            Issue(itype=IType.ERROR,
                  description="At least one measure should be specified",
                  location=IssueLocation(sheet_name=name,
                                         row=1,
                                         column=measure_names_column)))

    # measures != agg_funcs && len(agg_funcs) == 1 --> OK
    if len(measures) != len(agg_funcs) and len(agg_funcs) != 1:
        issues.append(
            Issue(
                itype=IType.ERROR,
                description=
                "There must be one aggregation function (used for all measures) or one aggregation per measure",
                location=IssueLocation(sheet_name=name,
                                       row=1,
                                       column=aggregations_column)))

    if not result_name:
        result_name = source + "_" + dataset_name
        issues.append(
            Issue(itype=IType.WARNING,
                  description="No result name specified. Assuming '" +
                  result_name + "'",
                  location=IssueLocation(sheet_name=name, row=2,
                                         column=c + 1)))

    content = {
        "dataset_source": source,
        "dataset_name": dataset_name,
        "dataset_datetime": available_at_datetime,
        "where": filter_,
        "dimensions": [d for d in dims],
        "group_by": out_dims,
        "measures": measures,
        "agg_funcs": agg_funcs,
        "measures_as": measures_as,
        "result_name": result_name
    }
    return issues, None, content