def build_dimension_df(pid_meta, ind_theme_id, next_dim_id): df_dims = pd.DataFrame({ "Dimension_EN": ["Date"] + pid_meta["dimension_names"]["en"], "Dimension_FR": ["Date"] + pid_meta["dimension_names"]["fr"] }) # add date dimension df_dims["IndicatorThemeId"] = int(ind_theme_id) df_dims["DisplayOrder"] = h.create_id_series( df_dims, 1) # counter col for each dimension df_dims["DimensionId"] = h.create_id_series(df_dims, next_dim_id) # DimensionType is "Filter" for all Dimensions except the last one, which is "Value" df_dims["DimensionType"] = "Filter" df_dims.loc[df_dims.index[-1], "DimensionType"] = "Value" # order columns for insert df_dims = df_dims.loc[:, [ "DimensionId", "IndicatorThemeId", "Dimension_EN", "Dimension_FR", "DisplayOrder", "DimensionType" ]] return df_dims
def build_date_dimension_values_df(file_dates, existing_dates, dim_id, next_dim_val_id, next_dim_val_order): # build dataframe of date dimension for DimensionValues including file reference dates (file_dates) that do # not yet exist in the database (existing_dates). dim_id is the dimension id of the Dates dimension, # next_dim_val_id and next_dim_val_order are the next ids to populate in the table. # join ref_dates from file to those found in the DB (ensure join column is trimmed string) file_dates["ReferencePeriod"] = file_dates["ReferencePeriod"].astype( "str").str.strip() existing_dates["Display_EN"] = existing_dates["Display_EN"].astype( "str").str.strip() # format any oddball dates in "date" dimension so they match the dates in the created reference periods (yyyy-mm-dd) existing_dates["ReferencePeriod"] = "" if existing_dates.shape[0] > 0: existing_dates["ReferencePeriod"] = existing_dates.apply( lambda x: h.fix_ref_date(x["Display_EN"], "%Y-%m-%d"), axis=1) joined_ref_dates_df = pd.merge(file_dates, existing_dates, on="ReferencePeriod", how="left") new_ref_dates_df = joined_ref_dates_df[joined_ref_dates_df[ 'DimensionId'].isnull()].copy() # keeps only new dates ret_df = pd.DataFrame() if new_ref_dates_df.shape[0] > 0: # if there are new reference dates, build remaining columns new_ref_dates_df["DimensionValueId"] = h.create_id_series( new_ref_dates_df, next_dim_val_id) new_ref_dates_df["DimensionId"] = dim_id new_ref_dates_df["Display_EN"] = new_ref_dates_df[ "REF_DATE"] # duplicate date to FR new_ref_dates_df["Display_FR"] = new_ref_dates_df[ "Display_EN"] # duplicate date to FR new_ref_dates_df["ValueDisplayOrder"] = h.create_id_series( new_ref_dates_df, next_dim_val_order) ret_df = build_dimension_values_df_subset(new_ref_dates_df) return ret_df
def build_dimension_values_df(pid_meta, df_dims, next_dim_val_id): # build dataframe for DimensionValues with dimension data (df_dims). next_dim_val_id is the next id for the db. df_dim_vals = create_dimension_member_df( pid_meta["dimensions_and_members"]) # create dim/member df df_dim_vals.rename(columns={ "MemberNameEn": "Display_EN", "MemberNameFr": "Display_FR" }, inplace=True) # match db df_dim_vals.drop( df_dim_vals[df_dim_vals["DimNameEn"].str.lower() == "geography"].index, inplace=True) # no geo dim df_dim_vals["DimensionValueId"] = h.create_id_series( df_dim_vals, next_dim_val_id) df_dim_vals = pd.merge(df_dim_vals, df_dims, how="left", left_on="DimNameEn", right_on="Dimension_EN") # dimIDs df_dim_vals.sort_values( by=["DimPosId", "MemberId"], inplace=True) # add counter that resets for each dimID df_dim_vals["ValueDisplayOrder"] = df_dim_vals.groupby(["DimensionId" ]).cumcount() + 1 df_dim_vals["MemberPrefix"] = df_dim_vals["ValueDisplayOrder"].astype( str).str.zfill(2) + ". " # prefix for web app df_dim_vals[ "Display_EN"] = df_dim_vals["MemberPrefix"] + df_dim_vals["Display_EN"] df_dim_vals[ "Display_FR"] = df_dim_vals["MemberPrefix"] + df_dim_vals["Display_FR"] df_dim_vals[ "ValueDisplayParent"] = None # unable to determine whether field is being used, set null for now # check data types/lengths, order cols for insert df_dim_vals["Display_EN"] = df_dim_vals["Display_EN"].astype( "str").str[:255] df_dim_vals["Display_FR"] = df_dim_vals["Display_FR"].astype( "str").str[:255] df_dim_vals = build_dimension_values_df_subset(df_dim_vals) return df_dim_vals
def build_indicator_values_df(edf, gdf, ndf, next_id, prod_id, mixed_geo_justice_pids, is_sibling): # build the data frame for IndicatorValues based on dataframe of english csv file (edf), # GeographyReference ids (gdf), and NullReason ids (ndf). Populate indicator value ids starting from next_id. # mixed_geo_justice_pids/is_sibling indicate justice tables that have special date handling. # also collect and return unique GeographicLevelIDs # Justice products with mixed geos if int(prod_id) in mixed_geo_justice_pids: # remove rows < 2017 if geolevel is not in national, provincial, regional level edf.drop( edf[(edf["RefYear"].astype("int16") < 2017) & (~edf["GeographicLevelId"].isin(["A0000", "A0001", "A0002"]) )].index, inplace=True) # for sibling tables with mixed geos, remove these same geolevels b/c they already exist in the master if is_sibling: edf.drop(edf[edf["GeographicLevelId"].isin( ["A0000", "A0001", "A0002"])].index, inplace=True) df_iv = edf.loc[:, ["DGUID", "IndicatorCode", "STATUS", "VALUE"]] # subset of full en dataset df_iv["IndicatorValueId"] = h.create_id_series(edf, next_id) # populate IDs df_iv = pd.merge(df_iv, gdf, left_on="DGUID", right_on="GeographyReferenceId", how="left") # join to geoRef for id df_iv.dropna(subset=["GeographyReferenceId"], inplace=True) # drop empty ids df_iv.drop(["GeographyReferenceId"], axis=1, inplace=True) df_iv["IndicatorValueCode"] = df_iv["DGUID"] + "." + df_iv[ "IndicatorCode"] # combine DGUID and IndicatorCode df_iv.drop(["DGUID", "IndicatorCode"], axis=1, inplace=True) df_iv = pd.merge(df_iv, ndf, left_on="STATUS", right_on="Symbol", how="left") # join to NullReasonId for Symbol df_iv.drop(["STATUS", "Symbol"], axis=1, inplace=True) # format for locale while preserving decimals from datapoints, restore original locale setting when done. if df_iv.shape[0] > 0: df_iv["Value_Dec"] = df_iv.apply( lambda x: h.format_number_preserve_decimals(x["VALUE"]), axis=1) # temp column orig_locale = h.get_locale() h.set_locale("en_ca") df_iv["FormattedValue_EN"] = df_iv.apply( lambda x: h.format_number_for_locale(x["Value_Dec"]), axis=1) h.set_locale("fr_ca") df_iv["FormattedValue_FR"] = df_iv.apply( lambda x: h.format_number_for_locale(x["Value_Dec"]), axis=1) h.set_locale(orig_locale) else: df_iv["FormattedValue_EN"] = df_iv[ "VALUE"] # work around to prevent error on empty chunk df_iv["FormattedValue_FR"] = df_iv["VALUE"] # set datatypes for db df_iv = df_iv.fillna(np.nan).replace( [np.nan], [None]) # workaround to set nan/na=None (prevents sql error 22003) df_iv["IndicatorValueCode"] = df_iv["IndicatorValueCode"].str[:100] df_iv["VALUE"] = df_iv["VALUE"].astype("float64") # Keep only the columns needed for insert df_iv = df_iv.loc[:, [ "IndicatorValueId", "VALUE", "NullReasonId", "IndicatorValueCode", "FormattedValue_EN", "FormattedValue_FR" ]] return df_iv
def build_indicator_df(product_id, release_dt, dim_members, uom_codeset, ref_date_list, next_id, min_ref_year, mixed_geo_justice_pids, freq_code): # Build the data frame for gis.Indicator based on product_id, relase date (release_dt), dimension members # (dim_members), unit of measure information (uom_codeset), list of possible reference dates (ref_date_list). # next_id contains the next available indicator id, and min_ref_year specifies whether we want the df to be # generated from a specific year onward. Justice tables w/ mixed geo levels (mixed_geo_justice_pids) have special # date handling. Frequency of publication (freq_code) is used to format the date for the web app popup. df = create_dimension_member_df( dim_members) # turn dimension/member data into dataframe df.sort_values( by=["DimPosId", "MemberId"], inplace=True) # Important to allow recombining columns in df later # prepare dictionaries for creating member combinations dim_mem_ids = {} # for coordinates dim_mem_names_en = {} # for english indicator name dim_mem_names_fr = {} # for french indicator name dim_mem_uoms = {} # for unit of measure (will only occur one per member) for index, row in df.iterrows(): dim_id = row["DimPosId"] # skip dimension 1 (geography) if row["DimNameEn"] != "Geography": if dim_id not in dim_mem_names_en: dim_mem_names_en[dim_id] = [] if dim_id not in dim_mem_names_fr: dim_mem_names_fr[dim_id] = [] if dim_id not in dim_mem_ids: dim_mem_ids[dim_id] = [] if dim_id not in dim_mem_uoms: dim_mem_uoms[dim_id] = [] dim_mem_names_en[dim_id].append(row["MemberNameEn"]) dim_mem_names_fr[dim_id].append(row["MemberNameFr"]) dim_mem_ids[dim_id].append(row["MemberId"]) app_uom = str(row["MemberUomCode"]) if row[ "DimHasUom"] else "" # keeps "nan" from ending up in the combo dim_mem_uoms[dim_id].append(app_uom) # build all possible member combinations mem_names_en = build_dimension_member_combos(dim_mem_names_en, " _ ") mem_names_fr = build_dimension_member_combos(dim_mem_names_fr, " _ ") mem_ids = build_dimension_member_combos(dim_mem_ids, ".") mem_uoms = build_dimension_member_combos(dim_mem_uoms, " ") pre_df = False # because the dicts are already sorted we can safely stick them together as columns in a dataframe at the end. if len(mem_names_en) == len(mem_names_fr) == len(mem_ids) == len(mem_uoms): pre_df = pd.DataFrame( { "IndicatorNameLong_EN": mem_names_en, "IndicatorNameLong_FR": mem_names_fr, "Coordinate": mem_ids, "UOM_ID": mem_uoms }, dtype=str) # UOM - Combining members may result in the uom field looking like "nan nan 229.0", we only want the 229 part. # Must go to float before int to prevent conversion error pre_df["UOM_ID"] = pre_df["UOM_ID"].str.replace("nan", "").str.replace( " ", "").astype("float").astype("int16") # Turn off inspection next 2 lines, false-positives from pycharm: see https://youtrack.jetbrains.com/issue/PY-43841 # noinspection PyTypeChecker pre_df["UOM_EN"] = pre_df.apply( lambda x: h.get_uom_desc_from_code_set(x["UOM_ID"], uom_codeset, "en"), axis=1) # noinspection PyTypeChecker pre_df["UOM_FR"] = pre_df.apply( lambda x: h.get_uom_desc_from_code_set(x["UOM_ID"], uom_codeset, "fr"), axis=1) pre_df["IndicatorThemeID"] = product_id pre_df["ReleaseIndicatorDate"] = release_dt pre_df[ "Vector"] = np.NaN # Vector field exists in gis.Indicator but is not used. We will insert nulls. # IndicatorNames seem to only be used for populating titles on related charts - 2nd last member for legend pre_df["IndicatorName_EN"] = pre_df.apply( lambda x: h.get_nth_item_from_string_list(x["IndicatorNameLong_EN"], " _ ", -2), axis=1) pre_df["IndicatorName_FR"] = pre_df.apply( lambda x: h.get_nth_item_from_string_list(x["IndicatorNameLong_FR"], " _ ", -2), axis=1) # Create new indicator data frame with a row for each year in the reference period ind_df = copy_data_frames_for_date_range(pre_df, ref_date_list, min_ref_year, product_id, mixed_geo_justice_pids) # add the remaining fields that required RefYear to be built first ind_df["RefYear"] = ind_df["RefYear"].astype("str") ind_df["IndicatorCode"] = str(product_id) + "." + ind_df[ "Coordinate"] + "." + ind_df["ReferencePeriod"] # This field becomes the popup on the web app. Reformat date depending on publication frequency. freq_dict = h.build_freq_code_to_pd_dict() fmt = freq_dict[freq_code][ "py_fmt"] if freq_code in freq_dict else "%Y-%m-%d" # default to show full date ind_df["IndicatorDisplay_EN"] = build_dimension_ul( ind_df["ReferencePeriod"], fmt, ind_df["IndicatorNameLong_EN"]) ind_df["IndicatorDisplay_FR"] = build_dimension_ul( ind_df["ReferencePeriod"], fmt, ind_df["IndicatorNameLong_FR"]) ind_df["IndicatorId"] = h.create_id_series(ind_df, next_id) # populate IDs # build fields needed later for IndicatorMetaData DimensionUniqueKey matching and RelatedCharts ind_df["IndicatorFmt"] = ind_df["ReferencePeriod"] + "-" + ind_df[ "IndicatorNameLong_EN"].str.replace(" _ ", "-") ind_df["LastIndicatorMember_EN"] = ind_df.apply( lambda x: h.get_nth_item_from_string_list(x["IndicatorNameLong_EN"], " _ "), axis=1) ind_df["LastIndicatorMember_FR"] = ind_df.apply( lambda x: h.get_nth_item_from_string_list(x["IndicatorNameLong_FR"], " _ "), axis=1) # set datatypes for db ind_df["ReleaseIndicatorDate"] = ind_df["ReleaseIndicatorDate"].astype( "datetime64[ns]") ind_df["ReferencePeriod"] = ind_df["ReferencePeriod"].astype( "datetime64[ns]") ind_df["IndicatorCode"] = ind_df["IndicatorCode"].str[:100] return ind_df