def schema_info_for_UID(self, parent_trace, a_uid): ''' Returns a pair: * An AcronymInfo object representing the schema properties corresponsing to the given UID `a_uid` * A string corresponding to the name of the UID column that should be used in a DataFrame or Excel representation for that `a_uid`. Typically these strings are like "UID", "UID-1", "UID-2", etc. @param a_uid A string representing a UID. It may be a full UID like "BR3.MR2.SR1" or just a leaf UID like "SR1". ''' leaf_uid = a_uid.split(".")[-1] acronym = UID_Utils().parseToken(parent_trace, leaf_uid)[0] acronyminfo_guesses = [ info for info in self.acronyminfo_list if info.acronym == acronym ] if len(acronyminfo_guesses) != 1: raise ApodeixiError( parent_trace, "UID Acronym schema is either not initialized or corrupted: " " it does not recognize a unique acronym for entity's UID", data={ "entity_UID": str(leaf_uid), "inferred acronyms": str(self.acronyminfo_list) }) acronyminfo = acronyminfo_guesses[0] level = self.acronyminfo_list.index(acronyminfo) UID = Interval.UID if level == 0: UID_COL = UID else: UID_COL = UID + '-' + str( level) # We start at "UID-1", "UID-2", etc. "UID" is on return acronyminfo, UID_COL
def _retrieve_referenced_uids(self, parent_trace): ''' Returns a list of UID strings that the referencing manifest (i.e., the manifest identified by self.referencing_handle) has in the path given by self.referencing_path. If the path is not valid or if it points to something that is not a UID or list of UIDs, this method raises an ApodeixiError ''' referencing_dict, ref_path = self.store.retrieveManifest( parent_trace, self.referencing_handle) val = DictionaryUtils().get_val(parent_trace=parent_trace, root_dict=referencing_dict, root_dict_name="Referencing Manifest", path_list=self.referencing_path, valid_types=[str, list]) my_trace = parent_trace.doing( "Validating referenced UIDs are well-formed") # To make a uniform check, operate on lists regardless of whether val is str (1 UID) or a list (multiple UIDs) if type(val) == str: alleged_uids = [val] else: alleged_uids = val # We leverage the UID_Utils method tokenize to validate that UIDs are well formed for au in alleged_uids: loop_trace = my_trace.doing("Validating that '" + str(au) + "' is a well-formed UID") au_tokens = UID_Utils().tokenize( loop_trace, au) # Will error out if au is not well formed # If we get this far without erroring out, the UIDs are all well-formed, so we can return them return alleged_uids
def attempt_tokenize(self, parent_trace, uid): my_trace = parent_trace.doing("Attempting to tokenize '" + uid + "'") try: x = UID_Utils().tokenize(parent_trace, uid) except ApodeixiError as ex: x = 'error' return x
def pad_uid(self, parent_trace, a_full_uid): ''' Utility method that can be used by callers that need to compare the padded and unpadded full UIDs. Some explanation: Unpadded UIDs are used in the paths through the manifest trees. For an acronym schema like [BR(big-rock), SR(Sub rock), and TR(Tiny rock)], unpadded UIDs are things like BR2.SR3 and BR1.TR1 However, the UID fields themselves inside the manifests must be "padded" if the user skips an entity, so that that knowledge of having skipped an entity is available later when the manifest is represented as a DataFrame or in Excel In our example, padding BR1.TR1 results in BR1.SR0.TR1, since the end-user skipped the sub-rock entity on that path. Padding BR2.SR3 yields no change (still BR2.SR3), since in that case the user skipped no entity. ''' tokens = a_full_uid.split(".") padded_tokens = [] all_acronyms = [info.acronym for info in self.acronym_infos()] for idx in range(len(tokens)): acronym, nb = UID_Utils().parseToken(parent_trace, tokens[idx]) if not acronym in all_acronyms: raise ApodeixiError( parent_trace, "Can't pad UID because it uses an acronym not in the schema", data={ "bad acronym": str(acronym), "uid": str(a_full_uid), "schema": str(self) }) schema_idx = all_acronyms.index(acronym) if schema_idx < len(padded_tokens): raise ApodeixiError( parent_trace, "Can't pad UID because it has an acronym out of order with regards to the " + "acronym schema. It must appear at index " + str(schema_idx) + " but " + "that is already taken by the partially constructed padded tokens so far", data={ "bad acronym": str(acronym), "uid": str(a_full_uid), "schema": str(self), "padded tokens so far": str(padded_tokens) }) # Now pad, if required for pad_idx in range(len(padded_tokens), schema_idx): pad_acronym = all_acronyms[pad_idx] padded_tokens.append(pad_acronym + "0") # Now add our payload padded_tokens.append(tokens[idx]) return ".".join(padded_tokens)
def _build_df_rows(self, parent_trace, content_dict, acronym_schema, parent_path, parent_uid, sparse, abbreviate_uids): ''' Recursive method that creates the data from which a Pandas DataFrame can be easily created by the caller, based on the dictionary `content_dict`. It returns: * The list of intervals whose concatenation would yield the columns for such a DataFrame-to-be (see example below for an explanation) * The list of rows (each row represented as a dict, whose keys are DataFrame columns, at least those columns for which the row has a non-NaN value) * A list of UID_Info objects, built as a by-product of this method's processing and which some callers may find useful. Refer to the documentation of UID_Info for explanation and example use cases. The rows might be populated to be "sparse" or not, depending on the `sparse` paremeter (a boolean). This is best explained in an example. The following is a non-sparse DataFrame: UID | Big Rock | UID-1 | Sub rock ============================================================== BR1 | New UX | BR1.1 | FX UI BR1 | New UX | BR1.2 | Lending UI BR2 | Containerization | BR2.1 | Pricing service BR2 | Containerization | BR2.2 | Market data svc This "non sparse" representation is well suited for making data analysis. In contrast, the "sparse" representation is geared towards visualization in Excel, where usability calls for leaving out repetitive text, replacing it by blanks since humans can easily infer what the values should be by glancing at preceding rows: UID | Big Rock | UID-1 | Sub rock ============================================================= BR1 | New UX | | | | BR1.1 | FX UI | | BR1.2 | Lending UI BR2 | Containerization | | | | BR2.1 | Pricing service | | BR2.2 | Market data svc In particular, the sparse representation has exactly 1 UID per row, using more rows if needed (in the example, the non-sparse representation has 4 rows but the sparse representation has 6 rows) This example also helps us comment some other nuances of the algorithm: 1. The first object returned by this method is a "list of intervals", which in the example would be [["UID", "Big Rock"], ["UID-1", "Sub rock"]] 2. The second object returned by this method is "all the rows as dictionaries", which for a sparse situation would be: [{ "UID": "BR1", "Big Rock": "New UX" }, { "UID-1", "BR1.1", "Sub rock": "FX UI" }, { "UID-1", "BR1.2", "Sub rock": "Lending UI" }, { "UID": "BR2", "Big Rock": "Contanerization" }, { "UID-1", "BR2.1", "Sub rock": "Pricing service" }, { "UID-1", "BR2.2", "Sub rock": "Market data svc" }, ] 3. The dictionaries representing rows don't need to have all columns present as keys. Pandas can still create a DataFrame from that, and will just put a null (nan or such) as the value of that column for the row in question. The algorithm makes use of this. 4. The UIDs are "abbreviated". For example, UID-1 has a value like "BR1.1" instead of "BR1.SR1". So only the first acronym "BR" (for "Big Rock") is displayed, not the second acronym "SR" (for Sub rock). This is for usability. The `contenct_dict` parameter is expected to contain non-abbreviated UIDs. 5. The `content_dict` representing the manifest's content uses "incremental" non-abbreviated UIDs for its recursive structure ("recursive" as in: a dict that contains some children that are sub dictionaries, not just scalars). By "incremental" we mean that content_dict["BR1"] would be a dict and the children are accessed by keys like "SR1" and "SR2", not "BR1.SR1" and "BR1.SR2". Thus, in our example content_dict["BR1"]["SR1"] and content_dict["BR1"]["SR2"] are the expected way to naviate the "recursive" nature of the contect_dict. 6. Because of the conventions in apodeixi.xli.breakdown_builder that were used to build such `content_dict`, there are columns like "BR1-name" and "SR1-name". These are ignored by this method. 7. This algorithm operates recursively, one interval at a time. In the example, we first process interval ["UID", "Big Rock"], first identify this row fragment: { "UID": "BR1", "Big Rock": "New UX"} The algorithm then makes a recursive call on `content_df["BR1"], which returns two objects: * A list of intervals: [["UID-1", "Sub rock"]], which is then merged with the caller's interval list to date and results in [["UID", "Big Rock"], ["UID-1", "Sub rock"]] * A list of rows: [ { "UID-1", "BR1.1", "Sub rock": "FX UI" }, { "UID-1", "BR1.2", "Sub rock": "Lending UI" } ] These then need to be "merged" with the caller, and the merging varies depending on whether sparse=True or not. In the sparse case, the merging would look like 3 rows: [{ "UID": "BR1", "Big Rock": "New UX" }, { "UID-1", "BR1.1", "Sub rock": "FX UI" }, { "UID-1", "BR1.2", "Sub rock": "Lending UI" } ] In the case sparse=False, the merging would look like 2 rows: [{ "UID": "BR1", "Big Rock": "New UX", "UID-1", "BR1.1", "Sub rock": "FX UI" }, { "UID": "BR1", "Big Rock": "New UX", "UID-1", "BR1.2", "Sub rock": "Lending UI" } ] 8. Apart from sub-dictionaries, `content_dict` usually has scalar attributes. These need to be included in the rows when they have a value. 9. Scalar attributes introduce a nuance with the merging of intervals: as subsequent rows of the result are gradually created, the algorithm only has a partial view of what the intervals are, since it infers intervals' columns based on what it has seen so far. It thus may happen that when dealing with a later row it will encounter additional columns for an entity's interval that had been previously seen. This must be taken into consideration in the merging of intervals. For example, perhaps we are processing a manifest dict arose from parsing a posting like this: UID | Big rock | Asset classes | Intended user =========================================================================== BR1 | Lending UI | Mortgages, commercial | BR2 | Treasury UI | | FX traders Then the manifest will have a branch for the first row that will make us infer an interval like [UID, Big rock, Asset classes] Later when we process the manifest's branch that arose from the second row we will instead get an interval like [UID, Big rock, Intended user] The correct "merge" behavior in such case is not to treat these as separate intervals, but to merge them as [UID, Big rock, Asset classes, Intended user] @param contents_dict A dict object representing the contents of a manifest, as opposed to the entire manifest. In the first example above, if manifest_dict represents a full manifest, then content_df = manifest_dict['assertion']['big-rock'] @param acronym_schema An AcronymSchema object that captures all the acronyms for `content_dict` as well as their corresponding entity names. This should be "global", i.e., is not for a "sub tree" but for the full manifest. Logically speaking the schema contains information that specifice the order of the acronyms and their entities. While an object and not a list, logically it is as in this example: [("BR", "big-rock"), ("MR", "medium_rock"), ("SR", "small rock")] @param parent_path A string using 'dot notation' for the path in the original YAML file that led to the `content_dict`. In the first example above, that would be "assertion.big-rock" when this method is first called, and "assertion.big-rock.BR1.Sub rock" when it recursively calls itself. @param parent_uid A string used to assist in recursive calls. In the first example above, that would be None when this method is first called, and "BR1" on a 1st recursion, or "BR1.SR1" on a 2nd nested recursion. @param sparse A boolean. If True, it returns a "sparse" representation suitable for Excel rendering, with exactly 1 UID per row (helpful when making joins) @param abbreviate_uids A boolean. If True, UIDs will only keep the top acronym. For example, a UID like "BR2.MR2.SM4" in the manifest would be transformed to "BR2.2.4" in the DataFrame returned by this method ''' my_trace = parent_trace.doing("Validating parent_path '" + parent_path + "''", data={'signaledFrom': __file__}) if True: if parent_path == None or len(parent_path.strip()) == 0: raise ApodeixiError( my_trace, "Can't process a parent_path that is null or blank") # parent_path is something like "assertion.big-rock" when this method is first called, and # like "assertion.big-rock.BR1.Sub rock" when this method is calls recursively on itself entity_uids = [ key for key in content_dict.keys() if not key.endswith('-name') ] # Will be one per entity_value, a dictionary of level_1_columns -> scalar value. By "level 1 columns" # we mean columns for the interval being processed here (subsequent intervals would be processed # in recursive invocations of this method). See method documentation for explanation of algorithm. all_rows = [] # Some manifest field names that have fixed, hard-coded values in Apodeixi UID = Interval.UID NAME = 'name' SYNTHETIC_COLUMNS = [ UID, NAME ] # These are added when parsing Excel, so not "real" content all_intervals = [] uid_info_list = [] # We'll build this as a by-product and return it # On a first call we loop through something like e_uid = "BR1", "BR2", "BR3", .... For that call # parent_uid = None and parent_path = "assertion.big-rock" # On a recursive call with parent_uid = "BR1" we loop through e_uid = "SR1", "SR2", "SR3", .... In this case # parent_path = "assertion.big-rock.BR1.Sub rock" for e_uid in entity_uids: if parent_uid == None: full_e_uid = e_uid else: full_e_uid = parent_uid + '.' + e_uid e_path = parent_path + '.' + e_uid e_dict = content_dict[e_uid] loop_trace = parent_trace.doing("Looping on entity with path '" + e_path + "'", data={'signaledFrom': __file__}) inner_trace = loop_trace.doing( "Determining name to give to UID column in DataFrame for a UID", data={"entity UID": str(full_e_uid)}) e_acronyminfo, UID_COL = acronym_schema.schema_info_for_UID( parent_trace, e_uid) # Check if we already have an interval for this acronym info, and if not, create one my_prior_interval = [ interval for interval in all_intervals if e_acronyminfo.entity_name in interval.columns ] if len(my_prior_interval) == 0: my_interval = Interval( parent_trace=my_trace, columns=[UID_COL, e_acronyminfo.entity_name], entity_name=e_acronyminfo.entity_name) all_intervals.append(my_interval) inner_trace = loop_trace.doing("Checking tree under '" + e_path + "' is well formed", data={'signaledFrom': __file__}) if True: # Check e.g. if content_dict = manifest_dict["assertion"]["big-rock"]["BR1"]["SubRock"] # that content_dict["SR2"] exists if e_dict == None: raise ApodeixiError( inner_trace, "Badly formatted tree: found nothing under '" + e_path + "'") # Check e.g. content_dict["SR2"] is a dictionary if type(e_dict) != dict: raise ApodeixiError( inner_trace, "Badly formatted tree: expected dictionary at '" + e_path + "' but instead found a " + str(type(e_dict))) # Check e.g. content_dict["SR2"]["UID"] exists if not UID in e_dict.keys(): raise ApodeixiError( inner_trace, "Badly formatted tree: expected a child called '" + UID + "' under '" + e_path + "'") # Check e.g. content_dict["SR2"]["UID"] == "SR2", except possibly for padding (this occurs # when the end user skips an entity). Thus, content_dict["SR2"]["UID"] = "MR0.SR2" would be OK if e_dict[UID] != acronym_schema.pad_uid( parent_trace, full_e_uid): raise ApodeixiError(inner_trace, "Badly formatted tree: expected '" + e_path + "[" + UID + "] = " + full_e_uid + "'", data={ "expected": full_e_uid, "actual": str(e_dict[UID]) }) # Check e.g. content_dict["SR2"]["UID"]['name'] exists if not NAME in e_dict.keys(): raise ApodeixiError( inner_trace, "Badly formatted tree: expected a child called '" + NAME + "' under '" + e_path + "'") inner_trace = loop_trace.doing("Building DataFrame row", data={"entity path": str(e_path)}) # We call it "level 1" because it is for my_interval. Recursive calls would be the subsequent # intervals, which are "level 2, 3, ..." in the content_df "tree" new_level_1_row = {} # Add the entity column to the level_1 row # But first replace by "friendly" UID like 'BR1.2' instead of "BR1.SR2", if we are thus configured if abbreviate_uids == True: abbreviated_full_e_uid = UID_Utils().abbreviate_uid( parent_trace=inner_trace, uid=full_e_uid, acronym_schema=acronym_schema) new_level_1_row[UID_COL] = abbreviated_full_e_uid else: # Remember to pad if needed, i.e., maybe full_e_uid is BR1.TR1, but if the acronym schema # is [BR, MR, TR], we should put a BR1.MR0.TR1 in the new_level_1_row, not a BR1.TR1 new_level_1_row[UID_COL] = acronym_schema.pad_uid( parent_trace, full_e_uid) new_level_1_row[e_acronyminfo.entity_name] = e_dict[NAME] uid_info_list.append( UID_Info( uid=new_level_1_row[UID_COL], entity_value=new_level_1_row[e_acronyminfo.entity_name])) sub_entities = acronym_schema.find_entities( inner_trace, e_dict) # Something like "Sub rock" # Now add the "scalar" attributes to the row and if needed also to the interval. A reason they may # not be in the interval already arises if we are creating the "first row" (i.e., entity e_uid) # or if that attribute was not present in "previous rows" #for attrib in [a for a in e_dict.keys() if not a in sub_entities and a not in SYNTHETIC_COLUMNS]: for attrib in [ a for a in e_dict.keys() if not a in sub_entities and a not in SYNTHETIC_COLUMNS ]: if not attrib in my_interval.columns: my_interval.columns.append(attrib) new_level_1_row[attrib] = e_dict[attrib] # Now we gear up to make a recursive call. For example, if we have been processing the interval # ["UID", "big-rock"] and e_dict = content_df["BR1"], we are now going to take the plunge into # the unique sub-entity "Sub rock" and make a recursive call to process interval # ["UID-1", "Sub rock"] passing content_df["BR1"]["Sub rock"] as the content to process. # # For our e_path = "assertion"."big-rock"."BR1" we pass a path of "assertion"."big-rock"."BR1"."Sub rock" # we set "ourselves" ("BR1") as the parent_uid in the recursive call sub_rows_across_subentities = [] for sub_entity in sub_entities: sub_rows = [] sub_intervals = [] inner_trace = loop_trace.doing( "Making a recursive call for '" + sub_entity + "'", data={'signaledFrom': __file__}) sub_intervals, sub_rows, sub_uid_info_list = self._build_df_rows( parent_trace=inner_trace, content_dict=e_dict[sub_entity], acronym_schema=acronym_schema, parent_path=e_path + '.' + sub_entity, parent_uid=full_e_uid, sparse=sparse, abbreviate_uids=abbreviate_uids) uid_info_list.extend(sub_uid_info_list) # Post-processing recursive call: handle the columns # # The recursive call discovered what other columns pertain to the sub-entity. We need to merge this # from two perspectives: # -If this was the first time we created an interval for the sub-entity (e.g., for "Sub rock"), # then add it to the list of intervals. # -However, if we already had an interval for "Sub rock" from having processed "previous rows", # then it may be that the recursive call we just made uncovered additional columns to be added to # that pre-existing row. See documentation for self._merge_interval self._merge_interval_lists(loop_trace, all_intervals, sub_intervals) sub_rows_across_subentities.extend(sub_rows) # Post-processing recursive call: handle the rows # # This is where the logic of sparse or non-sparse applies. See documentation to this method to explain # that algorithm if sparse == True: # Add (1 + N) rows, here N = len(sub_rows) all_rows.append(new_level_1_row) for idx in range(len(sub_rows_across_subentities)): all_rows.append(sub_rows_across_subentities[idx]) else: # Add N rows, where N = max(len(sub_rows), 1) if len(sub_rows_across_subentities) > 0: for r in sub_rows_across_subentities: # Copy the "level 1" data to all sub-rows and add them for k in new_level_1_row.keys(): r[k] = new_level_1_row[k] all_rows.append(r) else: all_rows.append(new_level_1_row) # Before returning, we need to sort the intervals to be order-consistent with the acronym_schema. # This is needed because due to the possibility of some rows having skipped an entity, it is possible # that in `all_intervals` we have the interval for UID-3 before the interval for UID-2. my_trace = parent_trace.doing( "Sorting intervals as per Acronym Schema", data={ "parent_uid": str(parent_uid), "acronym schema": str(acronym_schema) }) sorted_intervals = [] for acronyminfo in acronym_schema.acronym_infos(): # Find the interval for this acronym, if there is one in our list so far entity_name = acronyminfo.entity_name matches = [ interval for interval in all_intervals if entity_name in interval.columns ] if len(matches) > 1: raise ApodeixiError( my_trace, "Found multiple intervals for the same entity, and there shoud be at most 1" ) elif len(matches) == 1: sorted_intervals.append(matches[0]) # Check we sorted everything if len(sorted_intervals) != len(all_intervals): raise ApodeixiError( my_trace, "Was not able to sort all intervals based on the Acronym Schema. Some intervals " + " did not seem to correspond to anything recognized in the Acronym Schema, " + "The sorted intervals should have been equally long as all_intervals", data={ "len(all_intervals)": str(len(all_intervals)), "len(sorted_intervals)": str(len(sorted_intervals)) }) # Return value to caller (which for recursive call was this method itself, processing left interval to ours) return sorted_intervals, all_rows, uid_info_list
def _next_level_acronym_info(self, parent_trace, e_uid, content_dict, entity_name, level): ''' Helper method for self._map_acronyminfo_lists. The latter is a recursive method, and this method is used when the recursion "hits bottom", or when aggregating results from a recursive call. It returns a list of AcronymInfo objects, which normally would be a singleton: the AcronymInfo for the very next level in the tree: [AcronymInfo(e_acronym, entity_name)] where e_acronym is e_uid's acronym. *HOWEVER*, there is a boundary case that could lead to a but unless we return a list with more than one element: when the user skipped some intermediate entity. Consider this example: the acronym schema should be [BR(big rock), SR(sub rock), TR(tiny rock)] but the user skipped sub-rock sometimes. Anticipating this might happen, full UIDs were generated in the UID Store that put a "0" whenever an entity is skipped. For example, BR1.SR0.TR1 instead of BR1.TR1 Assume further that we are a point in the recursion where content_dict = manifest_dict["assertion"][big-rocks][BR1][Tiny rocks], and content_dict[TR1][UID] = BR1.SR0.TR1 Now, another path of the recursion would be perhaps content_dict2 = manifest_dict["assertion"][big-rocks][BR2][Sub rocks], and content_dict2[SR3][UID] = BR2.SR3 In this situation, it would be wrong for us to return [AcronymInfo(TR, Tiny rocks)] because the other path, which is at the same level in the manifest_dict tree, would return [AcronymInfo(SR, Sub rocks)] which will trigger an error in our subsequent processing, since we would think that at this level there are two valid acronyms for the next level down: TR and SR, and only one acronym is allowed per level. Therefore, the *correct* behaviour is to *pad* that list returned from this method to ensure that TR is never at the same level as SR. That means returning [AcronymInfo(SR, None), AcronymInfo(TR), "Tiny rocks")] The level of padding can be determined by looking at content_dict[UID] ''' padded_uid = content_dict[e_uid][Interval.UID] padded_tokens = padded_uid.split(".") # Check consistency of UID field with the path UID my_trace = parent_trace.doing( "Checking that the UID field is for a UID that extends the prior level", data={ "path incremental UID": str(e_uid), "level": str(level), "dict['UID']": str(padded_uid) }) if len(padded_tokens) < level + 1: raise ApodeixiError( my_trace, "UID field lacks the required tokens: expected at least " + str(level + 1) + " tokens, " " but found only " + len(padded_tokens) + " in the UID field") # Check any extra tokens is only padding as we add that padding my_trace = parent_trace.doing( "Padding a list of AcronymInfos below a node in the manifest tree " + " due to user skipping entities", data={ "path incremental UID": str(e_uid), "level": str(level), "dict['UID']": str(padded_uid) }) result = [] for idx in range(level, len(padded_tokens) - 1): some_acronym, some_val = UID_Utils().parseToken( my_trace, padded_tokens[idx]) if some_val != 0: raise ApodeixiError( my_trace, "Corrupted manifest: token '" + str(padded_tokens[idx]) + "' in UID field '" + str(padded_uid) + "' should have only been padding, i.e. a value of 0") # Add the padding result.append( AcronymInfo(some_acronym, None) ) # We put None for the entity because we don't know it, but that's OK # Any required padding is in, so now we can safely add the e_uid's acronym info e_acronym = UID_Utils().parseToken(my_trace, e_uid)[0] result.append(AcronymInfo(e_acronym, entity_name)) return result
def _map_acronyminfo_lists(self, parent_trace, content_dict, parent_path, parent_uid, level=0): ''' This is a recursive helper method to the "map-reduce" algorithm used by method _find_acronym_list. Refer to the documentation of that method for an explanation of the context for the algorithm. This method returns a list of lists, where the inner list consist of _AcronymInfo objects. @level An integer, to tells us where we are in the recursion. Starts at 0, so it must equal the number of tokens in parent_uid. Helpful to disambiguate the index to use for an AcronymInfo object in the returned value, particularly when the user skipped some intermediate entities so can't rely on the length of parent_path for such a determination. ''' my_trace = parent_trace.doing("Mapping acronym lists for '" + parent_path + "''", data={'signaledFrom': __file__}) if True: if parent_path == None or len(parent_path.strip()) == 0: raise ApodeixiError( my_trace, "Can't process a parent_path that is null or blank") # parent_path is something like "assertion.big-rock" when this method is first called, and # like "assertion.big-rock.BR1.Sub rock" when this method is calls recursively on itself path_tokens = parent_path.split('.') entity_name = path_tokens[ -1] # like "big-rock" on 1st call, and "Sub rock" on recursive call entity_uids = [ key for key in content_dict.keys() if not key.endswith('-name') ] # Will be one per "path" within the "tree" represented by `content_dict`, consisting of the acronyms # encountered along that path, in order. all_acronyms_result = [] my_trace = parent_trace.doing("Mapping acronyms under of '" + str(parent_path) + "'", data={'signaledFrom': __file__}) # On a first call we loop through something like e_uid = "BR1", "BR2", "BR3", .... For that call # parent_uid = None and parent_path = "assertion.big-rock" # On a recursive call with parent_uid = "BR1" we loop through e_uid = "SR1", "SR2", "SR3", .... In this case # parent_path = "assertion.big-rock.BR1.Sub rock" for e_uid in entity_uids: loop_trace = parent_trace.doing("Looping on entity with UID '" + str(e_uid) + "'", data={'signaledFrom': __file__}) if parent_uid == None: full_e_uid = e_uid else: full_e_uid = parent_uid + '.' + e_uid e_path = parent_path + '.' + e_uid e_dict = content_dict[e_uid] inner_trace = loop_trace.doing("Checking tree under '" + e_path + "' is well formed", data={'signaledFrom': __file__}) if True: # Check e.g. if content_dict = manifest_dict["assertion"]["big-rock"]["BR1"]["SubRock"] # and e_uid = "SR2", that content_dict["SR2"] exists and is a dictionary if e_dict == None: raise ApodeixiError( inner_trace, "Badly formatted tree: found nothing under '" + e_path + "'") if type(e_dict) != dict: raise ApodeixiError( inner_trace, "Badly formatted tree: expected dictionary at '" + e_path + "' but instead found a " + str(type(e_dict))) inner_trace = loop_trace.doing("Getting acronym lists under '" + e_path + "'", data={'signaledFrom': __file__}) sub_entities = self.find_entities( inner_trace, e_dict) # Something like "Sub rock" # Now we gear up to make a recursive call. For example, if we have been processing the interval # ["UID", "big-rock"] and e_dict = content_df["BR1"], we are now going to take the plunge into # the unique sub-entity "Sub rock" and make a recursive call to process interval # ["UID-1", "Sub rock"] passing content_df["BR1"]["Sub rock"] as the content to process. # # For our e_path = "assertion"."big-rock"."BR1" we pass a path of "assertion"."big-rock"."BR1"."Sub rock" # we set "ourselves" ("BR1") as the parent_uid in the recursive call next_level_infos = self._next_level_acronym_info( parent_trace=inner_trace, e_uid=e_uid, content_dict=content_dict, entity_name=entity_name, level=level) if len(sub_entities) == 0: all_acronyms_result.append(next_level_infos) else: for sub_entity in sub_entities: inner_trace = loop_trace.doing( "Making a recursive call for '" + sub_entity + "'", data={'signaledFrom': __file__}) acronyminfos_subresult = self._map_acronyminfo_lists( parent_trace=inner_trace, content_dict=e_dict[sub_entity], parent_path=e_path + '.' + sub_entity, parent_uid=full_e_uid, level=level + len(next_level_infos)) e_acronym = UID_Utils().parseToken(loop_trace, e_uid)[0] for acronyminfos_sublist in acronyminfos_subresult: # Check we are not about to put duplicate acronyms - if so, that is an error with the `content_df` if e_acronym in [ info.acronym for info in acronyminfos_sublist ]: raise ApodeixiError( inner_trace, "Looks like manifest is corrupted because the same acronym is " + " used at different levels. An acronym should be used in only 1 level", data={ "Problem at UID": str(full_e_uid), "Acronyms below UID": ListUtils().print(inner_trace, acronyminfos_sublist) }) acronyms_list = next_level_infos.copy() acronyms_list.extend(acronyminfos_sublist) all_acronyms_result.append(acronyms_list) return all_acronyms_result