Python UID_Utils 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: apodeixi.xli.uid_store

클래스/타입: UID_Utils

hotexamples.com에서의 예제들: 7

Python UID_Utils - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 apodeixi.xli.uid_store.UID_Utils에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

UID_Utils(7)

자주 사용되는 메소드들

UID_Utils (7)

예제 #1

파일 보기

파일: uid_acronym_schema.py 프로젝트: ChateauClaudia-Labs/apodeixi

    def schema_info_for_UID(self, parent_trace, a_uid):
        '''
        Returns a pair:

        * An AcronymInfo object representing the schema properties corresponsing to the given UID `a_uid`
        * A string corresponding to the name of the UID column that should be used in a DataFrame or Excel representation
          for that `a_uid`. Typically these strings are like "UID", "UID-1", "UID-2", etc.

        @param a_uid A string representing a UID. It may be a full UID like "BR3.MR2.SR1" or just a leaf UID like "SR1".

        '''
        leaf_uid = a_uid.split(".")[-1]
        acronym = UID_Utils().parseToken(parent_trace, leaf_uid)[0]
        acronyminfo_guesses = [
            info for info in self.acronyminfo_list if info.acronym == acronym
        ]
        if len(acronyminfo_guesses) != 1:
            raise ApodeixiError(
                parent_trace,
                "UID Acronym schema is either not initialized or corrupted: "
                " it does not recognize a unique acronym for entity's UID",
                data={
                    "entity_UID": str(leaf_uid),
                    "inferred acronyms": str(self.acronyminfo_list)
                })
        acronyminfo = acronyminfo_guesses[0]
        level = self.acronyminfo_list.index(acronyminfo)
        UID = Interval.UID
        if level == 0:
            UID_COL = UID
        else:
            UID_COL = UID + '-' + str(
                level)  # We start at "UID-1", "UID-2", etc. "UID" is on

        return acronyminfo, UID_COL

예제 #2

파일 보기

파일: foreign_key_constraints.py 프로젝트: ChateauClaudia-Labs/apodeixi

    def _retrieve_referenced_uids(self, parent_trace):
        '''
        Returns a list of UID strings that the referencing manifest (i.e., the manifest identified by 
        self.referencing_handle) has in the path given by self.referencing_path.

        If the path is not valid or if it points to something that is not a UID or list of UIDs, this method raises an
        ApodeixiError
        '''
        referencing_dict, ref_path = self.store.retrieveManifest(
            parent_trace, self.referencing_handle)
        val = DictionaryUtils().get_val(parent_trace=parent_trace,
                                        root_dict=referencing_dict,
                                        root_dict_name="Referencing Manifest",
                                        path_list=self.referencing_path,
                                        valid_types=[str, list])
        my_trace = parent_trace.doing(
            "Validating referenced UIDs are well-formed")
        # To make a uniform check, operate on lists regardless of whether val is str (1 UID) or a list (multiple UIDs)
        if type(val) == str:
            alleged_uids = [val]
        else:
            alleged_uids = val

        # We leverage the UID_Utils method tokenize to validate that UIDs are well formed
        for au in alleged_uids:
            loop_trace = my_trace.doing("Validating that '" + str(au) +
                                        "' is a well-formed UID")
            au_tokens = UID_Utils().tokenize(
                loop_trace, au)  # Will error out if au is not well formed

        # If we get this far without erroring out, the UIDs are all well-formed, so we can return them
        return alleged_uids

예제 #3

파일 보기

 def attempt_tokenize(self, parent_trace, uid):
     my_trace = parent_trace.doing("Attempting to tokenize '" + uid + "'")
     try:
         x = UID_Utils().tokenize(parent_trace, uid)
     except ApodeixiError as ex:
         x = 'error'
     return x

예제 #4

파일 보기

파일: uid_acronym_schema.py 프로젝트: ChateauClaudia-Labs/apodeixi

    def pad_uid(self, parent_trace, a_full_uid):
        '''
        Utility method that can be used by callers that need to compare the padded and unpadded full UIDs.
        Some explanation:

        Unpadded UIDs are used in the paths through the manifest trees. For an acronym schema like
        [BR(big-rock), SR(Sub rock), and TR(Tiny rock)], unpadded UIDs are things like BR2.SR3 and BR1.TR1

        However, the UID fields themselves inside the manifests must be "padded" if the user skips an entity, so that
        that knowledge of having skipped an entity is available later when the manifest is represented as a DataFrame or in 
        Excel

        In our example, padding BR1.TR1 results in BR1.SR0.TR1, since the end-user skipped the sub-rock entity on that path.

        Padding BR2.SR3 yields no change (still BR2.SR3), since in that case the user skipped no entity.
        '''
        tokens = a_full_uid.split(".")
        padded_tokens = []
        all_acronyms = [info.acronym for info in self.acronym_infos()]
        for idx in range(len(tokens)):

            acronym, nb = UID_Utils().parseToken(parent_trace, tokens[idx])
            if not acronym in all_acronyms:
                raise ApodeixiError(
                    parent_trace,
                    "Can't pad UID because it uses an acronym not in the schema",
                    data={
                        "bad acronym": str(acronym),
                        "uid": str(a_full_uid),
                        "schema": str(self)
                    })
            schema_idx = all_acronyms.index(acronym)
            if schema_idx < len(padded_tokens):
                raise ApodeixiError(
                    parent_trace,
                    "Can't pad UID because it has an acronym out of order with regards to the "
                    + "acronym schema. It must appear at index " +
                    str(schema_idx) + " but " +
                    "that is already taken by the partially constructed padded tokens so far",
                    data={
                        "bad acronym": str(acronym),
                        "uid": str(a_full_uid),
                        "schema": str(self),
                        "padded tokens so far": str(padded_tokens)
                    })
            # Now pad, if required
            for pad_idx in range(len(padded_tokens), schema_idx):
                pad_acronym = all_acronyms[pad_idx]
                padded_tokens.append(pad_acronym + "0")

            # Now add our payload
            padded_tokens.append(tokens[idx])
        return ".".join(padded_tokens)

예제 #5

파일 보기

파일: as_dataframe.py 프로젝트: ChateauClaudia-Labs/apodeixi

    def _build_df_rows(self, parent_trace, content_dict, acronym_schema,
                       parent_path, parent_uid, sparse, abbreviate_uids):
        '''
        Recursive method that creates the data from which a Pandas DataFrame can be easily created by the caller,
        based on the dictionary `content_dict`. It returns:
        
        * The list of intervals whose concatenation would yield the columns for such a DataFrame-to-be (see 
            example below for an explanation)
        * The list of rows (each row represented as a dict, whose keys are DataFrame columns, at least those
            columns for which the row has a non-NaN value)
        * A list of UID_Info objects, built as a by-product of this method's processing and which some callers
            may find useful. Refer to the documentation of UID_Info for explanation and example use cases.

        The rows might be populated to be "sparse" or not, depending on the `sparse` paremeter (a boolean).
        This is best explained in an example. The following is a non-sparse DataFrame:

                UID |      Big Rock         | UID-1 |     Sub rock
            ==============================================================
                BR1 |   New UX              | BR1.1 |   FX UI
                BR1 |   New UX              | BR1.2 |   Lending UI
                BR2 |   Containerization    | BR2.1 |   Pricing service
                BR2 |   Containerization    | BR2.2 |   Market data svc

        This "non sparse" representation is well suited for making data analysis.

        In contrast, the "sparse" representation is geared towards visualization in Excel, where usability
        calls for leaving out repetitive text, replacing it by blanks since humans can easily infer
        what the values should be by glancing at preceding rows:

                UID |      Big Rock         | UID-1 |     Sub rock
            =============================================================
                BR1 |   New UX              |       |   
                    |                       | BR1.1 |   FX UI
                    |                       | BR1.2 |   Lending UI
                BR2 |   Containerization    |       |   
                    |                       | BR2.1 |   Pricing service
                    |                       | BR2.2 |   Market data svc

        In particular, the sparse representation has exactly 1 UID per row, using more rows if needed
        (in the example, the non-sparse representation has 4 rows but the sparse representation has 6 rows)

        This example also helps us comment some other nuances of the algorithm:

        1. The first object returned by this method is a "list of intervals", which in the example would be

                [["UID", "Big Rock"], ["UID-1", "Sub rock"]]

        2. The second object returned by this method is "all the rows as dictionaries", which for a sparse
            situation would be:

                [{  "UID": "BR1",   "Big Rock": "New UX"                                                                },          
                                                                  { "UID-1", "BR1.1",   "Sub rock": "FX UI"             },
                                                                  { "UID-1", "BR1.2",   "Sub rock": "Lending UI"        },
                 {  "UID": "BR2",   "Big Rock": "Contanerization"                                                       },  
                                                                  { "UID-1", "BR2.1",   "Sub rock": "Pricing service"   },
                                                                  { "UID-1", "BR2.2",   "Sub rock": "Market data svc"   },
                ]

        3. The dictionaries representing rows don't need to have all columns present as keys. Pandas can still
            create a DataFrame from that, and will just put a null (nan or such) as the value of that column for
            the row in question. The algorithm makes use of this.
        
        4. The UIDs are "abbreviated". For example, UID-1 has a value like "BR1.1" instead of "BR1.SR1". So only
            the first acronym "BR" (for "Big Rock") is displayed, not the second acronym "SR" (for Sub rock).
            This is for usability. The `contenct_dict` parameter is expected to contain non-abbreviated UIDs.

        5. The `content_dict` representing the manifest's content uses "incremental" non-abbreviated UIDs
            for its recursive structure ("recursive" as in: a dict that contains some children that are sub
            dictionaries, not just scalars). By "incremental" we mean that content_dict["BR1"] would be
            a dict and the children are accessed by keys like "SR1" and "SR2", not "BR1.SR1" and "BR1.SR2".
            Thus, in our example content_dict["BR1"]["SR1"] and content_dict["BR1"]["SR2"] are the
            expected way to naviate the "recursive" nature of the contect_dict.

        6. Because of the conventions in apodeixi.xli.breakdown_builder that were used to build 
            such `content_dict`, there are columns like "BR1-name" and "SR1-name". These are ignored by this
            method.

        7. This algorithm operates recursively, one interval at a time. In the example, we first process
            interval ["UID", "Big Rock"], first identify this row fragment:

             {  "UID": "BR1",   "Big Rock": "New UX"}

            The algorithm then makes a recursive call on `content_df["BR1"], which returns two objects:

                * A list of intervals: [["UID-1", "Sub rock"]], which is then merged with the caller's interval
                    list to date and results in [["UID", "Big Rock"], ["UID-1", "Sub rock"]]

                * A list of rows:
                                                                [ { "UID-1", "BR1.1",   "Sub rock": "FX UI"             },
                                                                  { "UID-1", "BR1.2",   "Sub rock": "Lending UI"        }
                                                                ]
                    These then need to be "merged" with the caller, and the merging varies depending on whether
                    sparse=True or not. In the sparse case, the merging would look like 3 rows:

                [{  "UID": "BR1",   "Big Rock": "New UX"                                                                },          
                                                                  { "UID-1", "BR1.1",   "Sub rock": "FX UI"             },
                                                                  { "UID-1", "BR1.2",   "Sub rock": "Lending UI"        }
                ]

                    In the case sparse=False, the merging would look like 2 rows:

                [{  "UID": "BR1",   "Big Rock": "New UX",           "UID-1", "BR1.1",   "Sub rock": "FX UI"             },
                 {  "UID": "BR1",   "Big Rock": "New UX",           "UID-1", "BR1.2",   "Sub rock": "Lending UI"        }
                ]

        8. Apart from sub-dictionaries, `content_dict` usually has scalar attributes. These need to be included
            in the rows when they have a value. 

        9. Scalar attributes introduce a nuance with the merging of intervals: as subsequent rows of the result
            are gradually created, the algorithm only has a partial view of what the intervals are, since it infers
            intervals' columns based on what it has seen so far. It thus may happen that when dealing with 
            a later row it will encounter additional columns for an entity's interval that had been previously
            seen. This must be taken into consideration in the merging of intervals.

            For example, perhaps we are processing a manifest dict arose from parsing a posting like this:

                    UID         |   Big rock        |   Asset classes           | Intended user
                    ===========================================================================
                    BR1         | Lending UI        |  Mortgages, commercial    |
                    BR2         | Treasury UI       |                           | FX traders

            Then the manifest will have a branch for the first row that will make us infer an
            interval like 
            
                    [UID, Big rock, Asset classes]
                    
            Later when we process the manifest's branch that arose from the second row we will instead get 
            an interval like 
            
                    [UID, Big rock, Intended user]
            
            The correct "merge" behavior in such case is not to treat these as separate intervals, but to merge them 
            as 
                    [UID, Big rock, Asset classes, Intended user]


        @param contents_dict A dict object representing the contents of a manifest, as opposed to the
                            entire manifest. 
                            In the first example above, if manifest_dict represents a full manifest, 
                            then content_df = manifest_dict['assertion']['big-rock']
        @param acronym_schema An AcronymSchema object that captures all the acronyms for `content_dict`
                            as well as their corresponding entity names. This should be
                            "global", i.e., is not for a "sub tree" but for the full manifest. 
                            Logically speaking the schema contains information that specifice the order
                            of the acronyms and their entities. While an object and not a list, logically
                            it is as in this example:

                                [("BR", "big-rock"), ("MR", "medium_rock"), ("SR", "small rock")] 
                            
        @param parent_path A string using 'dot notation' for the path in the original YAML file that led
                          to the `content_dict`. 
                          In the first example above, that would be "assertion.big-rock" when this method is
                          first called, and "assertion.big-rock.BR1.Sub rock" when it recursively
                          calls itself.
        @param parent_uid   A string used to assist in recursive calls.
                            In the first example above, that would be None when this method is first called,
                            and "BR1" on a 1st recursion, or "BR1.SR1" on a 2nd nested recursion.
        @param sparse A boolean. If True, it returns a "sparse" representation suitable for Excel rendering,
                    with exactly 1 UID per row (helpful when making joins)
        @param abbreviate_uids A boolean. If True, UIDs will only keep the top acronym. For example, 
                    a UID like "BR2.MR2.SM4" in the manifest would be transformed to "BR2.2.4" in the
                    DataFrame returned by this method
        '''
        my_trace = parent_trace.doing("Validating parent_path '" +
                                      parent_path + "''",
                                      data={'signaledFrom': __file__})
        if True:
            if parent_path == None or len(parent_path.strip()) == 0:
                raise ApodeixiError(
                    my_trace,
                    "Can't process a parent_path that is null or blank")

        # parent_path is something like "assertion.big-rock" when this method is first called, and
        # like  "assertion.big-rock.BR1.Sub rock" when this method is calls recursively on itself

        entity_uids = [
            key for key in content_dict.keys() if not key.endswith('-name')
        ]

        # Will be one per entity_value, a dictionary of level_1_columns -> scalar value. By "level 1 columns"
        # we mean columns for the interval being processed here (subsequent intervals would be processed
        # in recursive invocations of this method). See method documentation for explanation of algorithm.
        all_rows = []

        # Some manifest field names that have fixed, hard-coded values in Apodeixi
        UID = Interval.UID
        NAME = 'name'
        SYNTHETIC_COLUMNS = [
            UID, NAME
        ]  # These are added when parsing Excel, so not "real" content

        all_intervals = []

        uid_info_list = []  # We'll build this as a by-product and return it

        # On a first call we loop through something like e_uid = "BR1", "BR2", "BR3", .... For that call
        #       parent_uid = None and parent_path = "assertion.big-rock"
        # On a recursive call with parent_uid = "BR1" we loop through e_uid = "SR1", "SR2", "SR3", .... In this case
        #       parent_path = "assertion.big-rock.BR1.Sub rock"
        for e_uid in entity_uids:
            if parent_uid == None:
                full_e_uid = e_uid
            else:
                full_e_uid = parent_uid + '.' + e_uid

            e_path = parent_path + '.' + e_uid
            e_dict = content_dict[e_uid]
            loop_trace = parent_trace.doing("Looping on entity with path '" +
                                            e_path + "'",
                                            data={'signaledFrom': __file__})

            inner_trace = loop_trace.doing(
                "Determining name to give to UID column in DataFrame for a UID",
                data={"entity UID": str(full_e_uid)})

            e_acronyminfo, UID_COL = acronym_schema.schema_info_for_UID(
                parent_trace, e_uid)

            # Check if we already have an interval for this acronym info, and if not, create one
            my_prior_interval = [
                interval for interval in all_intervals
                if e_acronyminfo.entity_name in interval.columns
            ]
            if len(my_prior_interval) == 0:
                my_interval = Interval(
                    parent_trace=my_trace,
                    columns=[UID_COL, e_acronyminfo.entity_name],
                    entity_name=e_acronyminfo.entity_name)
                all_intervals.append(my_interval)

            inner_trace = loop_trace.doing("Checking tree under '" + e_path +
                                           "' is well formed",
                                           data={'signaledFrom': __file__})
            if True:
                # Check e.g. if content_dict = manifest_dict["assertion"]["big-rock"]["BR1"]["SubRock"]
                # that content_dict["SR2"] exists
                if e_dict == None:
                    raise ApodeixiError(
                        inner_trace,
                        "Badly formatted tree: found nothing under '" +
                        e_path + "'")
                # Check e.g. content_dict["SR2"] is a dictionary
                if type(e_dict) != dict:
                    raise ApodeixiError(
                        inner_trace,
                        "Badly formatted tree: expected dictionary at '" +
                        e_path + "' but instead found a " + str(type(e_dict)))
                # Check e.g. content_dict["SR2"]["UID"] exists
                if not UID in e_dict.keys():
                    raise ApodeixiError(
                        inner_trace,
                        "Badly formatted tree: expected a child called '" +
                        UID + "' under '" + e_path + "'")
                # Check e.g. content_dict["SR2"]["UID"] == "SR2", except possibly for padding (this occurs
                # when the end user skips an entity). Thus, content_dict["SR2"]["UID"] = "MR0.SR2" would be OK
                if e_dict[UID] != acronym_schema.pad_uid(
                        parent_trace, full_e_uid):
                    raise ApodeixiError(inner_trace,
                                        "Badly formatted tree: expected '" +
                                        e_path + "[" + UID + "] = " +
                                        full_e_uid + "'",
                                        data={
                                            "expected": full_e_uid,
                                            "actual": str(e_dict[UID])
                                        })
                # Check e.g. content_dict["SR2"]["UID"]['name'] exists
                if not NAME in e_dict.keys():
                    raise ApodeixiError(
                        inner_trace,
                        "Badly formatted tree: expected a child called '" +
                        NAME + "' under '" + e_path + "'")

            inner_trace = loop_trace.doing("Building DataFrame row",
                                           data={"entity path": str(e_path)})
            # We call it "level 1" because it is for my_interval. Recursive calls would be the subsequent
            # intervals, which are "level 2, 3, ..." in the content_df "tree"
            new_level_1_row = {}
            # Add the entity column to the level_1 row
            # But first replace by "friendly" UID like 'BR1.2' instead of "BR1.SR2", if we are thus configured
            if abbreviate_uids == True:
                abbreviated_full_e_uid = UID_Utils().abbreviate_uid(
                    parent_trace=inner_trace,
                    uid=full_e_uid,
                    acronym_schema=acronym_schema)
                new_level_1_row[UID_COL] = abbreviated_full_e_uid
            else:
                # Remember to pad if needed, i.e., maybe full_e_uid is BR1.TR1, but if the acronym schema
                # is [BR, MR, TR], we should put a BR1.MR0.TR1 in the new_level_1_row, not a BR1.TR1
                new_level_1_row[UID_COL] = acronym_schema.pad_uid(
                    parent_trace, full_e_uid)
            new_level_1_row[e_acronyminfo.entity_name] = e_dict[NAME]

            uid_info_list.append(
                UID_Info(
                    uid=new_level_1_row[UID_COL],
                    entity_value=new_level_1_row[e_acronyminfo.entity_name]))

            sub_entities = acronym_schema.find_entities(
                inner_trace, e_dict)  # Something like "Sub rock"
            # Now add the "scalar" attributes to the row and if needed also to the interval. A reason they may
            # not be in the interval already arises if we are creating the "first row" (i.e., entity e_uid)
            # or if that attribute was not present in "previous rows"
            #for attrib in [a for a in e_dict.keys() if not a in sub_entities and a not in SYNTHETIC_COLUMNS]:
            for attrib in [
                    a for a in e_dict.keys()
                    if not a in sub_entities and a not in SYNTHETIC_COLUMNS
            ]:
                if not attrib in my_interval.columns:
                    my_interval.columns.append(attrib)
                new_level_1_row[attrib] = e_dict[attrib]

            # Now we gear up to make a recursive call. For example, if we have been processing the interval
            # ["UID", "big-rock"] and e_dict = content_df["BR1"], we are now going to take the plunge into
            # the unique sub-entity "Sub rock" and make a recursive call to process interval
            # ["UID-1", "Sub rock"] passing content_df["BR1"]["Sub rock"] as the content to process.
            #
            # For our e_path = "assertion"."big-rock"."BR1" we pass a path of "assertion"."big-rock"."BR1"."Sub rock"
            # we set "ourselves" ("BR1") as the parent_uid in the recursive call
            sub_rows_across_subentities = []
            for sub_entity in sub_entities:
                sub_rows = []
                sub_intervals = []
                inner_trace = loop_trace.doing(
                    "Making a recursive call for '" + sub_entity + "'",
                    data={'signaledFrom': __file__})

                sub_intervals, sub_rows, sub_uid_info_list = self._build_df_rows(
                    parent_trace=inner_trace,
                    content_dict=e_dict[sub_entity],
                    acronym_schema=acronym_schema,
                    parent_path=e_path + '.' + sub_entity,
                    parent_uid=full_e_uid,
                    sparse=sparse,
                    abbreviate_uids=abbreviate_uids)

                uid_info_list.extend(sub_uid_info_list)

                # Post-processing recursive call: handle the columns
                #
                # The recursive call discovered what other columns pertain to the sub-entity. We need to merge this
                # from two perspectives:
                # -If this was the first time we created an interval for the sub-entity (e.g., for "Sub rock"),
                #  then add it to the list of intervals.
                # -However, if we already had an interval for "Sub rock" from having processed "previous rows",
                #  then it may be that the recursive call we just made uncovered additional columns to be added to
                #  that pre-existing row. See documentation for self._merge_interval
                self._merge_interval_lists(loop_trace, all_intervals,
                                           sub_intervals)

                sub_rows_across_subentities.extend(sub_rows)

            # Post-processing recursive call: handle the rows
            #
            # This is where the logic of sparse or non-sparse applies. See documentation to this method to explain
            # that algorithm
            if sparse == True:  # Add (1 + N) rows, here N = len(sub_rows)
                all_rows.append(new_level_1_row)
                for idx in range(len(sub_rows_across_subentities)):
                    all_rows.append(sub_rows_across_subentities[idx])

            else:  # Add N rows, where N = max(len(sub_rows), 1)
                if len(sub_rows_across_subentities) > 0:
                    for r in sub_rows_across_subentities:  # Copy the "level 1" data to all sub-rows and add them
                        for k in new_level_1_row.keys():
                            r[k] = new_level_1_row[k]
                        all_rows.append(r)
                else:
                    all_rows.append(new_level_1_row)

        # Before returning, we need to sort the intervals to be order-consistent with the acronym_schema.
        # This is needed because due to the possibility of some rows having skipped an entity, it is possible
        # that in `all_intervals` we have the interval for UID-3 before the interval for UID-2.
        my_trace = parent_trace.doing(
            "Sorting intervals as per Acronym Schema",
            data={
                "parent_uid": str(parent_uid),
                "acronym schema": str(acronym_schema)
            })
        sorted_intervals = []
        for acronyminfo in acronym_schema.acronym_infos():
            # Find the interval for this acronym, if there is one in our list so far
            entity_name = acronyminfo.entity_name
            matches = [
                interval for interval in all_intervals
                if entity_name in interval.columns
            ]
            if len(matches) > 1:
                raise ApodeixiError(
                    my_trace,
                    "Found multiple intervals for the same entity, and there shoud be at most 1"
                )
            elif len(matches) == 1:
                sorted_intervals.append(matches[0])

        # Check we sorted everything
        if len(sorted_intervals) != len(all_intervals):

            raise ApodeixiError(
                my_trace,
                "Was not able to sort all intervals based on the Acronym Schema. Some intervals "
                +
                " did not seem to correspond to anything recognized in the Acronym Schema, "
                +
                "The sorted intervals should have been equally long as all_intervals",
                data={
                    "len(all_intervals)": str(len(all_intervals)),
                    "len(sorted_intervals)": str(len(sorted_intervals))
                })

        # Return value to caller (which for recursive call was this method itself, processing left interval to ours)
        return sorted_intervals, all_rows, uid_info_list

예제 #6

파일 보기

파일: uid_acronym_schema.py 프로젝트: ChateauClaudia-Labs/apodeixi

    def _next_level_acronym_info(self, parent_trace, e_uid, content_dict,
                                 entity_name, level):
        '''
        Helper method for self._map_acronyminfo_lists. The latter is a recursive method, and this method is
        used when the recursion "hits bottom", or when aggregating results from a recursive call.

        It returns a list of AcronymInfo objects, which normally would be a singleton: the AcronymInfo for the
        very next level in the tree:

                        [AcronymInfo(e_acronym, entity_name)] where e_acronym is e_uid's acronym.

        *HOWEVER*, there is a boundary case that could lead to a but unless we return a list with more than one 
        element: when the user skipped some intermediate entity.

        Consider this example: the acronym schema should be [BR(big rock), SR(sub rock), TR(tiny rock)]
        but the user skipped sub-rock sometimes.

        Anticipating this might happen, full UIDs were generated in the UID Store that put a "0" whenever
        an entity is skipped. For example, BR1.SR0.TR1 instead of BR1.TR1

        Assume further that we are a point in the recursion where 
        
            content_dict = manifest_dict["assertion"][big-rocks][BR1][Tiny rocks], and content_dict[TR1][UID] = BR1.SR0.TR1

        Now, another path of the recursion would be perhaps

            content_dict2 = manifest_dict["assertion"][big-rocks][BR2][Sub rocks], and content_dict2[SR3][UID] = BR2.SR3
        
        In this situation, it would be wrong for us to return

            [AcronymInfo(TR, Tiny rocks)]

        because the other path, which is at the same level in the manifest_dict tree, would return

            [AcronymInfo(SR, Sub rocks)]
        
        which will trigger an error in our subsequent processing, since we would think that at this level there are two valid
        acronyms for the next level down: TR and SR, and only one acronym is allowed per level.

        Therefore, the *correct* behaviour is to *pad* that list returned from this method to ensure that TR is never
        at the same level as SR.

        That means returning [AcronymInfo(SR, None), AcronymInfo(TR), "Tiny rocks")]
        
        The level of padding can be determined by looking at content_dict[UID]
        '''
        padded_uid = content_dict[e_uid][Interval.UID]

        padded_tokens = padded_uid.split(".")

        # Check consistency of UID field with the path UID
        my_trace = parent_trace.doing(
            "Checking that the UID field is for a UID that extends the prior level",
            data={
                "path incremental UID": str(e_uid),
                "level": str(level),
                "dict['UID']": str(padded_uid)
            })
        if len(padded_tokens) < level + 1:
            raise ApodeixiError(
                my_trace,
                "UID field lacks the required tokens: expected at least " +
                str(level + 1) + " tokens, "
                " but found only " + len(padded_tokens) + " in the UID field")
        # Check any extra tokens is only padding as we add that padding
        my_trace = parent_trace.doing(
            "Padding a list of AcronymInfos below a node in the manifest tree "
            + " due to user skipping entities",
            data={
                "path incremental UID": str(e_uid),
                "level": str(level),
                "dict['UID']": str(padded_uid)
            })
        result = []
        for idx in range(level, len(padded_tokens) - 1):
            some_acronym, some_val = UID_Utils().parseToken(
                my_trace, padded_tokens[idx])
            if some_val != 0:
                raise ApodeixiError(
                    my_trace,
                    "Corrupted manifest: token '" + str(padded_tokens[idx]) +
                    "' in UID field '" + str(padded_uid) +
                    "' should have only been padding, i.e. a value of 0")
            # Add the padding
            result.append(
                AcronymInfo(some_acronym, None)
            )  # We put None for the entity because we don't know it, but that's OK

        # Any required padding is in, so now we can safely add the e_uid's acronym info
        e_acronym = UID_Utils().parseToken(my_trace, e_uid)[0]
        result.append(AcronymInfo(e_acronym, entity_name))

        return result

예제 #7

파일 보기

파일: uid_acronym_schema.py 프로젝트: ChateauClaudia-Labs/apodeixi

    def _map_acronyminfo_lists(self,
                               parent_trace,
                               content_dict,
                               parent_path,
                               parent_uid,
                               level=0):
        '''
        This is a recursive helper method to the "map-reduce" algorithm used by method _find_acronym_list. 
        Refer to the documentation of that method for an explanation of the context for the algorithm.

        This method returns a list of lists, where the inner list consist of _AcronymInfo objects.

        @level An integer, to tells us where we are in the recursion. Starts at 0, so it must equal
                the number of tokens in parent_uid.
                Helpful to disambiguate the index to use
                for an AcronymInfo object in the returned value, particularly when the user skipped some 
                intermediate entities so can't rely on the length of parent_path for such a determination.
        '''
        my_trace = parent_trace.doing("Mapping acronym lists for '" +
                                      parent_path + "''",
                                      data={'signaledFrom': __file__})
        if True:
            if parent_path == None or len(parent_path.strip()) == 0:
                raise ApodeixiError(
                    my_trace,
                    "Can't process a parent_path that is null or blank")

        # parent_path is something like "assertion.big-rock" when this method is first called, and
        # like  "assertion.big-rock.BR1.Sub rock" when this method is calls recursively on itself
        path_tokens = parent_path.split('.')
        entity_name = path_tokens[
            -1]  # like "big-rock" on 1st call, and "Sub rock" on recursive call

        entity_uids = [
            key for key in content_dict.keys() if not key.endswith('-name')
        ]

        # Will be one per "path" within the "tree" represented by `content_dict`, consisting of the acronyms
        # encountered along that path, in order.
        all_acronyms_result = []

        my_trace = parent_trace.doing("Mapping acronyms under of '" +
                                      str(parent_path) + "'",
                                      data={'signaledFrom': __file__})

        # On a first call we loop through something like e_uid = "BR1", "BR2", "BR3", .... For that call
        #       parent_uid = None and parent_path = "assertion.big-rock"
        # On a recursive call with parent_uid = "BR1" we loop through e_uid = "SR1", "SR2", "SR3", .... In this case
        #       parent_path = "assertion.big-rock.BR1.Sub rock"
        for e_uid in entity_uids:
            loop_trace = parent_trace.doing("Looping on entity with UID '" +
                                            str(e_uid) + "'",
                                            data={'signaledFrom': __file__})
            if parent_uid == None:
                full_e_uid = e_uid
            else:
                full_e_uid = parent_uid + '.' + e_uid

            e_path = parent_path + '.' + e_uid

            e_dict = content_dict[e_uid]

            inner_trace = loop_trace.doing("Checking tree under '" + e_path +
                                           "' is well formed",
                                           data={'signaledFrom': __file__})
            if True:
                # Check e.g. if content_dict = manifest_dict["assertion"]["big-rock"]["BR1"]["SubRock"]
                # and e_uid = "SR2", that content_dict["SR2"] exists and is a dictionary
                if e_dict == None:
                    raise ApodeixiError(
                        inner_trace,
                        "Badly formatted tree: found nothing under '" +
                        e_path + "'")
                if type(e_dict) != dict:
                    raise ApodeixiError(
                        inner_trace,
                        "Badly formatted tree: expected dictionary at '" +
                        e_path + "' but instead found a " + str(type(e_dict)))

            inner_trace = loop_trace.doing("Getting acronym lists under '" +
                                           e_path + "'",
                                           data={'signaledFrom': __file__})
            sub_entities = self.find_entities(
                inner_trace, e_dict)  # Something like "Sub rock"
            # Now we gear up to make a recursive call. For example, if we have been processing the interval
            # ["UID", "big-rock"] and e_dict = content_df["BR1"], we are now going to take the plunge into
            # the unique sub-entity "Sub rock" and make a recursive call to process interval
            # ["UID-1", "Sub rock"] passing content_df["BR1"]["Sub rock"] as the content to process.
            #
            # For our e_path = "assertion"."big-rock"."BR1" we pass a path of "assertion"."big-rock"."BR1"."Sub rock"
            # we set "ourselves" ("BR1") as the parent_uid in the recursive call
            next_level_infos = self._next_level_acronym_info(
                parent_trace=inner_trace,
                e_uid=e_uid,
                content_dict=content_dict,
                entity_name=entity_name,
                level=level)
            if len(sub_entities) == 0:
                all_acronyms_result.append(next_level_infos)
            else:
                for sub_entity in sub_entities:
                    inner_trace = loop_trace.doing(
                        "Making a recursive call for '" + sub_entity + "'",
                        data={'signaledFrom': __file__})

                    acronyminfos_subresult = self._map_acronyminfo_lists(
                        parent_trace=inner_trace,
                        content_dict=e_dict[sub_entity],
                        parent_path=e_path + '.' + sub_entity,
                        parent_uid=full_e_uid,
                        level=level + len(next_level_infos))
                    e_acronym = UID_Utils().parseToken(loop_trace, e_uid)[0]
                    for acronyminfos_sublist in acronyminfos_subresult:
                        # Check we are not about to put duplicate acronyms - if so, that is an error with the `content_df`
                        if e_acronym in [
                                info.acronym for info in acronyminfos_sublist
                        ]:
                            raise ApodeixiError(
                                inner_trace,
                                "Looks like manifest is corrupted because the same acronym is "
                                +
                                " used at different levels. An acronym should be used in only 1 level",
                                data={
                                    "Problem at UID":
                                    str(full_e_uid),
                                    "Acronyms below UID":
                                    ListUtils().print(inner_trace,
                                                      acronyminfos_sublist)
                                })
                        acronyms_list = next_level_infos.copy()
                        acronyms_list.extend(acronyminfos_sublist)
                        all_acronyms_result.append(acronyms_list)

        return all_acronyms_result