Esempio n. 1
    def dump_tokens(
        tokens: Iterable[Tok],
        tree: Optional[Node],
        error_index: Optional[int] = None,
        words: Optional[Dict[WordTuple, int]] = None
    ) -> List[TokenDict]:

        """ Generate a list of dicts representing the tokens in the sentence.

            For each token dict t:

                t.x is original token text.
                t.k is the token kind ( If omitted, the kind is TOK.WORD.
                t.t is the name of the matching terminal, if any.
                t.m is the BÍN meaning of the token, if any, as a tuple as follows:
                    t.m[0] is the lemma (stofn)
                    t.m[1] is the word category (ordfl)
                    t.m[2] is the word subcategory (fl)
                    t.m[3] is the word meaning/declination (beyging)
                t.v contains auxiliary information, depending on the token kind
                t.err is 1 if the token is an error token

            This function has the side effect of filling in the words dictionary
            with (stem, cat) keys and occurrence counts.


        # Map tokens to associated terminals, if any
        # tmap is an empty dict if there's no parse tree
        tmap = TreeUtility._terminal_map(tree)
        dump: List[TokenDict] = []
        for ix, token in enumerate(tokens):
            # We have already cut away paragraph and sentence markers
            # (P_BEGIN/P_END/S_BEGIN/S_END)
            terminal, meaning = tmap.get(ix, (None, None))
            d: TokenDict = describe_token(ix, token, terminal, meaning)
            if words is not None:
                wt = TreeUtility._word_tuple(token, terminal, meaning)
                if wt is not None:
                    # Add the (stem, cat) combination to the words dictionary
                    words[wt] += 1
            if ix == error_index:
                # Mark the error token, if present
                d["err"] = 1
            # The following code is a bit convoluted, in order to
            # work around a bug in Pylance
            if meaning is not None:
                txt = d.get("x", "").lower()
                if txt:
                    # Also return the augmented terminal name
                    d["a"] = augment_terminal(
              , txt, meaning.beyging
        return dump
Esempio n. 2
    def dump_tokens(tokens, tree, *, error_index=None, words=None):

        """ Generate a list of dicts representing the tokens in the sentence.

            For each token dict t:

                t.x is original token text.
                t.k is the token kind ( If omitted, the kind is TOK.WORD.
                t.t is the name of the matching terminal, if any.
                t.m is the BÍN meaning of the token, if any, as a tuple as follows:
                    t.m[0] is the lemma (stofn)
                    t.m[1] is the word category (ordfl)
                    t.m[2] is the word subcategory (fl)
                    t.m[3] is the word meaning/declination (beyging)
                t.v contains auxiliary information, depending on the token kind
                t.err is 1 if the token is an error token
                t.corr contains explanatory text if a correction has been applied

            This function has the side effect of filling in the words dictionary
            with (stem, cat) keys and occurrence counts.


        # Map tokens to associated terminals, if any
        # tmap is an empty dict if there's no parse tree
        tmap = TreeUtility._terminal_map(tree)
        dump = []
        for ix, token in enumerate(tokens):
            # We have already cut away paragraph and sentence markers
            # (P_BEGIN/P_END/S_BEGIN/S_END)
            terminal, meaning = tmap.get(ix, (None, None))
            d = describe_token(ix, token, terminal, meaning)
            if words is not None:
                wt = TreeUtility._word_tuple(token, terminal, meaning)
                if wt is not None:
                    # Add the (stem, cat) combination to the words dictionary
                    words[wt] += 1
            if ix == error_index:
                # Mark the error token, if present
                d["err"] = 1
            if meaning is not None and "x" in d:
                # Also return the augmented terminal name
                d["a"] = augment_terminal(
        return dump
Esempio n. 3
def test_augment_terminal():
    a = augment_terminal("so_subj_op_þf", "langaði", "OP-GM-FH-ÞT-3P-ET")
    assert a == "so_subj_op_þf_et_fh_gm_þt"
    a = augment_terminal("so_subj_sagnb_þf", "langað", "GM-SAGNB")
    assert a == "so_subj_sagnb_þf_gm"
    a = augment_terminal("so_subj_lhþt_et_kvk", "valin", "LHÞT-SB-KVK-NFET")
    assert a == "so_subj_lhþt_et_kvk_nf_sb"
    a = augment_terminal("so_subj_nh", "skorta", "GM-NH")
    assert a == "so_subj_nh_gm"
    a = augment_terminal("so_subj_nh_þgf", "blöskra", "GM-NH")
    assert a == "so_subj_nh_þgf_gm"
    a = augment_terminal("so_1_þf_subj_op_þgf", "þraut", "OP-GM-FH-ÞT-1P-ET")
    assert a == "so_1_þf_subj_op_þgf_et_fh_gm_þt"
    a = augment_terminal("so_2_þgf_þf_p1_et", "skrifa", "GM-FH-NT-1P-ET")
    assert a == "so_2_þgf_þf_et_fh_gm_nt_p1"
    a = augment_terminal("so_0_lhþt_et_kk", "kembdur", "LHÞT-SB-KK-NFET")
    assert a == "so_0_et_kk_lhþt_nf_sb"
Esempio n. 4
    def dump_tokens(tokens, tree, *, error_index=None, words=None):
        """ Generate a list of dicts representing the tokens in the sentence.

            For each token dict t:

                t.x is original token text.
                t.k is the token kind ( If omitted, the kind is TOK.WORD.
                t.t is the name of the matching terminal, if any.
                t.m is the BÍN meaning of the token, if any, as a tuple as follows:
                    t.m[0] is the lemma (stofn)
                    t.m[1] is the word category (ordfl)
                    t.m[2] is the word subcategory (fl)
                    t.m[3] is the word meaning/declination (beyging)
                t.v contains auxiliary information, depending on the token kind
                t.err is 1 if the token is an error token
                t.corr contains explanatory text if a correction has been applied

            This function has the side effect of filling in the words dictionary
            with (stem, cat) keys and occurrence counts.


        # Map tokens to associated terminals, if any
        # tmap is an empty dict if there's no parse tree
        tmap = TreeUtility._terminal_map(tree)
        dump = []
        for ix, token in enumerate(tokens):
            # We have already cut away paragraph and sentence markers
            # (P_BEGIN/P_END/S_BEGIN/S_END)
            terminal, meaning = tmap.get(ix, (None, None))
            d = describe_token(ix, token, terminal, meaning)
            if words is not None:
                wt = TreeUtility._word_tuple(token, terminal, meaning)
                if wt is not None:
                    # Add the (stem, cat) combination to the words dictionary
                    words[wt] += 1
            if ix == error_index:
                # Mark the error token, if present
                d["err"] = 1
            if meaning is not None and "x" in d:
                # Also return the augmented terminal name
                d["a"] = augment_terminal(, d["x"].lower(),
        return dump