Esempio n. 1
0
    def dump_tokens(
        tokens: Iterable[Tok],
        tree: Optional[Node],
        *,
        error_index: Optional[int] = None,
        words: Optional[Dict[WordTuple, int]] = None
    ) -> List[TokenDict]:

        """ Generate a list of dicts representing the tokens in the sentence.

            For each token dict t:

                t.x is original token text.
                t.k is the token kind (TOK.xxx). If omitted, the kind is TOK.WORD.
                t.t is the name of the matching terminal, if any.
                t.m is the BÍN meaning of the token, if any, as a tuple as follows:
                    t.m[0] is the lemma (stofn)
                    t.m[1] is the word category (ordfl)
                    t.m[2] is the word subcategory (fl)
                    t.m[3] is the word meaning/declination (beyging)
                t.v contains auxiliary information, depending on the token kind
                t.err is 1 if the token is an error token

            This function has the side effect of filling in the words dictionary
            with (stem, cat) keys and occurrence counts.

        """

        # Map tokens to associated terminals, if any
        # tmap is an empty dict if there's no parse tree
        tmap = TreeUtility._terminal_map(tree)
        dump: List[TokenDict] = []
        for ix, token in enumerate(tokens):
            # We have already cut away paragraph and sentence markers
            # (P_BEGIN/P_END/S_BEGIN/S_END)
            terminal, meaning = tmap.get(ix, (None, None))
            d: TokenDict = describe_token(ix, token, terminal, meaning)
            if words is not None:
                wt = TreeUtility._word_tuple(token, terminal, meaning)
                if wt is not None:
                    # Add the (stem, cat) combination to the words dictionary
                    words[wt] += 1
            if ix == error_index:
                # Mark the error token, if present
                d["err"] = 1
            # The following code is a bit convoluted, in order to
            # work around a bug in Pylance
            if meaning is not None:
                txt = d.get("x", "").lower()
                if txt:
                    # Also return the augmented terminal name
                    d["a"] = augment_terminal(
                        terminal.name, txt, meaning.beyging
                    )
            dump.append(d)
        return dump
Esempio n. 2
0
    def dump_tokens(tokens, tree, *, error_index=None, words=None):

        """ Generate a list of dicts representing the tokens in the sentence.

            For each token dict t:

                t.x is original token text.
                t.k is the token kind (TOK.xxx). If omitted, the kind is TOK.WORD.
                t.t is the name of the matching terminal, if any.
                t.m is the BÍN meaning of the token, if any, as a tuple as follows:
                    t.m[0] is the lemma (stofn)
                    t.m[1] is the word category (ordfl)
                    t.m[2] is the word subcategory (fl)
                    t.m[3] is the word meaning/declination (beyging)
                t.v contains auxiliary information, depending on the token kind
                t.err is 1 if the token is an error token
                t.corr contains explanatory text if a correction has been applied

            This function has the side effect of filling in the words dictionary
            with (stem, cat) keys and occurrence counts.

        """

        # Map tokens to associated terminals, if any
        # tmap is an empty dict if there's no parse tree
        tmap = TreeUtility._terminal_map(tree)
        dump = []
        for ix, token in enumerate(tokens):
            # We have already cut away paragraph and sentence markers
            # (P_BEGIN/P_END/S_BEGIN/S_END)
            terminal, meaning = tmap.get(ix, (None, None))
            d = describe_token(ix, token, terminal, meaning)
            if words is not None:
                wt = TreeUtility._word_tuple(token, terminal, meaning)
                if wt is not None:
                    # Add the (stem, cat) combination to the words dictionary
                    words[wt] += 1
            if ix == error_index:
                # Mark the error token, if present
                d["err"] = 1
            if meaning is not None and "x" in d:
                # Also return the augmented terminal name
                d["a"] = augment_terminal(
                    terminal.name,
                    d["x"].lower(),
                    meaning.beyging
                )
            dump.append(d)
        return dump
Esempio n. 3
0
def test_augment_terminal():
    a = augment_terminal("so_subj_op_þf", "langaði", "OP-GM-FH-ÞT-3P-ET")
    assert a == "so_subj_op_þf_et_fh_gm_þt"
    a = augment_terminal("so_subj_sagnb_þf", "langað", "GM-SAGNB")
    assert a == "so_subj_sagnb_þf_gm"
    a = augment_terminal("so_subj_lhþt_et_kvk", "valin", "LHÞT-SB-KVK-NFET")
    assert a == "so_subj_lhþt_et_kvk_nf_sb"
    a = augment_terminal("so_subj_nh", "skorta", "GM-NH")
    assert a == "so_subj_nh_gm"
    a = augment_terminal("so_subj_nh_þgf", "blöskra", "GM-NH")
    assert a == "so_subj_nh_þgf_gm"
    a = augment_terminal("so_1_þf_subj_op_þgf", "þraut", "OP-GM-FH-ÞT-1P-ET")
    assert a == "so_1_þf_subj_op_þgf_et_fh_gm_þt"
    a = augment_terminal("so_2_þgf_þf_p1_et", "skrifa", "GM-FH-NT-1P-ET")
    assert a == "so_2_þgf_þf_et_fh_gm_nt_p1"
    a = augment_terminal("so_0_lhþt_et_kk", "kembdur", "LHÞT-SB-KK-NFET")
    assert a == "so_0_et_kk_lhþt_nf_sb"
Esempio n. 4
0
    def dump_tokens(tokens, tree, *, error_index=None, words=None):
        """ Generate a list of dicts representing the tokens in the sentence.

            For each token dict t:

                t.x is original token text.
                t.k is the token kind (TOK.xxx). If omitted, the kind is TOK.WORD.
                t.t is the name of the matching terminal, if any.
                t.m is the BÍN meaning of the token, if any, as a tuple as follows:
                    t.m[0] is the lemma (stofn)
                    t.m[1] is the word category (ordfl)
                    t.m[2] is the word subcategory (fl)
                    t.m[3] is the word meaning/declination (beyging)
                t.v contains auxiliary information, depending on the token kind
                t.err is 1 if the token is an error token
                t.corr contains explanatory text if a correction has been applied

            This function has the side effect of filling in the words dictionary
            with (stem, cat) keys and occurrence counts.

        """

        # Map tokens to associated terminals, if any
        # tmap is an empty dict if there's no parse tree
        tmap = TreeUtility._terminal_map(tree)
        dump = []
        for ix, token in enumerate(tokens):
            # We have already cut away paragraph and sentence markers
            # (P_BEGIN/P_END/S_BEGIN/S_END)
            terminal, meaning = tmap.get(ix, (None, None))
            d = describe_token(ix, token, terminal, meaning)
            if words is not None:
                wt = TreeUtility._word_tuple(token, terminal, meaning)
                if wt is not None:
                    # Add the (stem, cat) combination to the words dictionary
                    words[wt] += 1
            if ix == error_index:
                # Mark the error token, if present
                d["err"] = 1
            if meaning is not None and "x" in d:
                # Also return the augmented terminal name
                d["a"] = augment_terminal(terminal.name, d["x"].lower(),
                                          meaning.beyging)
            dump.append(d)
        return dump