Python sent_tagsの例

プログラミング言語: Python

名前空間/パッケージ名: tagging

メソッド/関数: sent_tags

hotexamples.comのコード掲載数: 4

Python sent_tags - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtagging.sent_tagsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: UDlextag2json.py プロジェクト: ryanamannion/streusle

    def _postproc_sent(sent):
        nonlocal lc_tbd

        assert 'sent_id' in sent, sent

        # check that tokens are numbered from 1, in order
        for i, tok in enumerate(sent['toks'], 1):
            assert tok['#'] == i

        # check that MWEs are numbered from 1 based on first token offset
        xmwes = [(e["toknums"][0], 's', mwenum)
                 for mwenum, e in sent['smwes'].items()]
        xmwes += [(e["toknums"][0], 'w', mwenum)
                  for mwenum, e in sent['wmwes'].items()]
        xmwes.sort()
        for k, mwe in chain(sent['smwes'].items(), sent['wmwes'].items()):
            assert xmwes[int(k) - 1][
                2] == k, f"In {sent['sent_id']}, MWEs are not numbered in the correct order: use normalize_mwe_numbering.py to fix"

        # check that lexical & weak MWE lemmas are correct
        lexes_to_validate = chain(
            sent['swes'].values(),
            sent['smwes'].values()) if validate_type else []
        for lexe in lexes_to_validate:
            sent['toks'][lexe['toknums'][0] - 1]
            assert lexe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma'] for i in lexe['toknums']
            ), f"In {sent['sent_id']}, MWE lemma is incorrect: {lexe} vs. {sent['toks'][lexe['toknums'][0]-1]}"
            lc = lexe['lexcat']
            if lc.endswith('!@'): lc_tbd += 1
            valid_ss = supersenses_for_lexcat(lc)
            if lc == 'V':
                assert len(
                    lexe['toknums']
                ) == 1, f'In {sent["sent_id"]}, Verbal MWE "{lexe["lexlemma"]}" lexcat must be subtyped (V.VID, etc., not V)'
            ss, ss2 = lexe['ss'], lexe['ss2']
            if valid_ss:
                if ss == '??':
                    assert ss2 is None
                elif ss not in valid_ss or (lc in (
                        'N', 'V') or lc.startswith('V.')) != (ss2 is None) or (
                            ss2 is not None and ss2 not in valid_ss):
                    assert False, f"In {sent['sent_id']}, invalid supersense(s) in lexical entry: {lexe}"
                elif ss.startswith('p.'):
                    assert ss2.startswith('p.')
                    assert ss2 not in {
                        'p.Experiencer', 'p.Stimulus', 'p.Originator',
                        'p.Recipient', 'p.SocialRel', 'p.OrgRole'
                    }, (f'{ss2} should never be function', lexe)
                    if ss != ss2:
                        ssA, ss2A = ancestors(ss), ancestors(ss2)
                        # there are just a few permissible combinations where one is the ancestor of the other
                        if (ss, ss2) not in {('p.Whole', 'p.Gestalt'),
                                             ('p.Goal', 'p.Locus'),
                                             ('p.Circumstance', 'p.Locus'),
                                             ('p.Circumstance', 'p.Path'),
                                             ('p.Locus', 'p.Goal'),
                                             ('p.Locus', 'p.Source'),
                                             ('p.Characteristic', 'p.Stuff')}:
                            assert ss not in ss2A, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
                            assert ss2 not in ssA, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
            else:
                assert ss is None and ss2 is None and lexe not in (
                    'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe

        # check lexcat on single-word expressions
        for swe in sent['swes'].values():
            tok = sent['toks'][swe['toknums'][0] - 1]
            upos, xpos = tok['upos'], tok['xpos']
            lc = swe['lexcat']
            if lc.endswith('!@'): continue
            if lc not in ALL_LEXCATS:
                assert not validate_type, f"In {sent['sent_id']}, invalid lexcat {lc} for single-word expression '{tok['word']}'"
                continue
            if validate_pos and upos != lc and lc != 'PP' and (
                    upos, lc) not in {('NOUN', 'N'), ('PROPN', 'N'),
                                      ('VERB', 'V'), ('ADP', 'P'),
                                      ('ADV', 'P'), ('SCONJ', 'P'),
                                      ('ADP', 'DISC'), ('ADV', 'DISC'),
                                      ('SCONJ', 'DISC'), ('PART', 'POSS')}:
                # most often, the single-word lexcat should match its upos
                # check a list of exceptions
                mismatchOK = False
                if xpos == 'TO' and lc.startswith('INF'):
                    mismatchOK = True
                elif (xpos == 'TO') != lc.startswith('INF'):
                    assert upos == 'SCONJ' and swe['lexlemma'] == 'for', (
                        sent['sent_id'], swe, tok)
                    mismatchOK = True

                if (upos in ('NOUN', 'PROPN')) != (lc == 'N'):
                    try:
                        assert upos in ('SYM', 'X') or (lc in (
                            'PRON', 'DISC')), (sent['sent_id'], swe, tok)
                    except AssertionError:
                        print('Suspicious lexcat/POS combination:',
                              sent['sent_id'],
                              swe,
                              tok,
                              file=sys.stderr)
                    mismatchOK = True
                message = f"In {sent['sent_id']}, single-word expression '{tok['word']}' has lexcat {lc}, which is incompatible with its upos {upos}"
                if (upos == 'AUX') != (lc == 'AUX'):
                    assert tok[
                        'lemma'] == 'be' and lc == 'V', message  # copula has upos=AUX
                    mismatchOK = True
                if (upos == 'VERB') != (lc == 'V'):
                    if lc == 'ADJ':
                        print(
                            'Word treated as VERB in UD, ADJ for supersenses:',
                            sent['sent_id'],
                            tok['word'],
                            file=sys.stderr)
                    else:
                        assert tok[
                            'lemma'] == 'be' and lc == 'V', message  # copula has upos=AUX
                    mismatchOK = True
                if upos == 'PRON':
                    assert lc == 'PRON' or lc == 'PRON.POSS', message
                    mismatchOK = True
                if lc == 'ADV':
                    assert upos == 'ADV' or upos == 'PART', message  # PART is for negations
                    mismatchOK = True
                if upos == 'ADP' and lc == 'CCONJ':
                    assert tok['lemma'] == 'versus'
                    mismatchOK = True

                assert mismatchOK, message
            if validate_type:
                assert lc != 'PP', f"In {sent['sent_id']}, PP should only apply to strong MWEs, but occurs for single-word expression '{tok['word']}'"
        for smwe in sent['smwes'].values():
            assert len(smwe['toknums']) > 1
        for wmwe in sent['wmwes'].values():
            assert len(
                wmwe['toknums']
            ) > 1, f"In {sent['sent_id']}, weak MWE has only one token according to group indices: {wmwe}"
            assert wmwe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma']
                for i in wmwe['toknums']), (wmwe,
                                            sent['toks'][wmwe['toknums'][0] -
                                                         1])
        # we already checked that noninitial tokens in an MWE have _ as their lemma

        # check lextags
        smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()]
        wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()]
        if 'mwe' not in sent:
            sent['mwe'] = render_sent(sent, False, False)
        tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups,
                            wmweGroups)
        for tok, tag in zip(sent['toks'], tagging):
            fulllextag = tag
            if tok['smwe']:
                smweNum, position = tok['smwe']
                lexe = sent['smwes'][smweNum]
            else:
                position = None
                lexe = sent['swes'][tok['#']]

            if position is None or position == 1:
                lexcat = lexe['lexcat']
                fulllextag += '-' + lexcat
                ss1, ss2 = lexe['ss'], lexe['ss2']
                if ss1 is not None:
                    assert ss1
                    fulllextag += '-' + ss1
                    if ss2 is not None and ss2 != ss1:
                        assert ss2
                        fulllextag += '|' + ss2
                if tok['wmwe']:
                    wmweNum, position = tok['wmwe']
                    wmwe = sent['wmwes'][wmweNum]
                    wcat = wmwe['lexcat']
                    if wcat and position == 1:
                        fulllextag += '+' + wcat

            assert tok[
                'lextag'] == fulllextag, f"In {sent['sent_id']}, the full tag at the end of the line is inconsistent with the rest of the line ({fulllextag} expected): {tok}"

        # check rendered MWE string
        s = render([tok['word'] for tok in sent['toks']], smweGroups,
                   wmweGroups)
        if sent['mwe'] != s:
            caveat = ' (may be due to simplification)' if '$1' in sent[
                'mwe'] else ''
            print(f'MWE string mismatch{caveat}:',
                  s,
                  sent['mwe'],
                  sent['sent_id'],
                  file=sys.stderr)

コード例 #2

ファイルを表示

            smweGroupToks[o] = sg
            lexLemmas[o] = '_'  # lexlemma for strong MWE should be _
        lexLemmas[sg[0]] = ' '.join(udLemmas[j - 1]
                                    for j in sg)  # ...except the first token
    for h, wg in enumerate(data["~"], i + 1):
        wg = sorted(wg)
        for j, o in enumerate(wg, 1):
            assert o not in wmweGroup
            wmweGroup[o] = f'{h}:{j}'
            wmweGroupToks[o] = wg
            wLemmas[o] = '_'  # wlemma for MWE tokens should be _
        wLemmas[wg[0]] = ' '.join(udLemmas[j - 1]
                                  for j in wg)  # ...except the first token

    tagging = sent_tags(len(data["words"]), mweMarkup,
                        set(map(tuple, smweGroupToks.values())),
                        set(map(tuple, wmweGroupToks.values())))

    for ln in udTokLines:
        tokNum, form, lemma, upos, xpos, feats, head, deprel, deps, misc = ln.split(
            '\t')
        if re.match(r'^\d+$', tokNum):
            tokNum = int(tokNum)
            offset0 = tokNum - 1
            if data["words"][offset0] != [form, xpos]:
                # Most of the time the UD wordform and XPOS will match
                # what is stored in the .sst file.
                # Exceptions: ASCII normalization and tag fixes.
                print(data["words"][offset0], [form, xpos], file=sys.stderr)
            # TODO: if data["lemmas"]: assert data["lemmas"][offset0] == lemma
            smwe = smweGroup.get(tokNum, '_')

コード例 #3

ファイルを表示

ファイル: conllulex2json.py プロジェクト: nschneid/pss-nn

    def _postproc_sent(sent):
        nonlocal lc_tbd

        sent['autoid_swes'] = sent.get('autoid_swes') or {}
        sent['autoid_smwes'] = sent.get('autoid_smwes') or {}

        # autoid/goldid - pick one according to args. For autoid, fill in gold ss,ss2 if there's an exact match in gold id
        if identification == 'autoid':
            for auto_we in chain(sent['autoid_swes'].values(),
                                 sent['autoid_smwes'].values()):
                matching_gold_wes = [
                    we for we in chain(sent['swes'].values(),
                                       sent['smwes'].values())
                    if set(we['toknums']) == set(auto_we['toknums'])
                ]
                gold_we = (matching_gold_wes + [None])[0]
                if gold_we and all([
                        ss is None or '.' in ss
                        for ss in [gold_we['ss'], gold_we['ss2']]
                ]):
                    auto_we['ss'], auto_we['ss2'] = gold_we['ss'], gold_we[
                        'ss2']
                else:
                    auto_we['ss'], auto_we['ss2'] = None, None
            sent['swes'], sent['smwes'] = sent['autoid_swes'], sent[
                'autoid_smwes']
            for tok in sent['toks']:
                tok['smwe'] = tok.get('autoid_smwe')
                if 'autoid_smwe' in tok:
                    del tok['autoid_smwe']
                tok['wmwe'] = None
            sent['wmwes'] = {}

        del sent['autoid_smwes']
        del sent['autoid_swes']

        # check that tokens are numbered from 1, in order
        for i, tok in enumerate(sent['toks'], 1):
            assert tok['#'] == i

        # check that MWEs are numbered from 1
        # fix_mwe_numbering.py was written to correct this
        for i, (k, mwe) in enumerate(
                sorted(chain(sent['smwes'].items(), sent['wmwes'].items()),
                       key=lambda x: int(x[0])), 1):
            assert int(k) == i, (sent['sent_id'], i, k, mwe)

        # check that lexical & weak MWE lemmas are correct
        for lexe in chain(sent['swes'].values(), sent['smwes'].values()):
            lexe['lexlemma'] = ' '.join(sent['toks'][i - 1]['lemma']
                                        for i in lexe['toknums'])
            lc = lexe['lexcat']
            if lc.endswith('!@'): lc_tbd += 1
            valid_ss = supersenses_for_lexcat(lc)
            ss, ss2 = lexe['ss'], lexe['ss2']
            if valid_ss:
                if ss == '??':
                    assert ss2 is None
                elif ss not in valid_ss or (lc in ('N', 'V')) != (
                        ss2 is None) or (ss2 is not None
                                         and ss2 not in valid_ss):
                    print('Invalid supersense(s) in lexical entry:',
                          lexe,
                          file=sys.stderr)
                elif ss.startswith('p.'):
                    assert ss2.startswith('p.')
                    assert ss2 not in {
                        'p.Experiencer', 'p.Stimulus', 'p.Originator',
                        'p.Recipient', 'p.SocialRel', 'p.OrgRole'
                    }, (ss2 + ' should never be function', lexe)
            else:
                assert ss is None and ss2 is None and lexe not in (
                    'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe

        # check lexcat on single-word expressions
        for swe in sent['swes'].values():
            tok = sent['toks'][swe['toknums'][0] - 1]
            upos, xpos = tok['upos'], tok['xpos']
            lc = swe['lexcat']
            if lc.endswith('!@'): continue
            assert lc in ALL_LEXCATS, (sent['sent_id'], tok)
            if (xpos == 'TO') != lc.startswith('INF'):
                # assert upos=='SCONJ' and swe['lexlemma']=='for',(sent['sent_id'],swe,tok)
                pass
            if (upos in ('NOUN', 'PROPN')) != (lc == 'N'):
                try:
                    assert upos in ('SYM', 'X') or (lc in ('PRON', 'DISC')), (
                        sent['sent_id'], swe, tok)
                except AssertionError:
                    print('Suspicious lexcat/POS combination:',
                          sent['sent_id'],
                          swe,
                          tok,
                          file=sys.stderr)
            if (upos == 'AUX') != (lc == 'AUX'):
                # assert tok['lemma']=='be' and lc=='V',(sent['sent_id'],tok)    # copula has upos=AUX
                pass
            if (upos == 'VERB') != (lc == 'V'):
                if lc == 'ADJ':
                    print('Word treated as VERB in UD, ADJ for supersenses:',
                          sent['sent_id'],
                          tok['word'],
                          file=sys.stderr)
                else:
                    # assert tok['lemma']=='be' and lc=='V',(sent['sent_id'],tok)    # copula has upos=AUX
                    pass
            if upos == 'PRON':
                # assert lc=='PRON' or lc=='PRON.POSS',(sent['sent_id'],tok)
                pass
            if lc == 'ADV':
                # assert upos=='ADV' or upos=='PART',(sent['sent_id'],tok)    # PART is for negations
                pass
            assert lc != 'PP', ('PP should only apply to strong MWEs',
                                sent['sent_id'], tok)
        for smwe in sent['smwes'].values():
            assert len(smwe['toknums']) > 1
        for wmwe in sent['wmwes'].values():
            assert len(wmwe['toknums']) > 1, (sent['sent_id'], wmwe)
            # assert wmwe['lexlemma']==' '.join(sent['toks'][i-1]['lemma'] for i in wmwe['toknums']),(wmwe,sent['toks'][wmwe['toknums'][0]-1])
        # we already checked that noninitial tokens in an MWE have _ as their lemma

        # check lextags
        smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()]
        wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()]
        tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups,
                            wmweGroups)
        for tok, tag in zip(sent['toks'], tagging):
            fulllextag = tag
            if tok['smwe']:
                smweNum, position = tok['smwe']
                lexe = sent['smwes'][smweNum]
            elif tok['#'] in sent['swes']:
                position = None
                lexe = sent['swes'][tok['#']]
            else:
                lexe = None

            if lexe and (position is None or position == 1):
                lexcat = lexe['lexcat']
                fulllextag += '-' + lexcat
                ss1, ss2 = lexe['ss'], lexe['ss2']
                if ss1 is not None:
                    assert ss1
                    fulllextag += '-' + ss1
                    if ss2 is not None and ss2 != ss1:
                        assert ss2
                        fulllextag += '|' + ss2
                if tok['wmwe']:
                    wmweNum, position = tok['wmwe']
                    wmwe = sent['wmwes'][wmweNum]
                    wcat = wmwe['lexcat']
                    if wcat and position == 1:
                        fulllextag += '+' + wcat

            # assert tok['lextag']==fulllextag,(sent['sent_id'],fulllextag,tok)

        # check rendered MWE string
        s = render([tok['word'] for tok in sent['toks']], smweGroups,
                   wmweGroups)
        if sent['mwe'] != s:
            caveat = ' (may be due to simplification)' if '$1' in sent[
                'mwe'] else ''
            print('MWE string mismatch' + caveat + ':',
                  s,
                  sent['mwe'],
                  sent['sent_id'],
                  file=sys.stderr)

コード例 #4

ファイルを表示

    def _postproc_sent(sent):
        nonlocal lc_tbd

        # check that tokens are numbered from 1, in order
        for i, tok in enumerate(sent['toks'], 1):
            assert tok['#'] == i

        # check that MWEs are numbered from 1
        # fix_mwe_numbering.py was written to correct this
        for i, (k, mwe) in enumerate(
                sorted(chain(sent['smwes'].items(), sent['wmwes'].items()),
                       key=lambda x: int(x[0])), 1):
            assert int(k) == i, (sent['sent_id'], i, k, mwe)

        # check that lexical & weak MWE lemmas are correct
        for lexe in chain(sent['swes'].values(), sent['smwes'].values()):
            assert lexe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma'] for i in lexe['toknums']
            ), f"In {sent['sent_id']}, MWE lemma is incorrect: {lexe} vs. {sent['toks'][lexe['toknums'][0]-1]}"
            lc = lexe['lexcat']
            if lc.endswith('!@'): lc_tbd += 1
            valid_ss = supersenses_for_lexcat(lc)
            if lc == 'V':
                assert len(
                    lexe['toknums']
                ) == 1, f'Verbal MWE lexcat must be subtyped (V.VID, etc., not V): {lexe}'
            ss, ss2 = lexe['ss'], lexe['ss2']
            if valid_ss:
                if ss == '??':
                    assert ss2 is None
                elif ss not in valid_ss or (lc in (
                        'N', 'V') or lc.startswith('V.')) != (ss2 is None) or (
                            ss2 is not None and ss2 not in valid_ss):
                    assert False, f"In {sent['sent_id']}, invalid supersense(s) in lexical entry: {lexe}"
                elif ss.startswith('p.'):
                    assert ss2.startswith('p.')
                    assert ss2 not in {
                        'p.Experiencer', 'p.Stimulus', 'p.Originator',
                        'p.Recipient', 'p.SocialRel', 'p.OrgRole'
                    }, (f'{ss2} should never be function', lexe)
                    if ss != ss2:
                        ssA, ss2A = ancestors(ss), ancestors(ss2)
                        # there are just a few permissible combinations where one is the ancestor of the other
                        if (ss, ss2) not in {('p.Whole', 'p.Gestalt'),
                                             ('p.Goal', 'p.Locus'),
                                             ('p.Circumstance', 'p.Locus'),
                                             ('p.Circumstance', 'p.Path'),
                                             ('p.Locus', 'p.Goal'),
                                             ('p.Locus', 'p.Source'),
                                             ('p.Characteristic', 'p.Stuff')}:
                            assert ss not in ss2A, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
                            assert ss2 not in ssA, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
            else:
                assert ss is None and ss2 is None and lexe not in (
                    'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe

        # check lexcat on single-word expressions
        for swe in sent['swes'].values():
            tok = sent['toks'][swe['toknums'][0] - 1]
            upos, xpos = tok['upos'], tok['xpos']
            lc = swe['lexcat']
            if lc.endswith('!@'): continue
            assert lc in ALL_LEXCATS, f"In {sent['sent_id']}, invalid lexcat for single-word expression: {lc} in {tok}"
            if (xpos == 'TO') != lc.startswith('INF'):
                assert upos == 'SCONJ' and swe['lexlemma'] == 'for', (
                    sent['sent_id'], swe, tok)
            if (upos in ('NOUN', 'PROPN')) != (lc == 'N'):
                try:
                    assert upos in ('SYM', 'X') or (lc in ('PRON', 'DISC')), (
                        sent['sent_id'], swe, tok)
                except AssertionError:
                    print('Suspicious lexcat/POS combination:',
                          sent['sent_id'],
                          swe,
                          tok,
                          file=sys.stderr)
            if (upos == 'AUX') != (lc == 'AUX'):
                assert tok['lemma'] == 'be' and lc == 'V', (
                    sent['sent_id'], tok)  # copula has upos=AUX
            if (upos == 'VERB') != (lc == 'V'):
                if lc == 'ADJ':
                    print('Word treated as VERB in UD, ADJ for supersenses:',
                          sent['sent_id'],
                          tok['word'],
                          file=sys.stderr)
                else:
                    assert tok['lemma'] == 'be' and lc == 'V', (
                        sent['sent_id'], tok)  # copula has upos=AUX
            if upos == 'PRON':
                assert lc == 'PRON' or lc == 'PRON.POSS', (sent['sent_id'],
                                                           tok)
            if lc == 'ADV':
                assert upos == 'ADV' or upos == 'PART', (
                    sent['sent_id'], tok)  # PART is for negations
            assert lc != 'PP', ('PP should only apply to strong MWEs',
                                sent['sent_id'], tok)
        for smwe in sent['smwes'].values():
            assert len(smwe['toknums']) > 1
        for wmwe in sent['wmwes'].values():
            assert len(
                wmwe['toknums']
            ) > 1, f"In {sent['sent_id']}, weak MWE has only one token according to group indices: {wmwe}"
            assert wmwe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma']
                for i in wmwe['toknums']), (wmwe,
                                            sent['toks'][wmwe['toknums'][0] -
                                                         1])
        # we already checked that noninitial tokens in an MWE have _ as their lemma

        # check lextags
        smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()]
        wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()]
        tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups,
                            wmweGroups)
        for tok, tag in zip(sent['toks'], tagging):
            fulllextag = tag
            if tok['smwe']:
                smweNum, position = tok['smwe']
                lexe = sent['smwes'][smweNum]
            else:
                position = None
                lexe = sent['swes'][tok['#']]

            if position is None or position == 1:
                lexcat = lexe['lexcat']
                fulllextag += '-' + lexcat
                ss1, ss2 = lexe['ss'], lexe['ss2']
                if ss1 is not None:
                    assert ss1
                    fulllextag += '-' + ss1
                    if ss2 is not None and ss2 != ss1:
                        assert ss2
                        fulllextag += '|' + ss2
                if tok['wmwe']:
                    wmweNum, position = tok['wmwe']
                    wmwe = sent['wmwes'][wmweNum]
                    wcat = wmwe['lexcat']
                    if wcat and position == 1:
                        fulllextag += '+' + wcat

            assert tok[
                'lextag'] == fulllextag, f"In {sent['sent_id']}, the full tag at the end of the line is inconsistent with the rest of the line ({fulllextag} expected): {tok}"

        # check rendered MWE string
        s = render([tok['word'] for tok in sent['toks']], smweGroups,
                   wmweGroups)
        if sent['mwe'] != s:
            caveat = ' (may be due to simplification)' if '$1' in sent[
                'mwe'] else ''
            print(f'MWE string mismatch{caveat}:',
                  s,
                  sent['mwe'],
                  sent['sent_id'],
                  file=sys.stderr)