def _postproc_sent(sent): nonlocal lc_tbd assert 'sent_id' in sent, sent # check that tokens are numbered from 1, in order for i, tok in enumerate(sent['toks'], 1): assert tok['#'] == i # check that MWEs are numbered from 1 based on first token offset xmwes = [(e["toknums"][0], 's', mwenum) for mwenum, e in sent['smwes'].items()] xmwes += [(e["toknums"][0], 'w', mwenum) for mwenum, e in sent['wmwes'].items()] xmwes.sort() for k, mwe in chain(sent['smwes'].items(), sent['wmwes'].items()): assert xmwes[int(k) - 1][ 2] == k, f"In {sent['sent_id']}, MWEs are not numbered in the correct order: use normalize_mwe_numbering.py to fix" # check that lexical & weak MWE lemmas are correct lexes_to_validate = chain( sent['swes'].values(), sent['smwes'].values()) if validate_type else [] for lexe in lexes_to_validate: sent['toks'][lexe['toknums'][0] - 1] assert lexe['lexlemma'] == ' '.join( sent['toks'][i - 1]['lemma'] for i in lexe['toknums'] ), f"In {sent['sent_id']}, MWE lemma is incorrect: {lexe} vs. {sent['toks'][lexe['toknums'][0]-1]}" lc = lexe['lexcat'] if lc.endswith('!@'): lc_tbd += 1 valid_ss = supersenses_for_lexcat(lc) if lc == 'V': assert len( lexe['toknums'] ) == 1, f'In {sent["sent_id"]}, Verbal MWE "{lexe["lexlemma"]}" lexcat must be subtyped (V.VID, etc., not V)' ss, ss2 = lexe['ss'], lexe['ss2'] if valid_ss: if ss == '??': assert ss2 is None elif ss not in valid_ss or (lc in ( 'N', 'V') or lc.startswith('V.')) != (ss2 is None) or ( ss2 is not None and ss2 not in valid_ss): assert False, f"In {sent['sent_id']}, invalid supersense(s) in lexical entry: {lexe}" elif ss.startswith('p.'): assert ss2.startswith('p.') assert ss2 not in { 'p.Experiencer', 'p.Stimulus', 'p.Originator', 'p.Recipient', 'p.SocialRel', 'p.OrgRole' }, (f'{ss2} should never be function', lexe) if ss != ss2: ssA, ss2A = ancestors(ss), ancestors(ss2) # there are just a few permissible combinations where one is the ancestor of the other if (ss, ss2) not in {('p.Whole', 'p.Gestalt'), ('p.Goal', 'p.Locus'), ('p.Circumstance', 'p.Locus'), ('p.Circumstance', 'p.Path'), ('p.Locus', 'p.Goal'), ('p.Locus', 'p.Source'), ('p.Characteristic', 'p.Stuff')}: assert ss not in ss2A, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}" assert ss2 not in ssA, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}" else: assert ss is None and ss2 is None and lexe not in ( 'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe # check lexcat on single-word expressions for swe in sent['swes'].values(): tok = sent['toks'][swe['toknums'][0] - 1] upos, xpos = tok['upos'], tok['xpos'] lc = swe['lexcat'] if lc.endswith('!@'): continue if lc not in ALL_LEXCATS: assert not validate_type, f"In {sent['sent_id']}, invalid lexcat {lc} for single-word expression '{tok['word']}'" continue if validate_pos and upos != lc and lc != 'PP' and ( upos, lc) not in {('NOUN', 'N'), ('PROPN', 'N'), ('VERB', 'V'), ('ADP', 'P'), ('ADV', 'P'), ('SCONJ', 'P'), ('ADP', 'DISC'), ('ADV', 'DISC'), ('SCONJ', 'DISC'), ('PART', 'POSS')}: # most often, the single-word lexcat should match its upos # check a list of exceptions mismatchOK = False if xpos == 'TO' and lc.startswith('INF'): mismatchOK = True elif (xpos == 'TO') != lc.startswith('INF'): assert upos == 'SCONJ' and swe['lexlemma'] == 'for', ( sent['sent_id'], swe, tok) mismatchOK = True if (upos in ('NOUN', 'PROPN')) != (lc == 'N'): try: assert upos in ('SYM', 'X') or (lc in ( 'PRON', 'DISC')), (sent['sent_id'], swe, tok) except AssertionError: print('Suspicious lexcat/POS combination:', sent['sent_id'], swe, tok, file=sys.stderr) mismatchOK = True message = f"In {sent['sent_id']}, single-word expression '{tok['word']}' has lexcat {lc}, which is incompatible with its upos {upos}" if (upos == 'AUX') != (lc == 'AUX'): assert tok[ 'lemma'] == 'be' and lc == 'V', message # copula has upos=AUX mismatchOK = True if (upos == 'VERB') != (lc == 'V'): if lc == 'ADJ': print( 'Word treated as VERB in UD, ADJ for supersenses:', sent['sent_id'], tok['word'], file=sys.stderr) else: assert tok[ 'lemma'] == 'be' and lc == 'V', message # copula has upos=AUX mismatchOK = True if upos == 'PRON': assert lc == 'PRON' or lc == 'PRON.POSS', message mismatchOK = True if lc == 'ADV': assert upos == 'ADV' or upos == 'PART', message # PART is for negations mismatchOK = True if upos == 'ADP' and lc == 'CCONJ': assert tok['lemma'] == 'versus' mismatchOK = True assert mismatchOK, message if validate_type: assert lc != 'PP', f"In {sent['sent_id']}, PP should only apply to strong MWEs, but occurs for single-word expression '{tok['word']}'" for smwe in sent['smwes'].values(): assert len(smwe['toknums']) > 1 for wmwe in sent['wmwes'].values(): assert len( wmwe['toknums'] ) > 1, f"In {sent['sent_id']}, weak MWE has only one token according to group indices: {wmwe}" assert wmwe['lexlemma'] == ' '.join( sent['toks'][i - 1]['lemma'] for i in wmwe['toknums']), (wmwe, sent['toks'][wmwe['toknums'][0] - 1]) # we already checked that noninitial tokens in an MWE have _ as their lemma # check lextags smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()] wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()] if 'mwe' not in sent: sent['mwe'] = render_sent(sent, False, False) tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups, wmweGroups) for tok, tag in zip(sent['toks'], tagging): fulllextag = tag if tok['smwe']: smweNum, position = tok['smwe'] lexe = sent['smwes'][smweNum] else: position = None lexe = sent['swes'][tok['#']] if position is None or position == 1: lexcat = lexe['lexcat'] fulllextag += '-' + lexcat ss1, ss2 = lexe['ss'], lexe['ss2'] if ss1 is not None: assert ss1 fulllextag += '-' + ss1 if ss2 is not None and ss2 != ss1: assert ss2 fulllextag += '|' + ss2 if tok['wmwe']: wmweNum, position = tok['wmwe'] wmwe = sent['wmwes'][wmweNum] wcat = wmwe['lexcat'] if wcat and position == 1: fulllextag += '+' + wcat assert tok[ 'lextag'] == fulllextag, f"In {sent['sent_id']}, the full tag at the end of the line is inconsistent with the rest of the line ({fulllextag} expected): {tok}" # check rendered MWE string s = render([tok['word'] for tok in sent['toks']], smweGroups, wmweGroups) if sent['mwe'] != s: caveat = ' (may be due to simplification)' if '$1' in sent[ 'mwe'] else '' print(f'MWE string mismatch{caveat}:', s, sent['mwe'], sent['sent_id'], file=sys.stderr)
smweGroupToks[o] = sg lexLemmas[o] = '_' # lexlemma for strong MWE should be _ lexLemmas[sg[0]] = ' '.join(udLemmas[j - 1] for j in sg) # ...except the first token for h, wg in enumerate(data["~"], i + 1): wg = sorted(wg) for j, o in enumerate(wg, 1): assert o not in wmweGroup wmweGroup[o] = f'{h}:{j}' wmweGroupToks[o] = wg wLemmas[o] = '_' # wlemma for MWE tokens should be _ wLemmas[wg[0]] = ' '.join(udLemmas[j - 1] for j in wg) # ...except the first token tagging = sent_tags(len(data["words"]), mweMarkup, set(map(tuple, smweGroupToks.values())), set(map(tuple, wmweGroupToks.values()))) for ln in udTokLines: tokNum, form, lemma, upos, xpos, feats, head, deprel, deps, misc = ln.split( '\t') if re.match(r'^\d+$', tokNum): tokNum = int(tokNum) offset0 = tokNum - 1 if data["words"][offset0] != [form, xpos]: # Most of the time the UD wordform and XPOS will match # what is stored in the .sst file. # Exceptions: ASCII normalization and tag fixes. print(data["words"][offset0], [form, xpos], file=sys.stderr) # TODO: if data["lemmas"]: assert data["lemmas"][offset0] == lemma smwe = smweGroup.get(tokNum, '_')
def _postproc_sent(sent): nonlocal lc_tbd sent['autoid_swes'] = sent.get('autoid_swes') or {} sent['autoid_smwes'] = sent.get('autoid_smwes') or {} # autoid/goldid - pick one according to args. For autoid, fill in gold ss,ss2 if there's an exact match in gold id if identification == 'autoid': for auto_we in chain(sent['autoid_swes'].values(), sent['autoid_smwes'].values()): matching_gold_wes = [ we for we in chain(sent['swes'].values(), sent['smwes'].values()) if set(we['toknums']) == set(auto_we['toknums']) ] gold_we = (matching_gold_wes + [None])[0] if gold_we and all([ ss is None or '.' in ss for ss in [gold_we['ss'], gold_we['ss2']] ]): auto_we['ss'], auto_we['ss2'] = gold_we['ss'], gold_we[ 'ss2'] else: auto_we['ss'], auto_we['ss2'] = None, None sent['swes'], sent['smwes'] = sent['autoid_swes'], sent[ 'autoid_smwes'] for tok in sent['toks']: tok['smwe'] = tok.get('autoid_smwe') if 'autoid_smwe' in tok: del tok['autoid_smwe'] tok['wmwe'] = None sent['wmwes'] = {} del sent['autoid_smwes'] del sent['autoid_swes'] # check that tokens are numbered from 1, in order for i, tok in enumerate(sent['toks'], 1): assert tok['#'] == i # check that MWEs are numbered from 1 # fix_mwe_numbering.py was written to correct this for i, (k, mwe) in enumerate( sorted(chain(sent['smwes'].items(), sent['wmwes'].items()), key=lambda x: int(x[0])), 1): assert int(k) == i, (sent['sent_id'], i, k, mwe) # check that lexical & weak MWE lemmas are correct for lexe in chain(sent['swes'].values(), sent['smwes'].values()): lexe['lexlemma'] = ' '.join(sent['toks'][i - 1]['lemma'] for i in lexe['toknums']) lc = lexe['lexcat'] if lc.endswith('!@'): lc_tbd += 1 valid_ss = supersenses_for_lexcat(lc) ss, ss2 = lexe['ss'], lexe['ss2'] if valid_ss: if ss == '??': assert ss2 is None elif ss not in valid_ss or (lc in ('N', 'V')) != ( ss2 is None) or (ss2 is not None and ss2 not in valid_ss): print('Invalid supersense(s) in lexical entry:', lexe, file=sys.stderr) elif ss.startswith('p.'): assert ss2.startswith('p.') assert ss2 not in { 'p.Experiencer', 'p.Stimulus', 'p.Originator', 'p.Recipient', 'p.SocialRel', 'p.OrgRole' }, (ss2 + ' should never be function', lexe) else: assert ss is None and ss2 is None and lexe not in ( 'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe # check lexcat on single-word expressions for swe in sent['swes'].values(): tok = sent['toks'][swe['toknums'][0] - 1] upos, xpos = tok['upos'], tok['xpos'] lc = swe['lexcat'] if lc.endswith('!@'): continue assert lc in ALL_LEXCATS, (sent['sent_id'], tok) if (xpos == 'TO') != lc.startswith('INF'): # assert upos=='SCONJ' and swe['lexlemma']=='for',(sent['sent_id'],swe,tok) pass if (upos in ('NOUN', 'PROPN')) != (lc == 'N'): try: assert upos in ('SYM', 'X') or (lc in ('PRON', 'DISC')), ( sent['sent_id'], swe, tok) except AssertionError: print('Suspicious lexcat/POS combination:', sent['sent_id'], swe, tok, file=sys.stderr) if (upos == 'AUX') != (lc == 'AUX'): # assert tok['lemma']=='be' and lc=='V',(sent['sent_id'],tok) # copula has upos=AUX pass if (upos == 'VERB') != (lc == 'V'): if lc == 'ADJ': print('Word treated as VERB in UD, ADJ for supersenses:', sent['sent_id'], tok['word'], file=sys.stderr) else: # assert tok['lemma']=='be' and lc=='V',(sent['sent_id'],tok) # copula has upos=AUX pass if upos == 'PRON': # assert lc=='PRON' or lc=='PRON.POSS',(sent['sent_id'],tok) pass if lc == 'ADV': # assert upos=='ADV' or upos=='PART',(sent['sent_id'],tok) # PART is for negations pass assert lc != 'PP', ('PP should only apply to strong MWEs', sent['sent_id'], tok) for smwe in sent['smwes'].values(): assert len(smwe['toknums']) > 1 for wmwe in sent['wmwes'].values(): assert len(wmwe['toknums']) > 1, (sent['sent_id'], wmwe) # assert wmwe['lexlemma']==' '.join(sent['toks'][i-1]['lemma'] for i in wmwe['toknums']),(wmwe,sent['toks'][wmwe['toknums'][0]-1]) # we already checked that noninitial tokens in an MWE have _ as their lemma # check lextags smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()] wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()] tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups, wmweGroups) for tok, tag in zip(sent['toks'], tagging): fulllextag = tag if tok['smwe']: smweNum, position = tok['smwe'] lexe = sent['smwes'][smweNum] elif tok['#'] in sent['swes']: position = None lexe = sent['swes'][tok['#']] else: lexe = None if lexe and (position is None or position == 1): lexcat = lexe['lexcat'] fulllextag += '-' + lexcat ss1, ss2 = lexe['ss'], lexe['ss2'] if ss1 is not None: assert ss1 fulllextag += '-' + ss1 if ss2 is not None and ss2 != ss1: assert ss2 fulllextag += '|' + ss2 if tok['wmwe']: wmweNum, position = tok['wmwe'] wmwe = sent['wmwes'][wmweNum] wcat = wmwe['lexcat'] if wcat and position == 1: fulllextag += '+' + wcat # assert tok['lextag']==fulllextag,(sent['sent_id'],fulllextag,tok) # check rendered MWE string s = render([tok['word'] for tok in sent['toks']], smweGroups, wmweGroups) if sent['mwe'] != s: caveat = ' (may be due to simplification)' if '$1' in sent[ 'mwe'] else '' print('MWE string mismatch' + caveat + ':', s, sent['mwe'], sent['sent_id'], file=sys.stderr)
def _postproc_sent(sent): nonlocal lc_tbd # check that tokens are numbered from 1, in order for i, tok in enumerate(sent['toks'], 1): assert tok['#'] == i # check that MWEs are numbered from 1 # fix_mwe_numbering.py was written to correct this for i, (k, mwe) in enumerate( sorted(chain(sent['smwes'].items(), sent['wmwes'].items()), key=lambda x: int(x[0])), 1): assert int(k) == i, (sent['sent_id'], i, k, mwe) # check that lexical & weak MWE lemmas are correct for lexe in chain(sent['swes'].values(), sent['smwes'].values()): assert lexe['lexlemma'] == ' '.join( sent['toks'][i - 1]['lemma'] for i in lexe['toknums'] ), f"In {sent['sent_id']}, MWE lemma is incorrect: {lexe} vs. {sent['toks'][lexe['toknums'][0]-1]}" lc = lexe['lexcat'] if lc.endswith('!@'): lc_tbd += 1 valid_ss = supersenses_for_lexcat(lc) if lc == 'V': assert len( lexe['toknums'] ) == 1, f'Verbal MWE lexcat must be subtyped (V.VID, etc., not V): {lexe}' ss, ss2 = lexe['ss'], lexe['ss2'] if valid_ss: if ss == '??': assert ss2 is None elif ss not in valid_ss or (lc in ( 'N', 'V') or lc.startswith('V.')) != (ss2 is None) or ( ss2 is not None and ss2 not in valid_ss): assert False, f"In {sent['sent_id']}, invalid supersense(s) in lexical entry: {lexe}" elif ss.startswith('p.'): assert ss2.startswith('p.') assert ss2 not in { 'p.Experiencer', 'p.Stimulus', 'p.Originator', 'p.Recipient', 'p.SocialRel', 'p.OrgRole' }, (f'{ss2} should never be function', lexe) if ss != ss2: ssA, ss2A = ancestors(ss), ancestors(ss2) # there are just a few permissible combinations where one is the ancestor of the other if (ss, ss2) not in {('p.Whole', 'p.Gestalt'), ('p.Goal', 'p.Locus'), ('p.Circumstance', 'p.Locus'), ('p.Circumstance', 'p.Path'), ('p.Locus', 'p.Goal'), ('p.Locus', 'p.Source'), ('p.Characteristic', 'p.Stuff')}: assert ss not in ss2A, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}" assert ss2 not in ssA, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}" else: assert ss is None and ss2 is None and lexe not in ( 'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe # check lexcat on single-word expressions for swe in sent['swes'].values(): tok = sent['toks'][swe['toknums'][0] - 1] upos, xpos = tok['upos'], tok['xpos'] lc = swe['lexcat'] if lc.endswith('!@'): continue assert lc in ALL_LEXCATS, f"In {sent['sent_id']}, invalid lexcat for single-word expression: {lc} in {tok}" if (xpos == 'TO') != lc.startswith('INF'): assert upos == 'SCONJ' and swe['lexlemma'] == 'for', ( sent['sent_id'], swe, tok) if (upos in ('NOUN', 'PROPN')) != (lc == 'N'): try: assert upos in ('SYM', 'X') or (lc in ('PRON', 'DISC')), ( sent['sent_id'], swe, tok) except AssertionError: print('Suspicious lexcat/POS combination:', sent['sent_id'], swe, tok, file=sys.stderr) if (upos == 'AUX') != (lc == 'AUX'): assert tok['lemma'] == 'be' and lc == 'V', ( sent['sent_id'], tok) # copula has upos=AUX if (upos == 'VERB') != (lc == 'V'): if lc == 'ADJ': print('Word treated as VERB in UD, ADJ for supersenses:', sent['sent_id'], tok['word'], file=sys.stderr) else: assert tok['lemma'] == 'be' and lc == 'V', ( sent['sent_id'], tok) # copula has upos=AUX if upos == 'PRON': assert lc == 'PRON' or lc == 'PRON.POSS', (sent['sent_id'], tok) if lc == 'ADV': assert upos == 'ADV' or upos == 'PART', ( sent['sent_id'], tok) # PART is for negations assert lc != 'PP', ('PP should only apply to strong MWEs', sent['sent_id'], tok) for smwe in sent['smwes'].values(): assert len(smwe['toknums']) > 1 for wmwe in sent['wmwes'].values(): assert len( wmwe['toknums'] ) > 1, f"In {sent['sent_id']}, weak MWE has only one token according to group indices: {wmwe}" assert wmwe['lexlemma'] == ' '.join( sent['toks'][i - 1]['lemma'] for i in wmwe['toknums']), (wmwe, sent['toks'][wmwe['toknums'][0] - 1]) # we already checked that noninitial tokens in an MWE have _ as their lemma # check lextags smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()] wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()] tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups, wmweGroups) for tok, tag in zip(sent['toks'], tagging): fulllextag = tag if tok['smwe']: smweNum, position = tok['smwe'] lexe = sent['smwes'][smweNum] else: position = None lexe = sent['swes'][tok['#']] if position is None or position == 1: lexcat = lexe['lexcat'] fulllextag += '-' + lexcat ss1, ss2 = lexe['ss'], lexe['ss2'] if ss1 is not None: assert ss1 fulllextag += '-' + ss1 if ss2 is not None and ss2 != ss1: assert ss2 fulllextag += '|' + ss2 if tok['wmwe']: wmweNum, position = tok['wmwe'] wmwe = sent['wmwes'][wmweNum] wcat = wmwe['lexcat'] if wcat and position == 1: fulllextag += '+' + wcat assert tok[ 'lextag'] == fulllextag, f"In {sent['sent_id']}, the full tag at the end of the line is inconsistent with the rest of the line ({fulllextag} expected): {tok}" # check rendered MWE string s = render([tok['word'] for tok in sent['toks']], smweGroups, wmweGroups) if sent['mwe'] != s: caveat = ' (may be due to simplification)' if '$1' in sent[ 'mwe'] else '' print(f'MWE string mismatch{caveat}:', s, sent['mwe'], sent['sent_id'], file=sys.stderr)