def main(args):
    with open(args.fname,
              encoding="utf-8") as f, open(args.lextags,
                                           encoding="utf-8") as lextags_lines:
        print_json(
            swap_lextags(load_sents(f, ss_mapper=SSMapper(args.depth)),
                         map(ast.literal_eval, lextags_lines)))
Exemple #2
0
def main(args):
    goldF = args.goldfile
    sysFs = args.sysfile

    ss_mapper = lambda ss: coarsen_pss(ss, args.depth) if ss.startswith(
        'p.') else ss

    # Load gold data
    gold_sents = list(load_sents(goldF, ss_mapper=ss_mapper))
    for sent in gold_sents:
        sent['punits'] = {
            tuple(e['toknums']): (e['lexcat'], e['ss'], e['ss2'])
            for e in list(sent['swes'].values()) + list(sent['smwes'].values())
            if e['ss'] and (e['ss'].startswith('p.') or e['ss'] == '??')
        }

    all_sys_scores = {}
    for sysF in sysFs:
        sysscores = eval_sys(sysF, gold_sents, ss_mapper)
        syspath = sysF.name
        basename = syspath.rsplit('.', 2)[0]
        if basename not in all_sys_scores:
            all_sys_scores[basename] = [
                defaultdict(lambda: defaultdict(Counter)),
                defaultdict(lambda: defaultdict(Counter))
            ]
        if syspath.split('.')[-2] == 'goldid':
            all_sys_scores[basename][0] = sysscores
        else:
            all_sys_scores[basename][1] = sysscores

    # Print output
    args.output_format(all_sys_scores, depth=args.depth)
 def __call__(self, batch_tags: List[List[str]],
              batch_gold_tags: List[List[str]],
              batch_upos: List[List[str]]):
     tempdir = tempfile.mkdtemp()
     gold_path = os.path.join(tempdir, "gold.json")
     predicted_path = os.path.join(tempdir, "predicted.autoid.json")
     # TODO(danielhers): Unused variable:
     # unpacked_predicted_path = os.path.join(tempdir, "unpacked_predicted.autoid.json")
     with open(predicted_path, "w", encoding="utf-8") as predicted_file, \
             open(gold_path, "w", encoding="utf-8") as gold_file:
         for tags, gold_tags, upos in zip(batch_tags, batch_gold_tags,
                                          batch_upos):
             write_conllulex_formatted_tags_to_file(predicted_file,
                                                    gold_file, tags,
                                                    gold_tags, upos)
     with open(predicted_path, encoding="utf-8") as predicted_file:
         # TODO(danielhers): Unused variable:
         # \ open(unpacked_predicted_path, "w", encoding="utf-8") as unpacked_predicted_file:
         print_json(unpack_sents(predicted_file))
     with open(gold_path, encoding="utf-8") as gold_file:
         # TODO(danielhers): Unused variable:
         # \ open(unpacked_predicted_path, encoding="utf-8") as unpacked_predicted_file:
         gold_sents = list(load_sents(gold_file, ss_mapper=ss_mapper))
         self._scores = eval_sys(predicted_file, gold_sents,
                                 ss_mapper)  # TODO accumulate
Exemple #4
0
def main(args):
    goldF = args.goldfile
    sysFs = args.sysfile

    ss_mapper = lambda ss: coarsen_pss(ss, args.depth) if ss.startswith(
        'p.') else ss

    # Load gold data
    gold_sents = list(load_sents(goldF, ss_mapper=ss_mapper))

    all_sys_scores = {}
    for sysF in sysFs:
        sysscores = eval_sys(sysF, gold_sents, ss_mapper)
        syspath = sysF.name
        basename = syspath.rsplit('.', 2)[0]
        if basename not in all_sys_scores:
            all_sys_scores[basename] = [
                defaultdict(lambda: defaultdict(Counter)),
                defaultdict(lambda: defaultdict(Counter))
            ]
        if syspath.split('.')[-2] == 'goldid':
            all_sys_scores[basename][0] = sysscores
        else:
            all_sys_scores[basename][1] = sysscores

    # Print output
    args.output_format(all_sys_scores, depth=args.depth, mode=args.output_mode)
Exemple #5
0
def main(args):
    ss_mapper = SSMapper(args.depth)

    # Load gold data
    gold_sents = list(
        tqdm(load_sents(args.goldfile, ss_mapper=ss_mapper),
             desc="Reading " + args.goldfile.name,
             unit=" lines"))

    all_sys_scores = {}
    for lextags_file in args.lextags:
        # Load predictions
        with open(lextags_file, encoding="utf-8") as f:
            pred_sents = list(
                tqdm(swap_lextags(gold_sents, map(ast.literal_eval, f)),
                     desc="Reading " + lextags_file,
                     unit=" lines"))
        s = StringIO()
        print_json(pred_sents, fh=s)
        s = BytesIO(s.getvalue().encode("utf-8"))
        s.name = "autoid.json"
        scores = eval_sys(s, gold_sents, ss_mapper)
        basename = lextags_file.rsplit('.', 2)[0]
        if basename not in all_sys_scores:
            all_sys_scores[basename] = [
                defaultdict(lambda: defaultdict(Counter)),
                defaultdict(lambda: defaultdict(Counter))
            ]
        if lextags_file.split('.')[-2] == 'goldid':
            all_sys_scores[basename][0] = scores
        else:
            all_sys_scores[basename][1] = scores

    # Print output
    args.output_format(all_sys_scores, depth=args.depth, mode=args.output_mode)
Exemple #6
0
def main(args):
    with open(args.fname,
              encoding="utf-8") as f, open(args.lextags,
                                           encoding="utf-8") as tags_lines:
        sents = load_sents(f,
                           ss_mapper=SSMapper(args.depth),
                           validate_type=False,
                           validate_pos=False)
        preds = load_tags(tags_lines)
        print_json(swap_tags(sents, preds))
Exemple #7
0
def eval_sys(sysF, gold_sents, ss_mapper):
    goldid = (sysF.name.split('.')[-2]=='goldid')
    if not goldid and sysF.name.split('.')[-2]!='autoid':
        raise ValueError(f'File path of system output not specified for gold vs. auto identification of units to be labeled: {sysF.name}')

    compare_sets = compare_sets_Acc if goldid else compare_sets_PRF

    scores = defaultdict(lambda: defaultdict(Counter))

    for iSent,syssent in enumerate(load_sents(sysF, ss_mapper=ss_mapper)):
        sent = gold_sents[iSent]
        assert sent['sent_id']==syssent['sent_id']

        eval_sent_tagging(sent, syssent, scores)
        for shapeclass in SHAPE_CLASSES:
            for ssclass in SS_CLASSES:
                eval_sent_by_classes(sent, syssent, shapeclass, ssclass, scores, compare_sets)

    for k in scores:
        if k[1] =='Tags':
            if k[0]=='*':   # k is ('*', 'Tags')
                for subscore in ('Full', '-Lexcat', '-SS', '-Lexcat -SS'):
                    c = scores[k][subscore]
                    assert scores[k][subscore]['N']>0,(k,subscore,scores[k][subscore])
                    c['Acc'] = Ratio(c['correct'], c['N'])
            elif k[0] in ('MWE', 'GappyMWE'):
                for subscore in ('Link+', 'Link-'):
                    c = scores[k][subscore]
                    c['P'] = Ratio(c['PNumer'], c['PDenom'])
                    c['R'] = Ratio(c['RNumer'], c['RDenom'])
                    c['F'] = f1(c['P'], c['R'])
                for m in ('P', 'R', 'F'):
                    # strength averaging
                    avg = (scores[k]['Link+'][m]+scores[k]['Link-'][m])/2   # float
                    # construct a ratio by averaging the denominators (this gives insight into underlying recall-denominators)
                    denom = (scores[k]['Link+'][m].denominator+scores[k]['Link-'][m].denominator)/2   # float
                    scores[k]['LinkAvg'][m] = Ratio(avg*denom, denom)
        elif goldid:  # assuming goldid means gold identification of spans & kind of supersense
            for subscore in ('Role','Fxn','Labeled'):
                c = scores[k][subscore]
                assert scores[k][subscore]['N']>0,(k,subscore,scores[k][subscore])
                c['Acc'] = Ratio(c['correct'], c['N'])
        else:
            for subscore in ('ID','Role','Fxn','Labeled'):
                c = scores[k][subscore]
                c['P'] = Ratio(c['correct'], c['Pdenom'])
                c['R'] = Ratio(c['correct'], c['Rdenom'])
                c['F'] = f1(c['P'], c['R'])

    assert len(gold_sents)==iSent+1,f'Mismatch in number of sentences: {len(gold_sents)} gold, {iSent+1} system from {sysFP}'

    return scores
    def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        logger.info("Reading instances from lines in file at: %s", file_path)

        with open(file_path, 'r') as tagging_file:
            tagging_data = load_sents(tagging_file)
            for instance in tagging_data:
                # Get the tokens
                tokens = [x["word"] for x in instance["toks"]]
                # Get their associated upos
                upos_tags = [x["upos"] for x in instance["toks"]]

                # Get their associated lemma
                lemmas = [x["lemma"] for x in instance["toks"]]
                # Get their associated lextag
                labels = [x["lextag"] for x in instance["toks"]]
                yield self.text_to_instance(tokens=tokens,
                                            upos_tags=upos_tags,
                                            lemmas=lemmas,
                                            streusle_lextags=labels)
Exemple #9
0
            updates[sentid] = r
        else:   # continuation of second column from previous line
            assert sentid
            updates[sentid] += ' ' + r

"""
2. Scan the full corpus .conllulex for sentences with their original annotations.
If there was a change, parse the rendered lexical semantic analysis into tags,
substitute the tags in the UDlextag format, and parse the sentence to JSON in
order to update the fields: 'mwe', 'toks', 'swes', 'smwes', 'wmwes'
('etoks' etc. will be unaffected).
"""
sents = []
with open(conllulexFP, encoding='utf-8') as conllulexF:
    nUpdatedSents = 0
    for sent in load_sents(conllulexF, store_conllulex='toks'):
        sentid = sent['sent_id']
        if sentid in updates:
            # compare rendered strings to see whether there has been a change
            rendered_old = render_sent(sent, lexcats=True, supersenses=True)
            rendered_new = updates[sentid]
            if rendered_old!=rendered_new:  # there has been a change
                # parse the new rendered string
                toks = [tok['word'] for tok in sent['toks']]
                tagging = unrender(rendered_new, toks)  # this should fail if tokens have changed
                toks2, bios, lbls = zip(*tagging)
                assert toks==list(toks2),(toks,toks2)  # be super-duper sure tokens haven't changed
                labeled_bio = [bio+('-'+lbl.replace(':','|') if lbl else '') for bio,lbl in zip(bios,lbls)]

                # substitute new tagging in UDlextag format
                conllulex = sent['conllulex'].strip().split('\n')
Exemple #10
0
def eval_sys(sysF, gold_sents, ss_mapper):
    goldid = (sysF.name.split('.')[-2] == 'goldid')
    if not goldid and sysF.name.split('.')[-2] != 'autoid':
        raise ValueError(
            f'File path of system output not specified for gold vs. auto identification of units to be labeled: {sysF.name}'
        )

    compare_sets = compare_sets_Acc if goldid else compare_sets_PRF

    scores = {
        'All': defaultdict(Counter),
        'MWE': defaultdict(Counter),
        'MWP': defaultdict(Counter)
    }

    for iSent, syssent in enumerate(load_sents(sysF, ss_mapper=ss_mapper)):
        sent = gold_sents[iSent]
        assert sent['sent_id'] == syssent['sent_id']

        # all units with a PSS label
        c = scores['All']
        goldunits = dict(
            sent['punits']
        )  # make a copy so we can delete stuff locally for gold=?? and not have it affect other results
        predunits = {
            tuple(e['toknums']): (e['lexcat'], e['ss'], e['ss2'])
            for e in list(syssent['swes'].values()) +
            list(syssent['smwes'].values())
            if e['ss'] and e['ss'].startswith('p.')
        }

        # special case: discard gold=?? tokens regardless of their predicted label
        for k, (lc, r, f) in list(goldunits.items()):
            if r == '??':
                if k in predunits:
                    del predunits[k]
                del goldunits[k]

        c['ID'] += compare_sets(set(goldunits.keys()), set(predunits.keys()))
        c['Role,Fxn'] += compare_sets(
            {(k, r, f)
             for k, (lc, r, f) in goldunits.items()},
            {(k, r, f)
             for k, (lc, r, f) in predunits.items()})
        c['Role'] += compare_sets(
            {(k, r)
             for k, (lc, r, f) in goldunits.items()},
            {(k, r)
             for k, (lc, r, f) in predunits.items()})
        c['Fxn'] += compare_sets(
            {(k, f)
             for k, (lc, r, f) in goldunits.items()},
            {(k, f)
             for k, (lc, r, f) in predunits.items()})

        # MWEs only
        c = scores['MWE']
        goldunits = {k: v for k, v in goldunits.items() if len(k) > 1}
        predunits = {k: v for k, v in predunits.items() if len(k) > 1}
        c['ID'] += compare_sets(set(goldunits.keys()), set(predunits.keys()))
        c['Role,Fxn'] += compare_sets(
            {(k, r, f)
             for k, (lc, r, f) in goldunits.items()},
            {(k, r, f)
             for k, (lc, r, f) in predunits.items()})
        c['Role'] += compare_sets(
            {(k, r)
             for k, (lc, r, f) in goldunits.items()},
            {(k, r)
             for k, (lc, r, f) in predunits.items()})
        c['Fxn'] += compare_sets(
            {(k, f)
             for k, (lc, r, f) in goldunits.items()},
            {(k, f)
             for k, (lc, r, f) in predunits.items()})

        # multiword adpositions only: note this requires the lexcat to be predicted
        c = scores['MWP']
        goldunits = {k: v for k, v in goldunits.items() if v[0] != 'PP'}
        predunits = {k: v for k, v in predunits.items() if v[0] != 'PP'}
        c['ID'] += compare_sets(set(goldunits.keys()), set(predunits.keys()))
        c['Role,Fxn'] += compare_sets(
            {(k, r, f)
             for k, (lc, r, f) in goldunits.items()},
            {(k, r, f)
             for k, (lc, r, f) in predunits.items()})
        c['Role'] += compare_sets(
            {(k, r)
             for k, (lc, r, f) in goldunits.items()},
            {(k, r)
             for k, (lc, r, f) in predunits.items()})
        c['Fxn'] += compare_sets(
            {(k, f)
             for k, (lc, r, f) in goldunits.items()},
            {(k, f)
             for k, (lc, r, f) in predunits.items()})

    for k in ('All', 'MWE', 'MWP'):
        if goldid:
            for criterion in ('Role', 'Fxn', 'Role,Fxn'):
                c = scores[k][criterion]
                assert scores[k][criterion]['N'] > 0, (k, criterion,
                                                       scores[k][criterion])
                c['Acc'] = c['correct'] / c['N']
        else:
            for criterion in ('ID', 'Role', 'Fxn', 'Role,Fxn'):
                c = scores[k][criterion]
                c['P'] = c['correct'] / c['Pdenom']
                c['R'] = c['correct'] / c['Rdenom']
                c['F'] = f1(c['P'], c['R'])

    assert len(
        gold_sents
    ) == iSent + 1, f'Mismatch in number of sentences: {len(gold_sents)} gold, {iSent+1} system from {sysFP}'

    return scores
Exemple #11
0
def main(args):
    if args.colorless or not sys.stdin.isatty():
        for c in dir(Colors):
            if not c.startswith('_'):
                setattr(Colors, c, '')
        for s in dir(Styles):
            if not s.startswith('_'):
                setattr(Styles, s, '')


    goldF = args.goldfile
    sysFs = args.sysfile

    ss_mapper = lambda ss: coarsen_pss(ss, args.depth) if ss.startswith('p.') else ss

    # Load gold data
    gold_sents = list(load_sents(goldF, ss_mapper=ss_mapper))

    predFs = [load_sents(predFP, ss_mapper=ss_mapper) for predFP in sysFs]

    all_sys_scores = {}

    def filter_labels(ll):
        result = dict(ll)
        for k,l in ll.items():
            if l.startswith('n.') and args.no_noun: del result[k]
            elif l.startswith('v.') and args.no_verb: del result[k]
            elif l.startswith('p.') and args.no_snacs: del result[k]
        return result

    R = lambda ww,sg,wg,ll: render(ww, sg if not args.no_mwe else [], wg if not args.no_mwe else [], filter_labels(ll))

    for i,sent in enumerate(gold_sents):
        # gold analysis
        words = [t["word"] for t in sent["toks"]]
        rendered = []
        rendered.append(R(words,
                           [e["toknums"] for e in sent["smwes"].values()],
                           [e["toknums"] for e in sent["wmwes"].values()],
                           makelabelmap(sent, include_lexcat=args.lexcats, include_supersenses=True)))
        for predF in predFs:
            psent = next(predF)
            assert psent['sent_id']==sent['sent_id']
            rendered.append(R(words,
                               [e["toknums"] for e in psent["smwes"].values()],
                               [e["toknums"] for e in psent["wmwes"].values()],
                               makelabelmap(sent, include_lexcat=args.lexcats, include_supersenses=True)))

        diff_classes = set()
        if not args.no_diff:
            diff_classes.add('special')
            if not args.no_mwe_diff: diff_classes.add('mwe')
            if not args.no_noun_diff: diff_classes.add('n')
            if not args.no_snacs_diff: diff_classes.add('p')
            if not args.no_verb_diff: diff_classes.add('v')

        if args.sent_ids:
            print(sent['sent_id'], end='\t')
        print(color_rendered(words, rendered, diff_classes))
        #assert False,(color_rendered(words, rendered),words,rendered)

    # restore the terminal's default colors
    print(Colors.ENDC, end='')
@author: Nathan Schneider (@nschneid)
"""

import os, sys, fileinput, re, json, csv
from collections import defaultdict
from itertools import chain

from conllulex2json import load_sents, print_json

inFname, = sys.argv[1:]

nSentsRenumbered = 0
nMWEsRenumbered = 0

with open(inFname, encoding='utf-8') as inF:
    sents = list(load_sents(inF))
    for sent in sents:
        smwes = sent["smwes"]
        wmwes = sent["wmwes"]
        allmwes = []
        for oldnum, e in smwes.items():
            allmwes.append((e["toknums"][0], 's', oldnum))
        for oldnum, e in wmwes.items():
            allmwes.append((e["toknums"][0], 'w', oldnum))
        allmwes.sort()
        current_sort = sorted(allmwes, key=lambda x: x[2])
        if allmwes != current_sort:
            nSentsRenumbered += 1
            # renumber
            new_smwes = {}
            new_wmwes = {}
parser = argparse.ArgumentParser(description='Augment Data')
parser.add_argument("conllulex",
                    type=str,
                    help="Augment CoNLL-U/CoNLL-U-Lex/JSON file")
parser.add_argument("mrp", type=str, help="Input MRP file")
parser.add_argument("output", type=str, help="Output Augmented file")
args = parser.parse_args()

conllulex_file = args.conllulex
mrp_file = args.mrp
out_file = args.output

with open(conllulex_file, 'r', encoding='utf8') as f_c:
    augs = {
        sent["sent_id"].replace("reviews-", ""): sent
        for sent in load_sents(CompanionToConllulex(f_c))
    }
with open(mrp_file, 'r',
          encoding='utf8') as f_in, open(out_file, 'w',
                                         encoding='utf8') as f_out:
    for line in f_in:
        mrp = json.loads(line, object_pairs_hook=collections.OrderedDict)
        sent_id = mrp['id']
        aug = augs[sent_id]
        if aug is None:
            print("id:{} not in companion".format(sent_id))
        else:
            add_token_ranges(aug["toks"], aug["text"])
            mrp['companion'] = aug
            f_out.write((json.dumps(mrp) + '\n'))