Example #1
0
def evaluate(eval_dic):
    #evaluates the outputs against gold files using conll18_ud_eval module by UD
    #eval_dic should be a dictionary with gold filenames as keys and system filenames as values
    import conll18_ud_eval
    table = []
    for k in eval_dic:
        gfile = open(k, 'r', encoding='utf-8')
        sfile = open(eval_dic[k], 'r', encoding='utf-8')
        g = conll18_ud_eval.load_conllu(gfile)
        s = conll18_ud_eval.load_conllu(sfile)
        gfile.close()
        sfile.close()
        evaluation = conll18_ud_eval.evaluate(g, s)
        newdic = {}
        name = k.split('\\')[-1]
        name = name.replace('-ud-', '_')
        name = name.replace('.conllu', '')
        newdic['name'] = name
        for component in [
                'Sentences', 'Tokens', 'Words', 'Lemmas', 'UPOS', 'XPOS',
                'UFeats', 'UAS', 'LAS'
        ]:
            newdic[component + ' precision'] = evaluation[component].precision
            newdic[component + ' recall'] = evaluation[component].recall
            newdic[component + ' f1'] = evaluation[component].f1
            newdic[
                component +
                ' aligned_accuracy'] = evaluation[component].aligned_accuracy
        newdic['tokens'] = evaluation['Tokens'].gold_total
        newdic['words'] = evaluation['Words'].gold_total
        newdic['sentences'] = evaluation['Sentences'].gold_total
        table.append(newdic)
    df = pandas.DataFrame(table)
    return df
Example #2
0
    def evaluate(self, dataset_name, dataset, dataset_conllu, args):
        import io

        conllu, sentences = io.StringIO(), 0

        self.session.run(self.reset_metrics)
        while not dataset.epoch_finished():
            sentence_lens, word_ids, charseq_ids, charseqs, charseq_lens = dataset.next_batch(args.batch_size)


            feeds = {self.is_training: False, self.sentence_lens: sentence_lens,
                     self.charseqs: charseqs[train.FORMS], self.charseq_lens: charseq_lens[train.FORMS],
                     self.word_ids: word_ids[train.FORMS], self.charseq_ids: charseq_ids[train.FORMS]}
            for tag in args.tags: feeds[self.tags[tag]] = word_ids[train.FACTORS_MAP[tag]]
            predictions, _ = self.session.run([self.predictions, self.update_loss], feeds)

            for i in range(len(sentence_lens)):
                overrides = [None] * dataset.FACTORS
                for tag in args.tags: overrides[dataset.FACTORS_MAP[tag]] = predictions[tag][i]
                dataset.write_sentence(conllu, sentences, overrides)
                sentences += 1

        metrics = conll18_ud_eval.evaluate(dataset_conllu, conll18_ud_eval.load_conllu(io.StringIO(conllu.getvalue())))
        self.session.run(self.summaries[dataset_name],
                         dict((self.metrics[metric], metrics[metric].f1) for metric in self.METRICS))

        return metrics["LAS"].f1 if metrics["LAS"].f1 < 1 else metrics["AllTags"].f1, metrics
Example #3
0
    def test(self, test_sentences, conllu_ud=None):
        splits = 64
        step = len(test_sentences) / splits
        start = 0
        uas = []
        las = []
        parsed_sents = []

        while start + step < len(test_sentences):
            u, l, p = self._test_part(test_sentences[start: start + step])
            uas.append(u)
            las.append(l)
            parsed_sents += p
            start += step
        u, l, p = self._test_part(test_sentences[start:])
        parsed_sents += p
        uas.append(u)
        las.append(l)
        lenu = len(uas)
        lenl = len(las)
        avgu = sum(uas) / lenu
        avgl = sum(las) / lenl
        if self.conll_format is const.CONLLU:
            res = sents_to_conll(parsed_sents)
            import cStringIO
            system_ud2 = load_conllu(cStringIO.StringIO(res))
            ev = evaluate(conllu_ud, system_ud2)
            avgu = ev['UAS'].f1
            avgl = ev['LAS'].f1
        print 'Test results:'
        print 'uas (no punctuation): ', str(avgu)
        print 'las (no punctuation): ', str(avgl)
        return avgu, avgl
Example #4
0
def get_UPOS(gold, syst):
    file1 = open(gold)

    goldfile = load_conllu(file1)
    file2 = open(syst)

    systemfile = load_conllu(file2)
    return (evaluate(goldfile, systemfile)["UPOS"].f1)
Example #5
0
    def evaluate(self, dataset_name, dataset, dataset_conllu, args):
        import io

        self.session.run(self.reset_metrics)
        conllu = self.predict(dataset, args)
        metrics = conll18_ud_eval.evaluate(
            dataset_conllu, conll18_ud_eval.load_conllu(io.StringIO(conllu)))
        self.session.run(
            self.summaries[dataset_name],
            dict((self.metrics[metric], metrics[metric].f1)
                 for metric in self.METRICS))

        return metrics["LAS"].f1 if metrics["LAS"].f1 < 1 else metrics[
            "AllTags"].f1, metrics
Example #6
0
    def evaluate(self, dataset_name, dataset, dataset_conllu, args):
        import io

        conllu = self.predict(dataset, True, args)
        metrics = conll18_ud_eval.evaluate(
            dataset_conllu, conll18_ud_eval.load_conllu(io.StringIO(conllu)))
        self.session.run(
            self.summaries[dataset_name],
            dict((self.metrics[metric], metrics[metric].f1)
                 for metric in self.METRICS))

        if args.parse:
            return (metrics["LAS"].f1 + metrics["MLAS"].f1 +
                    metrics["BLEX"].f1) / 3., metrics
        else:
            return metrics["AllTags"].f1, metrics
Example #7
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("truth",
                        type=str,
                        help="Directory name of the truth dataset.")
    parser.add_argument("system",
                        type=str,
                        help="Directory name of system output.")
    parser.add_argument("output",
                        type=str,
                        help="Directory name of the output directory.")
    args = parser.parse_args()

    # Load input dataset metadata.json
    with open(args.truth + "/metadata.json", "r") as metadata_file:
        metadata = json.load(metadata_file)

    # Evaluate and compute sum of all treebanks
    metrics = [
        "Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags",
        "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"
    ]
    treebanks = 0
    summation = {}
    results = []
    results_las, results_mlas, results_blex = {}, {}, {}
    for entry in metadata:
        treebanks += 1

        ltcode, goldfile, outfile = "_".join(
            (entry['lcode'],
             entry['tcode'])), entry['goldfile'], entry['outfile']

        # Load gold data
        try:
            gold = load_conllu_file(args.truth + "/" + goldfile)
        except:
            results.append(
                (ltcode + "-Status", "Error: Cannot load gold file"))
            continue

        # Load system data
        try:
            system = load_conllu_file(args.system + "/" + outfile)
        except UDError as e:
            if e.args[0].startswith("There is a cycle"):
                results.append(
                    (ltcode + "-Status",
                     "Error: There is a cycle in generated CoNLL-U file"))
                continue
            if e.args[0].startswith("There are multiple roots"):
                results.append((
                    ltcode + "-Status",
                    "Error: There are multiple roots in a sentence in generated CoNLL-U file"
                ))
                continue
            results.append((
                ltcode + "-Status",
                "Error: There is a format error (tabs, ID values, etc) in generated CoNLL-U file"
            ))
            continue
        except:
            results.append((ltcode + "-Status",
                            "Error: Cannot open generated CoNLL-U file"))
            continue

        # Check for correctness
        if not system.characters:
            results.append(
                (ltcode + "-Status", "Error: The system file is empty"))
            continue
        if system.characters != gold.characters:
            results.append((
                ltcode + "-Status",
                "Error: The concatenation of tokens in gold file and in system file differ, system file has {} nonspace characters, which is approximately {}% of the gold file"
                .format(
                    len(system.characters),
                    int(100 * len(system.characters) / len(gold.characters)))))
            continue

        # Evaluate
        try:
            evaluation = evaluate(gold, system)
        except:
            # Should not happen
            results.append((
                ltcode + "-Status",
                "Error: Cannot evaluate generated CoNLL-U file, internal error"
            ))
            continue

        # Generate output metrics and compute sum
        results.append((
            ltcode + "-Status",
            "OK: Result F1 scores rounded to 5% are LAS={:.0f}% MLAS={:.0f}% BLEX={:.0f}%"
            .format(100 * round_score(evaluation["LAS"].f1),
                    100 * round_score(evaluation["MLAS"].f1),
                    100 * round_score(evaluation["BLEX"].f1))))

        for metric in metrics:
            results.append((ltcode + "-" + metric + "-F1",
                            "{:.9f}".format(100 * evaluation[metric].f1)))
            summation[metric] = summation.get(metric,
                                              0) + evaluation[metric].f1
        results_las[ltcode] = evaluation["LAS"].f1
        results_mlas[ltcode] = evaluation["MLAS"].f1
        results_blex[ltcode] = evaluation["BLEX"].f1

    # Compute averages
    for metric in reversed(metrics):
        results.insert(0, ("total-" + metric + "-F1", "{:.9f}".format(
            100 * summation.get(metric, 0) / treebanks)))

    # Generate evaluation.prototext
    with open(args.output + "/evaluation.prototext", "w") as evaluation:
        for key, value in results:
            print('measure{{\n  key: "{}"\n  value: "{}"\n}}'.format(
                key, value),
                  file=evaluation)

    # Generate LAS-F1, MLAS-F1, BLEX-F1 + Status on stdout, Status on stderr
    for key, value in results:
        if not key.endswith("-Status"):
            continue

        ltcode = key[:-len("-Status")]
        print("{:13} LAS={:10.6f}% MLAS={:10.6f}% BLEX={:10.6f}% ({})".format(
            ltcode, 100 * results_las.get(ltcode, 0.),
            100 * results_mlas.get(ltcode, 0.),
            100 * results_blex.get(ltcode, 0.), value),
              file=sys.stdout)
        print("{:13} {}".format(ltcode, value), file=sys.stderr)
Example #8
0
    for s in nltk.corpus.dependency_treebank.sents()[:200]:
        sent = detok.detokenize(s)
        corenlp_model = client.annotate(sent)
        corenlp_conll_en += corenlp_model  # + '\r\n'

f_corenlp_en = io.StringIO(
    corenlp_conll_en.replace("Corp.", "Corp").replace("Conn.", "Conn").replace(
        "Â", "").replace("Ltd.", "Ltd"))
corenlp_en_eval = load_conllu(f_corenlp_en)

f_gold_en = io.StringIO(
    gold_conll_en.replace("Corp.",
                          "Corp").replace("Conn.",
                                          "Conn").replace("Ltd.", "Ltd"))
gold_en_eval = load_conllu(f_gold_en)
corenlp_en_evaluation = evaluate(gold_en_eval, corenlp_en_eval)

print_results(
    corenlp_en_evaluation,
    "Results for Penn Treebank dataset using CoreNLP Dependency Parser")

# Spanish
spanish_dep_file = '../../dependency/UD_Spanish-AnCora-master/es_ancora-ud-test.conllu'

with open(spanish_dep_file, 'r') as ancora_f:
    ancora_text = conll_text_reader(ancora_f)

corenlp_conll_es = ""
with CoreNLPClient(
        annotators=['tokenize', 'ssplit', 'pos', 'parse', 'depparse'],
        output_format="conllu",
def conll_eval(system_file, gold_file):    
    gold_ud = load_conllu_file(gold_file)
    system_ud = load_conllu_file(system_file)
    return evaluate(gold_ud, system_ud)
    
Example #10
0
    sent = detok.detokenize(s)
    doc = nlp(sent)
    for s in CoNLL.convert_dict(doc.to_dict()):
        for w in s:
            for i, content in enumerate(w):
                stanza_conll_en += content + '\t'
            stanza_conll_en = stanza_conll_en[:-1] + '\r\n'
        stanza_conll_en += '\r\n'

f_gold_en = io.StringIO(gold_conll_en)
f_stanza_en = io.StringIO(stanza_conll_en)

gold_en_eval = load_conllu(f_gold_en)
stanza_en_eval = load_conllu(f_stanza_en)

stanza_en_evaluation = evaluate(gold_en_eval, stanza_en_eval)

print_results(stanza_en_evaluation,
              "Results for Penn Treebank dataset using Stanza Dependency Parser")

# Spanish
spanish_dep_file = '../../dependency/UD_Spanish-AnCora-master/es_ancora-ud-test.conllu'

with open(spanish_dep_file, 'r') as ancora_f:
    ancora_text = conll_text_reader(ancora_f)

nlp = stanza.Pipeline(processors='tokenize,mwt,pos,lemma,depparse', tokenize_pretokenized=True, lang='es')
doc = nlp(ancora_text)
stanza_conll_es = ""
for s in CoNLL.convert_dict(doc.to_dict()):
    for w in s: