def __init__(self): self.dep_acc_by_pos = TwoLevelCountDict() self.head_acc_by_pos = TwoLevelCountDict() self.long_sent_stats = CountDict() self.short_sent_stats = CountDict() self.fields = ["pos_acc", "ul_acc", "l_acc"]
def parse_test(self): path = '/Users/rgeorgi/Documents/treebanks/LDC95T07/RAW/combined/wsj/00/wsj_0001.mrg' tc = CountDict() def count_tokens(tokens): for token in tokens: tc.add(token.label) process_wsj_file(path, count_tokens) # There should be 31 total tokens in this file. self.assertEqual(31, tc.total()) self.assertEqual(tc['.'], 2)
class ConllEval(object): def __init__(self): self.dep_acc_by_pos = TwoLevelCountDict() self.head_acc_by_pos = TwoLevelCountDict() self.long_sent_stats = CountDict() self.short_sent_stats = CountDict() self.fields = ["pos_acc", "ul_acc", "l_acc"] def add(self, k, sent): self.long_sent_stats.add(k) if len(sent) < 10: self.short_sent_stats.add(k) def pos_stats(self): for pos in sorted(set(self.dep_acc_by_pos.keys()).union(set(self.head_acc_by_pos.keys()))): print( ",".join( [ pos, str(self.dep_acc_by_pos.sub_distribution(pos).get(True, 0.0)), str(self.head_acc_by_pos.sub_distribution(pos).get(True, 0.0)), ] ) ) def acc(self, d, k): return d[k] / d["words"] * 100 def long_stats(self): return [self.acc(self.long_sent_stats, k) for k in self.fields] def short_stats(self): return [self.acc(self.short_sent_stats, k) for k in self.fields] def short_ul(self): return self.acc(self.short_sent_stats, "ul_acc") def short_ul_count(self): return self.short_sent_stats.get("ul_acc", 0) def short_words(self): return self.short_sent_stats.get("words", 0) def long_ul(self): return self.acc(self.long_sent_stats, "ul_acc") def long_ul_count(self): return self.long_sent_stats.get("ul_acc", 0) def long_words(self): return self.long_sent_stats.get("words", 0)
def __init__(self, gold=None): CountDict.__init__(self) self._gold = gold