class ConllEval(object): def __init__(self): self.dep_acc_by_pos = TwoLevelCountDict() self.head_acc_by_pos = TwoLevelCountDict() self.long_sent_stats = CountDict() self.short_sent_stats = CountDict() self.fields = ["pos_acc", "ul_acc", "l_acc"] def add(self, k, sent): self.long_sent_stats.add(k) if len(sent) < 10: self.short_sent_stats.add(k) def pos_stats(self): for pos in sorted(set(self.dep_acc_by_pos.keys()).union(set(self.head_acc_by_pos.keys()))): print( ",".join( [ pos, str(self.dep_acc_by_pos.sub_distribution(pos).get(True, 0.0)), str(self.head_acc_by_pos.sub_distribution(pos).get(True, 0.0)), ] ) ) def acc(self, d, k): return d[k] / d["words"] * 100 def long_stats(self): return [self.acc(self.long_sent_stats, k) for k in self.fields] def short_stats(self): return [self.acc(self.short_sent_stats, k) for k in self.fields] def short_ul(self): return self.acc(self.short_sent_stats, "ul_acc") def short_ul_count(self): return self.short_sent_stats.get("ul_acc", 0) def short_words(self): return self.short_sent_stats.get("words", 0) def long_ul(self): return self.acc(self.long_sent_stats, "ul_acc") def long_ul_count(self): return self.long_sent_stats.get("ul_acc", 0) def long_words(self): return self.long_sent_stats.get("words", 0)
def info(self): """ Print the feature statistics for the given model. (Assumes MaxEnt) """ mallet = c['mallet'] env = set_env_lang_utf8() info_bin = os.path.join(os.path.join(mallet, 'bin'), 'classifier2info') info_p = sub.Popen([info_bin, '--classifier', self._model], stdout=sub.PIPE, stdin=sub.PIPE, stderr=sub.PIPE, env=env) cur_class = None feats = TwoLevelCountDict() # Go through and pick out what the features are for for line in info_p.stdout: content = line.decode(encoding='utf-8') class_change = re.search('FEATURES FOR CLASS (.*)', content) # Set the current class if the section changes if class_change: cur_class = class_change.group(1).strip() continue # Otherwise, let's catalog the features. word, prob = content.split() feats.add(cur_class, word, float(prob)) # Now, print some info for cur_class in feats.keys(): print(cur_class, end='\t') print('%s:%.4f' % ('<default>', feats[cur_class]['<default>']), end='\t') top_10 = feats.top_n(cur_class, n=10, key2_re='^nom') print('\t'.join(['%s:%.4f' % (w,p) for w,p in top_10]))
def get_prototypes(tagged_path, proto_out, delimeter, ignoretags=[], unambiguous=False, maxproto=0): encoding = getencoding(tagged_path) tagged_file = codecs.open(tagged_path, "r", encoding=encoding) tag_word_dict = TwoLevelCountDict() word_tag_dict = TwoLevelCountDict() proto_dict = defaultdict(set) for line in tagged_file: tokens = line.split() for token in tokens: word, pos = re.search("(^.*)%s(.*?)$" % delimeter, token).groups() if pos not in ignoretags: word = word.lower() tag_word_dict.add(pos, word) word_tag_dict.add(word, pos) numproto = 0 # First, let's pick the maxproto most frequent words for a tag. for tag in tag_word_dict.keys(): words = tag_word_dict[tag].most_frequent(minimum=1, num=None) found_words = 0 for word in words: freq_tag = word_tag_dict[word].most_frequent(minimum=1) if freq_tag and freq_tag[0] == tag: # if freq_tag: proto_dict.add(freq_tag[0], word) numproto += 1 found_words += 1 if maxproto and found_words == maxproto: break print("%s Prototypes found." % numproto) # Now, set up the proto file for writing. proto_file = open(proto_out, "w") for tag in proto_dict: proto_file.write(tag) for word in proto_dict[tag]: proto_file.write("\t" + word.lower()) # LOWERCASE for testing proto_file.write("\n") proto_file.close()