def info(self): """ Print the feature statistics for the given model. (Assumes MaxEnt) """ mallet = c['mallet'] env = set_env_lang_utf8() info_bin = os.path.join(os.path.join(mallet, 'bin'), 'classifier2info') info_p = sub.Popen([info_bin, '--classifier', self._model], stdout=sub.PIPE, stdin=sub.PIPE, stderr=sub.PIPE, env=env) cur_class = None feats = TwoLevelCountDict() # Go through and pick out what the features are for for line in info_p.stdout: content = line.decode(encoding='utf-8') class_change = re.search('FEATURES FOR CLASS (.*)', content) # Set the current class if the section changes if class_change: cur_class = class_change.group(1).strip() continue # Otherwise, let's catalog the features. word, prob = content.split() feats.add(cur_class, word, float(prob)) # Now, print some info for cur_class in feats.keys(): print(cur_class, end='\t') print('%s:%.4f' % ('<default>', feats[cur_class]['<default>']), end='\t') top_10 = feats.top_n(cur_class, n=10, key2_re='^nom') print('\t'.join(['%s:%.4f' % (w,p) for w,p in top_10]))
def get_prototypes(tagged_path, proto_out, delimeter, ignoretags=[], unambiguous=False, maxproto=0): encoding = getencoding(tagged_path) tagged_file = codecs.open(tagged_path, "r", encoding=encoding) tag_word_dict = TwoLevelCountDict() word_tag_dict = TwoLevelCountDict() proto_dict = defaultdict(set) for line in tagged_file: tokens = line.split() for token in tokens: word, pos = re.search("(^.*)%s(.*?)$" % delimeter, token).groups() if pos not in ignoretags: word = word.lower() tag_word_dict.add(pos, word) word_tag_dict.add(word, pos) numproto = 0 # First, let's pick the maxproto most frequent words for a tag. for tag in tag_word_dict.keys(): words = tag_word_dict[tag].most_frequent(minimum=1, num=None) found_words = 0 for word in words: freq_tag = word_tag_dict[word].most_frequent(minimum=1) if freq_tag and freq_tag[0] == tag: # if freq_tag: proto_dict.add(freq_tag[0], word) numproto += 1 found_words += 1 if maxproto and found_words == maxproto: break print("%s Prototypes found." % numproto) # Now, set up the proto file for writing. proto_file = open(proto_out, "w") for tag in proto_dict: proto_file.write(tag) for word in proto_dict[tag]: proto_file.write("\t" + word.lower()) # LOWERCASE for testing proto_file.write("\n") proto_file.close()
def _process_file(f): c = TwoLevelCountDict() d = TwoLevelCountDict() m = TwoLevelCountDict() print("Processing file {}".format(f)) xc = xc_load(f) for inst in xc: LOG.info("Now on instance {}".format(inst.id)) # Search for the gloss POS tier, if it exists. gpos = inst.find(alignment=GLOSS_WORD_ID, type=POS_TIER_TYPE) # If a gloss POS tier was found... if gpos: # Iterate through the projected tags. for gp in gpos: word = gp.igt.find(id=gp.attributes[ALIGNMENT]) grams = tokenize_item(word, morpheme_tokenizer) # Add the (gram, POSTag) pair as something that was encountered. for gram in grams: m.add(gram.content.lower(), gp.value()) c.add(gp.value(), word.value().lower()) d.add(word.value().lower(), gp.value()) return (c, d, m)