Ejemplo n.º 1
0
    def info(self):
        """
        Print the feature statistics for the given model. (Assumes MaxEnt)
        """
        mallet = c['mallet']
        env = set_env_lang_utf8()
        info_bin = os.path.join(os.path.join(mallet, 'bin'), 'classifier2info')
        info_p = sub.Popen([info_bin, '--classifier', self._model],
                            stdout=sub.PIPE, stdin=sub.PIPE, stderr=sub.PIPE, env=env)

        cur_class = None
        feats = TwoLevelCountDict()

        # Go through and pick out what the features are for
        for line in info_p.stdout:
            content = line.decode(encoding='utf-8')

            class_change = re.search('FEATURES FOR CLASS (.*)', content)
            # Set the current class if the section changes
            if class_change:
                cur_class = class_change.group(1).strip()
                continue

            # Otherwise, let's catalog the features.
            word, prob = content.split()
            feats.add(cur_class, word, float(prob))

        # Now, print some info
        for cur_class in feats.keys():
            print(cur_class, end='\t')
            print('%s:%.4f' % ('<default>', feats[cur_class]['<default>']), end='\t')
            top_10 = feats.top_n(cur_class, n=10, key2_re='^nom')
            print('\t'.join(['%s:%.4f' % (w,p) for w,p in top_10]))
Ejemplo n.º 2
0
def get_prototypes(tagged_path, proto_out, delimeter, ignoretags=[], unambiguous=False, maxproto=0):

    encoding = getencoding(tagged_path)

    tagged_file = codecs.open(tagged_path, "r", encoding=encoding)

    tag_word_dict = TwoLevelCountDict()
    word_tag_dict = TwoLevelCountDict()

    proto_dict = defaultdict(set)

    for line in tagged_file:
        tokens = line.split()
        for token in tokens:
            word, pos = re.search("(^.*)%s(.*?)$" % delimeter, token).groups()
            if pos not in ignoretags:
                word = word.lower()
                tag_word_dict.add(pos, word)
                word_tag_dict.add(word, pos)

    numproto = 0
    # First, let's pick the maxproto most frequent words for a tag.
    for tag in tag_word_dict.keys():
        words = tag_word_dict[tag].most_frequent(minimum=1, num=None)
        found_words = 0
        for word in words:

            freq_tag = word_tag_dict[word].most_frequent(minimum=1)

            if freq_tag and freq_tag[0] == tag:
                # 			if freq_tag:

                proto_dict.add(freq_tag[0], word)
                numproto += 1
                found_words += 1

            if maxproto and found_words == maxproto:
                break

    print("%s Prototypes found." % numproto)

    # Now, set up the proto file for writing.
    proto_file = open(proto_out, "w")
    for tag in proto_dict:
        proto_file.write(tag)
        for word in proto_dict[tag]:
            proto_file.write("\t" + word.lower())  # LOWERCASE for testing
        proto_file.write("\n")
    proto_file.close()
Ejemplo n.º 3
0
def _process_file(f):
    c = TwoLevelCountDict()
    d = TwoLevelCountDict()
    m = TwoLevelCountDict()

    print("Processing file {}".format(f))
    xc = xc_load(f)
    for inst in xc:
        LOG.info("Now on instance {}".format(inst.id))

        # Search for the gloss POS tier, if it exists.
        gpos = inst.find(alignment=GLOSS_WORD_ID, type=POS_TIER_TYPE)

        # If a gloss POS tier was found...
        if gpos:

            # Iterate through the projected tags.
            for gp in gpos:

                word = gp.igt.find(id=gp.attributes[ALIGNMENT])

                grams = tokenize_item(word, morpheme_tokenizer)

                # Add the (gram, POSTag) pair as something that was encountered.
                for gram in grams:
                    m.add(gram.content.lower(), gp.value())

                c.add(gp.value(), word.value().lower())
                d.add(word.value().lower(), gp.value())

    return (c, d, m)