Exemple #1
0
def tag_test(test_feature_file, trained_model_file, tag_test_set_file):
    fin = codecs.open(test_feature_file, 'r', 'utf-8')
    fout = codecs.open(tag_test_set_file, 'w', 'utf-8')
    m = MaxentModel()
    m.load(trained_model_file)
    contents = fin.read()
    feature_list = contents.split('\r')
    feature_list.remove('\n')
    for feature in feature_list:
        if (feature == 'split'):
            fout.write('\n\n\n')
            continue
        str_feature = []
        u_feature = feature.split(' ')
        for item in u_feature:
            str_feature.append(item.encode('utf-8'))
        label_prob_list = m.eval_all(str_feature)
        label = max_prob(label_prob_list)

        try:
            new_tag = str_feature[2].split('=')[1] + '/' + label
        except IndexError:
            print
            str_feature
        fout.write(new_tag.decode('utf-8'))
        pre_tag = label

    return feature_list
Exemple #2
0
 def __init__(self, restrictFeatures=False):
     Classifier.__init__(self)
     print "MaximumEntropy: Creating model"
     self.model = MaxentModel()
     self.model.verbose = 1
     self.restrictFeatures = restrictFeatures
     self.model.begin_add_event()
Exemple #3
0
def simple_train(event_list):
    m = MaxentModel()
    m.begin_add_event()
    for e in event_list:
        m.add_event(e[0], e[1])
    m.end_add_event()
    #maxent.set_verbose(1)
    m.train(30, 'lbfgs', 2)
    return m
def train(corpus, *args):
	projections = {}
	model = MaxentModel()
	model.begin_add_event()
	for datums in corpus.values():
		for datum in datums:
			projection = datum2features(datum)
			model.add_event(datum2features(datum), datum.is_related, long(100 * float(datum._trust)))
			projections[datum.row_in_corpus] = projection
	model.end_add_event()
	model.train(*args)
	return model, projections
Exemple #5
0
def train_model(options, iterable):
    model = MaxentModel()
    data = {}

    data["feature_set"] = set()
    data["word_frequencies"] = defaultdict(long)
    # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be
    # a better choice here (for |labelled_words|) but it could not be pickled.
    # C'est la vie.
    data["labelled_words"] = dict()

    print >> sys.stderr, "*** Training options are:"
    print >> sys.stderr, "   ", options

    print >> sys.stderr, "*** First pass: Computing statistics..."
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        for word, pos, label in sentence:
            data["word_frequencies"][word] += 1
            if label.startswith("B-") or label.startswith("I-"):
                if word in data["labelled_words"]:
                    data["labelled_words"][word][label] += 1
                else:
                    data["labelled_words"][word] = defaultdict(long)

    print >> sys.stderr, "*** Second pass: Collecting features..."
    model.begin_add_event()
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses, labels = map(list, zip(*sentence))
        for i in xrange(len(labels)):
            features = compute_features(data, words, poses, i,
                                        labels[i - 1] if i >= 1 else "^")
            features = list(features)
            model.add_event(features, labels[i])
            for feature in features:
                data["feature_set"].add(feature)
    model.end_add_event(options.cutoff)
    print >> sys.stderr, "*** Collected {0} features.".format(
        len(data["feature_set"]))

    print >> sys.stderr, "*** Training..."
    maxent.set_verbose(1)
    model.train(options.iterations, options.technique, options.gaussian)
    maxent.set_verbose(0)

    print >> sys.stderr, "*** Saving..."
    model.save(options.model + ".maxent")
    with open(options.model + ".data", "w") as handle:
        cPickle.dump(data, handle)
def main():
    usage = "usage: %prog [options] -m model file"
    parser = OptionParser(usage)
    parser.add_option("-o",
                      "--output",
                      type="string",
                      help="write tagged result to OUTPUT")
    parser.add_option("-m",
                      "--model",
                      type="string",
                      help="load trained model from MODEL")
    parser.add_option("-t",
                      "--test",
                      action="store_true",
                      default=0,
                      help="test mode, include original tag in output")
    parser.add_option("-v",
                      "--verbose",
                      action="store_true",
                      dest="verbose",
                      default=1)
    parser.add_option("-q", "--quiet", action="store_false", dest="verbose")
    parser.add_option("-T",
                      "--type",
                      type="int",
                      default=None,
                      help="choose context type")

    (options, args) = parser.parse_args()

    if not options.model:
        print >> sys.stderr, 'Tagger model name not given!'
        parser.print_usage()
        sys.exit(1)

    model = options.model
    tag_dict = cPickle.load(open(model + '.tagdict'))

    me = MaxentModel()
    me.load(model)
    tagger = postagger.PosTagger(me, tag_dict, options.type)

    tag_in_file = sys.stdin
    if len(args) >= 1:
        tag_in_file = open(args[0])

    tag_out_file = sys.stdout
    if options.output:
        tag_out_file = open(out, 'w')

    tag_file(tagger, tag_in_file, tag_out_file, options.test)
Exemple #7
0
def main():
    if len(sys.argv) != 2:
        print "Usage: MaxentTest.py modelName"
        sys.exit(1)

    model = MaxentModel()
    model.load(sys.argv[1])
    context = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0
    ]
    label = model.eval(context, str(0))
    #result = model.eval_all(context)
    print "Result: ", label
Exemple #8
0
def predict_tags(best_1_name, best_1_org, best_3_name, best_5_org, sentences,
                 f, op):
    rel = [
        'others', 'director', 'analyst', 'advisor', 'head', 'manager',
        'spokesperson', 'founder', 'professor', 'leave', 'lawyer'
    ]
    me = MaxentModel()
    me.load('../training/models/lbfgs/model3')
    count = 0
    for n1, o1, n3, o5, sent in zip(best_1_name, best_1_org, best_3_name,
                                    best_5_org, sentences):
        if len(n3) == 0 or len(o5) == 0:
            op.write(str((n1, o1)) + '\n')
        else:
            j = ('', '', '', 0.0)
            d = {}
            for name in n3:
                for org in o5:
                    context = get_context(name, org, sent)

                    relation = ''
                    prob = 0.0
                    if context != None:

                        for r in rel:
                            y = me.eval(context, r)
                            if y > prob:
                                prob = y
                                relation = r
                        #set_r.append((name,org,relation,prob))
                        d[(name, org)] = relation
                        if prob > j[3] and relation != 'others':
                            j = (name, org, relation, prob)

                    else:

                        d[(name, org)] = 'others'
            #print str(count)+' before : '+str(n1)+'\t'+str(o1)
            resolve_conflicts(n1, o1, j)
            #print str(count)+' after : '+str(n1)+'\t'+str(o1)
            #x = raw_input()
            op.write(str((n1, o1)) + '\n')
            f.write(str(j) + '\n')
        count = count + 1
Exemple #9
0
def eval_model(options, iterable):
    model = MaxentModel()
    data = {}

    print >> sys.stderr, "*** Loading..."
    model.load(options.model + ".maxent")
    with open(options.model + ".data", "r") as handle:
        data = cPickle.load(handle)

    print >> sys.stderr, "*** Evaluating..."
    for n, sentence in enumerate(iterable):
        if (n % 100) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses = map(list, zip(*sentence))
        labels = eval_model_sentence(options, data, model, words, poses)

        for word, pos, label in zip(words, poses, labels):
            print label
        print
Exemple #10
0
def training(feature_file_path, trained_model_file, times):
    m = MaxentModel()
    fin = codecs.open(feature_file_path, 'r', 'utf-8')
    all_list = []
    m.begin_add_event()
    for line in fin:
        line = line.rstrip()
        line_list = line.split(' ')
        str_list = []
        for item in line_list:
            str_list.append(item.encode('utf-8'))
        all_list.append(str_list)
        m.add_event(str_list[1:], str_list[0], 1)
    m.end_add_event()
    print 'begin training'
    m.train(times, "lbfgs")
    print 'end training'
    m.save(trained_model_file)
    return all_list
Exemple #11
0
def main():
    if len(sys.argv) != 4:
        print "Usage: MaxentTrain.py features.mat labels.mat modelName"
        sys.exit(1)

    features = featureMatrice(sys.argv[1])
    labels = labelLst(sys.argv[2])

    model = MaxentModel()
    # add data into model
    model.begin_add_event()
    for i in range(len(labels)):
        model.add_event(features[i], str(labels[i]), 1)

    model.end_add_event()

    # start training
    #model.train()
    model.train(1000, "gis", 2)
    #model.train(30, "lbfgs")

    # save the model
    model.save(sys.argv[3])
Exemple #12
0
def main():
    global feat_dict, me
    # parsing options{{{
    usage = "usage: %prog [options] model"
    parser = OptionParser(usage)
    parser.add_option("-f",
                      "--file",
                      type="string",
                      dest="filename",
                      metavar="FILE",
                      help="train a ME model with data from FILE")
    parser.add_option("--heldout",
                      type="string",
                      metavar="FILE",
                      help="use heldout events from FILE")
    parser.add_option("--extract",
                      type="string",
                      metavar="FILE",
                      help="extract training data to file")
    parser.add_option("--events_out",
                      type="string",
                      help="write training(heldout) events to file")
    parser.add_option(
        "-c",
        "--cutoff",
        type="int",
        default=10,
        help="discard feature with frequency < CUTOFF when training\
            [default=10]")
    parser.add_option(
        "-r",
        "--rare",
        type="int",
        default=5,
        help="use special feature for rare word with frequency < RARE \
            [default=5]")
    parser.add_option("-g",
                      "--gaussian",
                      type="float",
                      default=0.0,
                      help="apply Gaussian penality when training \
            [default=0.0]")
    parser.add_option(
        "-b",
        "--binary",
        action="store_true",
        default=0,
        help="save events in binary format for fast loading [default=off]")
    parser.add_option(
        "--ev_cutoff",
        type="int",
        default=1,
        help="discard event with frequency < CUTOFF when training \
            [default=1]")
    parser.add_option(
        "--iters",
        type="int",
        default=15,
        help="how many iterations are required for training[default=15]")

    parser.add_option("-T",
                      "--type",
                      type="int",
                      default=None,
                      help="choose context type [default for English]")
    (options, args) = parser.parse_args()
    #}}}

    if options.filename:
        file = open(options.filename)
    else:
        print 'training file not given'
        parser.print_usage()
        sys.exit(1)

    if len(args) != 1:
        print >> sys.stderr, 'model name not given'
        parser.print_usage()
        sys.exit(1)
    model_name = args[0]

    global rare_freq
    rare_freq = options.rare

    global get_context

    get_context = postagger.choose_context(options.type)

    # First pass: gather word frequency information {{{
    print 'First pass: gather word frequency information'
    gather_word_freq(file)
    print '%d words found in training data' % len(word_freq)
    word_freq_file = options.filename + '.wordfreq'
    print 'Saving word frequence information to %s' % col(
        word_freq_file, 'lgreen')
    save_word_freq(word_freq_file)
    print
    # }}}

    # Second pass: gather features and tag dict {{{
    file.seek(0)
    print 'Second pass: gather features and tag dict to be used in tagger'
    print 'feature cutoff:%d' % options.cutoff
    print 'rare word freq:%d' % options.rare
    extract_feature(file, gather_feature)
    print '%d features found' % len(feat_dict)
    print '%d words found in pos dict' % len(tag_dict)
    print 'Applying cutoff %d to features' % options.cutoff
    cutoff_feature(options.cutoff, options.rare)
    print '%d features remained after cutoff' % len(feat_dict)
    feature_file = model_name + '.features'
    print 'saving features to file %s' % feature_file
    save_features(feature_file)
    #    tag_dict_file = options.filename + '.tagdict'
    #    print 'Saving tag dict to file %s' % (col(tag_dict_file, 'lgreen'))
    #    save_tag_dict(tag_dict_file)
    tagdict_file = model_name + '.tagdict'
    print 'Saving tag dict object to %s' % col(tagdict_file, 'lgreen'),
    import cPickle
    cPickle.dump(tag_dict, open(tagdict_file, 'w'))
    print 'done'
    #}}}

    if options.extract:
        global training_data
        training_data = open(options.extract, 'w')
        print 'Saving training data to %s' % options.extract
        file.seek(0)
        extract_feature(file, save_training_data)
        sys.exit(0)

    # Third pass:training ME model...{{{
    print 'Third pass:training ME model...'
    me = MaxentModel()
    me.begin_add_event()
    file.seek(0)
    extract_feature(file, add_event)
    #import profile
    #profile.run('me.end_training()','proflog')
    if options.heldout:
        raise 'not tested'
        print 'adding heldout events from %s' % col(options.heldout, 'yellow')
        extract_feature(open(options.heldout), add_heldout_event, True)
    me.end_add_event(options.ev_cutoff)
    if options.events_out:
        raise 'not tested'
        print 'dumping training events to', col(options.events_out, 'lgreen')
        #        import hotshot,  hotshot.stats
        #        prof = hotshot.Profile("dump_events.prof", 1)
        #        prof.runcall(me.dump_events, options.events_out)
        me.dump_events(options.events_out, options.binary)
        sys.exit(0)

    me.train(options.iters, 'lbfgs', options.gaussian)

    print 'training finished'

    print 'saving tagger model to %s' % model_name,
    me.save(model_name)
    print 'done'
Exemple #13
0
from maxent import MaxentModel

for i in range(5):
    m = MaxentModel()
    context = []
    m.begin_add_event()
    with open('contexts/contexts' + str(i + 1) + '.txt', 'r') as f:
        for line in f:
            line = line.rstrip()
            try:
                ind = line.index(':')
                if line[:ind] != '':
                    rel = line[:ind]
                    l = eval(line[ind + 1:])
                    m.add_event(l, rel, 1)
            except:
                pass
    m.end_add_event()

    m.train(100, 'lbfgs')
    s_name = "models/lbfgs/model" + str(i + 1)
    m.save(s_name)
Exemple #14
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

# Imports
import sys, os

# Load MaxEnt models
corpusPath = os.environ.get('CORPUS_PATH')
from maxent import MaxentModel

maxEntModel = MaxentModel()
maxEntModel.load(corpusPath + '/model_markers.txt')

for trainLine in sys.stdin.readlines():
    trainCols = trainLine.split('\t')
    modelMarkerProbas = maxEntModel.eval_all(trainCols[1:])
    probaFeats = []
    for modelMarkerProba in modelMarkerProbas:
        if modelMarkerProba[1] > 0.00001:
            probaFeats.append(modelMarkerProba[0] + ':' +
                              str(modelMarkerProba[1]))
    print trainCols[0] + '\t' + '\t'.join(probaFeats)