Exemple #1
0
def main():
    global feat_dict, me
    # parsing options{{{
    usage = "usage: %prog [options] model"
    parser = OptionParser(usage)
    parser.add_option("-f",
                      "--file",
                      type="string",
                      dest="filename",
                      metavar="FILE",
                      help="train a ME model with data from FILE")
    parser.add_option("--heldout",
                      type="string",
                      metavar="FILE",
                      help="use heldout events from FILE")
    parser.add_option("--extract",
                      type="string",
                      metavar="FILE",
                      help="extract training data to file")
    parser.add_option("--events_out",
                      type="string",
                      help="write training(heldout) events to file")
    parser.add_option(
        "-c",
        "--cutoff",
        type="int",
        default=10,
        help="discard feature with frequency < CUTOFF when training\
            [default=10]")
    parser.add_option(
        "-r",
        "--rare",
        type="int",
        default=5,
        help="use special feature for rare word with frequency < RARE \
            [default=5]")
    parser.add_option("-g",
                      "--gaussian",
                      type="float",
                      default=0.0,
                      help="apply Gaussian penality when training \
            [default=0.0]")
    parser.add_option(
        "-b",
        "--binary",
        action="store_true",
        default=0,
        help="save events in binary format for fast loading [default=off]")
    parser.add_option(
        "--ev_cutoff",
        type="int",
        default=1,
        help="discard event with frequency < CUTOFF when training \
            [default=1]")
    parser.add_option(
        "--iters",
        type="int",
        default=15,
        help="how many iterations are required for training[default=15]")

    parser.add_option("-T",
                      "--type",
                      type="int",
                      default=None,
                      help="choose context type [default for English]")
    (options, args) = parser.parse_args()
    #}}}

    if options.filename:
        file = open(options.filename)
    else:
        print 'training file not given'
        parser.print_usage()
        sys.exit(1)

    if len(args) != 1:
        print >> sys.stderr, 'model name not given'
        parser.print_usage()
        sys.exit(1)
    model_name = args[0]

    global rare_freq
    rare_freq = options.rare

    global get_context

    get_context = postagger.choose_context(options.type)

    # First pass: gather word frequency information {{{
    print 'First pass: gather word frequency information'
    gather_word_freq(file)
    print '%d words found in training data' % len(word_freq)
    word_freq_file = options.filename + '.wordfreq'
    print 'Saving word frequence information to %s' % col(
        word_freq_file, 'lgreen')
    save_word_freq(word_freq_file)
    print
    # }}}

    # Second pass: gather features and tag dict {{{
    file.seek(0)
    print 'Second pass: gather features and tag dict to be used in tagger'
    print 'feature cutoff:%d' % options.cutoff
    print 'rare word freq:%d' % options.rare
    extract_feature(file, gather_feature)
    print '%d features found' % len(feat_dict)
    print '%d words found in pos dict' % len(tag_dict)
    print 'Applying cutoff %d to features' % options.cutoff
    cutoff_feature(options.cutoff, options.rare)
    print '%d features remained after cutoff' % len(feat_dict)
    feature_file = model_name + '.features'
    print 'saving features to file %s' % feature_file
    save_features(feature_file)
    #    tag_dict_file = options.filename + '.tagdict'
    #    print 'Saving tag dict to file %s' % (col(tag_dict_file, 'lgreen'))
    #    save_tag_dict(tag_dict_file)
    tagdict_file = model_name + '.tagdict'
    print 'Saving tag dict object to %s' % col(tagdict_file, 'lgreen'),
    import cPickle
    cPickle.dump(tag_dict, open(tagdict_file, 'w'))
    print 'done'
    #}}}

    if options.extract:
        global training_data
        training_data = open(options.extract, 'w')
        print 'Saving training data to %s' % options.extract
        file.seek(0)
        extract_feature(file, save_training_data)
        sys.exit(0)

    # Third pass:training ME model...{{{
    print 'Third pass:training ME model...'
    me = MaxentModel()
    me.begin_add_event()
    file.seek(0)
    extract_feature(file, add_event)
    #import profile
    #profile.run('me.end_training()','proflog')
    if options.heldout:
        raise 'not tested'
        print 'adding heldout events from %s' % col(options.heldout, 'yellow')
        extract_feature(open(options.heldout), add_heldout_event, True)
    me.end_add_event(options.ev_cutoff)
    if options.events_out:
        raise 'not tested'
        print 'dumping training events to', col(options.events_out, 'lgreen')
        #        import hotshot,  hotshot.stats
        #        prof = hotshot.Profile("dump_events.prof", 1)
        #        prof.runcall(me.dump_events, options.events_out)
        me.dump_events(options.events_out, options.binary)
        sys.exit(0)

    me.train(options.iters, 'lbfgs', options.gaussian)

    print 'training finished'

    print 'saving tagger model to %s' % model_name,
    me.save(model_name)
    print 'done'
Exemple #2
0
def main ():
    global feat_dict,me
    # parsing options{{{
    usage = "usage: %prog [options] model"
    parser = OptionParser(usage)
    parser.add_option("-f", "--file", type="string", dest="filename",
                    metavar="FILE",
                    help="train a ME model with data from FILE")
    parser.add_option("--heldout", type = "string" , metavar="FILE", 
            help="use heldout events from FILE")
    parser.add_option("--extract", type = "string", metavar="FILE", 
            help="extract training data to file")
    parser.add_option("--events_out", type="string",
            help="write training(heldout) events to file")
    parser.add_option("-c", "--cutoff", type="int", default=10,
            help="discard feature with frequency < CUTOFF when training\
            [default=10]")
    parser.add_option("-r", "--rare", type="int", default=5, 
            help="use special feature for rare word with frequency < RARE \
            [default=5]")
    parser.add_option("-g", "--gaussian", type="float", default=0.0, 
            help="apply Gaussian penality when training \
            [default=0.0]")
    parser.add_option("-b", "--binary", action="store_true", default=0, 
            help="save events in binary format for fast loading [default=off]")
    parser.add_option("--ev_cutoff", type="int", default=1,
            help="discard event with frequency < CUTOFF when training \
            [default=1]")
    parser.add_option("--iters", type="int", default=15,
                    help="how many iterations are required for training[default=15]")

    parser.add_option("-T","--type",  type="int", default=None, 
            help="choose context type [default for English]")
    (options, args) = parser.parse_args()
    #}}}

    if options.filename:
        file = open(options.filename)
    else:
        print 'training file not given'
        parser.print_usage()
        sys.exit(1)

    if len(args) !=1:
        print >> sys.stderr, 'model name not given'
        parser.print_usage()
        sys.exit(1)
    model_name = args[0]

    global rare_freq
    rare_freq = options.rare

    global get_context
    
    get_context = postagger.choose_context(options.type)

    # First pass: gather word frequency information {{{
    print 'First pass: gather word frequency information'
    gather_word_freq(file)
    print '%d words found in training data' % len(word_freq)
    word_freq_file = options.filename + '.wordfreq'
    print 'Saving word frequence information to %s' % col(word_freq_file,
    'lgreen')
    save_word_freq(word_freq_file)
    print
    # }}}

    # Second pass: gather features and tag dict {{{
    file.seek(0)
    print 'Second pass: gather features and tag dict to be used in tagger'
    print 'feature cutoff:%d' % options.cutoff
    print 'rare word freq:%d' % options.rare
    extract_feature(file, gather_feature)
    print '%d features found' % len(feat_dict)
    print '%d words found in pos dict' % len(tag_dict)
    print 'Applying cutoff %d to features' % options.cutoff
    cutoff_feature(options.cutoff, options.rare)
    print '%d features remained after cutoff' % len(feat_dict)
    feature_file = model_name + '.features'
    print 'saving features to file %s' % feature_file
    save_features(feature_file)
#    tag_dict_file = options.filename + '.tagdict'
#    print 'Saving tag dict to file %s' % (col(tag_dict_file, 'lgreen'))
#    save_tag_dict(tag_dict_file)
    tagdict_file = model_name + '.tagdict'
    print 'Saving tag dict object to %s' % col(tagdict_file, 'lgreen'), 
    import cPickle
    cPickle.dump(tag_dict, open(tagdict_file,'w'))
    print 'done'
    #}}}

    if options.extract:
        global training_data
        training_data = open(options.extract, 'w')
        print 'Saving training data to %s' % options.extract
        file.seek(0)
        extract_feature(file, save_training_data)
        sys.exit(0)

    # Third pass:training ME model...{{{
    print 'Third pass:training ME model...'
    me = MaxentModel()
    me.begin_add_event()
    file.seek(0)
    extract_feature(file, add_event)
    #import profile
    #profile.run('me.end_training()','proflog')
    if options.heldout:
        raise 'not tested'
        print 'adding heldout events from %s' % col(options.heldout, 'yellow')
        extract_feature(open(options.heldout), add_heldout_event, True)
    me.end_add_event(options.ev_cutoff)
    if options.events_out:
        raise 'not tested'
        print 'dumping training events to', col(options.events_out, 'lgreen')
#        import hotshot,  hotshot.stats
#        prof = hotshot.Profile("dump_events.prof", 1)
#        prof.runcall(me.dump_events, options.events_out)
        me.dump_events(options.events_out, options.binary)
        sys.exit(0)

    me.train(options.iters, 'lbfgs', options.gaussian)
    
    print 'training finished'

    print 'saving tagger model to %s' % model_name,
    me.save(model_name)
    print 'done'