def main(): global feat_dict, me # parsing options{{{ usage = "usage: %prog [options] model" parser = OptionParser(usage) parser.add_option("-f", "--file", type="string", dest="filename", metavar="FILE", help="train a ME model with data from FILE") parser.add_option("--heldout", type="string", metavar="FILE", help="use heldout events from FILE") parser.add_option("--extract", type="string", metavar="FILE", help="extract training data to file") parser.add_option("--events_out", type="string", help="write training(heldout) events to file") parser.add_option( "-c", "--cutoff", type="int", default=10, help="discard feature with frequency < CUTOFF when training\ [default=10]") parser.add_option( "-r", "--rare", type="int", default=5, help="use special feature for rare word with frequency < RARE \ [default=5]") parser.add_option("-g", "--gaussian", type="float", default=0.0, help="apply Gaussian penality when training \ [default=0.0]") parser.add_option( "-b", "--binary", action="store_true", default=0, help="save events in binary format for fast loading [default=off]") parser.add_option( "--ev_cutoff", type="int", default=1, help="discard event with frequency < CUTOFF when training \ [default=1]") parser.add_option( "--iters", type="int", default=15, help="how many iterations are required for training[default=15]") parser.add_option("-T", "--type", type="int", default=None, help="choose context type [default for English]") (options, args) = parser.parse_args() #}}} if options.filename: file = open(options.filename) else: print 'training file not given' parser.print_usage() sys.exit(1) if len(args) != 1: print >> sys.stderr, 'model name not given' parser.print_usage() sys.exit(1) model_name = args[0] global rare_freq rare_freq = options.rare global get_context get_context = postagger.choose_context(options.type) # First pass: gather word frequency information {{{ print 'First pass: gather word frequency information' gather_word_freq(file) print '%d words found in training data' % len(word_freq) word_freq_file = options.filename + '.wordfreq' print 'Saving word frequence information to %s' % col( word_freq_file, 'lgreen') save_word_freq(word_freq_file) print # }}} # Second pass: gather features and tag dict {{{ file.seek(0) print 'Second pass: gather features and tag dict to be used in tagger' print 'feature cutoff:%d' % options.cutoff print 'rare word freq:%d' % options.rare extract_feature(file, gather_feature) print '%d features found' % len(feat_dict) print '%d words found in pos dict' % len(tag_dict) print 'Applying cutoff %d to features' % options.cutoff cutoff_feature(options.cutoff, options.rare) print '%d features remained after cutoff' % len(feat_dict) feature_file = model_name + '.features' print 'saving features to file %s' % feature_file save_features(feature_file) # tag_dict_file = options.filename + '.tagdict' # print 'Saving tag dict to file %s' % (col(tag_dict_file, 'lgreen')) # save_tag_dict(tag_dict_file) tagdict_file = model_name + '.tagdict' print 'Saving tag dict object to %s' % col(tagdict_file, 'lgreen'), import cPickle cPickle.dump(tag_dict, open(tagdict_file, 'w')) print 'done' #}}} if options.extract: global training_data training_data = open(options.extract, 'w') print 'Saving training data to %s' % options.extract file.seek(0) extract_feature(file, save_training_data) sys.exit(0) # Third pass:training ME model...{{{ print 'Third pass:training ME model...' me = MaxentModel() me.begin_add_event() file.seek(0) extract_feature(file, add_event) #import profile #profile.run('me.end_training()','proflog') if options.heldout: raise 'not tested' print 'adding heldout events from %s' % col(options.heldout, 'yellow') extract_feature(open(options.heldout), add_heldout_event, True) me.end_add_event(options.ev_cutoff) if options.events_out: raise 'not tested' print 'dumping training events to', col(options.events_out, 'lgreen') # import hotshot, hotshot.stats # prof = hotshot.Profile("dump_events.prof", 1) # prof.runcall(me.dump_events, options.events_out) me.dump_events(options.events_out, options.binary) sys.exit(0) me.train(options.iters, 'lbfgs', options.gaussian) print 'training finished' print 'saving tagger model to %s' % model_name, me.save(model_name) print 'done'
def main (): global feat_dict,me # parsing options{{{ usage = "usage: %prog [options] model" parser = OptionParser(usage) parser.add_option("-f", "--file", type="string", dest="filename", metavar="FILE", help="train a ME model with data from FILE") parser.add_option("--heldout", type = "string" , metavar="FILE", help="use heldout events from FILE") parser.add_option("--extract", type = "string", metavar="FILE", help="extract training data to file") parser.add_option("--events_out", type="string", help="write training(heldout) events to file") parser.add_option("-c", "--cutoff", type="int", default=10, help="discard feature with frequency < CUTOFF when training\ [default=10]") parser.add_option("-r", "--rare", type="int", default=5, help="use special feature for rare word with frequency < RARE \ [default=5]") parser.add_option("-g", "--gaussian", type="float", default=0.0, help="apply Gaussian penality when training \ [default=0.0]") parser.add_option("-b", "--binary", action="store_true", default=0, help="save events in binary format for fast loading [default=off]") parser.add_option("--ev_cutoff", type="int", default=1, help="discard event with frequency < CUTOFF when training \ [default=1]") parser.add_option("--iters", type="int", default=15, help="how many iterations are required for training[default=15]") parser.add_option("-T","--type", type="int", default=None, help="choose context type [default for English]") (options, args) = parser.parse_args() #}}} if options.filename: file = open(options.filename) else: print 'training file not given' parser.print_usage() sys.exit(1) if len(args) !=1: print >> sys.stderr, 'model name not given' parser.print_usage() sys.exit(1) model_name = args[0] global rare_freq rare_freq = options.rare global get_context get_context = postagger.choose_context(options.type) # First pass: gather word frequency information {{{ print 'First pass: gather word frequency information' gather_word_freq(file) print '%d words found in training data' % len(word_freq) word_freq_file = options.filename + '.wordfreq' print 'Saving word frequence information to %s' % col(word_freq_file, 'lgreen') save_word_freq(word_freq_file) print # }}} # Second pass: gather features and tag dict {{{ file.seek(0) print 'Second pass: gather features and tag dict to be used in tagger' print 'feature cutoff:%d' % options.cutoff print 'rare word freq:%d' % options.rare extract_feature(file, gather_feature) print '%d features found' % len(feat_dict) print '%d words found in pos dict' % len(tag_dict) print 'Applying cutoff %d to features' % options.cutoff cutoff_feature(options.cutoff, options.rare) print '%d features remained after cutoff' % len(feat_dict) feature_file = model_name + '.features' print 'saving features to file %s' % feature_file save_features(feature_file) # tag_dict_file = options.filename + '.tagdict' # print 'Saving tag dict to file %s' % (col(tag_dict_file, 'lgreen')) # save_tag_dict(tag_dict_file) tagdict_file = model_name + '.tagdict' print 'Saving tag dict object to %s' % col(tagdict_file, 'lgreen'), import cPickle cPickle.dump(tag_dict, open(tagdict_file,'w')) print 'done' #}}} if options.extract: global training_data training_data = open(options.extract, 'w') print 'Saving training data to %s' % options.extract file.seek(0) extract_feature(file, save_training_data) sys.exit(0) # Third pass:training ME model...{{{ print 'Third pass:training ME model...' me = MaxentModel() me.begin_add_event() file.seek(0) extract_feature(file, add_event) #import profile #profile.run('me.end_training()','proflog') if options.heldout: raise 'not tested' print 'adding heldout events from %s' % col(options.heldout, 'yellow') extract_feature(open(options.heldout), add_heldout_event, True) me.end_add_event(options.ev_cutoff) if options.events_out: raise 'not tested' print 'dumping training events to', col(options.events_out, 'lgreen') # import hotshot, hotshot.stats # prof = hotshot.Profile("dump_events.prof", 1) # prof.runcall(me.dump_events, options.events_out) me.dump_events(options.events_out, options.binary) sys.exit(0) me.train(options.iters, 'lbfgs', options.gaussian) print 'training finished' print 'saving tagger model to %s' % model_name, me.save(model_name) print 'done'