class MaximumEntropyClassifier(Classifier): def __init__(self, restrictFeatures=False): Classifier.__init__(self) print "MaximumEntropy: Creating model" self.model = MaxentModel() self.model.verbose = 1 self.restrictFeatures = restrictFeatures self.model.begin_add_event() def addToIndex(self, trainingset): for (vec,cls) in trainingset: self.addFeatureVector(vec,cls) def addFeatureVector(self, vec, cls, value=1, binary=False): for key in vec.keys(): if key not in self.restrictFeatures: del vec[key] context = vec.keys() label = "%s" % cls self.model.add_event(context,label,value) def compile(self): self.model.end_add_event() self.model.train(30, "lbfgs", 2, 1E-03) #self.model.train(100, 'gis', 2) print "> Models trained" def classify(self, point, label='1', binary=False): result = self.model.eval(point.keys(), label) if result >= 0.5: return 1 return -1
class MaximumEntropyClassifier(Classifier): def __init__(self, restrictFeatures=False): Classifier.__init__(self) print "MaximumEntropy: Creating model" self.model = MaxentModel() self.model.verbose = 1 self.restrictFeatures = restrictFeatures self.model.begin_add_event() def addToIndex(self, trainingset): for (vec, cls) in trainingset: self.addFeatureVector(vec, cls) def addFeatureVector(self, vec, cls, value=1, binary=False): for key in vec.keys(): if key not in self.restrictFeatures: del vec[key] context = vec.keys() label = "%s" % cls self.model.add_event(context, label, value) def compile(self): self.model.end_add_event() self.model.train(30, "lbfgs", 2, 1E-03) #self.model.train(100, 'gis', 2) print "> Models trained" def classify(self, point, label='1', binary=False): result = self.model.eval(point.keys(), label) if result >= 0.5: return 1 return -1
def baseline(sentences, labels): maxent.set_verbose(1) m = MaxentModel() m.begin_add_event() with open(sentences) as file_content: sentences = file_content.readlines() with open(labels) as file_content: labels = file_content.readlines() for i in xrange(0, 3000): m.add_event(sentences[i].split(" "), labels[i].strip()) m.end_add_event() m.train() correct = 0 false = 0 for i in xrange(3000, len(sentences)): result = m.eval(sentences[i].split(" "), "1") result = int(round(result)) label = int(labels[i]) if result == label: correct = correct + 1 else: false = false + 1 print "correct :", correct print "false :", false print("accuracy : {:.2f}%".format(correct * 100.0 / (correct + false)))
def main(): global feat_dict, m # parsing options{{{ usage = "usage: %prog [options] model" parser = OptionParser(usage) parser.add_option("-f", "--file", type="string", dest="filename", metavar="FILE", help="train a Maxent model with data from FILE") parser.add_option("-g", "--gaussian", type="float", default=0.0, help="apply Gaussian penality when training \ [default=0.0]") parser.add_option("--iters", type="int", default=15, help="how many iterations are required for training[default=15]") (options, args) = parser.parse_args() #}}} if options.filename: file = open(options.filename) else: print 'training file not given' parser.print_usage() sys.exit(1) if len(args) !=1: print >> sys.stderr, 'model name not given' parser.print_usage() sys.exit(1) model_name = args[0] global get_context get_context = Generator.get_context_wordform # change this to use different features print 'First pass: gather features' extract_feature(file,gather_feature) feature_file = model_name + '.features' print 'save features to file %s' % feature_file save_features(feature_file) print 'feat_dict: ',feat_dict file.seek(0) print 'Second pass: training model...' m = MaxentModel() m.begin_add_event() extract_feature(file, add_event) m.end_add_event() m.train(options.iters, 'lbfgs', options.gaussian) print 'training finished' print 'saving tagger model to %s' % model_name, m.save(model_name) print 'done'
def simple_train(event_list): m = MaxentModel() m.begin_add_event() for e in event_list: m.add_event(e[0], e[1]) m.end_add_event() #maxent.set_verbose(1) m.train(30, 'lbfgs', 2) return m
def train_ne_binary_model(options, iterable): model = MaxentModel() data = {} data["feature_set"] = set() data["word_frequencies"] = defaultdict(long) # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be # a better choice here (for |labelled_words|) but it could not be pickled. # C'est la vie. data["labelled_words"] = dict() print >>sys.stderr, "*** Training options are:" print >>sys.stderr, " ", options print >>sys.stderr, "*** First pass: Computing statistics..." for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >>sys.stderr, " {0:6d} sentences...".format(n) for word, pos, label in sentence: data["word_frequencies"][word] += 1 if label.startswith("B-") or label.startswith("I-"): if word not in data["labelled_words"]: data["labelled_words"][word] = defaultdict(long) data["labelled_words"][word][label] += 1 print >>sys.stderr, "*** Second pass: Collecting features..." model.begin_add_event() for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >>sys.stderr, " {0:6d} sentences...".format(n) words, poses, labels = map(list, zip(*sentence)) for i in xrange(len(labels)): features = compute_ne_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^") features = list(features) if labels[i].startswith("B-") or labels[i].startswith("I-"): model.add_event(features, "NE") else: model.add_event(features, "O") for feature in features: data["feature_set"].add(feature) model.end_add_event(options.cutoff) print >>sys.stderr, "*** Collected {0} features.".format(len(data["feature_set"])) print >>sys.stderr, "*** Training..." maxent.set_verbose(1) model.train(options.iterations, options.technique, options.gaussian) maxent.set_verbose(0) print >>sys.stderr, "*** Saving..." model.save(options.model + ".ne.binary.maxent") with open(options.model + ".ne.binary.data", "w") as handle: cPickle.dump(data, handle)
def train(corpus, *args): projections = {} model = MaxentModel() model.begin_add_event() for datums in corpus.values(): for datum in datums: projection = datum2features(datum) model.add_event(datum2features(datum), datum.is_related, long(100 * float(datum._trust))) projections[datum.row_in_corpus] = projection model.end_add_event() model.train(*args) return model, projections
def train_model(options, iterable): model = MaxentModel() data = {} data["feature_set"] = set() data["word_frequencies"] = defaultdict(long) # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be # a better choice here (for |labelled_words|) but it could not be pickled. # C'est la vie. data["labelled_words"] = dict() print >> sys.stderr, "*** Training options are:" print >> sys.stderr, " ", options print >> sys.stderr, "*** First pass: Computing statistics..." for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >> sys.stderr, " {0:6d} sentences...".format(n) for word, pos, label in sentence: data["word_frequencies"][word] += 1 if label.startswith("B-") or label.startswith("I-"): if word in data["labelled_words"]: data["labelled_words"][word][label] += 1 else: data["labelled_words"][word] = defaultdict(long) print >> sys.stderr, "*** Second pass: Collecting features..." model.begin_add_event() for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >> sys.stderr, " {0:6d} sentences...".format(n) words, poses, labels = map(list, zip(*sentence)) for i in xrange(len(labels)): features = compute_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^") features = list(features) model.add_event(features, labels[i]) for feature in features: data["feature_set"].add(feature) model.end_add_event(options.cutoff) print >> sys.stderr, "*** Collected {0} features.".format( len(data["feature_set"])) print >> sys.stderr, "*** Training..." maxent.set_verbose(1) model.train(options.iterations, options.technique, options.gaussian) maxent.set_verbose(0) print >> sys.stderr, "*** Saving..." model.save(options.model + ".maxent") with open(options.model + ".data", "w") as handle: cPickle.dump(data, handle)
def test(): maxent.set_verbose(1) m = MaxentModel() m.begin_add_event() m.add_event(['1'], '1') m.add_event(['2'], '2') m.add_event(['3'], '3') m.end_add_event() m.train(30, 'lbfgs', 2, 1e-03) for x in map(str, range(1,4)): print "tested on:", x, "predicted:", m.eval_all([x])
def trainOn(self, train_groups): ''' Train on the train set and return the trained model ''' maxent.set_verbose(1) m = MaxentModel() m.begin_add_event() for pair in train_groups: m.add_event(pair[0], pair[1]) m.end_add_event() m.train(20, 'lbfgs', 1e-04, 1e-03) return m
def training(feature_file_path, trained_model_file, times): m = MaxentModel() fr = codecs.open(feature_file_path, 'r', 'utf-8') all_list = [] m.begin_add_event() for line in fr: line = line.rstrip() line_list = line.split(' ') str_list = [] for item in line_list: str_list.append(item.encode('utf-8')) all_list.append(str_list) m.add_event(str_list[1:], str_list[0], 1) m.end_add_event() print 'begin training' m.train(times, "lbfgs") print 'end training' m.save(trained_model_file) return all_list
def training(feature_file_path, trained_model_file, times): m = MaxentModel() fin = codecs.open(feature_file_path, 'r', 'utf-8') all_list = [] m.begin_add_event() for line in fin: line = line.rstrip() line_list = line.split(' ') str_list = [] for item in line_list: str_list.append(item.encode('utf-8')) all_list.append(str_list) m.add_event(str_list[1:], str_list[0], 1) m.end_add_event() print 'begin training' m.train(times, "lbfgs") print 'end training' m.save(trained_model_file) return all_list
def trainOn(self, train_groups, n_itr = 15, var = 1, tol = 1e-5): ''' Train on the train set and return the trained model ''' print "training set:", Counter(zip(*train_groups)[1]).most_common() maxent.set_verbose(1) m = MaxentModel() m.begin_add_event() for pair in train_groups: m.add_event(pair[0], pair[1]) n_cutoff = 1 m.end_add_event(n_cutoff) m.train(n_itr, 'lbfgs', var, tol) return m
def main(): if len(sys.argv) != 4: print "Usage: MaxentTrain.py features.mat labels.mat modelName" sys.exit(1) features = featureMatrice(sys.argv[1]) labels = labelLst(sys.argv[2]) model = MaxentModel() # add data into model model.begin_add_event() for i in range(len(labels)): model.add_event(features[i], str(labels[i]), 1) model.end_add_event() # start training #model.train() model.train(1000, "gis", 2) #model.train(30, "lbfgs") # save the model model.save(sys.argv[3])
def main (): global feat_dict,me # parsing options{{{ usage = "usage: %prog [options] model" parser = OptionParser(usage) parser.add_option("-f", "--file", type="string", dest="filename", metavar="FILE", help="train a ME model with data from FILE") parser.add_option("--heldout", type = "string" , metavar="FILE", help="use heldout events from FILE") parser.add_option("--extract", type = "string", metavar="FILE", help="extract training data to file") parser.add_option("--events_out", type="string", help="write training(heldout) events to file") parser.add_option("-c", "--cutoff", type="int", default=10, help="discard feature with frequency < CUTOFF when training\ [default=10]") parser.add_option("-r", "--rare", type="int", default=5, help="use special feature for rare word with frequency < RARE \ [default=5]") parser.add_option("-g", "--gaussian", type="float", default=0.0, help="apply Gaussian penality when training \ [default=0.0]") parser.add_option("-b", "--binary", action="store_true", default=0, help="save events in binary format for fast loading [default=off]") parser.add_option("--ev_cutoff", type="int", default=1, help="discard event with frequency < CUTOFF when training \ [default=1]") parser.add_option("--iters", type="int", default=15, help="how many iterations are required for training[default=15]") parser.add_option("-T","--type", type="int", default=None, help="choose context type [default for English]") (options, args) = parser.parse_args() #}}} if options.filename: file = open(options.filename) else: print 'training file not given' parser.print_usage() sys.exit(1) if len(args) !=1: print >> sys.stderr, 'model name not given' parser.print_usage() sys.exit(1) model_name = args[0] global rare_freq rare_freq = options.rare global get_context get_context = postagger.choose_context(options.type) # First pass: gather word frequency information {{{ print 'First pass: gather word frequency information' gather_word_freq(file) print '%d words found in training data' % len(word_freq) word_freq_file = options.filename + '.wordfreq' print 'Saving word frequence information to %s' % col(word_freq_file, 'lgreen') save_word_freq(word_freq_file) print # }}} # Second pass: gather features and tag dict {{{ file.seek(0) print 'Second pass: gather features and tag dict to be used in tagger' print 'feature cutoff:%d' % options.cutoff print 'rare word freq:%d' % options.rare extract_feature(file, gather_feature) print '%d features found' % len(feat_dict) print '%d words found in pos dict' % len(tag_dict) print 'Applying cutoff %d to features' % options.cutoff cutoff_feature(options.cutoff, options.rare) print '%d features remained after cutoff' % len(feat_dict) feature_file = model_name + '.features' print 'saving features to file %s' % feature_file save_features(feature_file) # tag_dict_file = options.filename + '.tagdict' # print 'Saving tag dict to file %s' % (col(tag_dict_file, 'lgreen')) # save_tag_dict(tag_dict_file) tagdict_file = model_name + '.tagdict' print 'Saving tag dict object to %s' % col(tagdict_file, 'lgreen'), import cPickle cPickle.dump(tag_dict, open(tagdict_file,'w')) print 'done' #}}} if options.extract: global training_data training_data = open(options.extract, 'w') print 'Saving training data to %s' % options.extract file.seek(0) extract_feature(file, save_training_data) sys.exit(0) # Third pass:training ME model...{{{ print 'Third pass:training ME model...' me = MaxentModel() me.begin_add_event() file.seek(0) extract_feature(file, add_event) #import profile #profile.run('me.end_training()','proflog') if options.heldout: raise 'not tested' print 'adding heldout events from %s' % col(options.heldout, 'yellow') extract_feature(open(options.heldout), add_heldout_event, True) me.end_add_event(options.ev_cutoff) if options.events_out: raise 'not tested' print 'dumping training events to', col(options.events_out, 'lgreen') # import hotshot, hotshot.stats # prof = hotshot.Profile("dump_events.prof", 1) # prof.runcall(me.dump_events, options.events_out) me.dump_events(options.events_out, options.binary) sys.exit(0) me.train(options.iters, 'lbfgs', options.gaussian) print 'training finished' print 'saving tagger model to %s' % model_name, me.save(model_name) print 'done'
SUN = 'sun' RAIN = 'rain' train_data = [(SUN, 10),(SUN,8),(SUN,11), (RAIN,3),(RAIN,2),(SUN,6),(SUN,10),(RAIN,1)] labels_train = [ i[0] for i in train_data] icecream_train = [ i[1] for i in train_data] me = MaxentModel() me.begin_add_event() for i,data in enumerate( train_data ): features = list(compute_features( icecream_train, i , labels_train[ i - 1] if i > 0 else None ) ) me.add_event(features, labels_train[i] ) me.end_add_event() me.train() Y = set([ SUN, RAIN ]) print eval_model_sentence( observations = [1,6,1,6], model = me) print get_viterbi_path_memm( me = me, x = [1,6,1,6], Y= Y ) me.save('sunny.dat')
class MMEMAlgorithm(object): #реализация алгоритма на основе HMM def __init__(self,compute_features, N_filter_func = N_default): self.filter_func = N_filter_func self.me = MaxentModel() self.num_train_iters = 2000 self.compute_features = compute_features def load_memm_model(self, filename): self.me.load( filename ) def init(self): pass def train_model_file_list(self, corpus_filelist, ambiguity_dir ): self.me.begin_add_event() for corpus_file in corpus_filelist: print "Training on file {0}".format( corpus_file ) sentence = [] morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) ) morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func ) if os.path.exists( morph_analys_file ) else None if morph_analys_tokens: print "Using mystem features on file {0}".format( morph_analys_file ) gold_tokens = get_tokens_from_file(corpus_file, N_filter_func = self.filter_func ) for corpus_token in gold_tokens: morph_analys_token = morph_analys_tokens.next() if morph_analys_tokens else None gold_token_word = corpus_token[0].word morph_analys_token_word = morph_analys_token[0].word if morph_analys_token else None if morph_analys_token_word: if gold_token_word != morph_analys_token_word: ''' if ('-' in gold_token_word and '-' not in morph_analys_token_word) or ('\'' in gold_token_word and '\'' not in morph_analys_token_word): morph_analys_token = morph_analys_tokens.next() if ('.' in gold_token_word): cnt_dots = '.'.count( gold_token_word ) for i in xrange( 0, cnt_dots ): morph_analys_token = morph_analys_tokens.next() ''' print >>sys.stderr, u"Start skipping sentence. Gold token wordform {0} morph token wordform {1}".format( gold_token_word, morph_analys_token_word ) sentence = [] try: next_gold = gold_tokens.next() while( next_gold != [EOS_TOKEN] ): next_gold = gold_tokens.next() next_gold = gold_tokens.next() next_morph = morph_analys_tokens.next() while( next_morph[0].word != next_gold[0].word ): next_morph = morph_analys_tokens.next() except StopIteration: break if corpus_token[0] == EOS_TOKEN and len(sentence) > 0: words = [token[0].word for token in sentence] labels = [token[0].gram for token in sentence] for i,token_info in enumerate( sentence ): gold_token = token_info[0] morph_analysises = [token.gram for token in token_info[1]] if token_info[1] and morph_analys_token else None if token_info[1] is not None: if gold_token.word != token_info[1][0].word: print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0} morph analysis token : {1}".format( gold_token.word, token_info[1][0].word ) morph_analysises = None word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) ) gold_token_gram = gold_token.gram.encode('utf-8') self.me.add_event(word_features, gold_token_gram ) sentence = [] else: sentence.append( (corpus_token[0], morph_analys_token) ) self.me.end_add_event() maxent.set_verbose(1) self.me.train( self.num_train_iters, 'lbfgs', 0.0 ) maxent.set_verbose(0) def train_model(self, corpus_dir, ambiguity_dir ): self.me.begin_add_event() #self.B = train_B_corpus(corpus_dir = corpus_dir,N_filter_func = N_filter_func) sentence = [] corpus_files = get_corpus_files(corpus_dir) for corpus_file in corpus_files: morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) ) morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func ) for corpus_token in get_tokens_from_file(corpus_file, N_filter_func = self.filter_func ): morph_analys_token = morph_analys_tokens.next() if corpus_token[0] == EOS_TOKEN: words = [token[0].word for token in sentence] labels = [token[0].gram for token in sentence] for i,token_info in enumerate( sentence ): gold_token = token_info[0] morph_analysises = [token.gram for token in token_info[1]] if gold_token.word != token_info[1][0].word: print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0} morph analysis token : {1}".format( gold_token.word, token_info[1][0].word ) morph_analysises = None word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) ) gold_token_gram = gold_token.gram.encode('utf-8') self.me.add_event(word_features, gold_token_gram ) sentence = [] else: sentence.append( (corpus_token[0], morph_analys_token) ) self.me.end_add_event() maxent.set_verbose(1) self.me.train( 50, 'lbfgs', 0.0 ) maxent.set_verbose(0) def load_model(self, memm_filename): self.me.load( memm_filename ) def save_model(self, memm_filename): self.me.save( memm_filename ) #dump_object( B_stat_filename, self.B ) def remove_ambiguity_file(self, file, outfile): out_f = codecs.open( outfile, 'w', 'utf-8' ) sentence = [] for token in get_tokens_from_file(file, N_filter_func= self.filter_func): if len(token) == 1 and token[0] == EOS_TOKEN: if len(sentence)>0: no_ambig_tokens = self.remove_ambiguity( sentence ) for no_ambig_token in no_ambig_tokens: out_f.write( u"{0}\t{1}={2}\r\n".format(no_ambig_token[0], 'nolemma', no_ambig_token[1] ) ) out_f.write('\r\n') sentence = [] continue else: sentence = [] continue sentence.append( (token[0].word, token) ) out_f.close() def remove_ambiguity_dir(self, dir): pass def remove_ambiguity(self, variants): """ Структура variants = [ (word_form, [tokens ]), (...) , ( ) ] """ words = [variant[0] for variant in variants] analysises = [[token.gram for token in variant[1]] for variant in variants ] viterbi_layers = [ None for i in xrange(len(words)) ] viterbi_backpointers = [ None for i in xrange(len(words) + 1) ] # Compute first layer directly. viterbi_layers[0] = self.me.eval_all(list(self.compute_features(sentence=words, i = 0 , prev_label= None, analysises = analysises[0], labels = None ) ) ) filtered_viterbi_layer = dict( (k, v) for k, v in viterbi_layers[0] if k in analysises[0] ) viterbi_layer_0_prob = sum( [v for v in filtered_viterbi_layer.values() ] ) viterbi_layers[0] = dict( (k, math.log(v/viterbi_layer_0_prob) ) for k, v in filtered_viterbi_layer.items() ) viterbi_backpointers[0] = dict( (k, None) for k, v in viterbi_layers[0].iteritems() ) # Compute intermediate layers. for i in xrange(1, len(words)): viterbi_layers[i] = defaultdict(lambda: float("-inf")) viterbi_backpointers[i] = defaultdict(lambda: None) for prev_label, prev_logprob in viterbi_layers[i - 1].iteritems(): features = self.compute_features(sentence=words,i= i, prev_label= prev_label, analysises = analysises[i], labels = None) features = list(features) distribution = self.me.eval_all(features) distribution = dict( (label, prob) for label, prob in distribution if label in analysises[i]) distribution_sum = sum( [v for v in distribution.values() ] ) distribution = dict( (k, v/ distribution_sum) for k, v in distribution.items() ) for label, prob in distribution.items(): logprob = math.log(prob) if prev_logprob + logprob > viterbi_layers[i][label]: viterbi_layers[i][label] = prev_logprob + logprob viterbi_backpointers[i][label] = prev_label # Most probable endpoint. max_logprob = float("-inf") max_label = None for label, logprob in viterbi_layers[len(words) - 1].iteritems(): if logprob > max_logprob: max_logprob = logprob max_label = label # Most probable sequence. path = [] label = max_label for i in reversed(xrange(len(words))): path.insert(0, label) try: label = viterbi_backpointers[i][label] except KeyError: pass return zip(words,path)
def main(): global feat_dict, me # parsing options{{{ usage = "usage: %prog [options] model" parser = OptionParser(usage) parser.add_option("-f", "--file", type="string", dest="filename", metavar="FILE", help="train a ME model with data from FILE") parser.add_option("--heldout", type="string", metavar="FILE", help="use heldout events from FILE") parser.add_option("--extract", type="string", metavar="FILE", help="extract training data to file") parser.add_option("--events_out", type="string", help="write training(heldout) events to file") parser.add_option( "-c", "--cutoff", type="int", default=10, help="discard feature with frequency < CUTOFF when training\ [default=10]") parser.add_option( "-r", "--rare", type="int", default=5, help="use special feature for rare word with frequency < RARE \ [default=5]") parser.add_option("-g", "--gaussian", type="float", default=0.0, help="apply Gaussian penality when training \ [default=0.0]") parser.add_option( "-b", "--binary", action="store_true", default=0, help="save events in binary format for fast loading [default=off]") parser.add_option( "--ev_cutoff", type="int", default=1, help="discard event with frequency < CUTOFF when training \ [default=1]") parser.add_option( "--iters", type="int", default=15, help="how many iterations are required for training[default=15]") parser.add_option("-T", "--type", type="int", default=None, help="choose context type [default for English]") (options, args) = parser.parse_args() #}}} if options.filename: file = open(options.filename) else: print 'training file not given' parser.print_usage() sys.exit(1) if len(args) != 1: print >> sys.stderr, 'model name not given' parser.print_usage() sys.exit(1) model_name = args[0] global rare_freq rare_freq = options.rare global get_context get_context = postagger.choose_context(options.type) # First pass: gather word frequency information {{{ print 'First pass: gather word frequency information' gather_word_freq(file) print '%d words found in training data' % len(word_freq) word_freq_file = options.filename + '.wordfreq' print 'Saving word frequence information to %s' % col( word_freq_file, 'lgreen') save_word_freq(word_freq_file) print # }}} # Second pass: gather features and tag dict {{{ file.seek(0) print 'Second pass: gather features and tag dict to be used in tagger' print 'feature cutoff:%d' % options.cutoff print 'rare word freq:%d' % options.rare extract_feature(file, gather_feature) print '%d features found' % len(feat_dict) print '%d words found in pos dict' % len(tag_dict) print 'Applying cutoff %d to features' % options.cutoff cutoff_feature(options.cutoff, options.rare) print '%d features remained after cutoff' % len(feat_dict) feature_file = model_name + '.features' print 'saving features to file %s' % feature_file save_features(feature_file) # tag_dict_file = options.filename + '.tagdict' # print 'Saving tag dict to file %s' % (col(tag_dict_file, 'lgreen')) # save_tag_dict(tag_dict_file) tagdict_file = model_name + '.tagdict' print 'Saving tag dict object to %s' % col(tagdict_file, 'lgreen'), import cPickle cPickle.dump(tag_dict, open(tagdict_file, 'w')) print 'done' #}}} if options.extract: global training_data training_data = open(options.extract, 'w') print 'Saving training data to %s' % options.extract file.seek(0) extract_feature(file, save_training_data) sys.exit(0) # Third pass:training ME model...{{{ print 'Third pass:training ME model...' me = MaxentModel() me.begin_add_event() file.seek(0) extract_feature(file, add_event) #import profile #profile.run('me.end_training()','proflog') if options.heldout: raise 'not tested' print 'adding heldout events from %s' % col(options.heldout, 'yellow') extract_feature(open(options.heldout), add_heldout_event, True) me.end_add_event(options.ev_cutoff) if options.events_out: raise 'not tested' print 'dumping training events to', col(options.events_out, 'lgreen') # import hotshot, hotshot.stats # prof = hotshot.Profile("dump_events.prof", 1) # prof.runcall(me.dump_events, options.events_out) me.dump_events(options.events_out, options.binary) sys.exit(0) me.train(options.iters, 'lbfgs', options.gaussian) print 'training finished' print 'saving tagger model to %s' % model_name, me.save(model_name) print 'done'
def train_model(options, iterable): model = MaxentModel() data = {} data["feature_set"] = set() data["word_frequencies"] = defaultdict(long) # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be # a better choice here (for |labelled_words|) but it could not be pickled. # C'est la vie. data["labelled_words"] = dict() data["unigrams"] = dict() print >>sys.stderr, "*** Training options are:" print >>sys.stderr, " ", options print >>sys.stderr, "*** First pass: Computing statistics..." unigrams = dict() unigrams["B-ORG"] = defaultdict(long) unigrams["B-MISC"] = defaultdict(long) unigrams["B-LOC"] = defaultdict(long) unigrams["B-PER"] = defaultdict(long) for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >>sys.stderr, " {0:6d} sentences...".format(n) previous_word = "^" previous_label = "^" for word, pos, label in sentence: data["word_frequencies"][string.lower(word)] += 1 if label.startswith("B-") or label.startswith("I-"): if word in data["labelled_words"]: data["labelled_words"][string.lower(word)][label] += 1 else: data["labelled_words"][string.lower(word)] = defaultdict(long) data["labelled_words"][string.lower(word)][label] = 1 if label.startswith("B-") and (previous_word != "^"): unigrams[label][string.lower(previous_word)] += 1 previous_label = label previous_word = word unigram_counters = [Counter(unigrams[key]) for key in unigrams] total_count = Counter() for counter in unigram_counters: total_count += counter total_count = dict(total_count) inv_total_freq = dict([[key, (math.log(sum(total_count.values()) / total_count[key]) ** 3)] for key in total_count]) for label in unigrams: all_sum = sum([unigrams[label][word] for word in unigrams[label]]) uni = sorted([[(1.0 * unigrams[label][word] * inv_total_freq[word] / all_sum ), word] for word in unigrams[label]]) uni = [word[1] for word in uni] data["unigrams"][label] = uni[-50:] # print >>sys.stderr, "*** Collected {0} unigrams for {1}".format(len(data["unigrams"][label]), label) print >>sys.stderr, "*** Second pass: Collecting features..." model.begin_add_event() for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >>sys.stderr, " {0:6d} sentences...".format(n) words, poses, labels = map(list, zip(*sentence)) for i in xrange(len(labels)): features = compute_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^") features = list(features) model.add_event(features, labels[i]) for feature in features: data["feature_set"].add(feature) model.end_add_event(options.cutoff) print >>sys.stderr, "*** Collected {0} features.".format(len(data["feature_set"])) print >>sys.stderr, "*** Training..." maxent.set_verbose(1) model.train(options.iterations, options.technique, options.gaussian) maxent.set_verbose(0) print >>sys.stderr, "*** Saving..." model.save(options.model + ".maxent") with open(options.model + ".data", "w") as handle: cPickle.dump(data, handle)
from maxent import MaxentModel for i in range(5): m = MaxentModel() context = [] m.begin_add_event() with open('contexts/contexts' + str(i + 1) + '.txt', 'r') as f: for line in f: line = line.rstrip() try: ind = line.index(':') if line[:ind] != '': rel = line[:ind] l = eval(line[ind + 1:]) m.add_event(l, rel, 1) except: pass m.end_add_event() m.train(100, 'lbfgs') s_name = "models/lbfgs/model" + str(i + 1) m.save(s_name)