Ejemplo n.º 1
0
def main():

	usage = "usage: %prog [options] -m model file"
    	parser = OptionParser(usage)
	parser.add_option("-i","--input",type="string",
			help="test data as input")
    	parser.add_option("-o", "--output", type="string",
            help="write detector result to OUTPUT")
    	parser.add_option("-m", "--model", type="string", 
            help="load trained model from MODEL")
    	(options, args) = parser.parse_args()

	global m
	model = options.model
	m = MaxentModel()
	m.load(model)

	#in_file = sys.stdin
	if options.input:
    		in_file = open(options.input)
	else: 
		print >> sys.stderr,'not given input test data'
		sys.exit(1)
	
	if len(args) >=1:
        	tag_in_file = open(args[0])

    	out_file = sys.stdout
    
    	if options.output:
        	out_file = open(options.output, 'w')

    	predict_file(in_file,out_file)
Ejemplo n.º 2
0
 def __init__(self, restrictFeatures=False):
     Classifier.__init__(self)
     print "MaximumEntropy: Creating model"
     self.model = MaxentModel()
     self.model.verbose = 1
     self.restrictFeatures = restrictFeatures
     self.model.begin_add_event()
Ejemplo n.º 3
0
def eval_model(options, iterable):
    model = MaxentModel()
    data = {}

    # ne_labels = eval_ne_binary_model(options, iterable)

    print >>sys.stderr, "*** Loading..."
    model.load(options.model + ".maxent")
    with open(options.model + ".data", "r") as handle:
        data = cPickle.load(handle)

    print >>sys.stderr, "*** Evaluating..."
    data["sentences"] = iterable
    for n, sentence in enumerate(iterable):
        if (n % 100) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses = map(list, zip(*sentence))
        # data["ne_labels"] = ne_labels[n]
        data["sentence_number"] = n
        data["double_quotes"] = False
        data["single_quotes"] = False

        labels = eval_model_sentence(options, data, model, words, poses)

        for word, pos, label in zip(words, poses, labels):
            print label
        print
Ejemplo n.º 4
0
def eval_model(options, iterable):
    model = MaxentModel()
    data = {}

    print >>sys.stderr, "*** Loading..."
    model.load(options.model + ".maxent")
    with open(options.model + ".data", "r") as handle:
        data = cPickle.load(handle)

    print >>sys.stderr, "*** Evaluating..."
    for n, sentence in enumerate(iterable):
        if (n % 100) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses = map(list, zip(*sentence))
        labels = eval_model_sentence(options, data, model, words, poses)

        ## some post-proccessing for remove sequences: O I-ORG O
        previous_label = '^'

        for i in xrange(0, len(words)):
            label = labels[i]            
            if (label.startswith('I-')) and ((previous_label == 'O') or (previous_label == '^')):
                label = 'B' + label[1:]
            # if (i + 1 < len(words)) and (labels[i + 1] != 'O') and (labels[i] != 'O') and (labels[i + 1][0] != 'B') and (labels[i + 1][2:] != labels[i][2:]):
                # label = labels[i][:1] + labels[i + 1][2:]
            print label
            previous_label = label
        print
Ejemplo n.º 5
0
def tag_test(test_feature_file, trained_model_file,  tag_test_set_file):
  fr = codecs.open(test_feature_file, 'r', 'utf-8')
  fw = codecs.open(tag_test_set_file, 'w', 'utf-8')
  m = MaxentModel()
  m.load(trained_model_file)
  contents = fr.read()
  feature_list = contents.split('\r')
  feature_list.remove('\n')
  for feature in feature_list:
    if (feature == 'split'):
      fw.write('\n\n\n')
      continue
    str_feature = []
    u_feature = feature.split(' ')
    for item in u_feature:
      str_feature.append(item.encode('utf-8'))
    label_prob_list = m.eval_all(str_feature)
    label = max_prob(label_prob_list)

    try:
      new_tag = str_feature[2].split('=')[1] + '/' + label
    except IndexError:
      print str_feature
    fw.write(new_tag.decode('utf-8'))
    pre_tag = label
  return feature_list
Ejemplo n.º 6
0
def tag_test(test_feature_file, trained_model_file, tag_test_set_file):
    fin = codecs.open(test_feature_file, 'r', 'utf-8')
    fout = codecs.open(tag_test_set_file, 'w', 'utf-8')
    m = MaxentModel()
    m.load(trained_model_file)
    contents = fin.read()
    feature_list = contents.split('\r')
    feature_list.remove('\n')
    for feature in feature_list:
        if (feature == 'split'):
            fout.write('\n\n\n')
            continue
        str_feature = []
        u_feature = feature.split(' ')
        for item in u_feature:
            str_feature.append(item.encode('utf-8'))
        label_prob_list = m.eval_all(str_feature)
        label = max_prob(label_prob_list)

        try:
            new_tag = str_feature[2].split('=')[1] + '/' + label
        except IndexError:
            print
            str_feature
        fout.write(new_tag.decode('utf-8'))
        pre_tag = label

    return feature_list
Ejemplo n.º 7
0
def baseline(sentences, labels):

    maxent.set_verbose(1)
    m = MaxentModel()
    m.begin_add_event()

    with open(sentences) as file_content:
        sentences = file_content.readlines()
    with open(labels) as file_content:
        labels = file_content.readlines()

    for i in xrange(0, 3000):
        m.add_event(sentences[i].split(" "), labels[i].strip())

    m.end_add_event()

    m.train()

    correct = 0
    false = 0

    for i in xrange(3000, len(sentences)):
        result = m.eval(sentences[i].split(" "), "1")
        result = int(round(result))
        label = int(labels[i])
        if result == label:
            correct = correct + 1
        else:
            false = false + 1

    print "correct   :", correct
    print "false     :", false

    print("accuracy  : {:.2f}%".format(correct * 100.0 / (correct + false)))
Ejemplo n.º 8
0
def simple_train(event_list):
    m = MaxentModel()
    m.begin_add_event()
    for e in event_list:
        m.add_event(e[0], e[1])
    m.end_add_event()
    #maxent.set_verbose(1)
    m.train(30, 'lbfgs', 2)
    return m
Ejemplo n.º 9
0
class MaximumEntropyClassifier(Classifier):
    def __init__(self, restrictFeatures=False):
        Classifier.__init__(self)
        print "MaximumEntropy: Creating model"
        self.model = MaxentModel()
        self.model.verbose = 1
        self.restrictFeatures = restrictFeatures
        self.model.begin_add_event()

    def addToIndex(self, trainingset):
        for (vec,cls) in trainingset:
            self.addFeatureVector(vec,cls)
        
    def addFeatureVector(self, vec, cls, value=1, binary=False):
        for key in vec.keys():
            if key not in self.restrictFeatures:
                del vec[key]
        context = vec.keys()
        label = "%s" % cls
        self.model.add_event(context,label,value)

    def compile(self):
        self.model.end_add_event()
        self.model.train(30, "lbfgs", 2, 1E-03)
        #self.model.train(100, 'gis', 2)
        print "> Models trained"

    def classify(self, point, label='1', binary=False):
        result = self.model.eval(point.keys(), label)
        if result >= 0.5:
            return 1
        return -1
Ejemplo n.º 10
0
class MaximumEntropyClassifier(Classifier):
    def __init__(self, restrictFeatures=False):
        Classifier.__init__(self)
        print "MaximumEntropy: Creating model"
        self.model = MaxentModel()
        self.model.verbose = 1
        self.restrictFeatures = restrictFeatures
        self.model.begin_add_event()

    def addToIndex(self, trainingset):
        for (vec, cls) in trainingset:
            self.addFeatureVector(vec, cls)

    def addFeatureVector(self, vec, cls, value=1, binary=False):
        for key in vec.keys():
            if key not in self.restrictFeatures:
                del vec[key]
        context = vec.keys()
        label = "%s" % cls
        self.model.add_event(context, label, value)

    def compile(self):
        self.model.end_add_event()
        self.model.train(30, "lbfgs", 2, 1E-03)
        #self.model.train(100, 'gis', 2)
        print "> Models trained"

    def classify(self, point, label='1', binary=False):
        result = self.model.eval(point.keys(), label)
        if result >= 0.5:
            return 1
        return -1
Ejemplo n.º 11
0
def main():
  if len(sys.argv) != 2:
    print "Usage: MaxentTest.py modelName"
    sys.exit(1)
  
  model = MaxentModel()
  model.load(sys.argv[1])
  context = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  label = model.eval(context, str(0))
  #result = model.eval_all(context)
  print "Result: ", label
Ejemplo n.º 12
0
def train(corpus, *args):
	projections = {}
	model = MaxentModel()
	model.begin_add_event()
	for datums in corpus.values():
		for datum in datums:
			projection = datum2features(datum)
			model.add_event(datum2features(datum), datum.is_related, long(100 * float(datum._trust)))
			projections[datum.row_in_corpus] = projection
	model.end_add_event()
	model.train(*args)
	return model, projections
Ejemplo n.º 13
0
def main():
	global feat_dict, m

	# parsing options{{{
	usage = "usage: %prog [options] model"
    	parser = OptionParser(usage)
    	parser.add_option("-f", "--file", type="string", dest="filename",
                    metavar="FILE",
                    help="train a Maxent model with data from FILE")
    	parser.add_option("-g", "--gaussian", type="float", default=0.0, 
            help="apply Gaussian penality when training \
            [default=0.0]")
    	parser.add_option("--iters", type="int", default=15,
                    help="how many iterations are required for training[default=15]")
    	(options, args) = parser.parse_args()
	#}}}

	if options.filename:
		file = open(options.filename)
    	else:
        	print 'training file not given'
        	parser.print_usage()
        	sys.exit(1)

    	if len(args) !=1:
        	print >> sys.stderr, 'model name not given'
        	parser.print_usage()
        	sys.exit(1)

	model_name = args[0]

	global get_context
	get_context = Generator.get_context_wordform # change this to use different features


    	print 'First pass: gather features'
	extract_feature(file,gather_feature)
	feature_file = model_name + '.features'
	print 'save features to file %s' % feature_file
	save_features(feature_file)

	print 'feat_dict: ',feat_dict

	file.seek(0)
	print 'Second pass: training model...'
	m = MaxentModel()
    	m.begin_add_event()
    	extract_feature(file, add_event)
	m.end_add_event()


    	m.train(options.iters, 'lbfgs', options.gaussian)
    	print 'training finished'

   	print 'saving tagger model to %s' % model_name,
    	m.save(model_name)
    	print 'done'
def main():
    usage = "usage: %prog [options] -m model file"
    parser = OptionParser(usage)
    parser.add_option("-o",
                      "--output",
                      type="string",
                      help="write tagged result to OUTPUT")
    parser.add_option("-m",
                      "--model",
                      type="string",
                      help="load trained model from MODEL")
    parser.add_option("-t",
                      "--test",
                      action="store_true",
                      default=0,
                      help="test mode, include original tag in output")
    parser.add_option("-v",
                      "--verbose",
                      action="store_true",
                      dest="verbose",
                      default=1)
    parser.add_option("-q", "--quiet", action="store_false", dest="verbose")
    parser.add_option("-T",
                      "--type",
                      type="int",
                      default=None,
                      help="choose context type")

    (options, args) = parser.parse_args()

    if not options.model:
        print >> sys.stderr, 'Tagger model name not given!'
        parser.print_usage()
        sys.exit(1)

    model = options.model
    tag_dict = cPickle.load(open(model + '.tagdict'))

    me = MaxentModel()
    me.load(model)
    tagger = postagger.PosTagger(me, tag_dict, options.type)

    tag_in_file = sys.stdin
    if len(args) >= 1:
        tag_in_file = open(args[0])

    tag_out_file = sys.stdout
    if options.output:
        tag_out_file = open(out, 'w')

    tag_file(tagger, tag_in_file, tag_out_file, options.test)
Ejemplo n.º 15
0
def main():
    if len(sys.argv) != 2:
        print "Usage: MaxentTest.py modelName"
        sys.exit(1)

    model = MaxentModel()
    model.load(sys.argv[1])
    context = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0
    ]
    label = model.eval(context, str(0))
    #result = model.eval_all(context)
    print "Result: ", label
Ejemplo n.º 16
0
def train(corpus, *args):
    projections = {}
    model = MaxentModel()
    model.begin_add_event()
    for datums in corpus.values():
        for datum in datums:
            projection = datum2features(datum)
            model.add_event(datum2features(datum), datum.is_related, long(100 * float(datum._trust)))
            projections[datum.row_in_corpus] = projection
    model.end_add_event()
    model.train(*args)
    return model, projections
Ejemplo n.º 17
0
 def __init__(self, restrictFeatures=False):
     Classifier.__init__(self)
     print "MaximumEntropy: Creating model"
     self.model = MaxentModel()
     self.model.verbose = 1
     self.restrictFeatures = restrictFeatures
     self.model.begin_add_event()
Ejemplo n.º 18
0
def predict_tags(best_1_name, best_1_org, best_3_name, best_5_org, sentences,
                 f, op):
    rel = [
        'others', 'director', 'analyst', 'advisor', 'head', 'manager',
        'spokesperson', 'founder', 'professor', 'leave', 'lawyer'
    ]
    me = MaxentModel()
    me.load('../training/models/lbfgs/model3')
    count = 0
    for n1, o1, n3, o5, sent in zip(best_1_name, best_1_org, best_3_name,
                                    best_5_org, sentences):
        if len(n3) == 0 or len(o5) == 0:
            op.write(str((n1, o1)) + '\n')
        else:
            j = ('', '', '', 0.0)
            d = {}
            for name in n3:
                for org in o5:
                    context = get_context(name, org, sent)

                    relation = ''
                    prob = 0.0
                    if context != None:

                        for r in rel:
                            y = me.eval(context, r)
                            if y > prob:
                                prob = y
                                relation = r
                        #set_r.append((name,org,relation,prob))
                        d[(name, org)] = relation
                        if prob > j[3] and relation != 'others':
                            j = (name, org, relation, prob)

                    else:

                        d[(name, org)] = 'others'
            #print str(count)+' before : '+str(n1)+'\t'+str(o1)
            resolve_conflicts(n1, o1, j)
            #print str(count)+' after : '+str(n1)+'\t'+str(o1)
            #x = raw_input()
            op.write(str((n1, o1)) + '\n')
            f.write(str(j) + '\n')
        count = count + 1
Ejemplo n.º 19
0
def eval_model(options, iterable):
    model = MaxentModel()
    data = {}

    print >>sys.stderr, "*** Loading..."
    model.load(options.model + ".maxent")
    with open(options.model + ".data", "r") as handle:
        data = cPickle.load(handle)

    print >>sys.stderr, "*** Evaluating..."
    for n, sentence in enumerate(iterable):
        if (n % 100) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses = map(list, zip(*sentence))
        labels = eval_model_sentence(options, data, model, words, poses)

        for word, pos, label in zip(words, poses, labels):
            print label
        print
Ejemplo n.º 20
0
def eval_model(options, iterable):
    model = MaxentModel()
    data = {}

    print >> sys.stderr, "*** Loading..."
    model.load(options.model + ".maxent")
    with open(options.model + ".data", "r") as handle:
        data = cPickle.load(handle)

    print >> sys.stderr, "*** Evaluating..."
    for n, sentence in enumerate(iterable):
        if (n % 100) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses = map(list, zip(*sentence))
        labels = eval_model_sentence(options, data, model, words, poses)

        for word, pos, label in zip(words, poses, labels):
            print label
        print
Ejemplo n.º 21
0
    def trainOn(self, train_groups):
        ''' Train on the train set and return the trained model '''
        maxent.set_verbose(1)

        m = MaxentModel()

        m.begin_add_event()

        for pair in train_groups:
            m.add_event(pair[0], pair[1])

        m.end_add_event()

        m.train(20, 'lbfgs', 1e-04, 1e-03)

        return m
Ejemplo n.º 22
0
def eval_ne_binary_model(options, iterable):
    model = MaxentModel()
    data = {}

    predicted_labels = []

    print >>sys.stderr, "*** Loading..."
    model.load(options.model + ".ne.binary.maxent")
    with open(options.model + ".ne.binary.data", "r") as handle:
        data = cPickle.load(handle)

    print >>sys.stderr, "*** Evaluating..."
    for n, sentence in enumerate(iterable):
        if (n % 100) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses = map(list, zip(*sentence))
        labels = eval_ne_binary_model_sentence(options, data, model, words, poses)
        predicted_labels += [labels]

    return predicted_labels
 def __init__(self, dictpath):
     self.tokenizer = Tokenizer.Tokenizer()
     self.documents = defaultdict(Document.Document)
     self.statistics = defaultdict(int)
     self.dictionary = Dictionary.Dictionary(dictpath)
     self.dictionary.load('syllable')
     self.dictionary.load('token')
     self.dictionary.load('type')
     self.dictionary.load('length')
     self.model = MaxentModel()
     self.threshold = 0.0
Ejemplo n.º 24
0
 def __init__(self, dictpath):
     util.Logger.info('Initializing sbd instance...')
     self.tokenizer = Tokenizer.Tokenizer()
     self.statistics = defaultdict(int)
     self.dictionary = Dictionary.Dictionary(dictpath)
     self.dictionary.load('syllable')
     self.dictionary.load('token')
     self.dictionary.load('type')
     self.dictionary.load('length')
     self.model = MaxentModel()
     self.threshold = 0.0
     util.Logger.info('sbd instance Initialized.')
Ejemplo n.º 25
0
def main ():
    usage = "usage: %prog [options] -m model file"
    parser = OptionParser(usage)
    parser.add_option("-o", "--output", type="string",
            help="write tagged result to OUTPUT")
    parser.add_option("-m", "--model", type="string", 
            help="load trained model from MODEL")
    parser.add_option("-t", "--test", action="store_true",
            default=0, help="test mode, include original tag in output")
    parser.add_option("-v", "--verbose",
                    action="store_true", dest="verbose", default=1)
    parser.add_option("-q", "--quiet",
                    action="store_false", dest="verbose")
    parser.add_option("-T","--type",  type="int", default=None, 
            help="choose context type")

    (options, args) = parser.parse_args()

    if not options.model:
        print >> sys.stderr, 'Tagger model name not given!'
        parser.print_usage()
        sys.exit(1)

    model = options.model
    tag_dict = cPickle.load(open(model + '.tagdict'))

    me = MaxentModel()
    me.load(model)
    tagger = postagger.PosTagger(me, tag_dict, options.type)

    tag_in_file = sys.stdin
    if len(args) >=1:
        tag_in_file = open(args[0])

    tag_out_file = sys.stdout
    if options.output:
        tag_out_file = open(out, 'w')

    tag_file(tagger, tag_in_file, tag_out_file, options.test)
Ejemplo n.º 26
0
    def trainOn(self, train_groups, n_itr = 15, var = 1, tol = 1e-5):
        ''' Train on the train set and return the trained model '''

        print "training set:", Counter(zip(*train_groups)[1]).most_common()

        maxent.set_verbose(1)

        m = MaxentModel()

        m.begin_add_event()

        for pair in train_groups:
            m.add_event(pair[0], pair[1])

        n_cutoff = 1
        m.end_add_event(n_cutoff)

        m.train(n_itr, 'lbfgs', var, tol)

        return m
Ejemplo n.º 27
0
from maxent import MaxentModel

for i in range(5):
    m = MaxentModel()
    context = []
    m.begin_add_event()
    with open('contexts/contexts' + str(i + 1) + '.txt', 'r') as f:
        for line in f:
            line = line.rstrip()
            try:
                ind = line.index(':')
                if line[:ind] != '':
                    rel = line[:ind]
                    l = eval(line[ind + 1:])
                    m.add_event(l, rel, 1)
            except:
                pass
    m.end_add_event()

    m.train(100, 'lbfgs')
    s_name = "models/lbfgs/model" + str(i + 1)
    m.save(s_name)
Ejemplo n.º 28
0
def training(feature_file_path, trained_model_file, times):
  m = MaxentModel()
  fr = codecs.open(feature_file_path, 'r', 'utf-8')
  all_list = []
  m.begin_add_event()
  for line in fr:
    line = line.rstrip()
    line_list = line.split(' ')
    str_list = []
    for item in line_list:
      str_list.append(item.encode('utf-8'))
    all_list.append(str_list)
    m.add_event(str_list[1:], str_list[0], 1)
  m.end_add_event()
  print 'begin training'
  m.train(times, "lbfgs")
  print 'end training'
  m.save(trained_model_file)
  return all_list
Ejemplo n.º 29
0
def test():
    maxent.set_verbose(1)

    m = MaxentModel()

    m.begin_add_event()
    m.add_event(['1'], '1')
    m.add_event(['2'], '2')
    m.add_event(['3'], '3')
    m.end_add_event()

    m.train(30, 'lbfgs', 2, 1e-03)

    for x in map(str, range(1,4)):
        print "tested on:", x, "predicted:", m.eval_all([x])
Ejemplo n.º 30
0
def train_model(options, iterable):
    model = MaxentModel()
    data = {}

    data["feature_set"] = set()
    data["word_frequencies"] = defaultdict(long)
    # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be
    # a better choice here (for |labelled_words|) but it could not be pickled.
    # C'est la vie.
    data["labelled_words"] = dict()

    print >> sys.stderr, "*** Training options are:"
    print >> sys.stderr, "   ", options

    print >> sys.stderr, "*** First pass: Computing statistics..."
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        for word, pos, label in sentence:
            data["word_frequencies"][word] += 1
            if label.startswith("B-") or label.startswith("I-"):
                if word in data["labelled_words"]:
                    data["labelled_words"][word][label] += 1
                else:
                    data["labelled_words"][word] = defaultdict(long)

    print >> sys.stderr, "*** Second pass: Collecting features..."
    model.begin_add_event()
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses, labels = map(list, zip(*sentence))
        for i in xrange(len(labels)):
            features = compute_features(data, words, poses, i,
                                        labels[i - 1] if i >= 1 else "^")
            features = list(features)
            model.add_event(features, labels[i])
            for feature in features:
                data["feature_set"].add(feature)
    model.end_add_event(options.cutoff)
    print >> sys.stderr, "*** Collected {0} features.".format(
        len(data["feature_set"]))

    print >> sys.stderr, "*** Training..."
    maxent.set_verbose(1)
    model.train(options.iterations, options.technique, options.gaussian)
    maxent.set_verbose(0)

    print >> sys.stderr, "*** Saving..."
    model.save(options.model + ".maxent")
    with open(options.model + ".data", "w") as handle:
        cPickle.dump(data, handle)
Ejemplo n.º 31
0
class BatchSBD:
    def __init__(self, dictpath):
        util.Logger.info('Initializing sbd instance...')
        self.tokenizer = Tokenizer.Tokenizer()
        self.statistics = defaultdict(int)
        self.dictionary = Dictionary.Dictionary(dictpath)
        self.dictionary.load('syllable')
        self.dictionary.load('token')
        self.dictionary.load('type')
        self.dictionary.load('length')
        self.model = MaxentModel()
        self.threshold = 0.0
        util.Logger.info('sbd instance Initialized.')

    def load(self, modelname=None, threshold=0.0):
        util.Logger.info('Loading model...')
        assert(modelname != None)
        assert(modelname.strip() != '')
        try:
            util.Logger.debug("Started to load model...")
            self.model.load(modelname)
            self.threshold = threshold
            util.Logger.debug("Completed to load model '%s'" % modelname)
        except:
            raise
        util.Logger.info('Model loaded.')

    def run(self, input=None, output=None, syllable_length=1, merged_use=False):
        util.Logger.info('run ' + input + ',' + output)
        assert(input != None)
        assert(input.strip() != '')
        assert(output != None)
        assert(output.strip() != '')
        try:
            # load document 
            util.Logger.info("Started to load document.")
            document = Document.Document()
            ifile = open(input)
            # build document
            util.Logger.info("Adding token to document.")
            self.tokenizer.clear()
            for token in self.tokenizer.tokenize(ifile):
                document.add(token)
            ifile.close()
            # detect sentence boundaries
            util.Logger.info("Detecting sentence boundaries.")
            ofile = open(output, "w+")
            line = ''
            lineno = 1
            for id in range(document.length()):
                prev = document.prev(id)
                curr = document.token(id)
                next = document.next(id)
                eos = False
                # check every position
                eos = self.eval(document, id, prev, curr, next, syllable_length, merged_use)
                if eos == None:
                    continue; # null field found
                line += curr.value
                if curr.isEoe():
                    line += ' '
                if eos and len(line.strip()) > 0:
                    if line[0:1] == ' ':
                        ofile.write('\n')
                    ofile.write(line.strip() + '\n')
                    line = ''
            ofile.write(line.strip() + '\n')
            ofile.close()
            document.clear()
            util.Logger.info("Detecting '%s' document completed." % input)
        except:
            raise

    def eos(self, context):
        label = 'yes'
        prob = self.model.eval(context, label)
        buf = ''
        if prob >= self.threshold:
            return True
        else:
            return False

    # append property into list-buf
    def append_maxent_parameter(self, list, i, property):
        i += 1
        list.append(str(i) + ':' + str(property))
        return i

    # FIXME: code duplicattion with sbd.detector.Probabilistic.py
    def eval(self, document, id, prevToken, currToken, nextToken, syllable_length=0, merged_use=False):
        dict = self.dictionary
        common = util.Common()
        # default token value
        default = '_'
        # { pos-type, pos-name }
        current_pos_type = common.name_of_type(currToken)
        current_pos_name = common.name_of_pos(currToken)
        prefix_pos_type = common.name_of_type(prevToken)
        prefix_pos_name = common.name_of_pos(prevToken)
        suffix_pos_type = common.name_of_type(nextToken)
        suffix_pos_name = common.name_of_pos(nextToken)
        # { syllables }
        prefix_syllable_name = []
        prefix_syllable_prob = []
        suffix_syllable_name = []
        suffix_syllable_prob = []
        merged_syllable_name = []
        merged_syllable_prob = []
        for length in xrange(syllable_length):
            if prevToken.length == 0: prefixName = default * syllable_length
            else: prefixName = prevToken.syllable(-1*(length+1))
            prefix_syllable_name.append(prefixName)
            prefix_syllable_prob.append(dict.getPrefixSyllableProb(prefixName))
            if nextToken.length == 0: suffixName = default * syllable_length
            else: suffixName = nextToken.syllable(length+1)
            suffix_syllable_name.append(suffixName)
            suffix_syllable_prob.append(dict.getSuffixSyllableProb(suffixName))
            if merged_use:
                mergedName = prefixName + '_' + suffixName
                merged_syllable_name.append(mergedName)
                merged_syllable_prob.append(dict.getMergedSyllableProb(mergedName))
        # { token-name, token-prob }
        if currToken.length == 0: current_token_name = default
        else: current_token_name = currToken.value
        current_token_prob = dict.getCurrentTokenProb(current_token_name)
        if prevToken.length == 0: prefix_token_name = default
        else: prefix_token_name = prevToken.value
        prefix_token_prob = dict.getPrefixTokenProb(prefix_token_name)
        if nextToken.length == 0: suffix_token_name = default
        else: suffix_token_name = nextToken.value
        suffix_token_prob = dict.getSuffixTokenProb(suffix_token_name)
        # { candidate-distance }
        prefix_candidate_dist = document.prevCandidateDist(id)
        suffix_candidate_dist = document.nextCandidateDist(id)
        # { punctuation-distance }
        prefix_punctuation_dist = document.prevPunctuationDist(id)
        suffix_punctuation_dist = document.nextPunctuationDist(id)
        # { token-length }
        current_token_length = currToken.length
        prefix_token_length = prevToken.length
        suffix_token_length = nextToken.length
        # { end-of-sentence }
        end_of_sentence = 'no'
        if currToken.end_of_sentence:
            end_of_sentence = 'yes'
        context = [end_of_sentence]
        i = 0
        # { building instances }
        i = self.append_maxent_parameter(context, i, current_pos_type)
        i = self.append_maxent_parameter(context, i, current_pos_name)
        i = self.append_maxent_parameter(context, i, prefix_pos_type)
        i = self.append_maxent_parameter(context, i, prefix_pos_name)
        i = self.append_maxent_parameter(context, i, suffix_pos_type)
        i = self.append_maxent_parameter(context, i, suffix_pos_name)
        # XXX: maxent use NAME instead of PROBABILITY
        for length in xrange(syllable_length):
            i = self.append_maxent_parameter(context, i, prefix_syllable_name[length])
            i = self.append_maxent_parameter(context, i, suffix_syllable_name[length])
            if merged_use:
                i = self.append_maxent_parameter(context, i, merged_syllable_name[length])
        i = self.append_maxent_parameter(context, i, current_token_name)
        i = self.append_maxent_parameter(context, i, prefix_token_name)
        i = self.append_maxent_parameter(context, i, suffix_token_name)
        i = self.append_maxent_parameter(context, i, str(current_token_length))
        i = self.append_maxent_parameter(context, i, str(prefix_token_length))
        i = self.append_maxent_parameter(context, i, str(suffix_token_length))
        eos = self.eos(context)
        return eos

    def calc(self, answer, rule):
        if answer == True and rule == True:
            result = 'TP'
        elif answer == True and rule == False:
            result = 'TN'
        elif answer == False and rule == True:
            result = 'FP'
        else:
            result = 'FN'
        self.statistics[result] += 1

    def summary(self):
        precision = 0.0
        recall = 0.0
        fscore = 0.0
        tp = self.statistics['TP']
        tn = self.statistics['TN']
        fp = self.statistics['FP']
        util.Logger.info("tp:", tp, "tn:", tn, "fp:", fp)
        if (tp + tn) > 0:
            precision = tp * 1.0 / (tp + tn)
        if (tp + fp) > 0:
            recall = tp * 1.0 / (tp + fp)
        if (precision+recall) > 0:
            fscore = (2*precision*recall) / (precision+recall)
        util.Logger.info("Precision:\t%0.3f%%" % (precision * 100.0))
        util.Logger.info("Recall:\t\t%0.3f%%" % (recall * 100.0))
        util.Logger.info("Fscore:\t\t%0.3f%%" % (fscore * 100.0))
Ejemplo n.º 32
0
def main():
  if len(sys.argv) != 4:
    print "Usage: MaxentTrain.py features.mat labels.mat modelName"
    sys.exit(1)
  
  features = featureMatrice(sys.argv[1])
  labels = labelLst(sys.argv[2])
  
  model = MaxentModel()
  # add data into model
  model.begin_add_event()
  for i in range(len(labels)):
    model.add_event(features[i], str(labels[i]), 1)
  
  model.end_add_event()
  
  # start training
  #model.train()
  model.train(1000, "gis", 2)
  #model.train(30, "lbfgs")
  
  # save the model
  model.save(sys.argv[3])
Ejemplo n.º 33
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

# Imports
import sys, os

# Load MaxEnt models
corpusPath = os.environ.get('CORPUS_PATH')
from maxent import MaxentModel

maxEntModel = MaxentModel()
maxEntModel.load(corpusPath + '/model_markers.txt')

for trainLine in sys.stdin.readlines():
    trainCols = trainLine.split('\t')
    modelMarkerProbas = maxEntModel.eval_all(trainCols[1:])
    probaFeats = []
    for modelMarkerProba in modelMarkerProbas:
        if modelMarkerProba[1] > 0.00001:
            probaFeats.append(modelMarkerProba[0] + ':' +
                              str(modelMarkerProba[1]))
    print trainCols[0] + '\t' + '\t'.join(probaFeats)
Ejemplo n.º 34
0
def train_model(options, iterable):
    model = MaxentModel()
    data = {}

    data["feature_set"] = set()
    data["word_frequencies"] = defaultdict(long)
    # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be
    # a better choice here (for |labelled_words|) but it could not be pickled.
    # C'est la vie.
    data["labelled_words"] = dict()

    data["geo"] = set()
    data["names"] = set()
    data["surnames"] = set()

    with open("geo_spanish.txt", "r") as geo:
        for name in geo:
            data["geo"].add(name.rstrip())

    with open("name_spanish.txt", "r") as names:
        for name in names:
            data["names"].add(name.rstrip())

    with open("surname_spanish.txt", "r") as surnames:
        for surname in surnames:
            data["surnames"].add(surname.rstrip())

    print >> sys.stderr, "*** Training options are:"
    print >> sys.stderr, "   ", options

    print >> sys.stderr, "*** First pass: Computing statistics..."
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        for word, pos, label in sentence:
            data["word_frequencies"][word] += 1
            if label.startswith("B-") or label.startswith("I-"):
                if word in data["labelled_words"]:
                    data["labelled_words"][word][label] += 1
                else:
                    data["labelled_words"][word] = defaultdict(long)

    print >> sys.stderr, "*** Second pass: Collecting features..."
    model.begin_add_event()
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses, labels = map(list, zip(*sentence))
        for i in xrange(len(labels)):
            features = compute_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^", labels[i])
            features = list(features)
            if len(features) > 0:
                model.add_event(features, labels[i])
            for feature in features:
                data["feature_set"].add(feature)
    model.end_add_event(options.cutoff)
    print >> sys.stderr, "*** Collected {0} features.".format(len(data["feature_set"]))

    print >> sys.stderr, "*** Training..."
    maxent.set_verbose(1)
    model.train(options.iterations, options.technique, options.gaussian)
    maxent.set_verbose(0)

    print >> sys.stderr, "*** Saving..."
    model.save(options.model + ".maxent")
    with open(options.model + ".data", "w") as handle:
        cPickle.dump(data, handle)
Ejemplo n.º 35
0
def training(feature_file_path, trained_model_file, times):
    m = MaxentModel()
    fin = codecs.open(feature_file_path, 'r', 'utf-8')
    all_list = []
    m.begin_add_event()
    for line in fin:
        line = line.rstrip()
        line_list = line.split(' ')
        str_list = []
        for item in line_list:
            str_list.append(item.encode('utf-8'))
        all_list.append(str_list)
        m.add_event(str_list[1:], str_list[0], 1)
    m.end_add_event()
    print
    'begin training'
    m.train(times, "lbfgs")
    print
    'end training'
    m.save(trained_model_file)
    return all_list
class MaxentBasedSBD:
    def __init__(self, dictpath):
        self.tokenizer = Tokenizer.Tokenizer()
        self.documents = defaultdict(Document.Document)
        self.statistics = defaultdict(int)
        self.dictionary = Dictionary.Dictionary(dictpath)
        self.dictionary.load('syllable')
        self.dictionary.load('token')
        self.dictionary.load('type')
        self.dictionary.load('length')
        self.model = MaxentModel()
        self.threshold = 0.0

    def set(self, modelname=None, threshold=0.0, filename=None):
        assert(modelname != None)
        assert(modelname.strip() != '')
        assert(filename != None)
        assert(filename.strip() != '')
        try:
            util.Logger.debug("Started to load model...")
            self.model.load(modelname)
            self.threshold = threshold
            util.Logger.debug("Completed to load model '%s'" % modelname)
        except:
            raise
        try:
            util.Logger.debug("Started to load document...")
            document = Document.Document()
            file = open(filename)
            for token in self.tokenizer.tokenize(file):
                document.add(token)
            file.close()
            self.documents[filename] = document
            util.Logger.debug("Competed to load document '%s'" % filename)
        except:
            raise

    def get(self, filename=None):
        assert(filename != None)
        assert(filename.strip() != '')
        if filename in self.documents:
            return self.documents[filename]
        else:
            return Document.Document()

    def eos(self, context):
        label = 'yes'
        prob = self.model.eval(context, label)
        buf = ''
        if prob >= self.threshold:
            return True
        else:
            return False

    # append property into list-buf
    def append_maxent_parameter(self, list, i, property):
        i += 1
        list.append(str(i) + ':' + str(property))
        return i

    # FIXME: code duplicattion with sbd.detector.Probabilistic.py
    def eval(self, document, id, prevToken, currToken, nextToken, syllable_length=0, merged_use=False):
        dict = self.dictionary
        common = util.Common()
        # default token value
        default = '_'
        # { pos-type, pos-name }
        current_pos_type = common.name_of_type(currToken)
        current_pos_name = common.name_of_pos(currToken)
        prefix_pos_type = common.name_of_type(prevToken)
        prefix_pos_name = common.name_of_pos(prevToken)
        suffix_pos_type = common.name_of_type(nextToken)
        suffix_pos_name = common.name_of_pos(nextToken)
        # { syllables }
        prefix_syllable_name = []
        prefix_syllable_prob = []
        suffix_syllable_name = []
        suffix_syllable_prob = []
        merged_syllable_name = []
        merged_syllable_prob = []
        for length in xrange(syllable_length):
            if prevToken.length == 0: prefixName = default * syllable_length
            else: prefixName = prevToken.syllable(-1*(length+1))
            prefix_syllable_name.append(prefixName)
            prefix_syllable_prob.append(dict.getPrefixSyllableProb(prefixName))
            if nextToken.length == 0: suffixName = default * syllable_length
            else: suffixName = nextToken.syllable(length+1)
            suffix_syllable_name.append(suffixName)
            suffix_syllable_prob.append(dict.getSuffixSyllableProb(suffixName))
            if merged_use:
                mergedName = prefixName + '_' + suffixName
                merged_syllable_name.append(mergedName)
                merged_syllable_prob.append(dict.getMergedSyllableProb(mergedName))
        # { token-name, token-prob }
        if currToken.length == 0: current_token_name = default
        else: current_token_name = currToken.value
        current_token_prob = dict.getCurrentTokenProb(current_token_name)
        if prevToken.length == 0: prefix_token_name = default
        else: prefix_token_name = prevToken.value
        prefix_token_prob = dict.getPrefixTokenProb(prefix_token_name)
        if nextToken.length == 0: suffix_token_name = default
        else: suffix_token_name = nextToken.value
        suffix_token_prob = dict.getSuffixTokenProb(suffix_token_name)
        # { candidate-distance }
        prefix_candidate_dist = document.prevCandidateDist(id)
        suffix_candidate_dist = document.nextCandidateDist(id)
        # { punctuation-distance }
        prefix_punctuation_dist = document.prevPunctuationDist(id)
        suffix_punctuation_dist = document.nextPunctuationDist(id)
        # { token-length }
        current_token_length = currToken.length
        prefix_token_length = prevToken.length
        suffix_token_length = nextToken.length
        # { end-of-sentence }
        end_of_sentence = 'no'
        if currToken.end_of_sentence:
            end_of_sentence = 'yes'
        context = [end_of_sentence]
        i = 0
        # { building instances }
        i = self.append_maxent_parameter(context, i, current_pos_type)
        i = self.append_maxent_parameter(context, i, current_pos_name)
        i = self.append_maxent_parameter(context, i, prefix_pos_type)
        i = self.append_maxent_parameter(context, i, prefix_pos_name)
        i = self.append_maxent_parameter(context, i, suffix_pos_type)
        i = self.append_maxent_parameter(context, i, suffix_pos_name)
        # XXX: maxent use NAME instead of PROBABILITY
        for length in xrange(syllable_length):
            i = self.append_maxent_parameter(context, i, prefix_syllable_name[length])
            i = self.append_maxent_parameter(context, i, suffix_syllable_name[length])
            if merged_use:
                i = self.append_maxent_parameter(context, i, merged_syllable_name[length])
        i = self.append_maxent_parameter(context, i, current_token_name)
        i = self.append_maxent_parameter(context, i, prefix_token_name)
        i = self.append_maxent_parameter(context, i, suffix_token_name)
        i = self.append_maxent_parameter(context, i, str(current_token_length))
        i = self.append_maxent_parameter(context, i, str(prefix_token_length))
        i = self.append_maxent_parameter(context, i, str(suffix_token_length))
        eos = self.eos(context)
        return eos

    def calc(self, answer, rule):
        if answer == True and rule == True:
            result = 'TP'
        elif answer == True and rule == False:
            result = 'TN'
        elif answer == False and rule == True:
            result = 'FP'
        else:
            result = 'FN'
        self.statistics[result] += 1

    def summary(self):
        precision = 0.0
        recall = 0.0
        fscore = 0.0
        tp = self.statistics['TP']
        tn = self.statistics['TN']
        fp = self.statistics['FP']
        util.Logger.info("tp:", tp, "tn:", tn, "fp:", fp)
        if (tp + tn) > 0:
            precision = tp * 1.0 / (tp + tn)
        if (tp + fp) > 0:
            recall = tp * 1.0 / (tp + fp)
        if (precision+recall) > 0:
            fscore = (2*precision*recall) / (precision+recall)
        util.Logger.info("Precision:\t%0.3f%%" % (precision * 100.0))
        util.Logger.info("Recall:\t\t%0.3f%%" % (recall * 100.0))
        util.Logger.info("Fscore:\t\t%0.3f%%" % (fscore * 100.0))
Ejemplo n.º 37
0
#!/usr/bin/python2.5

from maxent import MaxentModel

import sys

model_file = sys.argv[1]

m = MaxentModel()
m.load(model_file)
m.save(model_file+'.txt')

Ejemplo n.º 38
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

# Imports
import sys, os

# Load MaxEnt models
corpusPath = os.environ.get('CORPUS_PATH')
from maxent import MaxentModel
maxEntModel = MaxentModel()
maxEntModel.load(corpusPath+'/model_markers.txt')

for trainLine in sys.stdin.readlines():
	trainCols = trainLine.split('\t')
	modelMarkerProbas = maxEntModel.eval_all(trainCols[1:])
	probaFeats = []
	for modelMarkerProba in modelMarkerProbas:
		if modelMarkerProba[1] > 0.00001:
			probaFeats.append(modelMarkerProba[0] + ':' + str(modelMarkerProba[1]))
	print trainCols[0] + '\t' + '\t'.join(probaFeats)

    def __init__(self,compute_features, N_filter_func = N_default):
        self.filter_func = N_filter_func
        self.me = MaxentModel()
        self.num_train_iters = 2000
	self.compute_features = compute_features
Ejemplo n.º 40
0
        it_state = backtrace[index][it_state]


    return path[::-1] # Посчитать max_y P(y|x)


SUN = 'sun'
RAIN = 'rain'
train_data = [(SUN, 10),(SUN,8),(SUN,11), (RAIN,3),(RAIN,2),(SUN,6),(SUN,10),(RAIN,1)]


labels_train = [ i[0] for i in train_data]
icecream_train = [ i[1] for i in train_data]


me = MaxentModel()

me.begin_add_event()

for i,data in enumerate( train_data ):
    features = list(compute_features( icecream_train,  i , labels_train[ i - 1] if i > 0 else None )  )
    me.add_event(features, labels_train[i] )
me.end_add_event()

me.train()


Y = set([ SUN, RAIN ])


print eval_model_sentence( observations = [1,6,1,6], model = me)
Ejemplo n.º 41
0
def train_model(options, iterable):
    model = MaxentModel()
    data = {}
    

    data["feature_set"] = set()
    data["word_frequencies"] = defaultdict(long)
    # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be
    # a better choice here (for |labelled_words|) but it could not be pickled.
    # C'est la vie.
    data["labelled_words"] = dict()
    data["unigrams"] = dict()

    
    print >>sys.stderr, "*** Training options are:"
    print >>sys.stderr, "   ", options

    print >>sys.stderr, "*** First pass: Computing statistics..."
    
    unigrams = dict()
    unigrams["B-ORG"] = defaultdict(long)
    unigrams["B-MISC"] = defaultdict(long)
    unigrams["B-LOC"] = defaultdict(long)
    unigrams["B-PER"] = defaultdict(long)

    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        previous_word = "^"
        previous_label = "^"
        for word, pos, label in sentence:
            data["word_frequencies"][string.lower(word)] += 1
            if label.startswith("B-") or label.startswith("I-"):
                if word in data["labelled_words"]:
                    data["labelled_words"][string.lower(word)][label] += 1
                else:
                    data["labelled_words"][string.lower(word)] = defaultdict(long)
                    data["labelled_words"][string.lower(word)][label] = 1
            if label.startswith("B-") and (previous_word != "^"):
                unigrams[label][string.lower(previous_word)] += 1
                
            previous_label = label
            previous_word = word
    
    unigram_counters = [Counter(unigrams[key]) for key in unigrams]
    total_count = Counter()
    for counter in unigram_counters:
         total_count += counter

    total_count = dict(total_count)
    inv_total_freq  = dict([[key, (math.log(sum(total_count.values()) /  total_count[key]) ** 3)] for key in total_count])
    
    for label in unigrams:
        all_sum = sum([unigrams[label][word] for word in unigrams[label]])
        uni = sorted([[(1.0 * unigrams[label][word] * inv_total_freq[word] / all_sum ), word] for word in unigrams[label]])
        uni = [word[1] for word in uni]
        data["unigrams"][label] = uni[-50:]
        # print >>sys.stderr, "*** Collected {0} unigrams for {1}".format(len(data["unigrams"][label]), label)

    print >>sys.stderr, "*** Second pass: Collecting features..."
    model.begin_add_event()
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses, labels = map(list, zip(*sentence))
        for i in xrange(len(labels)):
            features = compute_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^")
            features = list(features)
            model.add_event(features, labels[i])
            for feature in features:
                data["feature_set"].add(feature)
    model.end_add_event(options.cutoff)
    print >>sys.stderr, "*** Collected {0} features.".format(len(data["feature_set"]))

    print >>sys.stderr, "*** Training..."
    maxent.set_verbose(1)
    model.train(options.iterations, options.technique, options.gaussian)
    maxent.set_verbose(0)

    print >>sys.stderr, "*** Saving..."
    model.save(options.model + ".maxent")
    with open(options.model + ".data", "w") as handle:
        cPickle.dump(data, handle)
class MMEMAlgorithm(object):

    #реализация алгоритма на основе HMM
    def __init__(self,compute_features, N_filter_func = N_default):
        self.filter_func = N_filter_func
        self.me = MaxentModel()
        self.num_train_iters = 2000
	self.compute_features = compute_features

    def load_memm_model(self, filename):
        self.me.load( filename  )

    def init(self):
        pass

    


    def train_model_file_list(self, corpus_filelist, ambiguity_dir ):
        self.me.begin_add_event()

        for corpus_file in corpus_filelist:
            print "Training on file {0}".format( corpus_file )
            sentence = []
            morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) )

            morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func ) if os.path.exists( morph_analys_file ) else None
            if morph_analys_tokens:
                print "Using mystem features on file {0}".format( morph_analys_file )

            gold_tokens = get_tokens_from_file(corpus_file, N_filter_func = self.filter_func )
            for corpus_token in gold_tokens:

                morph_analys_token = morph_analys_tokens.next() if morph_analys_tokens else None


                gold_token_word = corpus_token[0].word
                morph_analys_token_word = morph_analys_token[0].word if morph_analys_token else None
                if morph_analys_token_word:
                    if gold_token_word != morph_analys_token_word:
                        '''
                        if ('-' in gold_token_word and '-' not in morph_analys_token_word) or ('\'' in gold_token_word and '\'' not in morph_analys_token_word):
                            morph_analys_token = morph_analys_tokens.next()
                        if ('.' in gold_token_word):
                            cnt_dots = '.'.count( gold_token_word )
                            for i in xrange( 0, cnt_dots ):
                                morph_analys_token = morph_analys_tokens.next()
                        '''
                        print >>sys.stderr, u"Start skipping sentence. Gold token wordform {0} morph token wordform {1}".format( gold_token_word, morph_analys_token_word )

                        sentence = []
                        try:
                            next_gold = gold_tokens.next()
                            while( next_gold !=  [EOS_TOKEN] ):
                                next_gold = gold_tokens.next()

                            next_gold = gold_tokens.next()
                            next_morph = morph_analys_tokens.next()
                            while( next_morph[0].word != next_gold[0].word ):
                                next_morph = morph_analys_tokens.next()

                        except StopIteration:
                            break



                if corpus_token[0] == EOS_TOKEN and len(sentence) > 0:
                    words = [token[0].word for token in sentence]
                    labels = [token[0].gram for token in sentence]
                    for i,token_info in enumerate( sentence ):
                        gold_token = token_info[0]
                        morph_analysises = [token.gram for token in token_info[1]] if token_info[1] and morph_analys_token else None

                        if token_info[1] is not None:
                            if gold_token.word != token_info[1][0].word:
                                print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0}     morph analysis token : {1}".format( gold_token.word, token_info[1][0].word )
                                morph_analysises = None

                        word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) )
                        gold_token_gram = gold_token.gram.encode('utf-8')
                        self.me.add_event(word_features, gold_token_gram )
                    sentence = []
                else:
                    sentence.append( (corpus_token[0], morph_analys_token)  )


        self.me.end_add_event()
        maxent.set_verbose(1)
        self.me.train( self.num_train_iters, 'lbfgs', 0.0 )
        maxent.set_verbose(0)

    def train_model(self, corpus_dir, ambiguity_dir ):
        self.me.begin_add_event()
        #self.B = train_B_corpus(corpus_dir = corpus_dir,N_filter_func = N_filter_func)
        sentence = []

        corpus_files = get_corpus_files(corpus_dir)
        for corpus_file in corpus_files:

            morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) )
            morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func )

            for corpus_token in get_tokens_from_file(corpus_file, N_filter_func = self.filter_func ):

                morph_analys_token = morph_analys_tokens.next()
                if corpus_token[0] == EOS_TOKEN:
                    words = [token[0].word for token in sentence]
                    labels = [token[0].gram for token in sentence]
                    for i,token_info in enumerate( sentence ):
                        gold_token = token_info[0]
                        morph_analysises = [token.gram for token in token_info[1]]
                        if gold_token.word != token_info[1][0].word:
                            print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0}     morph analysis token : {1}".format( gold_token.word, token_info[1][0].word )
                            morph_analysises = None
                        word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) )
                        gold_token_gram = gold_token.gram.encode('utf-8')
                        self.me.add_event(word_features, gold_token_gram )
                    sentence = []
                else:
                    sentence.append( (corpus_token[0], morph_analys_token)  )

        self.me.end_add_event()
        maxent.set_verbose(1)
        self.me.train( 50, 'lbfgs', 0.0 )
        maxent.set_verbose(0)

    def load_model(self, memm_filename):
        self.me.load( memm_filename )

    def save_model(self, memm_filename):
        self.me.save( memm_filename )
        #dump_object( B_stat_filename, self.B )

    def remove_ambiguity_file(self, file, outfile):
        out_f =  codecs.open( outfile, 'w', 'utf-8' )
        sentence = []
        for token in get_tokens_from_file(file, N_filter_func= self.filter_func):
            if len(token) == 1 and token[0] == EOS_TOKEN:
                if len(sentence)>0:
                    no_ambig_tokens = self.remove_ambiguity( sentence )
                    for no_ambig_token in no_ambig_tokens:
                        out_f.write( u"{0}\t{1}={2}\r\n".format(no_ambig_token[0], 'nolemma', no_ambig_token[1] ) )
                    out_f.write('\r\n')
                    sentence = []
                    continue
                else:
                    sentence = []
                    continue

            sentence.append( (token[0].word, token) )
        out_f.close()

    def remove_ambiguity_dir(self, dir):
        pass

    def remove_ambiguity(self, variants):
        """
        Структура variants = [ (word_form, [tokens ]), (...) , (  ) ]
        """
        words = [variant[0]  for variant in variants]
        analysises = [[token.gram for token in variant[1]]  for variant in variants ]
        viterbi_layers = [ None for i in xrange(len(words)) ]

        viterbi_backpointers = [ None for i in xrange(len(words) + 1) ]

        # Compute first layer directly.
        viterbi_layers[0] = self.me.eval_all(list(self.compute_features(sentence=words, i = 0 , prev_label= None, analysises = analysises[0], labels = None ) ) )

        filtered_viterbi_layer = dict( (k, v) for k, v in viterbi_layers[0] if k in analysises[0] )
        viterbi_layer_0_prob = sum( [v for v in filtered_viterbi_layer.values() ]  )
        viterbi_layers[0] = dict( (k, math.log(v/viterbi_layer_0_prob) ) for k, v in filtered_viterbi_layer.items() )


        viterbi_backpointers[0] = dict( (k, None) for k, v in viterbi_layers[0].iteritems() )

        # Compute intermediate layers.
        for i in xrange(1, len(words)):
            viterbi_layers[i] = defaultdict(lambda: float("-inf"))
            viterbi_backpointers[i] = defaultdict(lambda: None)
            for prev_label, prev_logprob in viterbi_layers[i - 1].iteritems():
                features = self.compute_features(sentence=words,i= i, prev_label= prev_label, analysises = analysises[i], labels = None)
                features = list(features)
                distribution =  self.me.eval_all(features)
                distribution = dict( (label, prob) for label, prob in  distribution if label in analysises[i])

                distribution_sum = sum( [v for v in distribution.values() ]  )
                distribution = dict( (k, v/ distribution_sum) for k, v in distribution.items() )
                for label, prob in distribution.items():
                    logprob = math.log(prob)
                    if prev_logprob + logprob > viterbi_layers[i][label]:
                        viterbi_layers[i][label] = prev_logprob + logprob
                        viterbi_backpointers[i][label] = prev_label

        # Most probable endpoint.
        max_logprob = float("-inf")
        max_label = None
        for label, logprob in viterbi_layers[len(words) - 1].iteritems():
            if logprob > max_logprob:
                max_logprob = logprob
                max_label = label

        # Most probable sequence.
        path = []
        label = max_label
        for i in reversed(xrange(len(words))):
            path.insert(0, label)
            try:
                label = viterbi_backpointers[i][label]
            except KeyError:
                pass

        return zip(words,path)
Ejemplo n.º 43
0
def main():
    global feat_dict, me
    # parsing options{{{
    usage = "usage: %prog [options] model"
    parser = OptionParser(usage)
    parser.add_option("-f",
                      "--file",
                      type="string",
                      dest="filename",
                      metavar="FILE",
                      help="train a ME model with data from FILE")
    parser.add_option("--heldout",
                      type="string",
                      metavar="FILE",
                      help="use heldout events from FILE")
    parser.add_option("--extract",
                      type="string",
                      metavar="FILE",
                      help="extract training data to file")
    parser.add_option("--events_out",
                      type="string",
                      help="write training(heldout) events to file")
    parser.add_option(
        "-c",
        "--cutoff",
        type="int",
        default=10,
        help="discard feature with frequency < CUTOFF when training\
            [default=10]")
    parser.add_option(
        "-r",
        "--rare",
        type="int",
        default=5,
        help="use special feature for rare word with frequency < RARE \
            [default=5]")
    parser.add_option("-g",
                      "--gaussian",
                      type="float",
                      default=0.0,
                      help="apply Gaussian penality when training \
            [default=0.0]")
    parser.add_option(
        "-b",
        "--binary",
        action="store_true",
        default=0,
        help="save events in binary format for fast loading [default=off]")
    parser.add_option(
        "--ev_cutoff",
        type="int",
        default=1,
        help="discard event with frequency < CUTOFF when training \
            [default=1]")
    parser.add_option(
        "--iters",
        type="int",
        default=15,
        help="how many iterations are required for training[default=15]")

    parser.add_option("-T",
                      "--type",
                      type="int",
                      default=None,
                      help="choose context type [default for English]")
    (options, args) = parser.parse_args()
    #}}}

    if options.filename:
        file = open(options.filename)
    else:
        print 'training file not given'
        parser.print_usage()
        sys.exit(1)

    if len(args) != 1:
        print >> sys.stderr, 'model name not given'
        parser.print_usage()
        sys.exit(1)
    model_name = args[0]

    global rare_freq
    rare_freq = options.rare

    global get_context

    get_context = postagger.choose_context(options.type)

    # First pass: gather word frequency information {{{
    print 'First pass: gather word frequency information'
    gather_word_freq(file)
    print '%d words found in training data' % len(word_freq)
    word_freq_file = options.filename + '.wordfreq'
    print 'Saving word frequence information to %s' % col(
        word_freq_file, 'lgreen')
    save_word_freq(word_freq_file)
    print
    # }}}

    # Second pass: gather features and tag dict {{{
    file.seek(0)
    print 'Second pass: gather features and tag dict to be used in tagger'
    print 'feature cutoff:%d' % options.cutoff
    print 'rare word freq:%d' % options.rare
    extract_feature(file, gather_feature)
    print '%d features found' % len(feat_dict)
    print '%d words found in pos dict' % len(tag_dict)
    print 'Applying cutoff %d to features' % options.cutoff
    cutoff_feature(options.cutoff, options.rare)
    print '%d features remained after cutoff' % len(feat_dict)
    feature_file = model_name + '.features'
    print 'saving features to file %s' % feature_file
    save_features(feature_file)
    #    tag_dict_file = options.filename + '.tagdict'
    #    print 'Saving tag dict to file %s' % (col(tag_dict_file, 'lgreen'))
    #    save_tag_dict(tag_dict_file)
    tagdict_file = model_name + '.tagdict'
    print 'Saving tag dict object to %s' % col(tagdict_file, 'lgreen'),
    import cPickle
    cPickle.dump(tag_dict, open(tagdict_file, 'w'))
    print 'done'
    #}}}

    if options.extract:
        global training_data
        training_data = open(options.extract, 'w')
        print 'Saving training data to %s' % options.extract
        file.seek(0)
        extract_feature(file, save_training_data)
        sys.exit(0)

    # Third pass:training ME model...{{{
    print 'Third pass:training ME model...'
    me = MaxentModel()
    me.begin_add_event()
    file.seek(0)
    extract_feature(file, add_event)
    #import profile
    #profile.run('me.end_training()','proflog')
    if options.heldout:
        raise 'not tested'
        print 'adding heldout events from %s' % col(options.heldout, 'yellow')
        extract_feature(open(options.heldout), add_heldout_event, True)
    me.end_add_event(options.ev_cutoff)
    if options.events_out:
        raise 'not tested'
        print 'dumping training events to', col(options.events_out, 'lgreen')
        #        import hotshot,  hotshot.stats
        #        prof = hotshot.Profile("dump_events.prof", 1)
        #        prof.runcall(me.dump_events, options.events_out)
        me.dump_events(options.events_out, options.binary)
        sys.exit(0)

    me.train(options.iters, 'lbfgs', options.gaussian)

    print 'training finished'

    print 'saving tagger model to %s' % model_name,
    me.save(model_name)
    print 'done'
Ejemplo n.º 44
0
def main():
    if len(sys.argv) != 4:
        print "Usage: MaxentTrain.py features.mat labels.mat modelName"
        sys.exit(1)

    features = featureMatrice(sys.argv[1])
    labels = labelLst(sys.argv[2])

    model = MaxentModel()
    # add data into model
    model.begin_add_event()
    for i in range(len(labels)):
        model.add_event(features[i], str(labels[i]), 1)

    model.end_add_event()

    # start training
    #model.train()
    model.train(1000, "gis", 2)
    #model.train(30, "lbfgs")

    # save the model
    model.save(sys.argv[3])
Ejemplo n.º 45
0
def train_model(options, iterable):

    # train_ne_binary_model(options, iterable)
    # ne_labels = eval_ne_binary_model_train(options, iterable)

    model = MaxentModel()
    data = {}

    data["feature_set"] = set()
    data["word_frequencies"] = defaultdict(long)
    # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be
    # a better choice here (for |labelled_words|) but it could not be pickled.
    # C'est la vie.
    data["labelled_words"] = dict()

    print >>sys.stderr, "*** Training options are:"
    print >>sys.stderr, "   ", options

    print >>sys.stderr, "*** First pass: Computing statistics..."
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)

        for word, pos, label in sentence:
            data["word_frequencies"][word] += 1
            if label.startswith("B-") or label.startswith("I-"):
                if word not in data["labelled_words"]:
                    data["labelled_words"][word] = defaultdict(long)
                data["labelled_words"][word][label] += 1

    print >>sys.stderr, "*** Second pass: Collecting features..."
    model.begin_add_event()

    data["sentences"] = iterable
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses, labels = map(list, zip(*sentence))
        # sentence_ne_labels = ne_labels[n]
        # data["ne_labels"] = sentence_ne_labels
        data["sentence_number"] = n
        data["double_quotes"] = False
        data["single_quotes"] = False

        for i in xrange(len(labels)):
            features = compute_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^")
            features = list(features)
            model.add_event(features, labels[i])
            for feature in features:
                data["feature_set"].add(feature)
    model.end_add_event(options.cutoff)
    print >>sys.stderr, "*** Collected {0} features.".format(len(data["feature_set"]))

    print >>sys.stderr, "*** Training..."
    maxent.set_verbose(1)
    model.train(options.iterations, options.technique, options.gaussian)
    maxent.set_verbose(0)

    print >>sys.stderr, "*** Saving..."
    model.save(options.model + ".maxent")
    with open(options.model + ".data", "w") as handle:
        cPickle.dump(data, handle)