Exemple #1
0
def eval_model(options, iterable):
    model = MaxentModel()
    data = {}

    # ne_labels = eval_ne_binary_model(options, iterable)

    print >>sys.stderr, "*** Loading..."
    model.load(options.model + ".maxent")
    with open(options.model + ".data", "r") as handle:
        data = cPickle.load(handle)

    print >>sys.stderr, "*** Evaluating..."
    data["sentences"] = iterable
    for n, sentence in enumerate(iterable):
        if (n % 100) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses = map(list, zip(*sentence))
        # data["ne_labels"] = ne_labels[n]
        data["sentence_number"] = n
        data["double_quotes"] = False
        data["single_quotes"] = False

        labels = eval_model_sentence(options, data, model, words, poses)

        for word, pos, label in zip(words, poses, labels):
            print label
        print
Exemple #2
0
def tag_test(test_feature_file, trained_model_file, tag_test_set_file):
    fin = codecs.open(test_feature_file, 'r', 'utf-8')
    fout = codecs.open(tag_test_set_file, 'w', 'utf-8')
    m = MaxentModel()
    m.load(trained_model_file)
    contents = fin.read()
    feature_list = contents.split('\r')
    feature_list.remove('\n')
    for feature in feature_list:
        if (feature == 'split'):
            fout.write('\n\n\n')
            continue
        str_feature = []
        u_feature = feature.split(' ')
        for item in u_feature:
            str_feature.append(item.encode('utf-8'))
        label_prob_list = m.eval_all(str_feature)
        label = max_prob(label_prob_list)

        try:
            new_tag = str_feature[2].split('=')[1] + '/' + label
        except IndexError:
            print
            str_feature
        fout.write(new_tag.decode('utf-8'))
        pre_tag = label

    return feature_list
Exemple #3
0
def eval_model(options, iterable):
    model = MaxentModel()
    data = {}

    print >>sys.stderr, "*** Loading..."
    model.load(options.model + ".maxent")
    with open(options.model + ".data", "r") as handle:
        data = cPickle.load(handle)

    print >>sys.stderr, "*** Evaluating..."
    for n, sentence in enumerate(iterable):
        if (n % 100) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses = map(list, zip(*sentence))
        labels = eval_model_sentence(options, data, model, words, poses)

        ## some post-proccessing for remove sequences: O I-ORG O
        previous_label = '^'

        for i in xrange(0, len(words)):
            label = labels[i]            
            if (label.startswith('I-')) and ((previous_label == 'O') or (previous_label == '^')):
                label = 'B' + label[1:]
            # if (i + 1 < len(words)) and (labels[i + 1] != 'O') and (labels[i] != 'O') and (labels[i + 1][0] != 'B') and (labels[i + 1][2:] != labels[i][2:]):
                # label = labels[i][:1] + labels[i + 1][2:]
            print label
            previous_label = label
        print
Exemple #4
0
def tag_test(test_feature_file, trained_model_file,  tag_test_set_file):
  fr = codecs.open(test_feature_file, 'r', 'utf-8')
  fw = codecs.open(tag_test_set_file, 'w', 'utf-8')
  m = MaxentModel()
  m.load(trained_model_file)
  contents = fr.read()
  feature_list = contents.split('\r')
  feature_list.remove('\n')
  for feature in feature_list:
    if (feature == 'split'):
      fw.write('\n\n\n')
      continue
    str_feature = []
    u_feature = feature.split(' ')
    for item in u_feature:
      str_feature.append(item.encode('utf-8'))
    label_prob_list = m.eval_all(str_feature)
    label = max_prob(label_prob_list)

    try:
      new_tag = str_feature[2].split('=')[1] + '/' + label
    except IndexError:
      print str_feature
    fw.write(new_tag.decode('utf-8'))
    pre_tag = label
  return feature_list
Exemple #5
0
def main():

	usage = "usage: %prog [options] -m model file"
    	parser = OptionParser(usage)
	parser.add_option("-i","--input",type="string",
			help="test data as input")
    	parser.add_option("-o", "--output", type="string",
            help="write detector result to OUTPUT")
    	parser.add_option("-m", "--model", type="string", 
            help="load trained model from MODEL")
    	(options, args) = parser.parse_args()

	global m
	model = options.model
	m = MaxentModel()
	m.load(model)

	#in_file = sys.stdin
	if options.input:
    		in_file = open(options.input)
	else: 
		print >> sys.stderr,'not given input test data'
		sys.exit(1)
	
	if len(args) >=1:
        	tag_in_file = open(args[0])

    	out_file = sys.stdout
    
    	if options.output:
        	out_file = open(options.output, 'w')

    	predict_file(in_file,out_file)
Exemple #6
0
def main():
  if len(sys.argv) != 2:
    print "Usage: MaxentTest.py modelName"
    sys.exit(1)
  
  model = MaxentModel()
  model.load(sys.argv[1])
  context = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  label = model.eval(context, str(0))
  #result = model.eval_all(context)
  print "Result: ", label
def main():
    usage = "usage: %prog [options] -m model file"
    parser = OptionParser(usage)
    parser.add_option("-o",
                      "--output",
                      type="string",
                      help="write tagged result to OUTPUT")
    parser.add_option("-m",
                      "--model",
                      type="string",
                      help="load trained model from MODEL")
    parser.add_option("-t",
                      "--test",
                      action="store_true",
                      default=0,
                      help="test mode, include original tag in output")
    parser.add_option("-v",
                      "--verbose",
                      action="store_true",
                      dest="verbose",
                      default=1)
    parser.add_option("-q", "--quiet", action="store_false", dest="verbose")
    parser.add_option("-T",
                      "--type",
                      type="int",
                      default=None,
                      help="choose context type")

    (options, args) = parser.parse_args()

    if not options.model:
        print >> sys.stderr, 'Tagger model name not given!'
        parser.print_usage()
        sys.exit(1)

    model = options.model
    tag_dict = cPickle.load(open(model + '.tagdict'))

    me = MaxentModel()
    me.load(model)
    tagger = postagger.PosTagger(me, tag_dict, options.type)

    tag_in_file = sys.stdin
    if len(args) >= 1:
        tag_in_file = open(args[0])

    tag_out_file = sys.stdout
    if options.output:
        tag_out_file = open(out, 'w')

    tag_file(tagger, tag_in_file, tag_out_file, options.test)
Exemple #8
0
def main():
    if len(sys.argv) != 2:
        print "Usage: MaxentTest.py modelName"
        sys.exit(1)

    model = MaxentModel()
    model.load(sys.argv[1])
    context = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0
    ]
    label = model.eval(context, str(0))
    #result = model.eval_all(context)
    print "Result: ", label
Exemple #9
0
def predict_tags(best_1_name, best_1_org, best_3_name, best_5_org, sentences,
                 f, op):
    rel = [
        'others', 'director', 'analyst', 'advisor', 'head', 'manager',
        'spokesperson', 'founder', 'professor', 'leave', 'lawyer'
    ]
    me = MaxentModel()
    me.load('../training/models/lbfgs/model3')
    count = 0
    for n1, o1, n3, o5, sent in zip(best_1_name, best_1_org, best_3_name,
                                    best_5_org, sentences):
        if len(n3) == 0 or len(o5) == 0:
            op.write(str((n1, o1)) + '\n')
        else:
            j = ('', '', '', 0.0)
            d = {}
            for name in n3:
                for org in o5:
                    context = get_context(name, org, sent)

                    relation = ''
                    prob = 0.0
                    if context != None:

                        for r in rel:
                            y = me.eval(context, r)
                            if y > prob:
                                prob = y
                                relation = r
                        #set_r.append((name,org,relation,prob))
                        d[(name, org)] = relation
                        if prob > j[3] and relation != 'others':
                            j = (name, org, relation, prob)

                    else:

                        d[(name, org)] = 'others'
            #print str(count)+' before : '+str(n1)+'\t'+str(o1)
            resolve_conflicts(n1, o1, j)
            #print str(count)+' after : '+str(n1)+'\t'+str(o1)
            #x = raw_input()
            op.write(str((n1, o1)) + '\n')
            f.write(str(j) + '\n')
        count = count + 1
Exemple #10
0
def eval_model(options, iterable):
    model = MaxentModel()
    data = {}

    print >>sys.stderr, "*** Loading..."
    model.load(options.model + ".maxent")
    with open(options.model + ".data", "r") as handle:
        data = cPickle.load(handle)

    print >>sys.stderr, "*** Evaluating..."
    for n, sentence in enumerate(iterable):
        if (n % 100) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses = map(list, zip(*sentence))
        labels = eval_model_sentence(options, data, model, words, poses)

        for word, pos, label in zip(words, poses, labels):
            print label
        print
Exemple #11
0
def eval_model(options, iterable):
    model = MaxentModel()
    data = {}

    print >> sys.stderr, "*** Loading..."
    model.load(options.model + ".maxent")
    with open(options.model + ".data", "r") as handle:
        data = cPickle.load(handle)

    print >> sys.stderr, "*** Evaluating..."
    for n, sentence in enumerate(iterable):
        if (n % 100) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses = map(list, zip(*sentence))
        labels = eval_model_sentence(options, data, model, words, poses)

        for word, pos, label in zip(words, poses, labels):
            print label
        print
Exemple #12
0
def eval_ne_binary_model(options, iterable):
    model = MaxentModel()
    data = {}

    predicted_labels = []

    print >>sys.stderr, "*** Loading..."
    model.load(options.model + ".ne.binary.maxent")
    with open(options.model + ".ne.binary.data", "r") as handle:
        data = cPickle.load(handle)

    print >>sys.stderr, "*** Evaluating..."
    for n, sentence in enumerate(iterable):
        if (n % 100) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses = map(list, zip(*sentence))
        labels = eval_ne_binary_model_sentence(options, data, model, words, poses)
        predicted_labels += [labels]

    return predicted_labels
def main ():
    usage = "usage: %prog [options] -m model file"
    parser = OptionParser(usage)
    parser.add_option("-o", "--output", type="string",
            help="write tagged result to OUTPUT")
    parser.add_option("-m", "--model", type="string", 
            help="load trained model from MODEL")
    parser.add_option("-t", "--test", action="store_true",
            default=0, help="test mode, include original tag in output")
    parser.add_option("-v", "--verbose",
                    action="store_true", dest="verbose", default=1)
    parser.add_option("-q", "--quiet",
                    action="store_false", dest="verbose")
    parser.add_option("-T","--type",  type="int", default=None, 
            help="choose context type")

    (options, args) = parser.parse_args()

    if not options.model:
        print >> sys.stderr, 'Tagger model name not given!'
        parser.print_usage()
        sys.exit(1)

    model = options.model
    tag_dict = cPickle.load(open(model + '.tagdict'))

    me = MaxentModel()
    me.load(model)
    tagger = postagger.PosTagger(me, tag_dict, options.type)

    tag_in_file = sys.stdin
    if len(args) >=1:
        tag_in_file = open(args[0])

    tag_out_file = sys.stdout
    if options.output:
        tag_out_file = open(out, 'w')

    tag_file(tagger, tag_in_file, tag_out_file, options.test)
class MMEMAlgorithm(object):

    #реализация алгоритма на основе HMM
    def __init__(self,compute_features, N_filter_func = N_default):
        self.filter_func = N_filter_func
        self.me = MaxentModel()
        self.num_train_iters = 2000
	self.compute_features = compute_features

    def load_memm_model(self, filename):
        self.me.load( filename  )

    def init(self):
        pass

    


    def train_model_file_list(self, corpus_filelist, ambiguity_dir ):
        self.me.begin_add_event()

        for corpus_file in corpus_filelist:
            print "Training on file {0}".format( corpus_file )
            sentence = []
            morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) )

            morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func ) if os.path.exists( morph_analys_file ) else None
            if morph_analys_tokens:
                print "Using mystem features on file {0}".format( morph_analys_file )

            gold_tokens = get_tokens_from_file(corpus_file, N_filter_func = self.filter_func )
            for corpus_token in gold_tokens:

                morph_analys_token = morph_analys_tokens.next() if morph_analys_tokens else None


                gold_token_word = corpus_token[0].word
                morph_analys_token_word = morph_analys_token[0].word if morph_analys_token else None
                if morph_analys_token_word:
                    if gold_token_word != morph_analys_token_word:
                        '''
                        if ('-' in gold_token_word and '-' not in morph_analys_token_word) or ('\'' in gold_token_word and '\'' not in morph_analys_token_word):
                            morph_analys_token = morph_analys_tokens.next()
                        if ('.' in gold_token_word):
                            cnt_dots = '.'.count( gold_token_word )
                            for i in xrange( 0, cnt_dots ):
                                morph_analys_token = morph_analys_tokens.next()
                        '''
                        print >>sys.stderr, u"Start skipping sentence. Gold token wordform {0} morph token wordform {1}".format( gold_token_word, morph_analys_token_word )

                        sentence = []
                        try:
                            next_gold = gold_tokens.next()
                            while( next_gold !=  [EOS_TOKEN] ):
                                next_gold = gold_tokens.next()

                            next_gold = gold_tokens.next()
                            next_morph = morph_analys_tokens.next()
                            while( next_morph[0].word != next_gold[0].word ):
                                next_morph = morph_analys_tokens.next()

                        except StopIteration:
                            break



                if corpus_token[0] == EOS_TOKEN and len(sentence) > 0:
                    words = [token[0].word for token in sentence]
                    labels = [token[0].gram for token in sentence]
                    for i,token_info in enumerate( sentence ):
                        gold_token = token_info[0]
                        morph_analysises = [token.gram for token in token_info[1]] if token_info[1] and morph_analys_token else None

                        if token_info[1] is not None:
                            if gold_token.word != token_info[1][0].word:
                                print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0}     morph analysis token : {1}".format( gold_token.word, token_info[1][0].word )
                                morph_analysises = None

                        word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) )
                        gold_token_gram = gold_token.gram.encode('utf-8')
                        self.me.add_event(word_features, gold_token_gram )
                    sentence = []
                else:
                    sentence.append( (corpus_token[0], morph_analys_token)  )


        self.me.end_add_event()
        maxent.set_verbose(1)
        self.me.train( self.num_train_iters, 'lbfgs', 0.0 )
        maxent.set_verbose(0)

    def train_model(self, corpus_dir, ambiguity_dir ):
        self.me.begin_add_event()
        #self.B = train_B_corpus(corpus_dir = corpus_dir,N_filter_func = N_filter_func)
        sentence = []

        corpus_files = get_corpus_files(corpus_dir)
        for corpus_file in corpus_files:

            morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) )
            morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func )

            for corpus_token in get_tokens_from_file(corpus_file, N_filter_func = self.filter_func ):

                morph_analys_token = morph_analys_tokens.next()
                if corpus_token[0] == EOS_TOKEN:
                    words = [token[0].word for token in sentence]
                    labels = [token[0].gram for token in sentence]
                    for i,token_info in enumerate( sentence ):
                        gold_token = token_info[0]
                        morph_analysises = [token.gram for token in token_info[1]]
                        if gold_token.word != token_info[1][0].word:
                            print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0}     morph analysis token : {1}".format( gold_token.word, token_info[1][0].word )
                            morph_analysises = None
                        word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) )
                        gold_token_gram = gold_token.gram.encode('utf-8')
                        self.me.add_event(word_features, gold_token_gram )
                    sentence = []
                else:
                    sentence.append( (corpus_token[0], morph_analys_token)  )

        self.me.end_add_event()
        maxent.set_verbose(1)
        self.me.train( 50, 'lbfgs', 0.0 )
        maxent.set_verbose(0)

    def load_model(self, memm_filename):
        self.me.load( memm_filename )

    def save_model(self, memm_filename):
        self.me.save( memm_filename )
        #dump_object( B_stat_filename, self.B )

    def remove_ambiguity_file(self, file, outfile):
        out_f =  codecs.open( outfile, 'w', 'utf-8' )
        sentence = []
        for token in get_tokens_from_file(file, N_filter_func= self.filter_func):
            if len(token) == 1 and token[0] == EOS_TOKEN:
                if len(sentence)>0:
                    no_ambig_tokens = self.remove_ambiguity( sentence )
                    for no_ambig_token in no_ambig_tokens:
                        out_f.write( u"{0}\t{1}={2}\r\n".format(no_ambig_token[0], 'nolemma', no_ambig_token[1] ) )
                    out_f.write('\r\n')
                    sentence = []
                    continue
                else:
                    sentence = []
                    continue

            sentence.append( (token[0].word, token) )
        out_f.close()

    def remove_ambiguity_dir(self, dir):
        pass

    def remove_ambiguity(self, variants):
        """
        Структура variants = [ (word_form, [tokens ]), (...) , (  ) ]
        """
        words = [variant[0]  for variant in variants]
        analysises = [[token.gram for token in variant[1]]  for variant in variants ]
        viterbi_layers = [ None for i in xrange(len(words)) ]

        viterbi_backpointers = [ None for i in xrange(len(words) + 1) ]

        # Compute first layer directly.
        viterbi_layers[0] = self.me.eval_all(list(self.compute_features(sentence=words, i = 0 , prev_label= None, analysises = analysises[0], labels = None ) ) )

        filtered_viterbi_layer = dict( (k, v) for k, v in viterbi_layers[0] if k in analysises[0] )
        viterbi_layer_0_prob = sum( [v for v in filtered_viterbi_layer.values() ]  )
        viterbi_layers[0] = dict( (k, math.log(v/viterbi_layer_0_prob) ) for k, v in filtered_viterbi_layer.items() )


        viterbi_backpointers[0] = dict( (k, None) for k, v in viterbi_layers[0].iteritems() )

        # Compute intermediate layers.
        for i in xrange(1, len(words)):
            viterbi_layers[i] = defaultdict(lambda: float("-inf"))
            viterbi_backpointers[i] = defaultdict(lambda: None)
            for prev_label, prev_logprob in viterbi_layers[i - 1].iteritems():
                features = self.compute_features(sentence=words,i= i, prev_label= prev_label, analysises = analysises[i], labels = None)
                features = list(features)
                distribution =  self.me.eval_all(features)
                distribution = dict( (label, prob) for label, prob in  distribution if label in analysises[i])

                distribution_sum = sum( [v for v in distribution.values() ]  )
                distribution = dict( (k, v/ distribution_sum) for k, v in distribution.items() )
                for label, prob in distribution.items():
                    logprob = math.log(prob)
                    if prev_logprob + logprob > viterbi_layers[i][label]:
                        viterbi_layers[i][label] = prev_logprob + logprob
                        viterbi_backpointers[i][label] = prev_label

        # Most probable endpoint.
        max_logprob = float("-inf")
        max_label = None
        for label, logprob in viterbi_layers[len(words) - 1].iteritems():
            if logprob > max_logprob:
                max_logprob = logprob
                max_label = label

        # Most probable sequence.
        path = []
        label = max_label
        for i in reversed(xrange(len(words))):
            path.insert(0, label)
            try:
                label = viterbi_backpointers[i][label]
            except KeyError:
                pass

        return zip(words,path)
class BatchSBD:
    def __init__(self, dictpath):
        util.Logger.info('Initializing sbd instance...')
        self.tokenizer = Tokenizer.Tokenizer()
        self.statistics = defaultdict(int)
        self.dictionary = Dictionary.Dictionary(dictpath)
        self.dictionary.load('syllable')
        self.dictionary.load('token')
        self.dictionary.load('type')
        self.dictionary.load('length')
        self.model = MaxentModel()
        self.threshold = 0.0
        util.Logger.info('sbd instance Initialized.')

    def load(self, modelname=None, threshold=0.0):
        util.Logger.info('Loading model...')
        assert(modelname != None)
        assert(modelname.strip() != '')
        try:
            util.Logger.debug("Started to load model...")
            self.model.load(modelname)
            self.threshold = threshold
            util.Logger.debug("Completed to load model '%s'" % modelname)
        except:
            raise
        util.Logger.info('Model loaded.')

    def run(self, input=None, output=None, syllable_length=1, merged_use=False):
        util.Logger.info('run ' + input + ',' + output)
        assert(input != None)
        assert(input.strip() != '')
        assert(output != None)
        assert(output.strip() != '')
        try:
            # load document 
            util.Logger.info("Started to load document.")
            document = Document.Document()
            ifile = open(input)
            # build document
            util.Logger.info("Adding token to document.")
            self.tokenizer.clear()
            for token in self.tokenizer.tokenize(ifile):
                document.add(token)
            ifile.close()
            # detect sentence boundaries
            util.Logger.info("Detecting sentence boundaries.")
            ofile = open(output, "w+")
            line = ''
            lineno = 1
            for id in range(document.length()):
                prev = document.prev(id)
                curr = document.token(id)
                next = document.next(id)
                eos = False
                # check every position
                eos = self.eval(document, id, prev, curr, next, syllable_length, merged_use)
                if eos == None:
                    continue; # null field found
                line += curr.value
                if curr.isEoe():
                    line += ' '
                if eos and len(line.strip()) > 0:
                    if line[0:1] == ' ':
                        ofile.write('\n')
                    ofile.write(line.strip() + '\n')
                    line = ''
            ofile.write(line.strip() + '\n')
            ofile.close()
            document.clear()
            util.Logger.info("Detecting '%s' document completed." % input)
        except:
            raise

    def eos(self, context):
        label = 'yes'
        prob = self.model.eval(context, label)
        buf = ''
        if prob >= self.threshold:
            return True
        else:
            return False

    # append property into list-buf
    def append_maxent_parameter(self, list, i, property):
        i += 1
        list.append(str(i) + ':' + str(property))
        return i

    # FIXME: code duplicattion with sbd.detector.Probabilistic.py
    def eval(self, document, id, prevToken, currToken, nextToken, syllable_length=0, merged_use=False):
        dict = self.dictionary
        common = util.Common()
        # default token value
        default = '_'
        # { pos-type, pos-name }
        current_pos_type = common.name_of_type(currToken)
        current_pos_name = common.name_of_pos(currToken)
        prefix_pos_type = common.name_of_type(prevToken)
        prefix_pos_name = common.name_of_pos(prevToken)
        suffix_pos_type = common.name_of_type(nextToken)
        suffix_pos_name = common.name_of_pos(nextToken)
        # { syllables }
        prefix_syllable_name = []
        prefix_syllable_prob = []
        suffix_syllable_name = []
        suffix_syllable_prob = []
        merged_syllable_name = []
        merged_syllable_prob = []
        for length in xrange(syllable_length):
            if prevToken.length == 0: prefixName = default * syllable_length
            else: prefixName = prevToken.syllable(-1*(length+1))
            prefix_syllable_name.append(prefixName)
            prefix_syllable_prob.append(dict.getPrefixSyllableProb(prefixName))
            if nextToken.length == 0: suffixName = default * syllable_length
            else: suffixName = nextToken.syllable(length+1)
            suffix_syllable_name.append(suffixName)
            suffix_syllable_prob.append(dict.getSuffixSyllableProb(suffixName))
            if merged_use:
                mergedName = prefixName + '_' + suffixName
                merged_syllable_name.append(mergedName)
                merged_syllable_prob.append(dict.getMergedSyllableProb(mergedName))
        # { token-name, token-prob }
        if currToken.length == 0: current_token_name = default
        else: current_token_name = currToken.value
        current_token_prob = dict.getCurrentTokenProb(current_token_name)
        if prevToken.length == 0: prefix_token_name = default
        else: prefix_token_name = prevToken.value
        prefix_token_prob = dict.getPrefixTokenProb(prefix_token_name)
        if nextToken.length == 0: suffix_token_name = default
        else: suffix_token_name = nextToken.value
        suffix_token_prob = dict.getSuffixTokenProb(suffix_token_name)
        # { candidate-distance }
        prefix_candidate_dist = document.prevCandidateDist(id)
        suffix_candidate_dist = document.nextCandidateDist(id)
        # { punctuation-distance }
        prefix_punctuation_dist = document.prevPunctuationDist(id)
        suffix_punctuation_dist = document.nextPunctuationDist(id)
        # { token-length }
        current_token_length = currToken.length
        prefix_token_length = prevToken.length
        suffix_token_length = nextToken.length
        # { end-of-sentence }
        end_of_sentence = 'no'
        if currToken.end_of_sentence:
            end_of_sentence = 'yes'
        context = [end_of_sentence]
        i = 0
        # { building instances }
        i = self.append_maxent_parameter(context, i, current_pos_type)
        i = self.append_maxent_parameter(context, i, current_pos_name)
        i = self.append_maxent_parameter(context, i, prefix_pos_type)
        i = self.append_maxent_parameter(context, i, prefix_pos_name)
        i = self.append_maxent_parameter(context, i, suffix_pos_type)
        i = self.append_maxent_parameter(context, i, suffix_pos_name)
        # XXX: maxent use NAME instead of PROBABILITY
        for length in xrange(syllable_length):
            i = self.append_maxent_parameter(context, i, prefix_syllable_name[length])
            i = self.append_maxent_parameter(context, i, suffix_syllable_name[length])
            if merged_use:
                i = self.append_maxent_parameter(context, i, merged_syllable_name[length])
        i = self.append_maxent_parameter(context, i, current_token_name)
        i = self.append_maxent_parameter(context, i, prefix_token_name)
        i = self.append_maxent_parameter(context, i, suffix_token_name)
        i = self.append_maxent_parameter(context, i, str(current_token_length))
        i = self.append_maxent_parameter(context, i, str(prefix_token_length))
        i = self.append_maxent_parameter(context, i, str(suffix_token_length))
        eos = self.eos(context)
        return eos

    def calc(self, answer, rule):
        if answer == True and rule == True:
            result = 'TP'
        elif answer == True and rule == False:
            result = 'TN'
        elif answer == False and rule == True:
            result = 'FP'
        else:
            result = 'FN'
        self.statistics[result] += 1

    def summary(self):
        precision = 0.0
        recall = 0.0
        fscore = 0.0
        tp = self.statistics['TP']
        tn = self.statistics['TN']
        fp = self.statistics['FP']
        util.Logger.info("tp:", tp, "tn:", tn, "fp:", fp)
        if (tp + tn) > 0:
            precision = tp * 1.0 / (tp + tn)
        if (tp + fp) > 0:
            recall = tp * 1.0 / (tp + fp)
        if (precision+recall) > 0:
            fscore = (2*precision*recall) / (precision+recall)
        util.Logger.info("Precision:\t%0.3f%%" % (precision * 100.0))
        util.Logger.info("Recall:\t\t%0.3f%%" % (recall * 100.0))
        util.Logger.info("Fscore:\t\t%0.3f%%" % (fscore * 100.0))
class MaxentBasedSBD:
    def __init__(self, dictpath):
        self.tokenizer = Tokenizer.Tokenizer()
        self.documents = defaultdict(Document.Document)
        self.statistics = defaultdict(int)
        self.dictionary = Dictionary.Dictionary(dictpath)
        self.dictionary.load('syllable')
        self.dictionary.load('token')
        self.dictionary.load('type')
        self.dictionary.load('length')
        self.model = MaxentModel()
        self.threshold = 0.0

    def set(self, modelname=None, threshold=0.0, filename=None):
        assert(modelname != None)
        assert(modelname.strip() != '')
        assert(filename != None)
        assert(filename.strip() != '')
        try:
            util.Logger.debug("Started to load model...")
            self.model.load(modelname)
            self.threshold = threshold
            util.Logger.debug("Completed to load model '%s'" % modelname)
        except:
            raise
        try:
            util.Logger.debug("Started to load document...")
            document = Document.Document()
            file = open(filename)
            for token in self.tokenizer.tokenize(file):
                document.add(token)
            file.close()
            self.documents[filename] = document
            util.Logger.debug("Competed to load document '%s'" % filename)
        except:
            raise

    def get(self, filename=None):
        assert(filename != None)
        assert(filename.strip() != '')
        if filename in self.documents:
            return self.documents[filename]
        else:
            return Document.Document()

    def eos(self, context):
        label = 'yes'
        prob = self.model.eval(context, label)
        buf = ''
        if prob >= self.threshold:
            return True
        else:
            return False

    # append property into list-buf
    def append_maxent_parameter(self, list, i, property):
        i += 1
        list.append(str(i) + ':' + str(property))
        return i

    # FIXME: code duplicattion with sbd.detector.Probabilistic.py
    def eval(self, document, id, prevToken, currToken, nextToken, syllable_length=0, merged_use=False):
        dict = self.dictionary
        common = util.Common()
        # default token value
        default = '_'
        # { pos-type, pos-name }
        current_pos_type = common.name_of_type(currToken)
        current_pos_name = common.name_of_pos(currToken)
        prefix_pos_type = common.name_of_type(prevToken)
        prefix_pos_name = common.name_of_pos(prevToken)
        suffix_pos_type = common.name_of_type(nextToken)
        suffix_pos_name = common.name_of_pos(nextToken)
        # { syllables }
        prefix_syllable_name = []
        prefix_syllable_prob = []
        suffix_syllable_name = []
        suffix_syllable_prob = []
        merged_syllable_name = []
        merged_syllable_prob = []
        for length in xrange(syllable_length):
            if prevToken.length == 0: prefixName = default * syllable_length
            else: prefixName = prevToken.syllable(-1*(length+1))
            prefix_syllable_name.append(prefixName)
            prefix_syllable_prob.append(dict.getPrefixSyllableProb(prefixName))
            if nextToken.length == 0: suffixName = default * syllable_length
            else: suffixName = nextToken.syllable(length+1)
            suffix_syllable_name.append(suffixName)
            suffix_syllable_prob.append(dict.getSuffixSyllableProb(suffixName))
            if merged_use:
                mergedName = prefixName + '_' + suffixName
                merged_syllable_name.append(mergedName)
                merged_syllable_prob.append(dict.getMergedSyllableProb(mergedName))
        # { token-name, token-prob }
        if currToken.length == 0: current_token_name = default
        else: current_token_name = currToken.value
        current_token_prob = dict.getCurrentTokenProb(current_token_name)
        if prevToken.length == 0: prefix_token_name = default
        else: prefix_token_name = prevToken.value
        prefix_token_prob = dict.getPrefixTokenProb(prefix_token_name)
        if nextToken.length == 0: suffix_token_name = default
        else: suffix_token_name = nextToken.value
        suffix_token_prob = dict.getSuffixTokenProb(suffix_token_name)
        # { candidate-distance }
        prefix_candidate_dist = document.prevCandidateDist(id)
        suffix_candidate_dist = document.nextCandidateDist(id)
        # { punctuation-distance }
        prefix_punctuation_dist = document.prevPunctuationDist(id)
        suffix_punctuation_dist = document.nextPunctuationDist(id)
        # { token-length }
        current_token_length = currToken.length
        prefix_token_length = prevToken.length
        suffix_token_length = nextToken.length
        # { end-of-sentence }
        end_of_sentence = 'no'
        if currToken.end_of_sentence:
            end_of_sentence = 'yes'
        context = [end_of_sentence]
        i = 0
        # { building instances }
        i = self.append_maxent_parameter(context, i, current_pos_type)
        i = self.append_maxent_parameter(context, i, current_pos_name)
        i = self.append_maxent_parameter(context, i, prefix_pos_type)
        i = self.append_maxent_parameter(context, i, prefix_pos_name)
        i = self.append_maxent_parameter(context, i, suffix_pos_type)
        i = self.append_maxent_parameter(context, i, suffix_pos_name)
        # XXX: maxent use NAME instead of PROBABILITY
        for length in xrange(syllable_length):
            i = self.append_maxent_parameter(context, i, prefix_syllable_name[length])
            i = self.append_maxent_parameter(context, i, suffix_syllable_name[length])
            if merged_use:
                i = self.append_maxent_parameter(context, i, merged_syllable_name[length])
        i = self.append_maxent_parameter(context, i, current_token_name)
        i = self.append_maxent_parameter(context, i, prefix_token_name)
        i = self.append_maxent_parameter(context, i, suffix_token_name)
        i = self.append_maxent_parameter(context, i, str(current_token_length))
        i = self.append_maxent_parameter(context, i, str(prefix_token_length))
        i = self.append_maxent_parameter(context, i, str(suffix_token_length))
        eos = self.eos(context)
        return eos

    def calc(self, answer, rule):
        if answer == True and rule == True:
            result = 'TP'
        elif answer == True and rule == False:
            result = 'TN'
        elif answer == False and rule == True:
            result = 'FP'
        else:
            result = 'FN'
        self.statistics[result] += 1

    def summary(self):
        precision = 0.0
        recall = 0.0
        fscore = 0.0
        tp = self.statistics['TP']
        tn = self.statistics['TN']
        fp = self.statistics['FP']
        util.Logger.info("tp:", tp, "tn:", tn, "fp:", fp)
        if (tp + tn) > 0:
            precision = tp * 1.0 / (tp + tn)
        if (tp + fp) > 0:
            recall = tp * 1.0 / (tp + fp)
        if (precision+recall) > 0:
            fscore = (2*precision*recall) / (precision+recall)
        util.Logger.info("Precision:\t%0.3f%%" % (precision * 100.0))
        util.Logger.info("Recall:\t\t%0.3f%%" % (recall * 100.0))
        util.Logger.info("Fscore:\t\t%0.3f%%" % (fscore * 100.0))
Exemple #17
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

# Imports
import sys, os

# Load MaxEnt models
corpusPath = os.environ.get('CORPUS_PATH')
from maxent import MaxentModel

maxEntModel = MaxentModel()
maxEntModel.load(corpusPath + '/model_markers.txt')

for trainLine in sys.stdin.readlines():
    trainCols = trainLine.split('\t')
    modelMarkerProbas = maxEntModel.eval_all(trainCols[1:])
    probaFeats = []
    for modelMarkerProba in modelMarkerProbas:
        if modelMarkerProba[1] > 0.00001:
            probaFeats.append(modelMarkerProba[0] + ':' +
                              str(modelMarkerProba[1]))
    print trainCols[0] + '\t' + '\t'.join(probaFeats)
Exemple #18
0
                  'b': self._dummy,
                  'm': self._dummy}
        return action[tag]

    def segment(self):
        """ sent must be utf8 decoded. """
        if not self.sentence:
            return ''
        ts = heappop(self._segmentationTag()).segtags
        return ' '.join(self._get_words(ts))
    
    _newword = lambda self, result, nextword: (result+[nextword], '')
    _dummy =   lambda self, result, nextword: (result, nextword)


def test_tagger(inputfile, model, segmenterClass):
    for line in open(inputfile):
        line = line.decode('utf8').strip()
        segmenter = segmenterClass(line, model)
        print segmenter.segment().encode('utf8')

if __name__ == "__main__":   
    import sys
    if len(sys.argv) != 3:
        print 'Usage: segmenter.py modelfile inputfile'
        sys.exit()
    from maxent import MaxentModel
    model = MaxentModel()
    model.load(sys.argv[1])
    test_tagger(sys.argv[2], model, segmenterClass=WordSegmenter)
Exemple #19
0
#!/usr/bin/python2.5

from maxent import MaxentModel

import sys

model_file = sys.argv[1]

m = MaxentModel()
m.load(model_file)
m.save(model_file+'.txt')

Exemple #20
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

# Imports
import sys, os

# Load MaxEnt models
corpusPath = os.environ.get('CORPUS_PATH')
from maxent import MaxentModel
maxEntModel = MaxentModel()
maxEntModel.load(corpusPath+'/model_markers.txt')

for trainLine in sys.stdin.readlines():
	trainCols = trainLine.split('\t')
	modelMarkerProbas = maxEntModel.eval_all(trainCols[1:])
	probaFeats = []
	for modelMarkerProba in modelMarkerProbas:
		if modelMarkerProba[1] > 0.00001:
			probaFeats.append(modelMarkerProba[0] + ':' + str(modelMarkerProba[1]))
	print trainCols[0] + '\t' + '\t'.join(probaFeats)