def baseline(sentences, labels):

    maxent.set_verbose(1)
    m = MaxentModel()
    m.begin_add_event()

    with open(sentences) as file_content:
        sentences = file_content.readlines()
    with open(labels) as file_content:
        labels = file_content.readlines()

    for i in xrange(0, 3000):
        m.add_event(sentences[i].split(" "), labels[i].strip())

    m.end_add_event()

    m.train()

    correct = 0
    false = 0

    for i in xrange(3000, len(sentences)):
        result = m.eval(sentences[i].split(" "), "1")
        result = int(round(result))
        label = int(labels[i])
        if result == label:
            correct = correct + 1
        else:
            false = false + 1

    print "correct   :", correct
    print "false     :", false

    print("accuracy  : {:.2f}%".format(correct * 100.0 / (correct + false)))
class MaximumEntropyClassifier(Classifier):
    def __init__(self, restrictFeatures=False):
        Classifier.__init__(self)
        print "MaximumEntropy: Creating model"
        self.model = MaxentModel()
        self.model.verbose = 1
        self.restrictFeatures = restrictFeatures
        self.model.begin_add_event()

    def addToIndex(self, trainingset):
        for (vec,cls) in trainingset:
            self.addFeatureVector(vec,cls)
        
    def addFeatureVector(self, vec, cls, value=1, binary=False):
        for key in vec.keys():
            if key not in self.restrictFeatures:
                del vec[key]
        context = vec.keys()
        label = "%s" % cls
        self.model.add_event(context,label,value)

    def compile(self):
        self.model.end_add_event()
        self.model.train(30, "lbfgs", 2, 1E-03)
        #self.model.train(100, 'gis', 2)
        print "> Models trained"

    def classify(self, point, label='1', binary=False):
        result = self.model.eval(point.keys(), label)
        if result >= 0.5:
            return 1
        return -1
Exemple #3
0
class MaximumEntropyClassifier(Classifier):
    def __init__(self, restrictFeatures=False):
        Classifier.__init__(self)
        print "MaximumEntropy: Creating model"
        self.model = MaxentModel()
        self.model.verbose = 1
        self.restrictFeatures = restrictFeatures
        self.model.begin_add_event()

    def addToIndex(self, trainingset):
        for (vec, cls) in trainingset:
            self.addFeatureVector(vec, cls)

    def addFeatureVector(self, vec, cls, value=1, binary=False):
        for key in vec.keys():
            if key not in self.restrictFeatures:
                del vec[key]
        context = vec.keys()
        label = "%s" % cls
        self.model.add_event(context, label, value)

    def compile(self):
        self.model.end_add_event()
        self.model.train(30, "lbfgs", 2, 1E-03)
        #self.model.train(100, 'gis', 2)
        print "> Models trained"

    def classify(self, point, label='1', binary=False):
        result = self.model.eval(point.keys(), label)
        if result >= 0.5:
            return 1
        return -1
Exemple #4
0
def main():
  if len(sys.argv) != 2:
    print "Usage: MaxentTest.py modelName"
    sys.exit(1)
  
  model = MaxentModel()
  model.load(sys.argv[1])
  context = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  label = model.eval(context, str(0))
  #result = model.eval_all(context)
  print "Result: ", label
Exemple #5
0
def main():
    if len(sys.argv) != 2:
        print "Usage: MaxentTest.py modelName"
        sys.exit(1)

    model = MaxentModel()
    model.load(sys.argv[1])
    context = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0
    ]
    label = model.eval(context, str(0))
    #result = model.eval_all(context)
    print "Result: ", label
Exemple #6
0
def predict_tags(best_1_name, best_1_org, best_3_name, best_5_org, sentences,
                 f, op):
    rel = [
        'others', 'director', 'analyst', 'advisor', 'head', 'manager',
        'spokesperson', 'founder', 'professor', 'leave', 'lawyer'
    ]
    me = MaxentModel()
    me.load('../training/models/lbfgs/model3')
    count = 0
    for n1, o1, n3, o5, sent in zip(best_1_name, best_1_org, best_3_name,
                                    best_5_org, sentences):
        if len(n3) == 0 or len(o5) == 0:
            op.write(str((n1, o1)) + '\n')
        else:
            j = ('', '', '', 0.0)
            d = {}
            for name in n3:
                for org in o5:
                    context = get_context(name, org, sent)

                    relation = ''
                    prob = 0.0
                    if context != None:

                        for r in rel:
                            y = me.eval(context, r)
                            if y > prob:
                                prob = y
                                relation = r
                        #set_r.append((name,org,relation,prob))
                        d[(name, org)] = relation
                        if prob > j[3] and relation != 'others':
                            j = (name, org, relation, prob)

                    else:

                        d[(name, org)] = 'others'
            #print str(count)+' before : '+str(n1)+'\t'+str(o1)
            resolve_conflicts(n1, o1, j)
            #print str(count)+' after : '+str(n1)+'\t'+str(o1)
            #x = raw_input()
            op.write(str((n1, o1)) + '\n')
            f.write(str(j) + '\n')
        count = count + 1
class BatchSBD:
    def __init__(self, dictpath):
        util.Logger.info('Initializing sbd instance...')
        self.tokenizer = Tokenizer.Tokenizer()
        self.statistics = defaultdict(int)
        self.dictionary = Dictionary.Dictionary(dictpath)
        self.dictionary.load('syllable')
        self.dictionary.load('token')
        self.dictionary.load('type')
        self.dictionary.load('length')
        self.model = MaxentModel()
        self.threshold = 0.0
        util.Logger.info('sbd instance Initialized.')

    def load(self, modelname=None, threshold=0.0):
        util.Logger.info('Loading model...')
        assert(modelname != None)
        assert(modelname.strip() != '')
        try:
            util.Logger.debug("Started to load model...")
            self.model.load(modelname)
            self.threshold = threshold
            util.Logger.debug("Completed to load model '%s'" % modelname)
        except:
            raise
        util.Logger.info('Model loaded.')

    def run(self, input=None, output=None, syllable_length=1, merged_use=False):
        util.Logger.info('run ' + input + ',' + output)
        assert(input != None)
        assert(input.strip() != '')
        assert(output != None)
        assert(output.strip() != '')
        try:
            # load document 
            util.Logger.info("Started to load document.")
            document = Document.Document()
            ifile = open(input)
            # build document
            util.Logger.info("Adding token to document.")
            self.tokenizer.clear()
            for token in self.tokenizer.tokenize(ifile):
                document.add(token)
            ifile.close()
            # detect sentence boundaries
            util.Logger.info("Detecting sentence boundaries.")
            ofile = open(output, "w+")
            line = ''
            lineno = 1
            for id in range(document.length()):
                prev = document.prev(id)
                curr = document.token(id)
                next = document.next(id)
                eos = False
                # check every position
                eos = self.eval(document, id, prev, curr, next, syllable_length, merged_use)
                if eos == None:
                    continue; # null field found
                line += curr.value
                if curr.isEoe():
                    line += ' '
                if eos and len(line.strip()) > 0:
                    if line[0:1] == ' ':
                        ofile.write('\n')
                    ofile.write(line.strip() + '\n')
                    line = ''
            ofile.write(line.strip() + '\n')
            ofile.close()
            document.clear()
            util.Logger.info("Detecting '%s' document completed." % input)
        except:
            raise

    def eos(self, context):
        label = 'yes'
        prob = self.model.eval(context, label)
        buf = ''
        if prob >= self.threshold:
            return True
        else:
            return False

    # append property into list-buf
    def append_maxent_parameter(self, list, i, property):
        i += 1
        list.append(str(i) + ':' + str(property))
        return i

    # FIXME: code duplicattion with sbd.detector.Probabilistic.py
    def eval(self, document, id, prevToken, currToken, nextToken, syllable_length=0, merged_use=False):
        dict = self.dictionary
        common = util.Common()
        # default token value
        default = '_'
        # { pos-type, pos-name }
        current_pos_type = common.name_of_type(currToken)
        current_pos_name = common.name_of_pos(currToken)
        prefix_pos_type = common.name_of_type(prevToken)
        prefix_pos_name = common.name_of_pos(prevToken)
        suffix_pos_type = common.name_of_type(nextToken)
        suffix_pos_name = common.name_of_pos(nextToken)
        # { syllables }
        prefix_syllable_name = []
        prefix_syllable_prob = []
        suffix_syllable_name = []
        suffix_syllable_prob = []
        merged_syllable_name = []
        merged_syllable_prob = []
        for length in xrange(syllable_length):
            if prevToken.length == 0: prefixName = default * syllable_length
            else: prefixName = prevToken.syllable(-1*(length+1))
            prefix_syllable_name.append(prefixName)
            prefix_syllable_prob.append(dict.getPrefixSyllableProb(prefixName))
            if nextToken.length == 0: suffixName = default * syllable_length
            else: suffixName = nextToken.syllable(length+1)
            suffix_syllable_name.append(suffixName)
            suffix_syllable_prob.append(dict.getSuffixSyllableProb(suffixName))
            if merged_use:
                mergedName = prefixName + '_' + suffixName
                merged_syllable_name.append(mergedName)
                merged_syllable_prob.append(dict.getMergedSyllableProb(mergedName))
        # { token-name, token-prob }
        if currToken.length == 0: current_token_name = default
        else: current_token_name = currToken.value
        current_token_prob = dict.getCurrentTokenProb(current_token_name)
        if prevToken.length == 0: prefix_token_name = default
        else: prefix_token_name = prevToken.value
        prefix_token_prob = dict.getPrefixTokenProb(prefix_token_name)
        if nextToken.length == 0: suffix_token_name = default
        else: suffix_token_name = nextToken.value
        suffix_token_prob = dict.getSuffixTokenProb(suffix_token_name)
        # { candidate-distance }
        prefix_candidate_dist = document.prevCandidateDist(id)
        suffix_candidate_dist = document.nextCandidateDist(id)
        # { punctuation-distance }
        prefix_punctuation_dist = document.prevPunctuationDist(id)
        suffix_punctuation_dist = document.nextPunctuationDist(id)
        # { token-length }
        current_token_length = currToken.length
        prefix_token_length = prevToken.length
        suffix_token_length = nextToken.length
        # { end-of-sentence }
        end_of_sentence = 'no'
        if currToken.end_of_sentence:
            end_of_sentence = 'yes'
        context = [end_of_sentence]
        i = 0
        # { building instances }
        i = self.append_maxent_parameter(context, i, current_pos_type)
        i = self.append_maxent_parameter(context, i, current_pos_name)
        i = self.append_maxent_parameter(context, i, prefix_pos_type)
        i = self.append_maxent_parameter(context, i, prefix_pos_name)
        i = self.append_maxent_parameter(context, i, suffix_pos_type)
        i = self.append_maxent_parameter(context, i, suffix_pos_name)
        # XXX: maxent use NAME instead of PROBABILITY
        for length in xrange(syllable_length):
            i = self.append_maxent_parameter(context, i, prefix_syllable_name[length])
            i = self.append_maxent_parameter(context, i, suffix_syllable_name[length])
            if merged_use:
                i = self.append_maxent_parameter(context, i, merged_syllable_name[length])
        i = self.append_maxent_parameter(context, i, current_token_name)
        i = self.append_maxent_parameter(context, i, prefix_token_name)
        i = self.append_maxent_parameter(context, i, suffix_token_name)
        i = self.append_maxent_parameter(context, i, str(current_token_length))
        i = self.append_maxent_parameter(context, i, str(prefix_token_length))
        i = self.append_maxent_parameter(context, i, str(suffix_token_length))
        eos = self.eos(context)
        return eos

    def calc(self, answer, rule):
        if answer == True and rule == True:
            result = 'TP'
        elif answer == True and rule == False:
            result = 'TN'
        elif answer == False and rule == True:
            result = 'FP'
        else:
            result = 'FN'
        self.statistics[result] += 1

    def summary(self):
        precision = 0.0
        recall = 0.0
        fscore = 0.0
        tp = self.statistics['TP']
        tn = self.statistics['TN']
        fp = self.statistics['FP']
        util.Logger.info("tp:", tp, "tn:", tn, "fp:", fp)
        if (tp + tn) > 0:
            precision = tp * 1.0 / (tp + tn)
        if (tp + fp) > 0:
            recall = tp * 1.0 / (tp + fp)
        if (precision+recall) > 0:
            fscore = (2*precision*recall) / (precision+recall)
        util.Logger.info("Precision:\t%0.3f%%" % (precision * 100.0))
        util.Logger.info("Recall:\t\t%0.3f%%" % (recall * 100.0))
        util.Logger.info("Fscore:\t\t%0.3f%%" % (fscore * 100.0))
class MaxentBasedSBD:
    def __init__(self, dictpath):
        self.tokenizer = Tokenizer.Tokenizer()
        self.documents = defaultdict(Document.Document)
        self.statistics = defaultdict(int)
        self.dictionary = Dictionary.Dictionary(dictpath)
        self.dictionary.load('syllable')
        self.dictionary.load('token')
        self.dictionary.load('type')
        self.dictionary.load('length')
        self.model = MaxentModel()
        self.threshold = 0.0

    def set(self, modelname=None, threshold=0.0, filename=None):
        assert(modelname != None)
        assert(modelname.strip() != '')
        assert(filename != None)
        assert(filename.strip() != '')
        try:
            util.Logger.debug("Started to load model...")
            self.model.load(modelname)
            self.threshold = threshold
            util.Logger.debug("Completed to load model '%s'" % modelname)
        except:
            raise
        try:
            util.Logger.debug("Started to load document...")
            document = Document.Document()
            file = open(filename)
            for token in self.tokenizer.tokenize(file):
                document.add(token)
            file.close()
            self.documents[filename] = document
            util.Logger.debug("Competed to load document '%s'" % filename)
        except:
            raise

    def get(self, filename=None):
        assert(filename != None)
        assert(filename.strip() != '')
        if filename in self.documents:
            return self.documents[filename]
        else:
            return Document.Document()

    def eos(self, context):
        label = 'yes'
        prob = self.model.eval(context, label)
        buf = ''
        if prob >= self.threshold:
            return True
        else:
            return False

    # append property into list-buf
    def append_maxent_parameter(self, list, i, property):
        i += 1
        list.append(str(i) + ':' + str(property))
        return i

    # FIXME: code duplicattion with sbd.detector.Probabilistic.py
    def eval(self, document, id, prevToken, currToken, nextToken, syllable_length=0, merged_use=False):
        dict = self.dictionary
        common = util.Common()
        # default token value
        default = '_'
        # { pos-type, pos-name }
        current_pos_type = common.name_of_type(currToken)
        current_pos_name = common.name_of_pos(currToken)
        prefix_pos_type = common.name_of_type(prevToken)
        prefix_pos_name = common.name_of_pos(prevToken)
        suffix_pos_type = common.name_of_type(nextToken)
        suffix_pos_name = common.name_of_pos(nextToken)
        # { syllables }
        prefix_syllable_name = []
        prefix_syllable_prob = []
        suffix_syllable_name = []
        suffix_syllable_prob = []
        merged_syllable_name = []
        merged_syllable_prob = []
        for length in xrange(syllable_length):
            if prevToken.length == 0: prefixName = default * syllable_length
            else: prefixName = prevToken.syllable(-1*(length+1))
            prefix_syllable_name.append(prefixName)
            prefix_syllable_prob.append(dict.getPrefixSyllableProb(prefixName))
            if nextToken.length == 0: suffixName = default * syllable_length
            else: suffixName = nextToken.syllable(length+1)
            suffix_syllable_name.append(suffixName)
            suffix_syllable_prob.append(dict.getSuffixSyllableProb(suffixName))
            if merged_use:
                mergedName = prefixName + '_' + suffixName
                merged_syllable_name.append(mergedName)
                merged_syllable_prob.append(dict.getMergedSyllableProb(mergedName))
        # { token-name, token-prob }
        if currToken.length == 0: current_token_name = default
        else: current_token_name = currToken.value
        current_token_prob = dict.getCurrentTokenProb(current_token_name)
        if prevToken.length == 0: prefix_token_name = default
        else: prefix_token_name = prevToken.value
        prefix_token_prob = dict.getPrefixTokenProb(prefix_token_name)
        if nextToken.length == 0: suffix_token_name = default
        else: suffix_token_name = nextToken.value
        suffix_token_prob = dict.getSuffixTokenProb(suffix_token_name)
        # { candidate-distance }
        prefix_candidate_dist = document.prevCandidateDist(id)
        suffix_candidate_dist = document.nextCandidateDist(id)
        # { punctuation-distance }
        prefix_punctuation_dist = document.prevPunctuationDist(id)
        suffix_punctuation_dist = document.nextPunctuationDist(id)
        # { token-length }
        current_token_length = currToken.length
        prefix_token_length = prevToken.length
        suffix_token_length = nextToken.length
        # { end-of-sentence }
        end_of_sentence = 'no'
        if currToken.end_of_sentence:
            end_of_sentence = 'yes'
        context = [end_of_sentence]
        i = 0
        # { building instances }
        i = self.append_maxent_parameter(context, i, current_pos_type)
        i = self.append_maxent_parameter(context, i, current_pos_name)
        i = self.append_maxent_parameter(context, i, prefix_pos_type)
        i = self.append_maxent_parameter(context, i, prefix_pos_name)
        i = self.append_maxent_parameter(context, i, suffix_pos_type)
        i = self.append_maxent_parameter(context, i, suffix_pos_name)
        # XXX: maxent use NAME instead of PROBABILITY
        for length in xrange(syllable_length):
            i = self.append_maxent_parameter(context, i, prefix_syllable_name[length])
            i = self.append_maxent_parameter(context, i, suffix_syllable_name[length])
            if merged_use:
                i = self.append_maxent_parameter(context, i, merged_syllable_name[length])
        i = self.append_maxent_parameter(context, i, current_token_name)
        i = self.append_maxent_parameter(context, i, prefix_token_name)
        i = self.append_maxent_parameter(context, i, suffix_token_name)
        i = self.append_maxent_parameter(context, i, str(current_token_length))
        i = self.append_maxent_parameter(context, i, str(prefix_token_length))
        i = self.append_maxent_parameter(context, i, str(suffix_token_length))
        eos = self.eos(context)
        return eos

    def calc(self, answer, rule):
        if answer == True and rule == True:
            result = 'TP'
        elif answer == True and rule == False:
            result = 'TN'
        elif answer == False and rule == True:
            result = 'FP'
        else:
            result = 'FN'
        self.statistics[result] += 1

    def summary(self):
        precision = 0.0
        recall = 0.0
        fscore = 0.0
        tp = self.statistics['TP']
        tn = self.statistics['TN']
        fp = self.statistics['FP']
        util.Logger.info("tp:", tp, "tn:", tn, "fp:", fp)
        if (tp + tn) > 0:
            precision = tp * 1.0 / (tp + tn)
        if (tp + fp) > 0:
            recall = tp * 1.0 / (tp + fp)
        if (precision+recall) > 0:
            fscore = (2*precision*recall) / (precision+recall)
        util.Logger.info("Precision:\t%0.3f%%" % (precision * 100.0))
        util.Logger.info("Recall:\t\t%0.3f%%" % (recall * 100.0))
        util.Logger.info("Fscore:\t\t%0.3f%%" % (fscore * 100.0))