def train(self, filename):
        data = self.load_data(filename)
        m = MaxentModel()

        m.begin_add_event()
        for word, history in data:
            features = self.generate_features(word, history)
            m.add_event(features, word)
        m.end_add_event()

        m.train()
        self.model = m
def main():
    m = MaxentModel()
    data = load_data()
    
    m.begin_add_event()

    for vals, label in data:
        features = extract_features(vals)
        m.add_event(features, label)

    m.end_add_event()

    m.train()
    print 'Trained'
    
    test_instance = [6.7, 3.1, 5.6, 2.4]
    test_features = extract_features(test_instance)
    
    result = m.eval_all(test_features)
    print 'Result'
    print result
Example #3
0
'SHS.RELIG': 'Religions',
'SHS.SCIPO': 'Political science',
'SHS.SOCIO': 'Sociology',
'SHS.STAT': 'Methods and statistics',
'SDV': 'Life sciences',
'SPI': 'Engineering sciences',
'STAT': 'Statistics',
'QFIN': 'Economy and quantitative finance',
'OTHER': 'Other',
}

possible_labels = readable_descriptions.keys()


print "Loading model..."
model = MaxentModel()
model.load(str('data/model.maxent'))
print "Done."
print "Loading dict"
with open('data/dict-hal.pkl', 'rb') as f:
    dct = cPickle.load(f)
corpus = HALCorpus(dct=dct)
print "Done."

def lookup_category(probas):
    readable_list = []
    for lst in probas:
        if lst[0] in readable_descriptions:
            readable_list.append(
                    {'code':lst[0],
                    'proba':lst[1],
 def load(self, filename):
     """Loads a maxent model from disk."""
     self.model = MaxentModel()
     self.model.load(filename)
class Maxent(Model):
    def __init__(self, history_length=30):
        Model.__init__(self)
        self.history_length    = history_length

    def get_probability(self, word, history):
        features  = self.generate_features(word, history)
        prob_dist = self.model.eval_all(features)

        for tag, prob in prob_dist:
            if tag == word:
                return prob
        return 0.0

    def load_data(self, filename):
        words = [line.strip() for line in open(filename, 'r')]

        for index in xrange(len(words)):
            word = words[index]
            start_index = index - self.history_length \
                if index - self.history_length > 0 else 0 # look at the most recent N tags
            history = words[start_index:index]
            yield word, history

    def train(self, filename):
        data = self.load_data(filename)
        m = MaxentModel()

        m.begin_add_event()
        for word, history in data:
            features = self.generate_features(word, history)
            m.add_event(features, word)
        m.end_add_event()

        m.train()
        self.model = m

    def generate_features(self, word, history):
        history  = self.pad_history(history)
        features = Feature.eval(word, history)
        return features

    def pad_history(self, history):
        """Pads 'START' tag if history length is less than history_length"""
        if len(history) < self.history_length:
            history = ['START'] * (self.history_length - len(history)) + history
        return history

    def save(self, filename):
        """Saves the model to a file. Just calls the internal model's
        save function. We discard the value of history_length and 
        feature_funcs. We assume history_length will always be the same,
        and we just reload the feature_funcs.

        """
        self.model.save(filename)

    def load(self, filename):
        """Loads a maxent model from disk."""
        self.model = MaxentModel()
        self.model.load(filename)