def train(self, filename): data = self.load_data(filename) m = MaxentModel() m.begin_add_event() for word, history in data: features = self.generate_features(word, history) m.add_event(features, word) m.end_add_event() m.train() self.model = m
def main(): m = MaxentModel() data = load_data() m.begin_add_event() for vals, label in data: features = extract_features(vals) m.add_event(features, label) m.end_add_event() m.train() print 'Trained' test_instance = [6.7, 3.1, 5.6, 2.4] test_features = extract_features(test_instance) result = m.eval_all(test_features) print 'Result' print result
'SHS.RELIG': 'Religions', 'SHS.SCIPO': 'Political science', 'SHS.SOCIO': 'Sociology', 'SHS.STAT': 'Methods and statistics', 'SDV': 'Life sciences', 'SPI': 'Engineering sciences', 'STAT': 'Statistics', 'QFIN': 'Economy and quantitative finance', 'OTHER': 'Other', } possible_labels = readable_descriptions.keys() print "Loading model..." model = MaxentModel() model.load(str('data/model.maxent')) print "Done." print "Loading dict" with open('data/dict-hal.pkl', 'rb') as f: dct = cPickle.load(f) corpus = HALCorpus(dct=dct) print "Done." def lookup_category(probas): readable_list = [] for lst in probas: if lst[0] in readable_descriptions: readable_list.append( {'code':lst[0], 'proba':lst[1],
def load(self, filename): """Loads a maxent model from disk.""" self.model = MaxentModel() self.model.load(filename)
class Maxent(Model): def __init__(self, history_length=30): Model.__init__(self) self.history_length = history_length def get_probability(self, word, history): features = self.generate_features(word, history) prob_dist = self.model.eval_all(features) for tag, prob in prob_dist: if tag == word: return prob return 0.0 def load_data(self, filename): words = [line.strip() for line in open(filename, 'r')] for index in xrange(len(words)): word = words[index] start_index = index - self.history_length \ if index - self.history_length > 0 else 0 # look at the most recent N tags history = words[start_index:index] yield word, history def train(self, filename): data = self.load_data(filename) m = MaxentModel() m.begin_add_event() for word, history in data: features = self.generate_features(word, history) m.add_event(features, word) m.end_add_event() m.train() self.model = m def generate_features(self, word, history): history = self.pad_history(history) features = Feature.eval(word, history) return features def pad_history(self, history): """Pads 'START' tag if history length is less than history_length""" if len(history) < self.history_length: history = ['START'] * (self.history_length - len(history)) + history return history def save(self, filename): """Saves the model to a file. Just calls the internal model's save function. We discard the value of history_length and feature_funcs. We assume history_length will always be the same, and we just reload the feature_funcs. """ self.model.save(filename) def load(self, filename): """Loads a maxent model from disk.""" self.model = MaxentModel() self.model.load(filename)