def compute_mi(self): morphs = self.morphs for morph, morph_probs in morphs.iteritems(): mi = 0 for label, p_g in morph_probs.p_given_att.iteritems(): if p_g > 0: mi += p_g * (math.log(p_g) - math.log(self.class_probs[label]) - math.log(morph_probs.p)) morph_probs.mi = mi def analyze(users, model, target='gender'): for user in users: segmented = segmenter.morph_segments(model, user.id) user.segmented = segmented print user analyzer = Analyzer(users, target) morph_prob_list = analyzer.morphs.values() morph_prob_list.sort(key=lambda x: x.mi, reverse=True) for morph_prob in morph_prob_list: print morph_prob if __name__ == '__main__': target = 'gender' whereclause = 'where gender is not ""' if target == 'gender' else '' users = segmenter.get_users_from_db(whereclause=whereclause) model = segmenter.load_model('../models/idmorphs_pre10naworl.model') analyze(users, model, target=target)
semisup_classifier.segfun = None cPickle.dump(semisup_classifier, f) semisup_classifier.segfun = segfun return semisup_classifier if __name__ == '__main__': target = 'gender' whereclause = "where gender is not ''" if target == 'gender' else '' model = segmenter.load_model('../models/idmorphs_naworl.model') segfun = segmenter.morph_segmenter(model, match='[a-z]+') # model_semi = segmenter.load_model('../models/idmorphs.model') # segfun_semi = segmenter.morph_segmenter(model_semi, match='[a-z]+') users = segmenter.get_users_from_db(whereclause=whereclause) male_ids = [user.id for user in users if user.gender == 'M'] female_ids = [user.id for user in users if user.gender == 'F'] # unlabeled_users = segmenter.get_users_from_db(tablename='naver') # unknown_ids = [user.id for user in unlabeled_users] unknown_ids = None cls = DoTest(male_ids, female_ids, segfun, unknown_ids, balance=False) # cls = cPickle.load(open("../models/init_nonum_semi3.pkl")) # cls.segfun = segfun_semi test_users = segmenter.get_users_from_db(tablename='naworl_test', whereclause=whereclause) test_male_ids = [user.id for user in test_users if user.gender == 'M'] test_female_ids = [user.id for user in test_users if user.gender == 'F']
__author__ = 'hee' import numpy as np from collections import defaultdict import cPickle from datetime import datetime import segmenter from structurer import TwoNamecharStateHMM, NumberStateHMM if __name__ == '__main__': np.set_printoptions(suppress=True, linewidth=200) # users = segmenter.get_users_from_db(tablename='naver') users = list() users.extend(segmenter.get_users_from_db(tablename='naworl_train')) users.extend(segmenter.get_users_from_db(tablename='naworl_test')) ids = [user.id for user in users] model = segmenter.load_model('../models/idmorphs_naworl_num.model') file_path = "../models/numhmm_nomonend_naworl_bynaworl.pkl" print file_path # hmm = NumberStateHMM(n_iter=100) # hmm.learn(ids, model) # with open(file_path, 'w') as f: # cPickle.dump(hmm, f) # print file_path hmm = cPickle.load(open(file_path, 'r'))