def best_features(): ham_train = bayes.HAM + bayes.TRAIN spam_train = bayes.SPAM + bayes.TRAIN ham_words = map(lambda s: '/' + s + '/', set.union(*map(toolkit.word_bag, toolkit.get_files(ham_train)))) spam_words = map(lambda s: '/' + s + '/', set.union(*map(toolkit.word_bag, toolkit.get_files(spam_train)))) all_words = set(ham_words + spam_words) # p(w_i | c) p_w_ham = defaultdict(lambda: toolkit.ZERO, zip(ham_words, toolkit.countre(ham_train, ham_words, smoothing=toolkit.ONE)[0])) p_w_spam = defaultdict(lambda: toolkit.ZERO, zip(spam_words, toolkit.countre(spam_train, spam_words, smoothing=toolkit.ONE)[0])) #for word in all_words: # if type(p_w_ham[word]) != toolkit.NUM or type(p_w_spam[word]) != toolkit.NUM: # print p_w_ham[word], p_w_spam[word], word #return range(0, 100) mut_inf = dict() maxlog = math.log(toolkit.MAX) def no_error(x, y): try: return math.log(x / y) except Exception: return maxlog for word in all_words: p_w_h = p_w_ham[word] #if type(p_w_h)() == list() or type(p_w_s)() == list: # print "'" + word + "'" #else: # print "*", p_nw_h = toolkit.ONE - p_w_h p_w_s = p_w_spam[word] p_nw_s = toolkit.ONE - p_w_s p_w = p_w_h * toolkit.PRIOR_HAM + p_w_s * toolkit.PRIOR_SPAM p_nw = toolkit.ONE - p_w log_w_h = no_error(p_w, p_w_h) log_nw_h = no_error(p_nw, p_nw_h) log_w_s = no_error(p_w, p_w_s) log_nw_s = no_error(p_nw, p_nw_s) mut_inf[word] = p_w_h * toolkit.PRIOR_HAM * log_w_h mut_inf[word] += p_nw_h * toolkit.PRIOR_HAM * log_nw_h mut_inf[word] += p_w_s * toolkit.PRIOR_SPAM * log_w_s mut_inf[word] += p_nw_s * toolkit.PRIOR_SPAM * log_nw_s return sorted(mut_inf.iteritems(), key=operator.itemgetter(1), reverse=True)
def instance_feature_prob(instance, features, clss, train=True, smoothing=toolkit.ZERO): """ Corresponds to [p(x_i | C_k) for x_i in x] from the assignment. The instance is a message, which is either of clss HAM or clss SPAM. features is a collection of (compiled) regular expressions. If applied to the instance, we compute for every feature, if it occurs in the instance. We divide that number by how many training/testing instances of the class match against the feature. ``The probability of an observation (i.e., an email) given the class (i.e., ham or spam), p(x|Ck ) is then modelled as the probability of seeing specific keywords in the email.'' """ folder = clss + TRAIN if train else clss + TEST countre_result = toolkit.countre(folder, features, smoothing=smoothing)[0] feature_presence = toolkit.presentre(instance, features) return map(lambda c, p: 1 if c == toolkit.ZERO and p == toolkit.ZERO else pow(c, p) * pow(1 - c, 1 - p), countre_result, feature_presence)